doberst commited on
Commit
9af6eae
1 Parent(s): d8ae236

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeciLMForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_decilm.DeciLMConfig",
7
+ "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 4096,
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads_per_layer": [4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4],
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": {"type": "dynamic", "factor": 2.0},
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "use_bfloat16": true,
25
+ "transformers_version": "4.31.0",
26
+ "use_cache": true,
27
+ "vocab_size": 32000
28
+ }
configuration_decilm.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from packaging import version
2
+ import transformers
3
+ if version.parse(transformers.__version__) < version.parse("4.31.0"):
4
+ raise ImportError(
5
+ f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciLM. Please upgrade transformers."
6
+ )
7
+ from transformers.models.llama.configuration_llama import LlamaConfig
8
+ from transformers.utils import logging
9
+
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
14
+
15
+
16
+ class DeciLMConfig(LlamaConfig):
17
+ r"""
18
+
19
+ Args:
20
+ num_key_value_heads_per_layer (`List[int]`):
21
+ The number of key-value heads per layer.
22
+ naive_attention_prefill (`bool`, *optional*, defaults to False):
23
+ Whether to use naive matmul or scaled dot product attention during prefill.
24
+ naive_attention_decode_batched (`bool`, *optional*, defaults to True):
25
+ Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1.
26
+ naive_attention_decode_single (`bool`, *optional*, defaults to False):
27
+ Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1.
28
+
29
+
30
+ ```"""
31
+ keys_to_ignore_at_inference = ["past_key_values"]
32
+
33
+ def __init__(
34
+ self,
35
+ num_key_value_heads_per_layer: list = None,
36
+ naive_attention_prefill: bool = False,
37
+ naive_attention_decode_batched: bool = False,
38
+ naive_attention_decode_single: bool = False,
39
+ **kwargs,
40
+ ):
41
+ self.num_key_value_heads_per_layer = num_key_value_heads_per_layer
42
+ self.naive_attention_prefill = naive_attention_prefill
43
+ self.naive_attention_decode_batched = naive_attention_decode_batched
44
+ self.naive_attention_decode_single = naive_attention_decode_single
45
+ super().__init__(**kwargs, )
46
+
hf_benchmark_example.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ cmd example
3
+ You need a file called "sample.txt" (default path) with text to take tokens for prompts or supply --text_file "path/to/text.txt" as an argument to a text file.
4
+ You can use our attached "sample.txt" file with one of Deci's blogs as a prompt.
5
+
6
+ # Run this and record tokens per second (652 tokens per second on A10 for DeciLM-6b)
7
+ python hf_benchmark_example.py --model Deci/DeciLM-6b
8
+
9
+ # Run this and record tokens per second (136 tokens per second on A10 for meta-llama/Llama-2-7b-hf), CUDA OOM above batch size 8
10
+ python hf_benchmark_example.py --model meta-llama/Llama-2-7b-hf --batch_size 8
11
+ """
12
+
13
+ import json
14
+
15
+ import datasets
16
+ import torch
17
+ import transformers
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
19
+ from argparse import ArgumentParser
20
+
21
+
22
+ def parse_args():
23
+ parser = ArgumentParser()
24
+
25
+ parser.add_argument(
26
+ "--model",
27
+ required=True,
28
+ help="Model to evaluate, provide a repo name in Hugging Face hub or a local path",
29
+ )
30
+ parser.add_argument(
31
+ "--temperature",
32
+ default=0.2,
33
+ type=float
34
+ )
35
+ parser.add_argument(
36
+ "--top_p",
37
+ default=0.95,
38
+ type=float
39
+ )
40
+ parser.add_argument(
41
+ "--top_k",
42
+ default=0,
43
+ type=float
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--revision",
48
+ default=None,
49
+ help="Model revision to use",
50
+ )
51
+ parser.add_argument(
52
+ "--iterations",
53
+ type=int,
54
+ default=6,
55
+ help="Model revision to use",
56
+ )
57
+ parser.add_argument(
58
+ "--batch_size",
59
+ type=int,
60
+ default=64,
61
+ help="Batch size for evaluation on each worker, can be larger for HumanEval",
62
+
63
+ )
64
+ parser.add_argument(
65
+ "--prompt_length",
66
+ type=int,
67
+ default=512,
68
+ )
69
+ parser.add_argument(
70
+ "--max_new_tokens",
71
+ type=int,
72
+ default=512,
73
+ help="Maximum length of generated sequence (prompt+generation)",
74
+ )
75
+ parser.add_argument(
76
+ "--precision",
77
+ type=str,
78
+ default="bf16",
79
+ help="Model precision, from: fp32, fp16 or bf16",
80
+ )
81
+ parser.add_argument(
82
+ "--text_file",
83
+ type=str,
84
+ default="sample.txt",
85
+ help="text file that will be used to generate tokens for prompts",
86
+ )
87
+ parser.add_argument(
88
+ "--load_in_8bit",
89
+ action="store_true",
90
+ help="Load model in 8bit",
91
+ )
92
+ parser.add_argument(
93
+ "--load_in_4bit",
94
+ action="store_true",
95
+ help="Load model in 4bit",
96
+ )
97
+ return parser.parse_args()
98
+
99
+
100
+ def main():
101
+ args = parse_args()
102
+ transformers.logging.set_verbosity_error()
103
+ datasets.logging.set_verbosity_error()
104
+
105
+
106
+ results = {}
107
+ dict_precisions = {
108
+ "fp32": torch.float32,
109
+ "fp16": torch.float16,
110
+ "bf16": torch.bfloat16,
111
+ }
112
+ if args.precision not in dict_precisions:
113
+ raise ValueError(
114
+ f"Non valid precision {args.precision}, choose from: fp16, fp32, bf16"
115
+ )
116
+ if args.load_in_8bit:
117
+ print("Loading model in 8bit")
118
+ # the model needs to fit in one GPU
119
+ model = AutoModelForCausalLM.from_pretrained(
120
+ args.model,
121
+ revision=args.revision,
122
+ load_in_8bit=args.load_in_8bit,
123
+ trust_remote_code=args.trust_remote_code,
124
+ use_auth_token=args.use_auth_token,
125
+ device_map={"": 'cuda'},
126
+ )
127
+ elif args.load_in_4bit:
128
+ print("Loading model in 4bit")
129
+ # the model needs to fit in one GPU
130
+ model = AutoModelForCausalLM.from_pretrained(
131
+ args.model,
132
+ revision=args.revision,
133
+ load_in_4bit=args.load_in_4bit,
134
+ trust_remote_code=args.trust_remote_code,
135
+ use_auth_token=args.use_auth_token,
136
+ device_map={"": 'cuda'},
137
+ )
138
+ else:
139
+ print(f"Loading model in {args.precision}")
140
+ model = AutoModelForCausalLM.from_pretrained(
141
+ args.model,
142
+ torch_dtype=torch.bfloat16,
143
+ trust_remote_code=True,
144
+ use_auth_token=True
145
+ )
146
+
147
+ tokenizer = AutoTokenizer.from_pretrained(
148
+ args.model,
149
+ revision=args.revision,
150
+ trust_remote_code=True,
151
+ use_auth_token=True,
152
+ )
153
+
154
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
155
+ model.cuda()
156
+ model.eval()
157
+
158
+ with open(args.text_file, "r") as f:
159
+ prompt = f.read()
160
+
161
+ prompt = torch.tensor(tokenizer.encode(prompt))[:args.prompt_length].cuda()
162
+
163
+ results = {'prefill': [], 'gen': [], 'max_new_tokens': args.max_new_tokens, 'prompt_length': args.prompt_length, 'model': args.model, 'batch_size': args.batch_size}
164
+ inputs = prompt.repeat(args.batch_size, 1)
165
+
166
+ #warmup
167
+ print('start warmup')
168
+ for _ in range(10):
169
+ with torch.no_grad():
170
+ _ = model.generate(
171
+ input_ids=inputs,
172
+ max_new_tokens=1,
173
+ do_sample=False,
174
+ )
175
+ print('finish warmup')
176
+ torch.cuda.synchronize()
177
+
178
+ for prefill_iter in range(args.iterations):
179
+ starter.record()
180
+ with torch.no_grad():
181
+ _ = model.generate(
182
+ input_ids=inputs,
183
+ max_new_tokens=1,
184
+ do_sample=False,
185
+ )
186
+ ender.record()
187
+ torch.cuda.synchronize()
188
+ t = starter.elapsed_time(ender) / 1000
189
+ results['prefill'].append(t)
190
+ print(f'{args.batch_size} prefill iter {prefill_iter} took: {t}')
191
+
192
+
193
+ for gen_iter in range(args.iterations):
194
+ starter.record()
195
+ with torch.no_grad():
196
+ _ = model.generate(
197
+ input_ids=inputs,
198
+ max_new_tokens=args.max_new_tokens,
199
+ do_sample=False,
200
+ )
201
+ ender.record()
202
+ torch.cuda.synchronize()
203
+ t = starter.elapsed_time(ender) / 1000
204
+ results['gen'].append(t)
205
+
206
+ print(f'{args.batch_size} total generation iter {gen_iter} took: {t}')
207
+ print(f'{args.batch_size * args.max_new_tokens / t} tokens per seconds')
208
+ model_str = args.model.split('/')[-1]
209
+ with open(f'timing_{model_str}_{args.batch_size}.json', 'w') as f:
210
+ json.dump(results, f)
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
modeling_decilm.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright and license here
3
+ """ PyTorch DeciLM model."""
4
+ import math
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.utils.checkpoint
10
+ from torch import nn
11
+ from packaging import version
12
+ import transformers
13
+ if version.parse(transformers.__version__) < version.parse("4.31.0"):
14
+ raise ImportError(
15
+ f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciLM. Please upgrade transformers."
16
+ )
17
+ from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, LlamaAttention, apply_rotary_pos_emb, \
18
+ repeat_kv, LlamaPreTrainedModel, LLAMA_START_DOCSTRING, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel
19
+ from transformers.utils import add_start_docstrings
20
+
21
+ from .configuration_decilm import DeciLMConfig
22
+
23
+ _CONFIG_FOR_DOC = "DeciLMConfig"
24
+
25
+
26
+ class DeciLMAttention(LlamaAttention):
27
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
28
+
29
+ def __init__(self, config: DeciLMConfig, layer_idx: int):
30
+ nn.Module.__init__(self)
31
+ self.config = config
32
+ self.hidden_size = config.hidden_size
33
+ self.num_heads = config.num_attention_heads
34
+ self.head_dim = self.hidden_size // self.num_heads
35
+ self.layer_idx = layer_idx
36
+ self.num_key_value_heads = config.num_key_value_heads_per_layer[layer_idx]
37
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
38
+ self.pretraining_tp = config.pretraining_tp
39
+ self.max_position_embeddings = config.max_position_embeddings
40
+ self.rope_theta = getattr(config, 'rope_theta', None)
41
+
42
+ if (self.head_dim * self.num_heads) != self.hidden_size:
43
+ raise ValueError(
44
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
45
+ f" and `num_heads`: {self.num_heads})."
46
+ )
47
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
48
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
49
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
50
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
51
+
52
+ self.naive_attention_prefill = config.naive_attention_prefill
53
+ self.naive_attention_decode_batched = config.naive_attention_decode_batched
54
+ self.naive_attention_decode_single = config.naive_attention_decode_single
55
+ self._init_rope()
56
+
57
+ def forward(
58
+ self,
59
+ hidden_states: torch.Tensor,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
63
+ output_attentions: bool = False,
64
+ use_cache: bool = False,
65
+ padding_mask: Optional[torch.LongTensor] = None,
66
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
67
+ bsz, q_len, _ = hidden_states.size()
68
+ if past_key_value is None:
69
+ is_decode = False
70
+ else:
71
+ is_decode = True
72
+ if self.pretraining_tp > 1:
73
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
74
+ query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
75
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
76
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
77
+
78
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
79
+ query_states = torch.cat(query_states, dim=-1)
80
+
81
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
82
+ key_states = torch.cat(key_states, dim=-1)
83
+
84
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
85
+ value_states = torch.cat(value_states, dim=-1)
86
+
87
+ else:
88
+ query_states = self.q_proj(hidden_states)
89
+ key_states = self.k_proj(hidden_states)
90
+ value_states = self.v_proj(hidden_states)
91
+
92
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
93
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
94
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
95
+
96
+ kv_seq_len = key_states.shape[-2]
97
+ if past_key_value is not None:
98
+ kv_seq_len += past_key_value[0].shape[-2]
99
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
100
+
101
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
102
+
103
+ if past_key_value is not None:
104
+ # reuse k, v, self_attention
105
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
106
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
107
+
108
+ past_key_value = (key_states, value_states) if use_cache else None
109
+
110
+ # repeat k/v heads if n_kv_heads < n_heads
111
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
112
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
113
+ if is_decode:
114
+ if self.naive_attention_decode_batched and bsz > 1 or self.naive_attention_decode_single and bsz == 1:
115
+ attn_weights = (query_states @ key_states.transpose(-2, -1)) / math.sqrt(key_states.size(-1))
116
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
117
+ if attention_mask is not None:
118
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
119
+ raise ValueError(
120
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
121
+ )
122
+ attn_weights = attn_weights + attention_mask
123
+
124
+ attn_output = torch.matmul(attn_weights, value_states)
125
+ else:
126
+ attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, is_causal=False,
127
+ dropout_p=0.0)
128
+ attn_output = attn_output.contiguous().view(bsz, q_len, self.hidden_size)
129
+
130
+ else:
131
+ if not self.naive_attention_prefill:
132
+ with torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
133
+ attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, is_causal=True,
134
+ dropout_p=0.0)
135
+ else:
136
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
137
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
138
+ raise ValueError(
139
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
140
+ f" {attn_weights.size()}"
141
+ )
142
+
143
+ if attention_mask is not None:
144
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
145
+ raise ValueError(
146
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
147
+ )
148
+ attn_weights = attn_weights + attention_mask
149
+
150
+ # upcast attention to fp32
151
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
152
+ attn_output = torch.matmul(attn_weights, value_states)
153
+
154
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
155
+ raise ValueError(
156
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
157
+ f" {attn_output.size()}"
158
+ )
159
+
160
+ attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
161
+
162
+ if self.pretraining_tp > 1:
163
+ attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
164
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
165
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
166
+ else:
167
+ attn_output = self.o_proj(attn_output)
168
+
169
+ if not output_attentions:
170
+ attn_weights = None
171
+
172
+ return attn_output, attn_weights, past_key_value
173
+
174
+
175
+ class DeciLMDecoderLayer(LlamaDecoderLayer):
176
+ def __init__(self, config: DeciLMConfig, layer_idx: int):
177
+ nn.Module.__init__(self)
178
+ self.hidden_size = config.hidden_size
179
+ self.layer_idx = layer_idx
180
+ self.self_attn = DeciLMAttention(config=config, layer_idx=layer_idx)
181
+ self.mlp = LlamaMLP(config)
182
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
183
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
184
+
185
+
186
+ @add_start_docstrings(
187
+ "The bare DeciLM Model outputting raw hidden-states without any specific head on top.",
188
+ LLAMA_START_DOCSTRING,
189
+ )
190
+ class DeciLMPreTrainedModel(LlamaPreTrainedModel):
191
+ config_class = DeciLMConfig
192
+ _no_split_modules = ["DeciLMDecoderLayer"]
193
+ _keys_to_ignore_on_load_missing = ["self_attn.rotary_emb.inv_freq"]
194
+
195
+
196
+ @add_start_docstrings(
197
+ "The bare DeciLM Model outputting raw hidden-states without any specific head on top.",
198
+ LLAMA_START_DOCSTRING,
199
+ )
200
+ class DeciLMModel(LlamaModel, DeciLMPreTrainedModel):
201
+ """
202
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeciLMDecoderLayer`]
203
+
204
+ Args:
205
+ config: DeciLMConfig
206
+ """
207
+
208
+ def __init__(self, config: DeciLMConfig):
209
+ DeciLMPreTrainedModel.__init__(self, config)
210
+ self.padding_idx = config.pad_token_id
211
+ self.vocab_size = config.vocab_size
212
+
213
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
214
+ self.layers = nn.ModuleList([DeciLMDecoderLayer(config, layer_idx) for layer_idx
215
+ in range(config.num_hidden_layers)])
216
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
217
+
218
+ self.gradient_checkpointing = False
219
+ # Initialize weights and apply final processing
220
+ self.post_init()
221
+
222
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
223
+ self._validate_config_supports_attention_mask(attention_mask, input_shape, past_key_values_length)
224
+ return LlamaModel._prepare_decoder_attention_mask(
225
+ self, attention_mask, input_shape, inputs_embeds, past_key_values_length)
226
+
227
+ def _validate_config_supports_attention_mask(self, attention_mask, input_shape, past_key_values_length):
228
+ is_decode = past_key_values_length > 0
229
+ if not torch.all(torch.eq(attention_mask, 1)).item():
230
+ if is_decode:
231
+ if input_shape[0] == 1 and not self.config.naive_attention_decode_single:
232
+ raise ValueError(
233
+ "For support of custom attention masks please set naive_attention_decode_single to True in the "
234
+ "config")
235
+ elif input_shape[0] > 1 and not self.config.naive_attention_decode_batched:
236
+ raise ValueError(
237
+ "For support of custom attention masks please set naive_attention_decode_batched to True in the"
238
+ "config")
239
+ else:
240
+ if not self.config.naive_attention_prefill:
241
+ raise ValueError("For support of custom attention masks please set naive_attention_prefill to "
242
+ "True in the config")
243
+
244
+
245
+ class DeciLMForCausalLM(LlamaForCausalLM, DeciLMPreTrainedModel):
246
+ def __init__(self, config):
247
+ DeciLMPreTrainedModel.__init__(self, config)
248
+ self.model = DeciLMModel(config)
249
+ self.pretraining_tp = config.pretraining_tp
250
+ self.vocab_size = config.vocab_size
251
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
252
+
253
+ # Initialize weights and apply final processing
254
+ self.post_init()
sample.txt ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
2
+
3
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
4
+ Asynchronous execution
5
+
6
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
7
+
8
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
9
+
10
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
11
+
12
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the “time” library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
13
+ GPU warm-up
14
+
15
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
16
+
17
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don’t want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
18
+
19
+ Since, we want to enable the GPU power-saving mode whenever possible, let’s look at how to overcome the initialization of the GPU while measuring time.
20
+ The correct way to measure inference time
21
+
22
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ‘GPU warm-up.’ This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
23
+
24
+ model = EfficientNet.from_pretrained('efficientnet-b0')
25
+ device = torch.device("cuda")
26
+ model.to(device)
27
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
28
+
29
+ # INIT LOGGERS
30
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
31
+ repetitions = 300
32
+ timings=np.zeros((repetitions,1))
33
+ #GPU-WARM-UP
34
+ for _ in range(10):
35
+ _ = model(dummy_input)
36
+ # MEASURE PERFORMANCE
37
+ with torch.no_grad():
38
+ for rep in range(repetitions):
39
+ starter.record()
40
+ _ = model(dummy_input)
41
+ ender.record()
42
+ # WAIT FOR GPU SYNC
43
+ torch.cuda.synchronize()
44
+ curr_time = starter.elapsed_time(ender)
45
+ timings[rep] = curr_time
46
+
47
+ mean_syn = np.sum(timings) / repetitions
48
+ std_syn = np.std(timings)
49
+ print(mean_syn)
50
+
51
+ Common mistakes when measuring time
52
+
53
+ When we measure the latency of a network, our goal is to measure only the feed-forward of the network, not more and not less. Often, even experts will make certain common mistakes in their measurements. Here are some of them, along with their consequences:
54
+
55
+ 1. Transferring data between the host and the device. The point of view of this post is to measure only the inference time of a neural network. Under this point of view, one of the most common mistakes involves the transfer of data between the CPU and GPU while taking time measurements. This is usually done unintentionally when a tensor is created on the CPU and inference is then performed on the GPU. This memory allocation takes a considerable amount of time, which subsequently enlarges the time for inference. The effect of this mistake over the mean and variance of the measurements can be seen below:
56
+
57
+ Figure 2: Impact of transferring between CPU and GPU while measuring time. Left: The correct measurements for mean and standard deviation (bar). Right: The mean and standard deviation when the input tensor is transferred between CPU and GPU at each call for the network. The X-axis is the timing method and the Y-axis is the time in milliseconds.
58
+
59
+ 2. Not using GPU warm-up. As mentioned above, the first run on the GPU prompts its initialization. GPU initialization can take up to 3 seconds, which makes a huge difference when the timing is in terms of milliseconds.
60
+
61
+ 3. Using standard CPU timing. The most common mistake made is to measure time without synchronization. Even experienced programmers have been known to use the following piece of code.
62
+
63
+ s = time.time()
64
+ _ = model(dummy_input)
65
+ curr_time = (time.time()-s )*1000
66
+
67
+ This of course completely ignores the asynchronous execution mentioned earlier and hence outputs incorrect times. The impact of this mistake on the mean and variance of the measurements are shown below:
68
+
69
+ Figure 3: Impact of measuring time on CPU. Left: The correct measurements for mean and standard deviation (bar). Right: The mean and standard deviation when processes are not synchronized. The X-axis is the timing method and the Y-axis is the time in milliseconds.
70
+
71
+ 4. Taking one sample. Like many processes in computer science, feed forward of the neural network has a (small) stochastic component. The variance of the run-time can be significant, especially when measuring a low latency network. To this end, it is essential to run the network over several examples and then average the results (300 examples can be a good number). A common mistake is to use one sample and refer to it as the run-time. This, of course, won’t represent the true run-time.
72
+ Measuring Throughput
73
+
74
+ The throughput of a neural network is defined as the maximal number of input instances the network can process in time a unit (e.g., a second). Unlike latency, which involves the processing of a single instance, to achieve maximal throughput we would like to process in parallel as many instances as possible. The effective parallelism is obviously data-, model-, and device-dependent. Thus, to correctly measure throughput we perform the following two steps: (1) we estimate the optimal batch size that allows for maximum parallelism; and (2), given this optimal batch size, we measure the number of instances the network can process in one second.
75
+ To find the optimal batch size, a good rule of thumb is to reach the memory limit of our GPU for the given data type. This size of course depends on the hardware type and the size of the network. The quickest way to find this maximal batch size is by performing a binary search. When time is of no concern a simple sequential search is sufficient. To this end, using a for loop we increase by one the batch size until Run Time error is achieved, this identifies the largest batch size the GPU can process, for our neural network model and the input data it processes.
76
+ After finding the optimal batch size, we calculate the actual throughput. To this end, we would like to process many batches (100 batches will be a sufficient number) and then use the following formula:
77
+
78
+ (number of batches X batch size)/(total time in seconds)
79
+
80
+ This formula gives the number of examples our network can process in one second. The code below provides a simple way to perform the above calculation (given the optimal batch size):
81
+
82
+ model = EfficientNet.from_pretrained('efficientnet-b0')
83
+ device = torch.device("cuda")
84
+ model.to(device)
85
+ dummy_input = torch.randn(optimal_batch_size, 3,224,224, dtype=torch.float).to(device)
86
+
87
+ repetitions=100
88
+ total_time = 0
89
+ with torch.no_grad():
90
+ for rep in range(repetitions):
91
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
92
+ starter.record()
93
+ _ = model(dummy_input)
94
+ ender.record()
95
+ torch.cuda.synchronize()
96
+ curr_time = starter.elapsed_time(ender)/1000
97
+ total_time += curr_time
98
+ Throughput = (repetitions*optimal_batch_size)/total_time
99
+ print('Final Throughput:',Throughput)
100
+
101
+ Conclusion
102
+
103
+ To accurately measure inference time of neural networks is not as trivial as it sounds. We detailed several issues that deep learning practitioners should be aware of, such as asynchronous execution and GPU power-saving modes. The PyTorch code presented here demonstrates how to correctly measure the timing in neural networks, despite the aforementioned caveats. Finally, we mentioned some common mistakes that cause people to measure inference time incorrectly. In future posts, we will dive even deeper into this topic and explain existing deep learning profilers which enable us to achieve even more accurate time measurements of networks. If you are interested in how to reduce the latency of the network without compromising its accuracy you are invited to book a demo with one of our experts, or read more about this topic in Deci’s Guide to Inference Acceleration.
104
+
105
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
106
+
107
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
108
+ Asynchronous execution
109
+
110
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
111
+
112
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
113
+
114
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
115
+
116
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the ?~@~\time?~@~] library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
117
+ GPU warm-up
118
+
119
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
120
+
121
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don?~@~Yt want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
122
+
123
+ Since, we want to enable the GPU power-saving mode whenever possible, let?~@~Ys look at how to overcome the initialization of the GPU while measuring time.
124
+ The correct way to measure inference time
125
+
126
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ?~@~XGPU warm-up.?~@~Y This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
127
+
128
+ model = EfficientNet.from_pretrained('efficientnet-b0')
129
+ device = torch.device("cuda")
130
+ model.to(device)
131
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
132
+
133
+ # INIT LOGGERS
134
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
135
+ repetitions = 300
136
+ timings=np.zeros((repetitions,1))
137
+ #GPU-WARM-UP
138
+ for _ in range(10):
139
+ _ = model(dummy_input)
140
+ # MEASURE PERFORMANCE
141
+ with torch.no_grad():
142
+ for rep in range(repetitions):
143
+ starter.record()
144
+ _ = model(dummy_input)
145
+ ender.record()
146
+ # WAIT FOR GPU SYNC
147
+ torch.cuda.synchronize()
148
+ curr_time = starter.elapsed_time(ender)
149
+ timings[rep] = curr_time
150
+
151
+ mean_syn = np.sum(timings) / repetitions
152
+ std_syn = np.std(timings)
153
+ print(mean_syn)
154
+
155
+ Common mistakes when measuring time
156
+
157
+ When we measure the latency of a network, our goal is to measure only the feed-forward of the network, not more and not less. Often, even experts will make certain common mistakes in their measurements. Here are some of them, along with their consequences:
158
+
159
+ 1. Transferring data between the host and the device. The point of view of this post is to measure only the inference time of a neural network. Under this point of view, one of the most common mistakes involves the transfer of data between the CPU and GPU while taking time measurements. This is usually done unintentionally when a tensor is created on the CPU and inference is then performed on the GPU. This memory allocation takes a considerable amount of time, which subsequently enlarges the time for inference. The effect of this mistake over the mean and variance of the measurements can be seen below:
160
+
161
+ Figure 2: Impact of transferring between CPU and GPU while measuring time. Left: The correct measurements for mean and standard deviation (bar). Right: The mean and standard deviation when the input tensor is transferred between CPU and GPU at each call for the network. The X-axis is the timing method and the Y-axis is the time in milliseconds.
162
+
163
+ 2. Not using GPU warm-up. As mentioned above, the first run on the GPU prompts its initialization. GPU initialization can take up to 3 seconds, which makes a huge difference when the timing is in terms of milliseconds.
164
+
165
+ 3. Using standard CPU timing. The most common mistake made is to measure time without synchronization. Even experienced programmers have been known to use the following piece of code.
166
+
167
+ s = time.time()
168
+ _ = model(dummy_input)
169
+ curr_time = (time.time()-s )*1000
170
+
171
+ This of course completely ignores the asynchronous execution mentioned earlier and hence outputs incorrect times. The impact of this mistake on the mean and variance of the measurements are shown below:
172
+
173
+ Figure 3: Impact of measuring time on CPU. Left: The correct measurements for mean and standard deviation (bar). Right: The mean and standard deviation when processes are not synchronized. The X-axis is the timing method and the Y-axis is the time in milliseconds.
174
+
175
+ 4. Taking one sample. Like many processes in computer science, feed forward of the neural network has a (small) stochastic component. The variance of the run-time can be significant, especially when measuring a low latency network. To this end, it is essential to run the network over several examples and then average the results (300 examples can be a good number). A common mistake is to use one sample and refer to it as the run-time. This, of course, won?~@~Yt represent the true run-time.
176
+ Measuring Throughput
177
+
178
+ The throughput of a neural network is defined as the maximal number of input instances the network can process in time a unit (e.g., a second). Unlike latency, which involves the processing of a single instance, to achieve maximal throughput we would like to process in parallel as many instances as possible. The effective parallelism is obviously data-, model-, and device-dependent. Thus, to correctly measure throughput we perform the following two steps: (1) we estimate the optimal batch size that allows for maximum parallelism; and (2), given this optimal batch size, we measure the number of instances the network can process in one second.
179
+ To find the optimal batch size, a good rule of thumb is to reach the memory limit of our GPU for the given data type. This size of course depends on the hardware type and the size of the network. The quickest way to find this maximal batch size is by performing a binary search. When time is of no concern a simple sequential search is sufficient. To this end, using a for loop we increase by one the batch size until Run Time error is achieved, this identifies the largest batch size the GPU can process, for our neural network model and the input data it processes.
180
+ After finding the optimal batch size, we calculate the actual throughput. To this end, we would like to process many batches (100 batches will be a sufficient number) and then use the following formula:
181
+
182
+ (number of batches X batch size)/(total time in seconds)
183
+
184
+ This formula gives the number of examples our network can process in one second. The code below provides a simple way to perform the above calculation (given the optimal batch size):
185
+
186
+ model = EfficientNet.from_pretrained('efficientnet-b0')
187
+ device = torch.device("cuda")
188
+ model.to(device)
189
+ dummy_input = torch.randn(optimal_batch_size, 3,224,224, dtype=torch.float).to(device)
190
+
191
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
192
+
193
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
194
+ Asynchronous execution
195
+
196
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
197
+
198
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
199
+
200
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
201
+
202
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the ?~@~\time?~@~] library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
203
+ GPU warm-up
204
+
205
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
206
+
207
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don?~@~Yt want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
208
+
209
+ Since, we want to enable the GPU power-saving mode whenever possible, let?~@~Ys look at how to overcome the initialization of the GPU while measuring time.
210
+ The correct way to measure inference time
211
+
212
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ?~@~XGPU warm-up.?~@~Y This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
213
+
214
+ model = EfficientNet.from_pretrained('efficientnet-b0')
215
+ device = torch.device("cuda")
216
+ model.to(device)
217
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
218
+
219
+ # INIT LOGGERS
220
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
221
+ repetitions = 300
222
+ timings=np.zeros((repetitions,1))
223
+ #GPU-WARM-UP
224
+ for _ in range(10):
225
+ _ = model(dummy_input)
226
+ # MEASURE PERFORMANCE
227
+ with torch.no_grad():
228
+ for rep in range(repetitions):
229
+ starter.record()
230
+ _ = model(dummy_input)
231
+ ender.record()
232
+ 1,1 Top
233
+
234
+
235
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
236
+
237
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
238
+ Asynchronous execution
239
+
240
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
241
+
242
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
243
+
244
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
245
+
246
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the ?~@~\time?~@~] library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
247
+ GPU warm-up
248
+
249
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
250
+
251
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don?~@~Yt want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
252
+
253
+ Since, we want to enable the GPU power-saving mode whenever possible, let?~@~Ys look at how to overcome the initialization of the GPU while measuring time.
254
+ The correct way to measure inference time
255
+
256
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ?~@~XGPU warm-up.?~@~Y This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
257
+
258
+ model = EfficientNet.from_pretrained('efficientnet-b0')
259
+ device = torch.device("cuda")
260
+ model.to(device)
261
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
262
+
263
+ # INIT LOGGERS
264
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
265
+ repetitions = 300
266
+ timings=np.zeros((repetitions,1))
267
+ #GPU-WARM-UP
268
+ for _ in range(10):
269
+ _ = model(dummy_input)
270
+ # MEASURE PERFORMANCE
271
+ with torch.no_grad():
272
+ for rep in range(repetitions):
273
+ starter.record()
274
+ _ = model(dummy_input)
275
+ ender.record()
276
+ 1,1 Top
277
+
278
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
279
+
280
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
281
+ Asynchronous execution
282
+
283
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
284
+
285
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
286
+
287
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
288
+
289
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the ?~@~\time?~@~] library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
290
+ GPU warm-up
291
+
292
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
293
+
294
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don?~@~Yt want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
295
+
296
+ Since, we want to enable the GPU power-saving mode whenever possible, let?~@~Ys look at how to overcome the initialization of the GPU while measuring time.
297
+ The correct way to measure inference time
298
+
299
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ?~@~XGPU warm-up.?~@~Y This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
300
+
301
+ model = EfficientNet.from_pretrained('efficientnet-b0')
302
+ device = torch.device("cuda")
303
+ model.to(device)
304
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
305
+
306
+ # INIT LOGGERS
307
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
308
+ repetitions = 300
309
+ timings=np.zeros((repetitions,1))
310
+ #GPU-WARM-UP
311
+ for _ in range(10):
312
+ _ = model(dummy_input)
313
+ # MEASURE PERFORMANCE
314
+ with torch.no_grad():
315
+ for rep in range(repetitions):
316
+ starter.record()
317
+ _ = model(dummy_input)
318
+ ender.record()
319
+ 1,1 Top
320
+
321
+ The network latency is one of the more crucial aspects of deploying a deep network into a production environment. Most real-world applications require blazingly fast inference time, varying anywhere from a few milliseconds to one second. But the task of correctly and meaningfully measuring the inference time, or latency, of a neural network requires profound understanding. Even experienced programmers often make common mistakes that lead to inaccurate latency measurements. The impact of these mistakes has the potential to trigger bad decisions and unnecessary expenditures.
322
+
323
+ In this post, we review some of the main issues that should be addressed to measure latency time correctly. We review the main processes that make GPU execution unique, including asynchronous execution and GPU warm up. We then share code samples for measuring time correctly on a GPU. Finally, we review some of the common mistakes people make when quantifying inference time on GPUs.
324
+ Asynchronous execution
325
+
326
+ We begin by discussing the GPU execution mechanism. In multithreaded or multi-device programming, two blocks of code that are independent can be executed in parallel; this means that the second block may be executed before the first is finished. This process is referred to as asynchronous execution. In the deep learning context, we often use this execution because the GPU operations are asynchronous by default. More specifically, when calling a function using a GPU, the operations are enqueued to the specific device, but not necessarily to other devices. This allows us to execute computations in parallel on the CPU or another GPU.
327
+
328
+ Figure 1. Asynchronous execution. Left: Synchronous process where process A waits for a response from process B before it can continue working. Right: Asynchronous process A continues working without waiting for process B to finish.
329
+
330
+ Asynchronous execution offers huge advantages for deep learning, such as the ability to decrease run-time by a large factor. For example, at the inference of multiple batches, the second batch can be preprocessed on the CPU while the first batch is fed forward through the network on the GPU. Clearly, it would be beneficial to use asynchronism whenever possible at inference time.
331
+
332
+ The effect of asynchronous execution is invisible to the user; but, when it comes to time measurements, it can be the cause of many headaches. When you calculate time with the ?~@~\time?~@~] library in Python, the measurements are performed on the CPU device. Due to the asynchronous nature of the GPU, the line of code that stops the timing will be executed before the GPU process finishes. As a result, the timing will be inaccurate or irrelevant to the actual inference time. Keeping in mind that we want to use asynchronism, later in this post we explain how to correctly measure time despite the asynchronous processes.
333
+ GPU warm-up
334
+
335
+ A modern GPU device can exist in one of several different power states. When the GPU is not being used for any purpose and persistence mode (i.e., which keeps the GPU on) is not enabled, the GPU will automatically reduce its power state to a very low level, sometimes even a complete shutdown. In lower power state, the GPU shuts down different pieces of hardware, including memory subsystems, internal subsystems, or even compute cores and caches.
336
+
337
+ The invocation of any program that attempts to interact with the GPU will cause the driver to load and/or initialize the GPU. This driver load behavior is noteworthy. Applications that trigger GPU initialization can incur up to 3 seconds of latency, due to the scrubbing behavior of the error correcting code. For instance, if we measure time for a network that takes 10 milliseconds for one example, running over 1000 examples may result in most of our running time being wasted on initializing the GPU. Naturally, we don?~@~Yt want to measure such side effects because the timing is not accurate. Nor does it reflect a production environment where usually the GPU is already initialized or working in persistence mode.
338
+
339
+ Since, we want to enable the GPU power-saving mode whenever possible, let?~@~Ys look at how to overcome the initialization of the GPU while measuring time.
340
+ The correct way to measure inference time
341
+
342
+ The PyTorch code snippet below shows how to measure time correctly. Here we use Efficient-net-b0 but you can use any other network. In the code, we deal with the two caveats described above. Before we make any time measurements, we run some dummy examples through the network to do a ?~@~XGPU warm-up.?~@~Y This will automatically initialize the GPU and prevent it from going into power-saving mode when we measure time. Next, we use tr.cuda.event to measure time on the GPU. It is crucial here to use torch.cuda.synchronize(). This line of code performs synchronization between the host and device (i.e., GPU and CPU), so the time recording takes place only after the process running on the GPU is finished. This overcomes the issue of unsynchronized execution.
343
+
344
+ model = EfficientNet.from_pretrained('efficientnet-b0')
345
+ device = torch.device("cuda")
346
+ model.to(device)
347
+ dummy_input = torch.randn(1, 3,224,224, dtype=torch.float).to(device)
348
+
349
+ # INIT LOGGERS
350
+ starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
351
+ repetitions = 300
352
+ timings=np.zeros((repetitions,1))
353
+ #GPU-WARM-UP
354
+ for _ in range(10):
355
+ _ = model(dummy_input)
356
+ # MEASURE PERFORMANCE
357
+ with torch.no_grad():
358
+ for rep in range(repetitions):
359
+ starter.record()
360
+ _ = model(dummy_input)
361
+ ender.record()
362
+ 1,1 Top
363
+
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": false,
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "padding_side": "right",
25
+ "sp_model_kwargs": {},
26
+ "tokenizer_class": "LlamaTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }