Manu101 commited on
Commit
dc0d378
1 Parent(s): 895e50f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +90 -0
  2. model_00350.pt +3 -0
  3. model_file.py +354 -0
  4. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2Tokenizer
3
+ import gradio as gr
4
+ import tiktoken
5
+ import model_file
6
+ from dataclasses import dataclass
7
+ import time
8
+ import os
9
+ import torch.nn.functional as F
10
+
11
+ num_return_sequences = 1
12
+ max_length = 100
13
+
14
+
15
+ @dataclass
16
+ class GPTConfig:
17
+ block_size: int = 1024
18
+ vocab_size: int = 50304
19
+ n_layer: int = 12
20
+ n_head: int = 12
21
+ n_embd: int = 768
22
+
23
+
24
+ # tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
25
+ tokenizer = tiktoken.get_encoding("gpt2")
26
+
27
+ device = "cpu"
28
+ if torch.cuda.is_available():
29
+ device = "cuda"
30
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
31
+ device = "mps"
32
+
33
+ device = torch.device(device)
34
+
35
+ try:
36
+ model = model_file.get_model().to(device)
37
+ checkpoint = torch.load(os.path.join(os.path.dirname(__file__), "model_00350.pt"), map_location=device)
38
+ state_dict = {key.replace("_orig_mod.", ""): value for key, value in checkpoint['model'].items()}
39
+ model.load_state_dict(state_dict=state_dict)
40
+ model.eval()
41
+ print("Model loaded successfully.")
42
+ except Exception as e:
43
+ print(f"Error loading model: {e}")
44
+ raise e
45
+
46
+ examples = [
47
+ "Who are you?",
48
+ "Write a Shakespeare short poem.",
49
+ "Tell me a joke.",
50
+ "What is the meaning of life?",
51
+ ]
52
+
53
+
54
+ def chat_fn(message, history):
55
+ # Tokenize
56
+ print(f"message: {message}")
57
+ tokens = tokenizer.encode(message)
58
+ tokens = torch.tensor(tokens, dtype=torch.int32)
59
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
60
+ x = tokens.to(device)
61
+ while x.size(1) < max_length:
62
+ # forward pass through model to get logits
63
+ with torch.no_grad():
64
+ logits = model(x)[0] # batch_size, T, vocab_size
65
+ logits = logits[:, -1, :] # get last position logits B, vocab_size
66
+
67
+ # calculate probabilities
68
+ probs = F.softmax(logits, dim=-1)
69
+
70
+ # doing topk here, HF defafult is 50
71
+ # topk is (5, 50), top_indices is (5, 50) too
72
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
73
+
74
+ # sampling a token from topk
75
+ ix = torch.multinomial(input=topk_probs, num_samples=1) # (B, 1) (5, 1)
76
+
77
+ # gather corresponding indices
78
+ xcol = torch.gather(input=topk_indices, dim=-1, index=ix)
79
+ # append to the seq
80
+ x = torch.cat([x, xcol], dim=1)
81
+
82
+ for i in range(num_return_sequences):
83
+ tokens = x[i, :max_length].tolist()
84
+ decoded = tokenizer.decode(tokens)
85
+
86
+ yield decoded
87
+
88
+
89
+ gr.ChatInterface(chat_fn, examples=examples).launch()
90
+ # interface.launch()
model_00350.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e55d3d3c97b288aeb1659b766d405b8fcee33051dac8281869a0063c77a3d0
3
+ size 548296450
model_file.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import tiktoken
3
+ import inspect
4
+ from dataclasses import dataclass
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn import functional as F
8
+
9
+
10
+ @dataclass
11
+ class GPTConfig:
12
+ block_size: int = 1024 # this is max sequence len
13
+ vocab_size: int = 50304 # 50257 # total vocab including 256 bytes + 1 special token (<|endoftext|>) and 1000-257 BPE merges
14
+ n_layer: int = 12 # number of layers
15
+ n_head: int = 12 # total number of attention heads
16
+ n_embd: int = 768 # embedding dimension
17
+
18
+
19
+ class CausalSelfAttention(nn.Module):
20
+ def __init__(self, config):
21
+ super().__init__()
22
+ n_head = config.n_head
23
+ n_embd = config.n_embd
24
+
25
+ assert n_embd % n_head == 0
26
+
27
+ # query, key, value prjections all combined
28
+ self.c_attn = nn.Linear(n_embd, 3 * n_embd)
29
+
30
+ # output projection, after `v` is already multiplied with attention_scores
31
+ self.c_proj = nn.Linear(n_embd, n_embd)
32
+
33
+ self.c_proj.NANOGPT_SCALE_INIT = 1
34
+
35
+ block_size = config.block_size
36
+
37
+ self.register_buffer('bias', torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
38
+
39
+ self.n_embd = n_embd
40
+ self.n_head = n_head
41
+
42
+ def forward(self, x):
43
+ B, T, C = x.size() # batch_size, sequence_len, embedding_dim (n_embd)
44
+ # total dim = n_head * head_size
45
+ # example GPT2 has 12 heads with each hs = 64 thus C= 12*64 = 768
46
+
47
+ qkv = self.c_attn(x) # get combined qkv matix B, T, n_embd * 3(768*3=2304)
48
+
49
+ q, k, v = qkv.split(self.n_embd, dim=2) # each item gets n_embd size, dimension against two
50
+
51
+ # b, seq, n_embd -> b, seq, n_heads, head_size -> b, n_heads, seq_len, head_size
52
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
53
+ # final-> bs, n_heads, seq_len, mini-n_head_embd
54
+
55
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
56
+
57
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
58
+
59
+ # # print(f"shape of q: {q.shape}... shape of k : {k.shape}")
60
+
61
+ # attn = (q @ k.transpose(-2, -1))/(math.sqrt(k.shape[-1]))
62
+
63
+ # # apply masked fill at places where mask ==0, remember tril is lower triangle
64
+ # attn = attn.masked_fill(mask = self.bias[ : , : , :T, :T] == 0, value=float('-inf'))
65
+
66
+ # attn = F.softmax(attn, dim=-1)
67
+
68
+ # y = attn @ v # B, n_heads, T/seq, T @ B, n_heads, T/Seq, head_size) -> B, n_heads, T, head_size
69
+
70
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
71
+
72
+ # transpose y to merge all n_heads. B, n_heads, T, head_size -> transpose B, T, n_heads, head_size -> view B, T, Channel_size/n_emb 768
73
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
74
+
75
+ # out projection, B, T, C -> B, T, C
76
+ y = self.c_proj(y)
77
+
78
+ return y
79
+
80
+ def generate(self, prompt):
81
+ if not isinstance(prompt, str) or len(prompt) == 0:
82
+ return "Say something!"
83
+
84
+
85
+ class MLP(nn.Module):
86
+ def __init__(self, config):
87
+ super().__init__()
88
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
89
+ self.gelu = nn.GELU(approximate='tanh')
90
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
91
+ self.c_proj.NANOGPT_SCALE_INIT = 1
92
+
93
+ def forward(self, x):
94
+ x = self.c_fc(x)
95
+ x = self.gelu(x)
96
+ x = self.c_proj(x)
97
+ return x
98
+
99
+
100
+ class Block(nn.Module):
101
+ def __init__(self, config):
102
+ super().__init__()
103
+ self.ln_1 = nn.LayerNorm(config.n_embd)
104
+ self.attn = CausalSelfAttention(config)
105
+ self.ln_2 = nn.LayerNorm(config.n_embd)
106
+ self.mlp = MLP(config)
107
+
108
+ def forward(self, x):
109
+ x = x + self.attn(self.ln_1(x))
110
+ x = x + self.mlp(self.ln_2(x))
111
+ return x
112
+
113
+
114
+ class GPT(nn.Module):
115
+ def __init__(self, config):
116
+ super().__init__()
117
+ self.config = config
118
+
119
+ self.transformer = nn.ModuleDict(
120
+ dict(
121
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
122
+ wpe=nn.Embedding(config.block_size, config.n_embd),
123
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
124
+ ln_f=nn.LayerNorm(config.n_embd)
125
+ ))
126
+
127
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
128
+
129
+ # weight sharing
130
+ self.transformer.wte.weight = self.lm_head.weight
131
+
132
+ # weight initialization
133
+ self.apply(self._init_weights)
134
+
135
+ def _init_weights(self, module):
136
+ if isinstance(module, nn.Linear):
137
+ std = 0.02
138
+ if hasattr(module, 'NANOGPT_SCALE_INIT'):
139
+ std *= (2 * self.config.n_layer) ** -0.5
140
+
141
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
142
+
143
+ if module.bias is not None:
144
+ torch.nn.init.zeros_(module.bias)
145
+
146
+ elif isinstance(module, nn.Embedding):
147
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
148
+
149
+ def forward(self, idx, targets=None):
150
+ B, T = idx.size() # batch , seq_len
151
+
152
+ # check if incoming seq_len of idx is within limits
153
+ assert T <= self.config.block_size, f"Cannot proceed as your Sequence len : {T} is more than {self.config.block_size}"
154
+
155
+ # forward for token and position encodings
156
+ # shape (T)
157
+ pos = torch.arange(0, T, dtype=torch.int32, device=idx.device)
158
+ pos_emb = self.transformer.wpe(pos) # position embds of shape (T, n_embd)
159
+ token_emb = self.transformer.wte(idx) # token embds of shape (Batch, T/seq_len, n_embd)
160
+
161
+ x = pos_emb + token_emb
162
+
163
+ # now forward through transformer blocks
164
+ for block in self.transformer.h:
165
+ x = block(x)
166
+
167
+ # pass through final layernorm
168
+ x = self.transformer.ln_f(x)
169
+
170
+ # pass through final LM_HEAD
171
+ logits = self.lm_head(x) # shape (Batch_size, T, vocab_size)
172
+
173
+ loss = None
174
+ if targets is not None:
175
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
176
+
177
+ return logits, loss
178
+
179
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
180
+ # start with all of the candidate parameters (that require grad)
181
+ param_dict = {pn: p for pn, p in self.named_parameters()}
182
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
183
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
184
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
185
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
186
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
187
+ optim_groups = [
188
+ {'params': decay_params, 'weight_decay': weight_decay},
189
+ {'params': nodecay_params, 'weight_decay': 0.0}
190
+ ]
191
+ num_decay_params = sum(p.numel() for p in decay_params)
192
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
193
+
194
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
195
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
196
+ # Create AdamW optimizer and use the fused version if it is available
197
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
198
+ use_fused = fused_available and device_type == "cuda"
199
+
200
+ print(f"using fused AdamW: {use_fused}")
201
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
202
+ return optimizer
203
+
204
+
205
+ class DataLoaderLite:
206
+ def __init__(self, B, T, process_rank, num_processes):
207
+ self.B = B
208
+ self.T = T
209
+ self.process_rank = process_rank
210
+ self.num_processes = num_processes
211
+
212
+ with open('input.txt', 'r') as f:
213
+ text = f.read()
214
+ enc = tiktoken.get_encoding('gpt2')
215
+ tokens = enc.encode(text)
216
+ self.tokens = torch.tensor(tokens)
217
+ print(f'loaded len : {len(self.tokens)}')
218
+ # print(f'1 epoch = {len(self.tokens)//(B*T)} batches ')
219
+ self.current_position = self.B * self.T * self.process_rank
220
+
221
+ def next_batch(self):
222
+ B, T = self.B, self.T
223
+ buf = self.tokens[self.current_position: self.current_position + (B * T) + 1]
224
+ y = buf[1:].view(B, T)
225
+ x = buf[:-1].view(B, T)
226
+
227
+ self.current_position += (B * T * self.num_processes)
228
+
229
+ if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
230
+ self.current_position = self.B * self.T * self.process_rank
231
+ return x, y
232
+
233
+
234
+ def get_model():
235
+ model = GPT(GPTConfig())
236
+ return model
237
+
238
+ # cuda = torch.cuda.is_available()
239
+ # torch.set_float32_matmul_precision('high')
240
+
241
+ # max_lr = 6e-4
242
+ # min_lr = 0.1 * max_lr
243
+ # warmup_steps = 10
244
+ # max_steps = 5000
245
+
246
+ # def get_lr(iteration):
247
+ # if iteration < warmup_steps:
248
+ # return max_lr * (iteration + 1) / warmup_steps
249
+ # if iteration > max_steps:
250
+ # return min_lr
251
+
252
+ # decay_ratio = (iteration - warmup_steps) / (max_steps - warmup_steps)
253
+
254
+ # assert 0<= decay_ratio <= 1
255
+
256
+ # coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
257
+ # return min_lr + coeff * (max_lr - min_lr)
258
+
259
+
260
+ # model = GPT(GPTConfig()).to(device=device)
261
+
262
+ # model = torch.compile(model, mode='default')
263
+
264
+ # if ddp:
265
+ # print("\n\n====================================\nDDP")
266
+ # model = DDP(module=model,device_ids=[ddp_local_rank])
267
+
268
+ # raw_model = model.module if ddp else model
269
+
270
+ # # optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-8)
271
+ # optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
272
+
273
+
274
+ # total_batch_size = 524288
275
+
276
+ # B = 16
277
+ # T = 1024
278
+
279
+ # assert total_batch_size % (B * T * ddp_world_size) == 0, "just to make sure total batch size is divisible by B*T"
280
+
281
+ # grad_accumulation_steps = total_batch_size // (B * T * ddp_world_size)
282
+
283
+ # if master_process:
284
+ # print(f"\nGradient accumulation steps needed with B: {B} and T: {T} for total batch size: {total_batch_size} = {grad_accumulation_steps}")
285
+ # print(f"total params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
286
+
287
+
288
+ # train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size)
289
+
290
+ # # torch.cuda.amp.autocast(enabled=True)
291
+ # torch.backends.cuda.matmul.allow_tf32 = True
292
+ # torch.backends.cudnn.allow_tf32 = True
293
+
294
+ # log_dir = "logs"
295
+ # os.makedirs(log_dir, exist_ok=True)
296
+
297
+ # start= time.time()
298
+
299
+ # for step in range(max_steps):
300
+ # t0 = time.time()
301
+ # optimizer.zero_grad()
302
+
303
+ # loss_mini = 0.0
304
+ # for micro_step in range(grad_accumulation_steps):
305
+ # x, y = train_loader.next_batch()
306
+ # x, y = x.to(device=device), y.to(device)
307
+ # with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
308
+ # logits, loss = model(x, y)
309
+ # # if i == 0:
310
+ # # assert logits.dtype == torch.bfloat16
311
+ # # assert loss.dtype == torch.float32
312
+ # # assert model.transformer.wte.weight.dtype == torch.float32
313
+
314
+ # loss = loss/grad_accumulation_steps
315
+ # loss_mini += loss.detach()
316
+ # if ddp:
317
+ # model.require_backward_grad_sync = (micro_step == grad_accumulation_steps - 1)
318
+ # loss.backward()
319
+ # if ddp:
320
+ # dist.all_reduce(loss_mini, op=dist.ReduceOp.AVG)
321
+ # if master_process and step%50==0 and step > 100:
322
+ # print(f"saving at: {step}")
323
+ # checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
324
+ # checkpoint = {
325
+ # 'model': raw_model.state_dict(),
326
+ # 'config': raw_model.config,
327
+ # 'step': step
328
+ # }
329
+ # torch.save(checkpoint, checkpoint_path)
330
+ # # grad clip
331
+ # norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
332
+
333
+ # lr = get_lr(step)
334
+ # for param_group in optimizer.param_groups:
335
+ # param_group['lr'] = lr
336
+
337
+ # optimizer.step()
338
+ # torch.cuda.synchronize()
339
+
340
+ # t1 = time.time()
341
+ # dt = (t1 - t0)
342
+ # tokens_per_sec = (train_loader.B * train_loader.T * grad_accumulation_steps * ddp_world_size) / (dt)
343
+ # if master_process:
344
+ # # print happens via CPU, hence wait (synchronize GPU)
345
+ # print(f'step : {step+1} | loss: {loss_mini.item()} | lr: {lr:.7f} | dt: {dt* 1000:.2f} ms | tokens/sec: {tokens_per_sec:_.6f} | norm: {norm:.2f}')
346
+
347
+
348
+ # end = time.time()
349
+ # print("final loss: ", loss*grad_accumulation_steps)
350
+ # print(f"total time: {end - start} seconds")
351
+ # torch.save(model.state_dict(), "5k-run-new-DDP.pt")
352
+
353
+ # if ddp:
354
+ # destroy_process_group()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ tiktoken
3
+ transformers
4
+ gradio