Ateeb commited on
Commit
00974c5
1 Parent(s): 555360f

Updated version of the your-model-name model and tokenizer.

Browse files
Files changed (3) hide show
  1. main.py +36 -37
  2. preprocess.py +0 -166
  3. pytorch_model.bin +1 -1
main.py CHANGED
@@ -6,56 +6,55 @@ import torch
6
  import subprocess
7
 
8
  data = Model()
9
- data.ModelExecution()
10
- # train_contexts, train_questions, train_answers = data.ArrangeData("livecheckcontainer")
11
- # val_contexts, val_questions, val_answers = data.ArrangeData("livecheckcontainer")
12
- # print(train_answers)
13
 
14
- # train_answers, train_contexts = data.add_end_idx(train_answers, train_contexts)
15
- # val_answers, val_contexts = data.add_end_idx(val_answers, val_contexts)
16
 
17
- # train_encodings, val_encodings = data.Tokenizer(train_contexts, train_questions, val_contexts, val_questions)
18
 
19
- # train_encodings = data.add_token_positions(train_encodings, train_answers)
20
- # val_encodings = data.add_token_positions(val_encodings, val_answers)
21
 
22
- # train_dataset = SquadDataset(train_encodings)
23
- # val_dataset = SquadDataset(val_encodings)
24
 
25
 
26
 
27
- # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
28
 
29
 
30
- # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
31
 
32
- # model.to(device)
33
- # model.train()
34
 
35
- # train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
36
 
37
- # optim = AdamW(model.parameters(), lr=5e-5)
38
 
39
- # for epoch in range(2):
40
- # print(epoch)
41
- # for batch in train_loader:
42
- # optim.zero_grad()
43
- # input_ids = batch['input_ids'].to(device)
44
- # attention_mask = batch['attention_mask'].to(device)
45
- # start_positions = batch['start_positions'].to(device)
46
- # end_positions = batch['end_positions'].to(device)
47
- # outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
48
- # loss = outputs[0]
49
- # loss.backward()
50
- # optim.step()
51
- # print("Done")
52
- # model.eval()
53
- # model.save_pretrained("./")
54
- # data.tokenizer.save_pretrained("./")
55
 
56
 
57
- # subprocess.call(["git", "add","--all"])
58
- # subprocess.call(["git", "status"])
59
- # subprocess.call(["git", "commit", "-m", "First version of the your-model-name model and tokenizer."])
60
- # subprocess.call(["git", "push"])
61
 
 
6
  import subprocess
7
 
8
  data = Model()
9
+ train_contexts, train_questions, train_answers = data.ArrangeData("livecheckcontainer")
10
+ val_contexts, val_questions, val_answers = data.ArrangeData("livecheckcontainer")
11
+ print(train_answers)
 
12
 
13
+ train_answers, train_contexts = data.add_end_idx(train_answers, train_contexts)
14
+ val_answers, val_contexts = data.add_end_idx(val_answers, val_contexts)
15
 
16
+ train_encodings, val_encodings = data.Tokenizer(train_contexts, train_questions, val_contexts, val_questions)
17
 
18
+ train_encodings = data.add_token_positions(train_encodings, train_answers)
19
+ val_encodings = data.add_token_positions(val_encodings, val_answers)
20
 
21
+ train_dataset = SquadDataset(train_encodings)
22
+ val_dataset = SquadDataset(val_encodings)
23
 
24
 
25
 
26
+ model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
27
 
28
 
29
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
30
 
31
+ model.to(device)
32
+ model.train()
33
 
34
+ train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
35
 
36
+ optim = AdamW(model.parameters(), lr=5e-5)
37
 
38
+ for epoch in range(2):
39
+ print(epoch)
40
+ for batch in train_loader:
41
+ optim.zero_grad()
42
+ input_ids = batch['input_ids'].to(device)
43
+ attention_mask = batch['attention_mask'].to(device)
44
+ start_positions = batch['start_positions'].to(device)
45
+ end_positions = batch['end_positions'].to(device)
46
+ outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
47
+ loss = outputs[0]
48
+ loss.backward()
49
+ optim.step()
50
+ print("Done")
51
+ model.eval()
52
+ model.save_pretrained("./")
53
+ data.tokenizer.save_pretrained("./")
54
 
55
 
56
+ subprocess.call(["git", "add","--all"])
57
+ subprocess.call(["git", "status"])
58
+ subprocess.call(["git", "commit", "-m", "First version of the your-model-name model and tokenizer."])
59
+ subprocess.call(["git", "push"])
60
 
preprocess.py DELETED
@@ -1,166 +0,0 @@
1
- import json
2
- from os import close
3
- from pathlib import Path
4
- from azure.cosmos import CosmosClient, PartitionKey, exceptions
5
- from transformers import DistilBertTokenizerFast
6
- import torch
7
- from transformers import DistilBertForQuestionAnswering, AdamW
8
- from torch.utils.data import DataLoader
9
- import subprocess
10
-
11
- class Model:
12
-
13
- def __init__(self) -> None:
14
- self.endPoint = "https://productdevelopmentstorage.documents.azure.com:443/"
15
- self.primaryKey = "nVds9dPOkPuKu8RyWqigA1DIah4SVZtl1DIM0zDuRKd95an04QC0qv9TQIgrdtgluZo7Z0HXACFQgKgOQEAx1g=="
16
- self.client = CosmosClient(self.endPoint, self.primaryKey)
17
- self.tokenizer = None
18
-
19
- def GetData(self, type):
20
- database = self.client.get_database_client("squadstorage")
21
- container = database.get_container_client(type)
22
- item_list = list(container.read_all_items(max_item_count=10))
23
- return item_list
24
-
25
- def ArrangeData(self, type):
26
- squad_dict = self.GetData(type)
27
-
28
- contexts = []
29
- questions = []
30
- answers = []
31
-
32
- for i in squad_dict:
33
- contexts.append(i["context"])
34
- questions.append(i["question"])
35
- answers.append(i["answers"])
36
-
37
- return contexts, questions, answers
38
-
39
- def add_end_idx(self, answers, contexts):
40
- for answer, context in zip(answers, contexts):
41
- gold_text = answer['text'][0]
42
- start_idx = answer['answer_start'][0]
43
- end_idx = start_idx + len(gold_text)
44
-
45
- if context[start_idx:end_idx] == gold_text:
46
- answer['answer_end'] = end_idx
47
- elif context[start_idx-1:end_idx-1] == gold_text:
48
- answer['answer_start'] = start_idx - 1
49
- answer['answer_end'] = end_idx - 1 # When the gold label is off by one character
50
- elif context[start_idx-2:end_idx-2] == gold_text:
51
- answer['answer_start'] = start_idx - 2
52
- answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters
53
-
54
- return answers, contexts
55
-
56
- def Tokenizer(self, train_contexts, train_questions, val_contexts, val_questions):
57
- self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
58
-
59
- train_encodings = self.tokenizer(train_contexts, train_questions, truncation=True, padding=True)
60
- val_encodings = self.tokenizer(val_contexts, val_questions, truncation=True, padding=True)
61
-
62
- return train_encodings, val_encodings
63
-
64
-
65
- def add_token_positions(self, encodings, answers):
66
- start_positions = []
67
- end_positions = []
68
- for i in range(len(answers)):
69
- start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
70
- end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
71
-
72
- # if start position is None, the answer passage has been truncated
73
-
74
- if start_positions[-1] is None:
75
- start_positions[-1] = self.tokenizer.model_max_length
76
- if end_positions[-1] is None:
77
- end_positions[-1] = self.tokenizer.model_max_length
78
-
79
- encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
80
- return encodings
81
-
82
- # train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
83
- # val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
84
-
85
- def ModelExecution(self):
86
- train_contexts, train_questions, train_answers = self.ArrangeData("livecheckcontainer")
87
- val_contexts, val_questions, val_answers = self.ArrangeData("livecheckcontainer")
88
- print(train_answers)
89
-
90
- train_answers, train_contexts = self.add_end_idx(train_answers, train_contexts)
91
- val_answers, val_contexts = self.add_end_idx(val_answers, val_contexts)
92
-
93
- train_encodings, val_encodings = self.Tokenizer(train_contexts, train_questions, val_contexts, val_questions)
94
-
95
- train_encodings = self.add_token_positions(train_encodings, train_answers)
96
- val_encodings = self.add_token_positions(val_encodings, val_answers)
97
-
98
- train_dataset = SquadDataset(train_encodings)
99
- val_dataset = SquadDataset(val_encodings)
100
-
101
- model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
102
-
103
-
104
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
105
-
106
- model.to(device)
107
- model.train()
108
-
109
- train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
110
-
111
- optim = AdamW(model.parameters(), lr=5e-5)
112
-
113
- for epoch in range(2):
114
- print(epoch)
115
- for batch in train_loader:
116
- optim.zero_grad()
117
- input_ids = batch['input_ids'].to(device)
118
- attention_mask = batch['attention_mask'].to(device)
119
- start_positions = batch['start_positions'].to(device)
120
- end_positions = batch['end_positions'].to(device)
121
- outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
122
- loss = outputs[0]
123
- loss.backward()
124
- optim.step()
125
- print("Done")
126
- model.eval()
127
- model.save_pretrained("./")
128
- self.tokenizer.save_pretrained("./")
129
-
130
- subprocess.call(["git", "add","--all"])
131
- subprocess.call(["git", "status"])
132
- subprocess.call(["git", "commit", "-m", "First version of the your-model-name model and tokenizer."])
133
- subprocess.call(["git", "push"])
134
-
135
-
136
-
137
-
138
- class SquadDataset(torch.utils.data.Dataset):
139
- def __init__(self, encodings):
140
- self.encodings = encodings
141
-
142
- def __getitem__(self, idx):
143
- return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
144
-
145
- def __len__(self):
146
- return len(self.encodings.input_ids)
147
-
148
- # import requests
149
- # API_URL = "https://api-inference.huggingface.co/models/Ateeb/QA"
150
- # headers = {"Authorization": "Bearer api_DHnvjPKdjmjkmEYQubgvmIKJqWaNNYljaF"}
151
-
152
- # def query(payload):
153
- # data = json.dumps(payload)
154
- # response = requests.request("POST", API_URL, headers=headers, data=data)
155
- # return json.loads(response.content.decode("utf-8"))
156
-
157
-
158
- # data = query(
159
- # {
160
- # "inputs": {
161
- # "question": "What is my name?",
162
- # "context": "My name is Clara and I live in Berkeley.",
163
- # }
164
- # }
165
- # )
166
- # print(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e93e749fc2915653de7b297c5bae0125876890474e01ad3fd9c196680bd2fa3
3
  size 265498527
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8cccd7fb599db7567c7ad9506415596698f1de0cdda9f473af71a8edd791450
3
  size 265498527