Rajan Ghimire commited on
Commit
c3cfb6a
1 Parent(s): ab2290c
Test.ipynb ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 31,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "import numpy as np\n",
11
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
12
+ "max_len = 45"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 32,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "tag2idx = {'X': 0,\n",
22
+ " 'YM': 1,\n",
23
+ " '[CLS]': 2,\n",
24
+ " 'DUM': 3,\n",
25
+ " 'VBF': 4,\n",
26
+ " 'RP': 5,\n",
27
+ " 'VBKO': 6,\n",
28
+ " 'CS': 7,\n",
29
+ " 'VBX': 8,\n",
30
+ " 'VBNE': 9,\n",
31
+ " 'CC': 10,\n",
32
+ " 'Unknown': 11,\n",
33
+ " 'PKO': 12,\n",
34
+ " 'JJM': 13,\n",
35
+ " 'PLE': 14,\n",
36
+ " 'VBO': 15,\n",
37
+ " 'HRU': 16,\n",
38
+ " 'YF': 17,\n",
39
+ " 'NN': 18,\n",
40
+ " 'YQ': 19,\n",
41
+ " 'VBI': 20,\n",
42
+ " '[SEP]': 21,\n",
43
+ " 'JJ': 22,\n",
44
+ " 'POP': 23,\n",
45
+ " 'PLAI': 24,\n",
46
+ " 'RBO': 25,\n",
47
+ " 'PP': 26,\n",
48
+ " 'CD': 27,\n",
49
+ " 'NNP': 28}\n",
50
+ "\n",
51
+ "# Mapping index to name\n",
52
+ "tag2name={tag2idx[key] : key for key in tag2idx.keys()}\n"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 33,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "tag_2_nees = {'NN': 'Noun',\n",
62
+ "'JJ': 'Normal/Unmarked Adjective', \n",
63
+ "'NNP': 'Noun Plural',\n",
64
+ "'POP': 'Other Postpositions',\n",
65
+ "'PKO': 'Ko-Postpositions', \n",
66
+ "'YF': 'Sentence-final Punctuation',\n",
67
+ "'CD': 'Cardinal Digits',\n",
68
+ "'PLE':'Postpositions(Le- postpositions)',\n",
69
+ "'VBF': 'Finite Verb', \n",
70
+ "'HRU': 'Plural Marker',\n",
71
+ "'YM': 'Sentence-medial punctuation',\n",
72
+ "'VBX': 'Auxiliary Verb',\n",
73
+ "'VBKO': 'Verb aspectual participle',\n",
74
+ "'CC': 'Coordinating conjunction',\n",
75
+ " 'DUM':'Pronoun unmarked demonstrative',\n",
76
+ " 'VBNE': 'Verb(Prospective participle)',\n",
77
+ " 'VBO':'Other participle verb',\n",
78
+ "'PLAI': 'Postpositions(Lai-Postpositions)',\n",
79
+ " 'RBO': 'Adverb(Other Adverb)',\n",
80
+ " 'VBI': 'Verb Infinitive',\n",
81
+ " 'YQ': 'Quotation Marks',\n",
82
+ " 'PP':'Possessive pronoun',\n",
83
+ " 'JJM': 'Marked adjective',\n",
84
+ " 'CS': 'Subordinating conjunction appearing before/after the clause it subordinates',\n",
85
+ " 'RP': 'Particle'}"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 34,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "# ! pip install transformers\n"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 35,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "from transformers import BertForMaskedLM\n",
104
+ "from transformers import BertTokenizer\n",
105
+ "model = BertForMaskedLM.from_pretrained('./models/bert_out_model/en09',\n",
106
+ " num_labels=len(tag2idx),\n",
107
+ " output_attentions = False,\n",
108
+ " output_hidden_states = False\n",
109
+ " )\n",
110
+ "vocab_file_dir = './models/bert_out_model/en09' \n",
111
+ "tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,\n",
112
+ " strip_accents=False,\n",
113
+ " clean_text=False )"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 36,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "def Get_POS(test_query):\n",
123
+ " tokenized_texts = []\n",
124
+ " temp_token = []\n",
125
+ " # Add [CLS] at the front \n",
126
+ " temp_token.append('[CLS]')\n",
127
+ " token_list = tokenizer.tokenize(test_query)\n",
128
+ " for m,token in enumerate(token_list):\n",
129
+ " temp_token.append(token)\n",
130
+ " # Trim the token to fit the length requirement\n",
131
+ " if len(temp_token) > max_len-1:\n",
132
+ " temp_token= temp_token[:max_len-1]\n",
133
+ " # Add [SEP] at the end\n",
134
+ " temp_token.append('[SEP]')\n",
135
+ " tokenized_texts.append(temp_token)\n",
136
+ " # Make text token into id\n",
137
+ " input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],\n",
138
+ " maxlen=max_len, dtype=\"long\", truncating=\"post\", padding=\"post\")\n",
139
+ " # print(input_ids[0])\n",
140
+ " \n",
141
+ " # For fine tune of predict, with token mask is 1,pad token is 0\n",
142
+ " attention_masks = [[int(i>0) for i in ii] for ii in input_ids]\n",
143
+ " attention_masks[0];\n",
144
+ " segment_ids = [[0] * len(input_id) for input_id in input_ids]\n",
145
+ " segment_ids[0];\n",
146
+ " input_ids = torch.tensor(input_ids)\n",
147
+ " attention_masks = torch.tensor(attention_masks)\n",
148
+ " segment_ids = torch.tensor(segment_ids)\n",
149
+ " # Set save model to Evalue loop\n",
150
+ " model.eval();\n",
151
+ " # Get model predict result\n",
152
+ " with torch.no_grad():\n",
153
+ " outputs = model(input_ids, token_type_ids=None,\n",
154
+ " attention_mask=None,)\n",
155
+ " # For eval mode, the first result of outputs is logits\n",
156
+ " logits = outputs[0]\n",
157
+ " \n",
158
+ " # Make logits into numpy type predict result\n",
159
+ " # The predict result contain each token's all tags predict result\n",
160
+ " predict_results = logits.detach().cpu().numpy()\n",
161
+ "\n",
162
+ " predict_results.shape\n",
163
+ "\n",
164
+ " from scipy.special import softmax\n",
165
+ "\n",
166
+ " result_arrays_soft = softmax(predict_results[0])\n",
167
+ "\n",
168
+ " result_array = result_arrays_soft\n",
169
+ "\n",
170
+ " # Get each token final predict tag index result\n",
171
+ " result_list = np.argmax(result_array,axis=-1)\n",
172
+ "\n",
173
+ " \n",
174
+ " x = list()\n",
175
+ " y = list()\n",
176
+ " new_tokens, new_labels = [], []\n",
177
+ " for i, mark in enumerate(attention_masks[0]):\n",
178
+ " if mark>0:\n",
179
+ " print(\"Token:%s\"%(temp_token[i]))\n",
180
+ " x.append(temp_token[i])\n",
181
+ " # print(\"Tag:%s\"%(result_list[i]))\n",
182
+ " print(\"Predict_Tag:%s\"%(tag2name[result_list[i]]))\n",
183
+ " y.append(result_list[i])\n",
184
+ " # print(\"Posibility:%f\"%(result_array[i][result_list[i]]))\n",
185
+ " \n",
186
+ " for token, label_idx in zip(x, y):\n",
187
+ " if token.startswith(\"##\"):\n",
188
+ " new_tokens[-1] = new_tokens[-1] + token[2:]\n",
189
+ " else:\n",
190
+ " new_labels.append(tag2name[label_idx])\n",
191
+ " new_tokens.append(token)\n",
192
+ " \n",
193
+ " # for token, label in zip(new_tokens, new_labels):\n",
194
+ " # print(\"{} ---------------> {}\".format(token, label))\n",
195
+ " \n",
196
+ " \n",
197
+ " tag_names = []\n",
198
+ " for i in new_labels[1:-1]:\n",
199
+ " tag_names.append(\n",
200
+ " tag_2_nees[i]\n",
201
+ " )\n",
202
+ " \n",
203
+ " return new_tokens[1:-1],tag_names"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 37,
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "Token:[CLS]\n",
216
+ "Predict_Tag:[CLS]\n",
217
+ "Token:हाल\n",
218
+ "Predict_Tag:RBO\n",
219
+ "Token:नेपालका\n",
220
+ "Predict_Tag:JJ\n",
221
+ "Token:विभिन्न\n",
222
+ "Predict_Tag:JJ\n",
223
+ "Token:राजनैतिक\n",
224
+ "Predict_Tag:JJ\n",
225
+ "Token:दलहरूबीच\n",
226
+ "Predict_Tag:JJ\n",
227
+ "Token:एमसीसी\n",
228
+ "Predict_Tag:JJ\n",
229
+ "Token:कार्यक्रमबारे\n",
230
+ "Predict_Tag:NN\n",
231
+ "Token:मतैक्य\n",
232
+ "Predict_Tag:NN\n",
233
+ "Token:##ता\n",
234
+ "Predict_Tag:X\n",
235
+ "Token:हुन\n",
236
+ "Predict_Tag:VBI\n",
237
+ "Token:नसकेका\n",
238
+ "Predict_Tag:VBKO\n",
239
+ "Token:कारण\n",
240
+ "Predict_Tag:NN\n",
241
+ "Token:आन्दोलन\n",
242
+ "Predict_Tag:NN\n",
243
+ "Token:पनि\n",
244
+ "Predict_Tag:RP\n",
245
+ "Token:चर्क\n",
246
+ "Predict_Tag:VBO\n",
247
+ "Token:##िरहेको\n",
248
+ "Predict_Tag:X\n",
249
+ "Token:छ\n",
250
+ "Predict_Tag:VBX\n",
251
+ "Token:।\n",
252
+ "Predict_Tag:YF\n",
253
+ "Token:[SEP]\n",
254
+ "Predict_Tag:[SEP]\n"
255
+ ]
256
+ }
257
+ ],
258
+ "source": [
259
+ "x,y = Get_POS(\"हाल नेपालका विभिन्न राजनैतिक दलहरूबीच एमसीसी कार्यक्रमबारे मतैक्यता हुन नसकेका कारण आन्दोलन पनि चर्किरहेको छ।\")"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 38,
265
+ "metadata": {},
266
+ "outputs": [
267
+ {
268
+ "data": {
269
+ "text/plain": [
270
+ "(['हाल',\n",
271
+ " 'नेपालका',\n",
272
+ " 'विभिन्न',\n",
273
+ " 'राजनैतिक',\n",
274
+ " 'दलहरूबीच',\n",
275
+ " 'एमसीसी',\n",
276
+ " 'कार्यक्रमबारे',\n",
277
+ " 'मतैक्यता',\n",
278
+ " 'हुन',\n",
279
+ " 'नसकेका',\n",
280
+ " 'कारण',\n",
281
+ " 'आन्दोलन',\n",
282
+ " 'पनि',\n",
283
+ " 'चर्किरहेको',\n",
284
+ " 'छ',\n",
285
+ " '।'],\n",
286
+ " ['Adverb(Other Adverb)',\n",
287
+ " 'Normal/Unmarked Adjective',\n",
288
+ " 'Normal/Unmarked Adjective',\n",
289
+ " 'Normal/Unmarked Adjective',\n",
290
+ " 'Normal/Unmarked Adjective',\n",
291
+ " 'Normal/Unmarked Adjective',\n",
292
+ " 'Noun',\n",
293
+ " 'Noun',\n",
294
+ " 'Verb Infinitive',\n",
295
+ " 'Verb aspectual participle',\n",
296
+ " 'Noun',\n",
297
+ " 'Noun',\n",
298
+ " 'Particle',\n",
299
+ " 'Other participle verb',\n",
300
+ " 'Auxiliary Verb',\n",
301
+ " 'Sentence-final Punctuation'])"
302
+ ]
303
+ },
304
+ "execution_count": 38,
305
+ "metadata": {},
306
+ "output_type": "execute_result"
307
+ }
308
+ ],
309
+ "source": [
310
+ "x,y"
311
+ ]
312
+ }
313
+ ],
314
+ "metadata": {
315
+ "interpreter": {
316
+ "hash": "ca894e04cc6fd3e8c60826e0ca22793858ad83aa785622f3d49ff6f88f1ccbf8"
317
+ },
318
+ "kernelspec": {
319
+ "display_name": "Python 3.7.0 64-bit ('pt3.7': conda)",
320
+ "name": "python3"
321
+ },
322
+ "language_info": {
323
+ "codemirror_mode": {
324
+ "name": "ipython",
325
+ "version": 3
326
+ },
327
+ "file_extension": ".py",
328
+ "mimetype": "text/x-python",
329
+ "name": "python",
330
+ "nbconvert_exporter": "python",
331
+ "pygments_lexer": "ipython3",
332
+ "version": "3.7.5"
333
+ },
334
+ "orig_nbformat": 4
335
+ },
336
+ "nbformat": 4,
337
+ "nbformat_minor": 2
338
+ }
models/bert_out_model/en09/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../input/nepalibert",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2",
16
+ "3": "LABEL_3",
17
+ "4": "LABEL_4",
18
+ "5": "LABEL_5",
19
+ "6": "LABEL_6",
20
+ "7": "LABEL_7",
21
+ "8": "LABEL_8",
22
+ "9": "LABEL_9",
23
+ "10": "LABEL_10",
24
+ "11": "LABEL_11",
25
+ "12": "LABEL_12",
26
+ "13": "LABEL_13",
27
+ "14": "LABEL_14",
28
+ "15": "LABEL_15",
29
+ "16": "LABEL_16",
30
+ "17": "LABEL_17",
31
+ "18": "LABEL_18",
32
+ "19": "LABEL_19",
33
+ "20": "LABEL_20",
34
+ "21": "LABEL_21",
35
+ "22": "LABEL_22",
36
+ "23": "LABEL_23",
37
+ "24": "LABEL_24",
38
+ "25": "LABEL_25",
39
+ "26": "LABEL_26",
40
+ "27": "LABEL_27",
41
+ "28": "LABEL_28"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "LABEL_0": 0,
47
+ "LABEL_1": 1,
48
+ "LABEL_10": 10,
49
+ "LABEL_11": 11,
50
+ "LABEL_12": 12,
51
+ "LABEL_13": 13,
52
+ "LABEL_14": 14,
53
+ "LABEL_15": 15,
54
+ "LABEL_16": 16,
55
+ "LABEL_17": 17,
56
+ "LABEL_18": 18,
57
+ "LABEL_19": 19,
58
+ "LABEL_2": 2,
59
+ "LABEL_20": 20,
60
+ "LABEL_21": 21,
61
+ "LABEL_22": 22,
62
+ "LABEL_23": 23,
63
+ "LABEL_24": 24,
64
+ "LABEL_25": 25,
65
+ "LABEL_26": 26,
66
+ "LABEL_27": 27,
67
+ "LABEL_28": 28,
68
+ "LABEL_3": 3,
69
+ "LABEL_4": 4,
70
+ "LABEL_5": 5,
71
+ "LABEL_6": 6,
72
+ "LABEL_7": 7,
73
+ "LABEL_8": 8,
74
+ "LABEL_9": 9
75
+ },
76
+ "layer_norm_eps": 1e-12,
77
+ "max_position_embeddings": 512,
78
+ "model_type": "bert",
79
+ "num_attention_heads": 12,
80
+ "num_hidden_layers": 6,
81
+ "pad_token_id": 0,
82
+ "position_embedding_type": "absolute",
83
+ "transformers_version": "4.15.0",
84
+ "type_vocab_size": 2,
85
+ "use_cache": true,
86
+ "vocab_size": 50000
87
+ }
models/bert_out_model/en09/eval_results.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ f1 socre:
2
+ 0.9330855682813086
3
+
4
+ Accuracy score:
5
+ 0.9458905242268894
6
+
7
+ precision recall f1-score support
8
+
9
+ BF 0.9538 0.9253 0.9393 937
10
+ BI 0.9129 0.9402 0.9263 468
11
+ BKO 0.9785 0.9287 0.9529 785
12
+ BNE 0.9429 0.9319 0.9374 514
13
+ BO 0.8293 0.8872 0.8573 931
14
+ BX 0.9570 0.9547 0.9558 816
15
+ C 0.9943 0.9914 0.9929 701
16
+ D 0.9007 0.8772 0.8888 920
17
+ F 0.9963 0.9945 0.9954 1083
18
+ J 0.8835 0.8817 0.8826 2520
19
+ JM 0.8914 0.8914 0.8914 221
20
+ KO 0.9942 0.9976 0.9959 2070
21
+ LAI 0.9980 0.9980 0.9980 496
22
+ LE 0.9972 0.9945 0.9959 1088
23
+ M 0.9265 0.8164 0.8680 757
24
+ N 0.9304 0.9202 0.9253 6655
25
+ NP 0.8689 0.9005 0.8844 1648
26
+ OP 0.9880 0.9816 0.9848 2015
27
+ P 0.9833 0.9883 0.9858 597
28
+ Q 0.9513 0.8729 0.9104 425
29
+ RU 0.9977 0.9953 0.9965 859
30
+ S 0.9482 0.9337 0.9409 196
31
+ UM 0.9709 0.9799 0.9754 647
32
+ _ 0.0000 0.0000 0.0000 0
33
+ nknown 0.8970 0.8172 0.8552 629
34
+
35
+ micro avg 0.9329 0.9333 0.9331 27978
36
+ macro avg 0.9077 0.8960 0.9015 27978
37
+ weighted avg 0.9413 0.9333 0.9370 27978
models/bert_out_model/en09/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:779b44ae9309548a82a0f7631bde4e740cdeaf1c7117d200db157da46222c6ef
3
+ size 327908843
models/bert_out_model/en09/vocab.txt ADDED
The diff for this file is too large to render. See raw diff