KoichiYasuoka commited on
Commit
82ddd5c
1 Parent(s): 5b035a5

release after the tokenizer refined

Browse files
Files changed (8) hide show
  1. README.md +30 -0
  2. config.json +379 -0
  3. maker.py +128 -0
  4. pytorch_model.bin +3 -0
  5. special_tokens_map.json +1249 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +0 -0
  8. ud.py +142 -0
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "ja"
4
+ tags:
5
+ - "japanese"
6
+ - "pos"
7
+ - "dependency-parsing"
8
+ base_model: goldfish-models/jpn_jpan_1000mb
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "apache-2.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "全学年にわたって小学校の国語の教科書に挿し絵が用いられている"
15
+ ---
16
+
17
+ # goldfish-gpt2-japanese-1000mb-ud-causal
18
+
19
+ ## Model Description
20
+
21
+ This is a GPT-2 model pretrained for POS-tagging and dependency-parsing, derived from [jpn_jpan_1000mb](https://huggingface.co/goldfish-models/jpn_jpan_1000mb) refined for [UD_Japanese-GSDLUW](https://github.com/UniversalDependencies/UD_Japanese-GSDLUW).
22
+
23
+ ## How to Use
24
+
25
+ ```
26
+ from transformers import pipeline
27
+ nlp=pipeline("universal-dependencies","KoichiYasuoka/goldfish-gpt2-japanese-1000mb-ud-causal",trust_remote_code=True)
28
+ print(nlp("全学年にわたって小学校の国語の教科書に挿し絵が用いられている"))
29
+ ```
30
+
config.json ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu",
3
+ "architectures": [
4
+ "GPT2ForTokenClassification"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50000,
8
+ "custom_pipelines": {
9
+ "upos": {
10
+ "impl": "ud.BellmanFordTokenClassificationPipeline",
11
+ "pt": "AutoModelForTokenClassification"
12
+ },
13
+ "universal-dependencies": {
14
+ "impl": "ud.UniversalDependenciesCausalPipeline",
15
+ "pt": "AutoModelForTokenClassification"
16
+ }
17
+ },
18
+ "embd_pdrop": 0.1,
19
+ "eos_token_id": 50001,
20
+ "id2label": {
21
+ "0": "ADJ",
22
+ "1": "ADJ|l-acl",
23
+ "2": "ADJ|l-advcl",
24
+ "3": "ADJ|l-amod",
25
+ "4": "ADJ|l-ccomp",
26
+ "5": "ADJ|l-csubj",
27
+ "6": "ADJ|l-csubj:outer",
28
+ "7": "ADJ|l-nmod",
29
+ "8": "ADJ|l-nsubj",
30
+ "9": "ADJ|l-obj",
31
+ "10": "ADJ|l-obl",
32
+ "11": "ADJ|r-acl",
33
+ "12": "ADJ|r-amod",
34
+ "13": "ADJ|r-dep",
35
+ "14": "ADJ|root",
36
+ "15": "ADP",
37
+ "16": "ADP|l-case",
38
+ "17": "ADP|r-case",
39
+ "18": "ADP|r-fixed",
40
+ "19": "ADV",
41
+ "20": "ADV|l-advcl",
42
+ "21": "ADV|l-advmod",
43
+ "22": "ADV|l-obj",
44
+ "23": "ADV|r-dep",
45
+ "24": "ADV|root",
46
+ "25": "AUX",
47
+ "26": "AUX|Polarity=Neg",
48
+ "27": "AUX|Polarity=Neg|r-aux",
49
+ "28": "AUX|Polarity=Neg|r-fixed",
50
+ "29": "AUX|r-aux",
51
+ "30": "AUX|r-cop",
52
+ "31": "AUX|r-fixed",
53
+ "32": "AUX|root",
54
+ "33": "B-ADJ",
55
+ "34": "B-ADP",
56
+ "35": "B-ADV",
57
+ "36": "B-AUX",
58
+ "37": "B-AUX|Polarity=Neg",
59
+ "38": "B-CCONJ",
60
+ "39": "B-DET",
61
+ "40": "B-INTJ",
62
+ "41": "B-NOUN",
63
+ "42": "B-NOUN|Polarity=Neg",
64
+ "43": "B-NUM",
65
+ "44": "B-PART",
66
+ "45": "B-PRON",
67
+ "46": "B-PROPN",
68
+ "47": "B-PUNCT",
69
+ "48": "B-SCONJ",
70
+ "49": "B-SYM",
71
+ "50": "B-VERB",
72
+ "51": "B-X",
73
+ "52": "CCONJ",
74
+ "53": "CCONJ|l-cc",
75
+ "54": "CCONJ|r-cc",
76
+ "55": "DET",
77
+ "56": "DET|l-det",
78
+ "57": "I-ADJ",
79
+ "58": "I-ADP",
80
+ "59": "I-ADV",
81
+ "60": "I-AUX",
82
+ "61": "I-AUX|Polarity=Neg",
83
+ "62": "I-CCONJ",
84
+ "63": "I-DET",
85
+ "64": "I-INTJ",
86
+ "65": "I-NOUN",
87
+ "66": "I-NOUN|Polarity=Neg",
88
+ "67": "I-NUM",
89
+ "68": "I-PART",
90
+ "69": "I-PRON",
91
+ "70": "I-PROPN",
92
+ "71": "I-PUNCT",
93
+ "72": "I-SCONJ",
94
+ "73": "I-SYM",
95
+ "74": "I-VERB",
96
+ "75": "I-X",
97
+ "76": "INTJ",
98
+ "77": "INTJ|l-discourse",
99
+ "78": "INTJ|r-discourse",
100
+ "79": "INTJ|root",
101
+ "80": "NOUN",
102
+ "81": "NOUN|Polarity=Neg",
103
+ "82": "NOUN|Polarity=Neg|l-obl",
104
+ "83": "NOUN|Polarity=Neg|root",
105
+ "84": "NOUN|l-acl",
106
+ "85": "NOUN|l-advcl",
107
+ "86": "NOUN|l-ccomp",
108
+ "87": "NOUN|l-compound",
109
+ "88": "NOUN|l-csubj",
110
+ "89": "NOUN|l-csubj:outer",
111
+ "90": "NOUN|l-nmod",
112
+ "91": "NOUN|l-nsubj",
113
+ "92": "NOUN|l-nsubj:outer",
114
+ "93": "NOUN|l-obj",
115
+ "94": "NOUN|l-obl",
116
+ "95": "NOUN|r-compound",
117
+ "96": "NOUN|r-nmod",
118
+ "97": "NOUN|r-nsubj",
119
+ "98": "NOUN|root",
120
+ "99": "NUM",
121
+ "100": "NUM|l-advcl",
122
+ "101": "NUM|l-compound",
123
+ "102": "NUM|l-nmod",
124
+ "103": "NUM|l-nsubj",
125
+ "104": "NUM|l-nsubj:outer",
126
+ "105": "NUM|l-nummod",
127
+ "106": "NUM|l-obj",
128
+ "107": "NUM|l-obl",
129
+ "108": "NUM|r-compound",
130
+ "109": "NUM|root",
131
+ "110": "PART",
132
+ "111": "PART|l-mark",
133
+ "112": "PART|r-mark",
134
+ "113": "PRON",
135
+ "114": "PRON|l-acl",
136
+ "115": "PRON|l-advcl",
137
+ "116": "PRON|l-nmod",
138
+ "117": "PRON|l-nsubj",
139
+ "118": "PRON|l-nsubj:outer",
140
+ "119": "PRON|l-obj",
141
+ "120": "PRON|l-obl",
142
+ "121": "PRON|root",
143
+ "122": "PROPN",
144
+ "123": "PROPN|l-acl",
145
+ "124": "PROPN|l-advcl",
146
+ "125": "PROPN|l-compound",
147
+ "126": "PROPN|l-nmod",
148
+ "127": "PROPN|l-nsubj",
149
+ "128": "PROPN|l-nsubj:outer",
150
+ "129": "PROPN|l-obj",
151
+ "130": "PROPN|l-obl",
152
+ "131": "PROPN|r-compound",
153
+ "132": "PROPN|r-nmod",
154
+ "133": "PROPN|root",
155
+ "134": "PUNCT",
156
+ "135": "PUNCT|l-punct",
157
+ "136": "PUNCT|r-punct",
158
+ "137": "SCONJ",
159
+ "138": "SCONJ|l-dep",
160
+ "139": "SCONJ|r-fixed",
161
+ "140": "SCONJ|r-mark",
162
+ "141": "SYM",
163
+ "142": "SYM|l-compound",
164
+ "143": "SYM|l-dep",
165
+ "144": "SYM|l-nmod",
166
+ "145": "SYM|l-obl",
167
+ "146": "SYM|r-compound",
168
+ "147": "SYM|r-dep",
169
+ "148": "VERB",
170
+ "149": "VERB|l-acl",
171
+ "150": "VERB|l-advcl",
172
+ "151": "VERB|l-ccomp",
173
+ "152": "VERB|l-compound",
174
+ "153": "VERB|l-csubj",
175
+ "154": "VERB|l-csubj:outer",
176
+ "155": "VERB|l-nmod",
177
+ "156": "VERB|l-obj",
178
+ "157": "VERB|l-obl",
179
+ "158": "VERB|r-acl",
180
+ "159": "VERB|r-advcl",
181
+ "160": "VERB|r-compound",
182
+ "161": "VERB|root",
183
+ "162": "X",
184
+ "163": "X|l-nmod",
185
+ "164": "X|r-dep"
186
+ },
187
+ "initializer_range": 0.02,
188
+ "label2id": {
189
+ "ADJ": 0,
190
+ "ADJ|l-acl": 1,
191
+ "ADJ|l-advcl": 2,
192
+ "ADJ|l-amod": 3,
193
+ "ADJ|l-ccomp": 4,
194
+ "ADJ|l-csubj": 5,
195
+ "ADJ|l-csubj:outer": 6,
196
+ "ADJ|l-nmod": 7,
197
+ "ADJ|l-nsubj": 8,
198
+ "ADJ|l-obj": 9,
199
+ "ADJ|l-obl": 10,
200
+ "ADJ|r-acl": 11,
201
+ "ADJ|r-amod": 12,
202
+ "ADJ|r-dep": 13,
203
+ "ADJ|root": 14,
204
+ "ADP": 15,
205
+ "ADP|l-case": 16,
206
+ "ADP|r-case": 17,
207
+ "ADP|r-fixed": 18,
208
+ "ADV": 19,
209
+ "ADV|l-advcl": 20,
210
+ "ADV|l-advmod": 21,
211
+ "ADV|l-obj": 22,
212
+ "ADV|r-dep": 23,
213
+ "ADV|root": 24,
214
+ "AUX": 25,
215
+ "AUX|Polarity=Neg": 26,
216
+ "AUX|Polarity=Neg|r-aux": 27,
217
+ "AUX|Polarity=Neg|r-fixed": 28,
218
+ "AUX|r-aux": 29,
219
+ "AUX|r-cop": 30,
220
+ "AUX|r-fixed": 31,
221
+ "AUX|root": 32,
222
+ "B-ADJ": 33,
223
+ "B-ADP": 34,
224
+ "B-ADV": 35,
225
+ "B-AUX": 36,
226
+ "B-AUX|Polarity=Neg": 37,
227
+ "B-CCONJ": 38,
228
+ "B-DET": 39,
229
+ "B-INTJ": 40,
230
+ "B-NOUN": 41,
231
+ "B-NOUN|Polarity=Neg": 42,
232
+ "B-NUM": 43,
233
+ "B-PART": 44,
234
+ "B-PRON": 45,
235
+ "B-PROPN": 46,
236
+ "B-PUNCT": 47,
237
+ "B-SCONJ": 48,
238
+ "B-SYM": 49,
239
+ "B-VERB": 50,
240
+ "B-X": 51,
241
+ "CCONJ": 52,
242
+ "CCONJ|l-cc": 53,
243
+ "CCONJ|r-cc": 54,
244
+ "DET": 55,
245
+ "DET|l-det": 56,
246
+ "I-ADJ": 57,
247
+ "I-ADP": 58,
248
+ "I-ADV": 59,
249
+ "I-AUX": 60,
250
+ "I-AUX|Polarity=Neg": 61,
251
+ "I-CCONJ": 62,
252
+ "I-DET": 63,
253
+ "I-INTJ": 64,
254
+ "I-NOUN": 65,
255
+ "I-NOUN|Polarity=Neg": 66,
256
+ "I-NUM": 67,
257
+ "I-PART": 68,
258
+ "I-PRON": 69,
259
+ "I-PROPN": 70,
260
+ "I-PUNCT": 71,
261
+ "I-SCONJ": 72,
262
+ "I-SYM": 73,
263
+ "I-VERB": 74,
264
+ "I-X": 75,
265
+ "INTJ": 76,
266
+ "INTJ|l-discourse": 77,
267
+ "INTJ|r-discourse": 78,
268
+ "INTJ|root": 79,
269
+ "NOUN": 80,
270
+ "NOUN|Polarity=Neg": 81,
271
+ "NOUN|Polarity=Neg|l-obl": 82,
272
+ "NOUN|Polarity=Neg|root": 83,
273
+ "NOUN|l-acl": 84,
274
+ "NOUN|l-advcl": 85,
275
+ "NOUN|l-ccomp": 86,
276
+ "NOUN|l-compound": 87,
277
+ "NOUN|l-csubj": 88,
278
+ "NOUN|l-csubj:outer": 89,
279
+ "NOUN|l-nmod": 90,
280
+ "NOUN|l-nsubj": 91,
281
+ "NOUN|l-nsubj:outer": 92,
282
+ "NOUN|l-obj": 93,
283
+ "NOUN|l-obl": 94,
284
+ "NOUN|r-compound": 95,
285
+ "NOUN|r-nmod": 96,
286
+ "NOUN|r-nsubj": 97,
287
+ "NOUN|root": 98,
288
+ "NUM": 99,
289
+ "NUM|l-advcl": 100,
290
+ "NUM|l-compound": 101,
291
+ "NUM|l-nmod": 102,
292
+ "NUM|l-nsubj": 103,
293
+ "NUM|l-nsubj:outer": 104,
294
+ "NUM|l-nummod": 105,
295
+ "NUM|l-obj": 106,
296
+ "NUM|l-obl": 107,
297
+ "NUM|r-compound": 108,
298
+ "NUM|root": 109,
299
+ "PART": 110,
300
+ "PART|l-mark": 111,
301
+ "PART|r-mark": 112,
302
+ "PRON": 113,
303
+ "PRON|l-acl": 114,
304
+ "PRON|l-advcl": 115,
305
+ "PRON|l-nmod": 116,
306
+ "PRON|l-nsubj": 117,
307
+ "PRON|l-nsubj:outer": 118,
308
+ "PRON|l-obj": 119,
309
+ "PRON|l-obl": 120,
310
+ "PRON|root": 121,
311
+ "PROPN": 122,
312
+ "PROPN|l-acl": 123,
313
+ "PROPN|l-advcl": 124,
314
+ "PROPN|l-compound": 125,
315
+ "PROPN|l-nmod": 126,
316
+ "PROPN|l-nsubj": 127,
317
+ "PROPN|l-nsubj:outer": 128,
318
+ "PROPN|l-obj": 129,
319
+ "PROPN|l-obl": 130,
320
+ "PROPN|r-compound": 131,
321
+ "PROPN|r-nmod": 132,
322
+ "PROPN|root": 133,
323
+ "PUNCT": 134,
324
+ "PUNCT|l-punct": 135,
325
+ "PUNCT|r-punct": 136,
326
+ "SCONJ": 137,
327
+ "SCONJ|l-dep": 138,
328
+ "SCONJ|r-fixed": 139,
329
+ "SCONJ|r-mark": 140,
330
+ "SYM": 141,
331
+ "SYM|l-compound": 142,
332
+ "SYM|l-dep": 143,
333
+ "SYM|l-nmod": 144,
334
+ "SYM|l-obl": 145,
335
+ "SYM|r-compound": 146,
336
+ "SYM|r-dep": 147,
337
+ "VERB": 148,
338
+ "VERB|l-acl": 149,
339
+ "VERB|l-advcl": 150,
340
+ "VERB|l-ccomp": 151,
341
+ "VERB|l-compound": 152,
342
+ "VERB|l-csubj": 153,
343
+ "VERB|l-csubj:outer": 154,
344
+ "VERB|l-nmod": 155,
345
+ "VERB|l-obj": 156,
346
+ "VERB|l-obl": 157,
347
+ "VERB|r-acl": 158,
348
+ "VERB|r-advcl": 159,
349
+ "VERB|r-compound": 160,
350
+ "VERB|root": 161,
351
+ "X": 162,
352
+ "X|l-nmod": 163,
353
+ "X|r-dep": 164
354
+ },
355
+ "layer_norm_epsilon": 1e-05,
356
+ "model_type": "gpt2",
357
+ "n_ctx": 512,
358
+ "n_embd": 768,
359
+ "n_head": 12,
360
+ "n_inner": 3072,
361
+ "n_layer": 12,
362
+ "n_positions": 512,
363
+ "pad_token_id": 50002,
364
+ "prefix": "[CLS]",
365
+ "reorder_and_upcast_attn": false,
366
+ "resid_pdrop": 0.1,
367
+ "scale_attn_by_inverse_layer_idx": false,
368
+ "scale_attn_weights": true,
369
+ "summary_activation": null,
370
+ "summary_first_dropout": 0.1,
371
+ "summary_proj_to_labels": true,
372
+ "summary_type": "cls_index",
373
+ "summary_use_proj": true,
374
+ "tokenizer_class": "PreTrainedTokenizerFast",
375
+ "torch_dtype": "float32",
376
+ "transformers_version": "4.42.4",
377
+ "use_cache": true,
378
+ "vocab_size": 51200
379
+ }
maker.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ src="goldfish-models/jpn_jpan_1000mb"
3
+ tgt="KoichiYasuoka/goldfish-gpt2-japanese-1000mb-ud-causal"
4
+ url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
+
6
+ import os,json
7
+ from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPT2ForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
8
+ from tokenizers import pre_tokenizers,decoders
9
+ d=os.path.basename(url)
10
+ os.system("test -d "+d+" || git clone --depth=1 "+url)
11
+ os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
12
+ tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=768)
13
+ tkz.backend_tokenizer.pre_tokenizer=pre_tokenizers.Metaspace(prepend_scheme="never")
14
+ tkz.backend_tokenizer.decoder=decoders.Metaspace(prepend_scheme="never")
15
+ tkz.save_pretrained("tmpdir")
16
+ d=json.loads(tkz.backend_tokenizer.to_str())
17
+ form=set()
18
+ with open("train.conllu","r",encoding="utf-8") as r:
19
+ for s in r:
20
+ w=s.split("\t")
21
+ if len(w)==10 and w[0].isdecimal():
22
+ form.add(w[1])
23
+ for t in d["model"]["vocab"]:
24
+ if t[0] not in form:
25
+ t[1]*=len(t[0])
26
+ tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
27
+ tkz=PreTrainedTokenizerFast.from_pretrained("tmpdir")
28
+
29
+ class UDCausalDataset(object):
30
+ def __init__(self,conllu,tokenizer,embeddings=None):
31
+ self.conllu=open(conllu,"r",encoding="utf-8")
32
+ self.tokenizer=tokenizer
33
+ self.embeddings=embeddings
34
+ self.max_tokens=3
35
+ self.seeks=[(0,0)]
36
+ label=set(["SYM"])
37
+ dep=set()
38
+ s=self.conllu.readline()
39
+ while s!="":
40
+ if s=="\n":
41
+ self.seeks.append((self.conllu.tell(),0))
42
+ else:
43
+ w=s.split("\t")
44
+ if len(w)==10:
45
+ if w[0].isdecimal():
46
+ p=w[3] if w[5]=="_" else w[3]+"|"+w[5]
47
+ label.add(p)
48
+ dep.add(p+("|" if w[6]=="0" else "|l-" if int(w[0])<int(w[6]) else "|r-")+w[7])
49
+ self.seeks.append((self.seeks[-1][0],int(w[0])))
50
+ self.max_tokens=max(self.max_tokens,int(w[0])*2+1)
51
+ s=self.conllu.readline()
52
+ lid={}
53
+ for i,l in enumerate(sorted(label)):
54
+ lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
55
+ for i,d in enumerate(sorted(dep),len(lid)):
56
+ lid[d]=i
57
+ self.label2id=lid
58
+ def __call__(*args):
59
+ lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
60
+ for t in args:
61
+ t.label2id=lid
62
+ return lid
63
+ def __del__(self):
64
+ self.conllu.close()
65
+ __len__=lambda self:len(self.seeks)-1
66
+ def __getitem__(self,i):
67
+ s,t=self.seeks[i]
68
+ self.conllu.seek(s)
69
+ form,upos,deps,w=[],[],[],[""]
70
+ while w[0]!="\n":
71
+ w=self.conllu.readline().split("\t")
72
+ if len(w)==10:
73
+ form.append(w[1])
74
+ if w[0].isdecimal():
75
+ upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
76
+ deps.append((int(w[6]),w[7]))
77
+ v=self.tokenizer(form,add_special_tokens=False)
78
+ if t==0:
79
+ i,u=[self.tokenizer.cls_token_id],["SYM"]
80
+ for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
81
+ if x!=[]:
82
+ i+=x
83
+ u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
84
+ emb=self.embeddings
85
+ pad=self.tokenizer.pad_token_id
86
+ else:
87
+ import torch
88
+ m=[]
89
+ for x in v["input_ids"]:
90
+ if x==[]:
91
+ m.append(self.embeddings[self.tokenizer.unk_token_id,:])
92
+ else:
93
+ m.append(self.embeddings[x,:].sum(axis=0))
94
+ m.append(self.embeddings[self.tokenizer.sep_token_id,:])
95
+ m.append(self.embeddings[self.tokenizer.pad_token_id,:])
96
+ m.append(self.embeddings[self.tokenizer.cls_token_id,:])
97
+ emb=torch.stack(m)
98
+ i,u=list(range(-1,len(upos)+1)),["SYM"]+upos+["SYM"]
99
+ i.append(t-1)
100
+ k,d=deps[t-1]
101
+ u.append(upos[t-1]+"|"+d if k==0 else upos[t-1])
102
+ for j in range(t,len(upos)):
103
+ i.append(j)
104
+ a,b=deps[j]
105
+ u.append(upos[j]+"|r-"+b if a==t else upos[t-1]+"|l-"+d if j+1==k else upos[j])
106
+ pad=-2
107
+ j=self.max_tokens-len(i)
108
+ if j>0:
109
+ ids=i+[pad]*j
110
+ upos=u+["SYM"]*j
111
+ else:
112
+ ids=i[0:self.max_tokens]
113
+ upos=u[0:self.max_tokens]
114
+ return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
115
+
116
+ trainDS=UDCausalDataset("train.conllu",tkz)
117
+ devDS=UDCausalDataset("dev.conllu",tkz)
118
+ testDS=UDCausalDataset("test.conllu",tkz)
119
+ lid=trainDS(devDS,testDS)
120
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
121
+ mdl=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
122
+ trainDS.embeddings=mdl.get_input_embeddings().weight
123
+ trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
124
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
125
+ trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
126
+ trn.train()
127
+ trn.save_model(tgt)
128
+ tkz.save_pretrained(tgt)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23bae96766c8421ee8f7868d4e78c7c70a6534c1e6b759470397aa5d7ff81134
3
+ size 499639714
special_tokens_map.json ADDED
@@ -0,0 +1,1249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[XXXXX0]",
4
+ "[XXXXX1]",
5
+ "[XXXXX2]",
6
+ "[XXXXX3]",
7
+ "[XXXXX4]",
8
+ "[XXXXX5]",
9
+ "[XXXXX6]",
10
+ "[XXXXX7]",
11
+ "[XXXXX8]",
12
+ "[XXXXX9]",
13
+ "[XXXXX10]",
14
+ "[XXXXX11]",
15
+ "[XXXXX12]",
16
+ "[XXXXX13]",
17
+ "[XXXXX14]",
18
+ "[XXXXX15]",
19
+ "[XXXXX16]",
20
+ "[XXXXX17]",
21
+ "[XXXXX18]",
22
+ "[XXXXX19]",
23
+ "[XXXXX20]",
24
+ "[XXXXX21]",
25
+ "[XXXXX22]",
26
+ "[XXXXX23]",
27
+ "[XXXXX24]",
28
+ "[XXXXX25]",
29
+ "[XXXXX26]",
30
+ "[XXXXX27]",
31
+ "[XXXXX28]",
32
+ "[XXXXX29]",
33
+ "[XXXXX30]",
34
+ "[XXXXX31]",
35
+ "[XXXXX32]",
36
+ "[XXXXX33]",
37
+ "[XXXXX34]",
38
+ "[XXXXX35]",
39
+ "[XXXXX36]",
40
+ "[XXXXX37]",
41
+ "[XXXXX38]",
42
+ "[XXXXX39]",
43
+ "[XXXXX40]",
44
+ "[XXXXX41]",
45
+ "[XXXXX42]",
46
+ "[XXXXX43]",
47
+ "[XXXXX44]",
48
+ "[XXXXX45]",
49
+ "[XXXXX46]",
50
+ "[XXXXX47]",
51
+ "[XXXXX48]",
52
+ "[XXXXX49]",
53
+ "[XXXXX50]",
54
+ "[XXXXX51]",
55
+ "[XXXXX52]",
56
+ "[XXXXX53]",
57
+ "[XXXXX54]",
58
+ "[XXXXX55]",
59
+ "[XXXXX56]",
60
+ "[XXXXX57]",
61
+ "[XXXXX58]",
62
+ "[XXXXX59]",
63
+ "[XXXXX60]",
64
+ "[XXXXX61]",
65
+ "[XXXXX62]",
66
+ "[XXXXX63]",
67
+ "[XXXXX64]",
68
+ "[XXXXX65]",
69
+ "[XXXXX66]",
70
+ "[XXXXX67]",
71
+ "[XXXXX68]",
72
+ "[XXXXX69]",
73
+ "[XXXXX70]",
74
+ "[XXXXX71]",
75
+ "[XXXXX72]",
76
+ "[XXXXX73]",
77
+ "[XXXXX74]",
78
+ "[XXXXX75]",
79
+ "[XXXXX76]",
80
+ "[XXXXX77]",
81
+ "[XXXXX78]",
82
+ "[XXXXX79]",
83
+ "[XXXXX80]",
84
+ "[XXXXX81]",
85
+ "[XXXXX82]",
86
+ "[XXXXX83]",
87
+ "[XXXXX84]",
88
+ "[XXXXX85]",
89
+ "[XXXXX86]",
90
+ "[XXXXX87]",
91
+ "[XXXXX88]",
92
+ "[XXXXX89]",
93
+ "[XXXXX90]",
94
+ "[XXXXX91]",
95
+ "[XXXXX92]",
96
+ "[XXXXX93]",
97
+ "[XXXXX94]",
98
+ "[XXXXX95]",
99
+ "[XXXXX96]",
100
+ "[XXXXX97]",
101
+ "[XXXXX98]",
102
+ "[XXXXX99]",
103
+ "[XXXXX100]",
104
+ "[XXXXX101]",
105
+ "[XXXXX102]",
106
+ "[XXXXX103]",
107
+ "[XXXXX104]",
108
+ "[XXXXX105]",
109
+ "[XXXXX106]",
110
+ "[XXXXX107]",
111
+ "[XXXXX108]",
112
+ "[XXXXX109]",
113
+ "[XXXXX110]",
114
+ "[XXXXX111]",
115
+ "[XXXXX112]",
116
+ "[XXXXX113]",
117
+ "[XXXXX114]",
118
+ "[XXXXX115]",
119
+ "[XXXXX116]",
120
+ "[XXXXX117]",
121
+ "[XXXXX118]",
122
+ "[XXXXX119]",
123
+ "[XXXXX120]",
124
+ "[XXXXX121]",
125
+ "[XXXXX122]",
126
+ "[XXXXX123]",
127
+ "[XXXXX124]",
128
+ "[XXXXX125]",
129
+ "[XXXXX126]",
130
+ "[XXXXX127]",
131
+ "[XXXXX128]",
132
+ "[XXXXX129]",
133
+ "[XXXXX130]",
134
+ "[XXXXX131]",
135
+ "[XXXXX132]",
136
+ "[XXXXX133]",
137
+ "[XXXXX134]",
138
+ "[XXXXX135]",
139
+ "[XXXXX136]",
140
+ "[XXXXX137]",
141
+ "[XXXXX138]",
142
+ "[XXXXX139]",
143
+ "[XXXXX140]",
144
+ "[XXXXX141]",
145
+ "[XXXXX142]",
146
+ "[XXXXX143]",
147
+ "[XXXXX144]",
148
+ "[XXXXX145]",
149
+ "[XXXXX146]",
150
+ "[XXXXX147]",
151
+ "[XXXXX148]",
152
+ "[XXXXX149]",
153
+ "[XXXXX150]",
154
+ "[XXXXX151]",
155
+ "[XXXXX152]",
156
+ "[XXXXX153]",
157
+ "[XXXXX154]",
158
+ "[XXXXX155]",
159
+ "[XXXXX156]",
160
+ "[XXXXX157]",
161
+ "[XXXXX158]",
162
+ "[XXXXX159]",
163
+ "[XXXXX160]",
164
+ "[XXXXX161]",
165
+ "[XXXXX162]",
166
+ "[XXXXX163]",
167
+ "[XXXXX164]",
168
+ "[XXXXX165]",
169
+ "[XXXXX166]",
170
+ "[XXXXX167]",
171
+ "[XXXXX168]",
172
+ "[XXXXX169]",
173
+ "[XXXXX170]",
174
+ "[XXXXX171]",
175
+ "[XXXXX172]",
176
+ "[XXXXX173]",
177
+ "[XXXXX174]",
178
+ "[XXXXX175]",
179
+ "[XXXXX176]",
180
+ "[XXXXX177]",
181
+ "[XXXXX178]",
182
+ "[XXXXX179]",
183
+ "[XXXXX180]",
184
+ "[XXXXX181]",
185
+ "[XXXXX182]",
186
+ "[XXXXX183]",
187
+ "[XXXXX184]",
188
+ "[XXXXX185]",
189
+ "[XXXXX186]",
190
+ "[XXXXX187]",
191
+ "[XXXXX188]",
192
+ "[XXXXX189]",
193
+ "[XXXXX190]",
194
+ "[XXXXX191]",
195
+ "[XXXXX192]",
196
+ "[XXXXX193]",
197
+ "[XXXXX194]",
198
+ "[XXXXX195]",
199
+ "[XXXXX196]",
200
+ "[XXXXX197]",
201
+ "[XXXXX198]",
202
+ "[XXXXX199]",
203
+ "[XXXXX200]",
204
+ "[XXXXX201]",
205
+ "[XXXXX202]",
206
+ "[XXXXX203]",
207
+ "[XXXXX204]",
208
+ "[XXXXX205]",
209
+ "[XXXXX206]",
210
+ "[XXXXX207]",
211
+ "[XXXXX208]",
212
+ "[XXXXX209]",
213
+ "[XXXXX210]",
214
+ "[XXXXX211]",
215
+ "[XXXXX212]",
216
+ "[XXXXX213]",
217
+ "[XXXXX214]",
218
+ "[XXXXX215]",
219
+ "[XXXXX216]",
220
+ "[XXXXX217]",
221
+ "[XXXXX218]",
222
+ "[XXXXX219]",
223
+ "[XXXXX220]",
224
+ "[XXXXX221]",
225
+ "[XXXXX222]",
226
+ "[XXXXX223]",
227
+ "[XXXXX224]",
228
+ "[XXXXX225]",
229
+ "[XXXXX226]",
230
+ "[XXXXX227]",
231
+ "[XXXXX228]",
232
+ "[XXXXX229]",
233
+ "[XXXXX230]",
234
+ "[XXXXX231]",
235
+ "[XXXXX232]",
236
+ "[XXXXX233]",
237
+ "[XXXXX234]",
238
+ "[XXXXX235]",
239
+ "[XXXXX236]",
240
+ "[XXXXX237]",
241
+ "[XXXXX238]",
242
+ "[XXXXX239]",
243
+ "[XXXXX240]",
244
+ "[XXXXX241]",
245
+ "[XXXXX242]",
246
+ "[XXXXX243]",
247
+ "[XXXXX244]",
248
+ "[XXXXX245]",
249
+ "[XXXXX246]",
250
+ "[XXXXX247]",
251
+ "[XXXXX248]",
252
+ "[XXXXX249]",
253
+ "[XXXXX250]",
254
+ "[XXXXX251]",
255
+ "[XXXXX252]",
256
+ "[XXXXX253]",
257
+ "[XXXXX254]",
258
+ "[XXXXX255]",
259
+ "[XXXXX256]",
260
+ "[XXXXX257]",
261
+ "[XXXXX258]",
262
+ "[XXXXX259]",
263
+ "[XXXXX260]",
264
+ "[XXXXX261]",
265
+ "[XXXXX262]",
266
+ "[XXXXX263]",
267
+ "[XXXXX264]",
268
+ "[XXXXX265]",
269
+ "[XXXXX266]",
270
+ "[XXXXX267]",
271
+ "[XXXXX268]",
272
+ "[XXXXX269]",
273
+ "[XXXXX270]",
274
+ "[XXXXX271]",
275
+ "[XXXXX272]",
276
+ "[XXXXX273]",
277
+ "[XXXXX274]",
278
+ "[XXXXX275]",
279
+ "[XXXXX276]",
280
+ "[XXXXX277]",
281
+ "[XXXXX278]",
282
+ "[XXXXX279]",
283
+ "[XXXXX280]",
284
+ "[XXXXX281]",
285
+ "[XXXXX282]",
286
+ "[XXXXX283]",
287
+ "[XXXXX284]",
288
+ "[XXXXX285]",
289
+ "[XXXXX286]",
290
+ "[XXXXX287]",
291
+ "[XXXXX288]",
292
+ "[XXXXX289]",
293
+ "[XXXXX290]",
294
+ "[XXXXX291]",
295
+ "[XXXXX292]",
296
+ "[XXXXX293]",
297
+ "[XXXXX294]",
298
+ "[XXXXX295]",
299
+ "[XXXXX296]",
300
+ "[XXXXX297]",
301
+ "[XXXXX298]",
302
+ "[XXXXX299]",
303
+ "[XXXXX300]",
304
+ "[XXXXX301]",
305
+ "[XXXXX302]",
306
+ "[XXXXX303]",
307
+ "[XXXXX304]",
308
+ "[XXXXX305]",
309
+ "[XXXXX306]",
310
+ "[XXXXX307]",
311
+ "[XXXXX308]",
312
+ "[XXXXX309]",
313
+ "[XXXXX310]",
314
+ "[XXXXX311]",
315
+ "[XXXXX312]",
316
+ "[XXXXX313]",
317
+ "[XXXXX314]",
318
+ "[XXXXX315]",
319
+ "[XXXXX316]",
320
+ "[XXXXX317]",
321
+ "[XXXXX318]",
322
+ "[XXXXX319]",
323
+ "[XXXXX320]",
324
+ "[XXXXX321]",
325
+ "[XXXXX322]",
326
+ "[XXXXX323]",
327
+ "[XXXXX324]",
328
+ "[XXXXX325]",
329
+ "[XXXXX326]",
330
+ "[XXXXX327]",
331
+ "[XXXXX328]",
332
+ "[XXXXX329]",
333
+ "[XXXXX330]",
334
+ "[XXXXX331]",
335
+ "[XXXXX332]",
336
+ "[XXXXX333]",
337
+ "[XXXXX334]",
338
+ "[XXXXX335]",
339
+ "[XXXXX336]",
340
+ "[XXXXX337]",
341
+ "[XXXXX338]",
342
+ "[XXXXX339]",
343
+ "[XXXXX340]",
344
+ "[XXXXX341]",
345
+ "[XXXXX342]",
346
+ "[XXXXX343]",
347
+ "[XXXXX344]",
348
+ "[XXXXX345]",
349
+ "[XXXXX346]",
350
+ "[XXXXX347]",
351
+ "[XXXXX348]",
352
+ "[XXXXX349]",
353
+ "[XXXXX350]",
354
+ "[XXXXX351]",
355
+ "[XXXXX352]",
356
+ "[XXXXX353]",
357
+ "[XXXXX354]",
358
+ "[XXXXX355]",
359
+ "[XXXXX356]",
360
+ "[XXXXX357]",
361
+ "[XXXXX358]",
362
+ "[XXXXX359]",
363
+ "[XXXXX360]",
364
+ "[XXXXX361]",
365
+ "[XXXXX362]",
366
+ "[XXXXX363]",
367
+ "[XXXXX364]",
368
+ "[XXXXX365]",
369
+ "[XXXXX366]",
370
+ "[XXXXX367]",
371
+ "[XXXXX368]",
372
+ "[XXXXX369]",
373
+ "[XXXXX370]",
374
+ "[XXXXX371]",
375
+ "[XXXXX372]",
376
+ "[XXXXX373]",
377
+ "[XXXXX374]",
378
+ "[XXXXX375]",
379
+ "[XXXXX376]",
380
+ "[XXXXX377]",
381
+ "[XXXXX378]",
382
+ "[XXXXX379]",
383
+ "[XXXXX380]",
384
+ "[XXXXX381]",
385
+ "[XXXXX382]",
386
+ "[XXXXX383]",
387
+ "[XXXXX384]",
388
+ "[XXXXX385]",
389
+ "[XXXXX386]",
390
+ "[XXXXX387]",
391
+ "[XXXXX388]",
392
+ "[XXXXX389]",
393
+ "[XXXXX390]",
394
+ "[XXXXX391]",
395
+ "[XXXXX392]",
396
+ "[XXXXX393]",
397
+ "[XXXXX394]",
398
+ "[XXXXX395]",
399
+ "[XXXXX396]",
400
+ "[XXXXX397]",
401
+ "[XXXXX398]",
402
+ "[XXXXX399]",
403
+ "[XXXXX400]",
404
+ "[XXXXX401]",
405
+ "[XXXXX402]",
406
+ "[XXXXX403]",
407
+ "[XXXXX404]",
408
+ "[XXXXX405]",
409
+ "[XXXXX406]",
410
+ "[XXXXX407]",
411
+ "[XXXXX408]",
412
+ "[XXXXX409]",
413
+ "[XXXXX410]",
414
+ "[XXXXX411]",
415
+ "[XXXXX412]",
416
+ "[XXXXX413]",
417
+ "[XXXXX414]",
418
+ "[XXXXX415]",
419
+ "[XXXXX416]",
420
+ "[XXXXX417]",
421
+ "[XXXXX418]",
422
+ "[XXXXX419]",
423
+ "[XXXXX420]",
424
+ "[XXXXX421]",
425
+ "[XXXXX422]",
426
+ "[XXXXX423]",
427
+ "[XXXXX424]",
428
+ "[XXXXX425]",
429
+ "[XXXXX426]",
430
+ "[XXXXX427]",
431
+ "[XXXXX428]",
432
+ "[XXXXX429]",
433
+ "[XXXXX430]",
434
+ "[XXXXX431]",
435
+ "[XXXXX432]",
436
+ "[XXXXX433]",
437
+ "[XXXXX434]",
438
+ "[XXXXX435]",
439
+ "[XXXXX436]",
440
+ "[XXXXX437]",
441
+ "[XXXXX438]",
442
+ "[XXXXX439]",
443
+ "[XXXXX440]",
444
+ "[XXXXX441]",
445
+ "[XXXXX442]",
446
+ "[XXXXX443]",
447
+ "[XXXXX444]",
448
+ "[XXXXX445]",
449
+ "[XXXXX446]",
450
+ "[XXXXX447]",
451
+ "[XXXXX448]",
452
+ "[XXXXX449]",
453
+ "[XXXXX450]",
454
+ "[XXXXX451]",
455
+ "[XXXXX452]",
456
+ "[XXXXX453]",
457
+ "[XXXXX454]",
458
+ "[XXXXX455]",
459
+ "[XXXXX456]",
460
+ "[XXXXX457]",
461
+ "[XXXXX458]",
462
+ "[XXXXX459]",
463
+ "[XXXXX460]",
464
+ "[XXXXX461]",
465
+ "[XXXXX462]",
466
+ "[XXXXX463]",
467
+ "[XXXXX464]",
468
+ "[XXXXX465]",
469
+ "[XXXXX466]",
470
+ "[XXXXX467]",
471
+ "[XXXXX468]",
472
+ "[XXXXX469]",
473
+ "[XXXXX470]",
474
+ "[XXXXX471]",
475
+ "[XXXXX472]",
476
+ "[XXXXX473]",
477
+ "[XXXXX474]",
478
+ "[XXXXX475]",
479
+ "[XXXXX476]",
480
+ "[XXXXX477]",
481
+ "[XXXXX478]",
482
+ "[XXXXX479]",
483
+ "[XXXXX480]",
484
+ "[XXXXX481]",
485
+ "[XXXXX482]",
486
+ "[XXXXX483]",
487
+ "[XXXXX484]",
488
+ "[XXXXX485]",
489
+ "[XXXXX486]",
490
+ "[XXXXX487]",
491
+ "[XXXXX488]",
492
+ "[XXXXX489]",
493
+ "[XXXXX490]",
494
+ "[XXXXX491]",
495
+ "[XXXXX492]",
496
+ "[XXXXX493]",
497
+ "[XXXXX494]",
498
+ "[XXXXX495]",
499
+ "[XXXXX496]",
500
+ "[XXXXX497]",
501
+ "[XXXXX498]",
502
+ "[XXXXX499]",
503
+ "[XXXXX500]",
504
+ "[XXXXX501]",
505
+ "[XXXXX502]",
506
+ "[XXXXX503]",
507
+ "[XXXXX504]",
508
+ "[XXXXX505]",
509
+ "[XXXXX506]",
510
+ "[XXXXX507]",
511
+ "[XXXXX508]",
512
+ "[XXXXX509]",
513
+ "[XXXXX510]",
514
+ "[XXXXX511]",
515
+ "[XXXXX512]",
516
+ "[XXXXX513]",
517
+ "[XXXXX514]",
518
+ "[XXXXX515]",
519
+ "[XXXXX516]",
520
+ "[XXXXX517]",
521
+ "[XXXXX518]",
522
+ "[XXXXX519]",
523
+ "[XXXXX520]",
524
+ "[XXXXX521]",
525
+ "[XXXXX522]",
526
+ "[XXXXX523]",
527
+ "[XXXXX524]",
528
+ "[XXXXX525]",
529
+ "[XXXXX526]",
530
+ "[XXXXX527]",
531
+ "[XXXXX528]",
532
+ "[XXXXX529]",
533
+ "[XXXXX530]",
534
+ "[XXXXX531]",
535
+ "[XXXXX532]",
536
+ "[XXXXX533]",
537
+ "[XXXXX534]",
538
+ "[XXXXX535]",
539
+ "[XXXXX536]",
540
+ "[XXXXX537]",
541
+ "[XXXXX538]",
542
+ "[XXXXX539]",
543
+ "[XXXXX540]",
544
+ "[XXXXX541]",
545
+ "[XXXXX542]",
546
+ "[XXXXX543]",
547
+ "[XXXXX544]",
548
+ "[XXXXX545]",
549
+ "[XXXXX546]",
550
+ "[XXXXX547]",
551
+ "[XXXXX548]",
552
+ "[XXXXX549]",
553
+ "[XXXXX550]",
554
+ "[XXXXX551]",
555
+ "[XXXXX552]",
556
+ "[XXXXX553]",
557
+ "[XXXXX554]",
558
+ "[XXXXX555]",
559
+ "[XXXXX556]",
560
+ "[XXXXX557]",
561
+ "[XXXXX558]",
562
+ "[XXXXX559]",
563
+ "[XXXXX560]",
564
+ "[XXXXX561]",
565
+ "[XXXXX562]",
566
+ "[XXXXX563]",
567
+ "[XXXXX564]",
568
+ "[XXXXX565]",
569
+ "[XXXXX566]",
570
+ "[XXXXX567]",
571
+ "[XXXXX568]",
572
+ "[XXXXX569]",
573
+ "[XXXXX570]",
574
+ "[XXXXX571]",
575
+ "[XXXXX572]",
576
+ "[XXXXX573]",
577
+ "[XXXXX574]",
578
+ "[XXXXX575]",
579
+ "[XXXXX576]",
580
+ "[XXXXX577]",
581
+ "[XXXXX578]",
582
+ "[XXXXX579]",
583
+ "[XXXXX580]",
584
+ "[XXXXX581]",
585
+ "[XXXXX582]",
586
+ "[XXXXX583]",
587
+ "[XXXXX584]",
588
+ "[XXXXX585]",
589
+ "[XXXXX586]",
590
+ "[XXXXX587]",
591
+ "[XXXXX588]",
592
+ "[XXXXX589]",
593
+ "[XXXXX590]",
594
+ "[XXXXX591]",
595
+ "[XXXXX592]",
596
+ "[XXXXX593]",
597
+ "[XXXXX594]",
598
+ "[XXXXX595]",
599
+ "[XXXXX596]",
600
+ "[XXXXX597]",
601
+ "[XXXXX598]",
602
+ "[XXXXX599]",
603
+ "[XXXXX600]",
604
+ "[XXXXX601]",
605
+ "[XXXXX602]",
606
+ "[XXXXX603]",
607
+ "[XXXXX604]",
608
+ "[XXXXX605]",
609
+ "[XXXXX606]",
610
+ "[XXXXX607]",
611
+ "[XXXXX608]",
612
+ "[XXXXX609]",
613
+ "[XXXXX610]",
614
+ "[XXXXX611]",
615
+ "[XXXXX612]",
616
+ "[XXXXX613]",
617
+ "[XXXXX614]",
618
+ "[XXXXX615]",
619
+ "[XXXXX616]",
620
+ "[XXXXX617]",
621
+ "[XXXXX618]",
622
+ "[XXXXX619]",
623
+ "[XXXXX620]",
624
+ "[XXXXX621]",
625
+ "[XXXXX622]",
626
+ "[XXXXX623]",
627
+ "[XXXXX624]",
628
+ "[XXXXX625]",
629
+ "[XXXXX626]",
630
+ "[XXXXX627]",
631
+ "[XXXXX628]",
632
+ "[XXXXX629]",
633
+ "[XXXXX630]",
634
+ "[XXXXX631]",
635
+ "[XXXXX632]",
636
+ "[XXXXX633]",
637
+ "[XXXXX634]",
638
+ "[XXXXX635]",
639
+ "[XXXXX636]",
640
+ "[XXXXX637]",
641
+ "[XXXXX638]",
642
+ "[XXXXX639]",
643
+ "[XXXXX640]",
644
+ "[XXXXX641]",
645
+ "[XXXXX642]",
646
+ "[XXXXX643]",
647
+ "[XXXXX644]",
648
+ "[XXXXX645]",
649
+ "[XXXXX646]",
650
+ "[XXXXX647]",
651
+ "[XXXXX648]",
652
+ "[XXXXX649]",
653
+ "[XXXXX650]",
654
+ "[XXXXX651]",
655
+ "[XXXXX652]",
656
+ "[XXXXX653]",
657
+ "[XXXXX654]",
658
+ "[XXXXX655]",
659
+ "[XXXXX656]",
660
+ "[XXXXX657]",
661
+ "[XXXXX658]",
662
+ "[XXXXX659]",
663
+ "[XXXXX660]",
664
+ "[XXXXX661]",
665
+ "[XXXXX662]",
666
+ "[XXXXX663]",
667
+ "[XXXXX664]",
668
+ "[XXXXX665]",
669
+ "[XXXXX666]",
670
+ "[XXXXX667]",
671
+ "[XXXXX668]",
672
+ "[XXXXX669]",
673
+ "[XXXXX670]",
674
+ "[XXXXX671]",
675
+ "[XXXXX672]",
676
+ "[XXXXX673]",
677
+ "[XXXXX674]",
678
+ "[XXXXX675]",
679
+ "[XXXXX676]",
680
+ "[XXXXX677]",
681
+ "[XXXXX678]",
682
+ "[XXXXX679]",
683
+ "[XXXXX680]",
684
+ "[XXXXX681]",
685
+ "[XXXXX682]",
686
+ "[XXXXX683]",
687
+ "[XXXXX684]",
688
+ "[XXXXX685]",
689
+ "[XXXXX686]",
690
+ "[XXXXX687]",
691
+ "[XXXXX688]",
692
+ "[XXXXX689]",
693
+ "[XXXXX690]",
694
+ "[XXXXX691]",
695
+ "[XXXXX692]",
696
+ "[XXXXX693]",
697
+ "[XXXXX694]",
698
+ "[XXXXX695]",
699
+ "[XXXXX696]",
700
+ "[XXXXX697]",
701
+ "[XXXXX698]",
702
+ "[XXXXX699]",
703
+ "[XXXXX700]",
704
+ "[XXXXX701]",
705
+ "[XXXXX702]",
706
+ "[XXXXX703]",
707
+ "[XXXXX704]",
708
+ "[XXXXX705]",
709
+ "[XXXXX706]",
710
+ "[XXXXX707]",
711
+ "[XXXXX708]",
712
+ "[XXXXX709]",
713
+ "[XXXXX710]",
714
+ "[XXXXX711]",
715
+ "[XXXXX712]",
716
+ "[XXXXX713]",
717
+ "[XXXXX714]",
718
+ "[XXXXX715]",
719
+ "[XXXXX716]",
720
+ "[XXXXX717]",
721
+ "[XXXXX718]",
722
+ "[XXXXX719]",
723
+ "[XXXXX720]",
724
+ "[XXXXX721]",
725
+ "[XXXXX722]",
726
+ "[XXXXX723]",
727
+ "[XXXXX724]",
728
+ "[XXXXX725]",
729
+ "[XXXXX726]",
730
+ "[XXXXX727]",
731
+ "[XXXXX728]",
732
+ "[XXXXX729]",
733
+ "[XXXXX730]",
734
+ "[XXXXX731]",
735
+ "[XXXXX732]",
736
+ "[XXXXX733]",
737
+ "[XXXXX734]",
738
+ "[XXXXX735]",
739
+ "[XXXXX736]",
740
+ "[XXXXX737]",
741
+ "[XXXXX738]",
742
+ "[XXXXX739]",
743
+ "[XXXXX740]",
744
+ "[XXXXX741]",
745
+ "[XXXXX742]",
746
+ "[XXXXX743]",
747
+ "[XXXXX744]",
748
+ "[XXXXX745]",
749
+ "[XXXXX746]",
750
+ "[XXXXX747]",
751
+ "[XXXXX748]",
752
+ "[XXXXX749]",
753
+ "[XXXXX750]",
754
+ "[XXXXX751]",
755
+ "[XXXXX752]",
756
+ "[XXXXX753]",
757
+ "[XXXXX754]",
758
+ "[XXXXX755]",
759
+ "[XXXXX756]",
760
+ "[XXXXX757]",
761
+ "[XXXXX758]",
762
+ "[XXXXX759]",
763
+ "[XXXXX760]",
764
+ "[XXXXX761]",
765
+ "[XXXXX762]",
766
+ "[XXXXX763]",
767
+ "[XXXXX764]",
768
+ "[XXXXX765]",
769
+ "[XXXXX766]",
770
+ "[XXXXX767]",
771
+ "[XXXXX768]",
772
+ "[XXXXX769]",
773
+ "[XXXXX770]",
774
+ "[XXXXX771]",
775
+ "[XXXXX772]",
776
+ "[XXXXX773]",
777
+ "[XXXXX774]",
778
+ "[XXXXX775]",
779
+ "[XXXXX776]",
780
+ "[XXXXX777]",
781
+ "[XXXXX778]",
782
+ "[XXXXX779]",
783
+ "[XXXXX780]",
784
+ "[XXXXX781]",
785
+ "[XXXXX782]",
786
+ "[XXXXX783]",
787
+ "[XXXXX784]",
788
+ "[XXXXX785]",
789
+ "[XXXXX786]",
790
+ "[XXXXX787]",
791
+ "[XXXXX788]",
792
+ "[XXXXX789]",
793
+ "[XXXXX790]",
794
+ "[XXXXX791]",
795
+ "[XXXXX792]",
796
+ "[XXXXX793]",
797
+ "[XXXXX794]",
798
+ "[XXXXX795]",
799
+ "[XXXXX796]",
800
+ "[XXXXX797]",
801
+ "[XXXXX798]",
802
+ "[XXXXX799]",
803
+ "[XXXXX800]",
804
+ "[XXXXX801]",
805
+ "[XXXXX802]",
806
+ "[XXXXX803]",
807
+ "[XXXXX804]",
808
+ "[XXXXX805]",
809
+ "[XXXXX806]",
810
+ "[XXXXX807]",
811
+ "[XXXXX808]",
812
+ "[XXXXX809]",
813
+ "[XXXXX810]",
814
+ "[XXXXX811]",
815
+ "[XXXXX812]",
816
+ "[XXXXX813]",
817
+ "[XXXXX814]",
818
+ "[XXXXX815]",
819
+ "[XXXXX816]",
820
+ "[XXXXX817]",
821
+ "[XXXXX818]",
822
+ "[XXXXX819]",
823
+ "[XXXXX820]",
824
+ "[XXXXX821]",
825
+ "[XXXXX822]",
826
+ "[XXXXX823]",
827
+ "[XXXXX824]",
828
+ "[XXXXX825]",
829
+ "[XXXXX826]",
830
+ "[XXXXX827]",
831
+ "[XXXXX828]",
832
+ "[XXXXX829]",
833
+ "[XXXXX830]",
834
+ "[XXXXX831]",
835
+ "[XXXXX832]",
836
+ "[XXXXX833]",
837
+ "[XXXXX834]",
838
+ "[XXXXX835]",
839
+ "[XXXXX836]",
840
+ "[XXXXX837]",
841
+ "[XXXXX838]",
842
+ "[XXXXX839]",
843
+ "[XXXXX840]",
844
+ "[XXXXX841]",
845
+ "[XXXXX842]",
846
+ "[XXXXX843]",
847
+ "[XXXXX844]",
848
+ "[XXXXX845]",
849
+ "[XXXXX846]",
850
+ "[XXXXX847]",
851
+ "[XXXXX848]",
852
+ "[XXXXX849]",
853
+ "[XXXXX850]",
854
+ "[XXXXX851]",
855
+ "[XXXXX852]",
856
+ "[XXXXX853]",
857
+ "[XXXXX854]",
858
+ "[XXXXX855]",
859
+ "[XXXXX856]",
860
+ "[XXXXX857]",
861
+ "[XXXXX858]",
862
+ "[XXXXX859]",
863
+ "[XXXXX860]",
864
+ "[XXXXX861]",
865
+ "[XXXXX862]",
866
+ "[XXXXX863]",
867
+ "[XXXXX864]",
868
+ "[XXXXX865]",
869
+ "[XXXXX866]",
870
+ "[XXXXX867]",
871
+ "[XXXXX868]",
872
+ "[XXXXX869]",
873
+ "[XXXXX870]",
874
+ "[XXXXX871]",
875
+ "[XXXXX872]",
876
+ "[XXXXX873]",
877
+ "[XXXXX874]",
878
+ "[XXXXX875]",
879
+ "[XXXXX876]",
880
+ "[XXXXX877]",
881
+ "[XXXXX878]",
882
+ "[XXXXX879]",
883
+ "[XXXXX880]",
884
+ "[XXXXX881]",
885
+ "[XXXXX882]",
886
+ "[XXXXX883]",
887
+ "[XXXXX884]",
888
+ "[XXXXX885]",
889
+ "[XXXXX886]",
890
+ "[XXXXX887]",
891
+ "[XXXXX888]",
892
+ "[XXXXX889]",
893
+ "[XXXXX890]",
894
+ "[XXXXX891]",
895
+ "[XXXXX892]",
896
+ "[XXXXX893]",
897
+ "[XXXXX894]",
898
+ "[XXXXX895]",
899
+ "[XXXXX896]",
900
+ "[XXXXX897]",
901
+ "[XXXXX898]",
902
+ "[XXXXX899]",
903
+ "[XXXXX900]",
904
+ "[XXXXX901]",
905
+ "[XXXXX902]",
906
+ "[XXXXX903]",
907
+ "[XXXXX904]",
908
+ "[XXXXX905]",
909
+ "[XXXXX906]",
910
+ "[XXXXX907]",
911
+ "[XXXXX908]",
912
+ "[XXXXX909]",
913
+ "[XXXXX910]",
914
+ "[XXXXX911]",
915
+ "[XXXXX912]",
916
+ "[XXXXX913]",
917
+ "[XXXXX914]",
918
+ "[XXXXX915]",
919
+ "[XXXXX916]",
920
+ "[XXXXX917]",
921
+ "[XXXXX918]",
922
+ "[XXXXX919]",
923
+ "[XXXXX920]",
924
+ "[XXXXX921]",
925
+ "[XXXXX922]",
926
+ "[XXXXX923]",
927
+ "[XXXXX924]",
928
+ "[XXXXX925]",
929
+ "[XXXXX926]",
930
+ "[XXXXX927]",
931
+ "[XXXXX928]",
932
+ "[XXXXX929]",
933
+ "[XXXXX930]",
934
+ "[XXXXX931]",
935
+ "[XXXXX932]",
936
+ "[XXXXX933]",
937
+ "[XXXXX934]",
938
+ "[XXXXX935]",
939
+ "[XXXXX936]",
940
+ "[XXXXX937]",
941
+ "[XXXXX938]",
942
+ "[XXXXX939]",
943
+ "[XXXXX940]",
944
+ "[XXXXX941]",
945
+ "[XXXXX942]",
946
+ "[XXXXX943]",
947
+ "[XXXXX944]",
948
+ "[XXXXX945]",
949
+ "[XXXXX946]",
950
+ "[XXXXX947]",
951
+ "[XXXXX948]",
952
+ "[XXXXX949]",
953
+ "[XXXXX950]",
954
+ "[XXXXX951]",
955
+ "[XXXXX952]",
956
+ "[XXXXX953]",
957
+ "[XXXXX954]",
958
+ "[XXXXX955]",
959
+ "[XXXXX956]",
960
+ "[XXXXX957]",
961
+ "[XXXXX958]",
962
+ "[XXXXX959]",
963
+ "[XXXXX960]",
964
+ "[XXXXX961]",
965
+ "[XXXXX962]",
966
+ "[XXXXX963]",
967
+ "[XXXXX964]",
968
+ "[XXXXX965]",
969
+ "[XXXXX966]",
970
+ "[XXXXX967]",
971
+ "[XXXXX968]",
972
+ "[XXXXX969]",
973
+ "[XXXXX970]",
974
+ "[XXXXX971]",
975
+ "[XXXXX972]",
976
+ "[XXXXX973]",
977
+ "[XXXXX974]",
978
+ "[XXXXX975]",
979
+ "[XXXXX976]",
980
+ "[XXXXX977]",
981
+ "[XXXXX978]",
982
+ "[XXXXX979]",
983
+ "[XXXXX980]",
984
+ "[XXXXX981]",
985
+ "[XXXXX982]",
986
+ "[XXXXX983]",
987
+ "[XXXXX984]",
988
+ "[XXXXX985]",
989
+ "[XXXXX986]",
990
+ "[XXXXX987]",
991
+ "[XXXXX988]",
992
+ "[XXXXX989]",
993
+ "[XXXXX990]",
994
+ "[XXXXX991]",
995
+ "[XXXXX992]",
996
+ "[XXXXX993]",
997
+ "[XXXXX994]",
998
+ "[XXXXX995]",
999
+ "[XXXXX996]",
1000
+ "[XXXXX997]",
1001
+ "[XXXXX998]",
1002
+ "[XXXXX999]",
1003
+ "[XXXXX1000]",
1004
+ "[XXXXX1001]",
1005
+ "[XXXXX1002]",
1006
+ "[XXXXX1003]",
1007
+ "[XXXXX1004]",
1008
+ "[XXXXX1005]",
1009
+ "[XXXXX1006]",
1010
+ "[XXXXX1007]",
1011
+ "[XXXXX1008]",
1012
+ "[XXXXX1009]",
1013
+ "[XXXXX1010]",
1014
+ "[XXXXX1011]",
1015
+ "[XXXXX1012]",
1016
+ "[XXXXX1013]",
1017
+ "[XXXXX1014]",
1018
+ "[XXXXX1015]",
1019
+ "[XXXXX1016]",
1020
+ "[XXXXX1017]",
1021
+ "[XXXXX1018]",
1022
+ "[XXXXX1019]",
1023
+ "[XXXXX1020]",
1024
+ "[XXXXX1021]",
1025
+ "[XXXXX1022]",
1026
+ "[XXXXX1023]",
1027
+ "[XXXXX1024]",
1028
+ "[XXXXX1025]",
1029
+ "[XXXXX1026]",
1030
+ "[XXXXX1027]",
1031
+ "[XXXXX1028]",
1032
+ "[XXXXX1029]",
1033
+ "[XXXXX1030]",
1034
+ "[XXXXX1031]",
1035
+ "[XXXXX1032]",
1036
+ "[XXXXX1033]",
1037
+ "[XXXXX1034]",
1038
+ "[XXXXX1035]",
1039
+ "[XXXXX1036]",
1040
+ "[XXXXX1037]",
1041
+ "[XXXXX1038]",
1042
+ "[XXXXX1039]",
1043
+ "[XXXXX1040]",
1044
+ "[XXXXX1041]",
1045
+ "[XXXXX1042]",
1046
+ "[XXXXX1043]",
1047
+ "[XXXXX1044]",
1048
+ "[XXXXX1045]",
1049
+ "[XXXXX1046]",
1050
+ "[XXXXX1047]",
1051
+ "[XXXXX1048]",
1052
+ "[XXXXX1049]",
1053
+ "[XXXXX1050]",
1054
+ "[XXXXX1051]",
1055
+ "[XXXXX1052]",
1056
+ "[XXXXX1053]",
1057
+ "[XXXXX1054]",
1058
+ "[XXXXX1055]",
1059
+ "[XXXXX1056]",
1060
+ "[XXXXX1057]",
1061
+ "[XXXXX1058]",
1062
+ "[XXXXX1059]",
1063
+ "[XXXXX1060]",
1064
+ "[XXXXX1061]",
1065
+ "[XXXXX1062]",
1066
+ "[XXXXX1063]",
1067
+ "[XXXXX1064]",
1068
+ "[XXXXX1065]",
1069
+ "[XXXXX1066]",
1070
+ "[XXXXX1067]",
1071
+ "[XXXXX1068]",
1072
+ "[XXXXX1069]",
1073
+ "[XXXXX1070]",
1074
+ "[XXXXX1071]",
1075
+ "[XXXXX1072]",
1076
+ "[XXXXX1073]",
1077
+ "[XXXXX1074]",
1078
+ "[XXXXX1075]",
1079
+ "[XXXXX1076]",
1080
+ "[XXXXX1077]",
1081
+ "[XXXXX1078]",
1082
+ "[XXXXX1079]",
1083
+ "[XXXXX1080]",
1084
+ "[XXXXX1081]",
1085
+ "[XXXXX1082]",
1086
+ "[XXXXX1083]",
1087
+ "[XXXXX1084]",
1088
+ "[XXXXX1085]",
1089
+ "[XXXXX1086]",
1090
+ "[XXXXX1087]",
1091
+ "[XXXXX1088]",
1092
+ "[XXXXX1089]",
1093
+ "[XXXXX1090]",
1094
+ "[XXXXX1091]",
1095
+ "[XXXXX1092]",
1096
+ "[XXXXX1093]",
1097
+ "[XXXXX1094]",
1098
+ "[XXXXX1095]",
1099
+ "[XXXXX1096]",
1100
+ "[XXXXX1097]",
1101
+ "[XXXXX1098]",
1102
+ "[XXXXX1099]",
1103
+ "[XXXXX1100]",
1104
+ "[XXXXX1101]",
1105
+ "[XXXXX1102]",
1106
+ "[XXXXX1103]",
1107
+ "[XXXXX1104]",
1108
+ "[XXXXX1105]",
1109
+ "[XXXXX1106]",
1110
+ "[XXXXX1107]",
1111
+ "[XXXXX1108]",
1112
+ "[XXXXX1109]",
1113
+ "[XXXXX1110]",
1114
+ "[XXXXX1111]",
1115
+ "[XXXXX1112]",
1116
+ "[XXXXX1113]",
1117
+ "[XXXXX1114]",
1118
+ "[XXXXX1115]",
1119
+ "[XXXXX1116]",
1120
+ "[XXXXX1117]",
1121
+ "[XXXXX1118]",
1122
+ "[XXXXX1119]",
1123
+ "[XXXXX1120]",
1124
+ "[XXXXX1121]",
1125
+ "[XXXXX1122]",
1126
+ "[XXXXX1123]",
1127
+ "[XXXXX1124]",
1128
+ "[XXXXX1125]",
1129
+ "[XXXXX1126]",
1130
+ "[XXXXX1127]",
1131
+ "[XXXXX1128]",
1132
+ "[XXXXX1129]",
1133
+ "[XXXXX1130]",
1134
+ "[XXXXX1131]",
1135
+ "[XXXXX1132]",
1136
+ "[XXXXX1133]",
1137
+ "[XXXXX1134]",
1138
+ "[XXXXX1135]",
1139
+ "[XXXXX1136]",
1140
+ "[XXXXX1137]",
1141
+ "[XXXXX1138]",
1142
+ "[XXXXX1139]",
1143
+ "[XXXXX1140]",
1144
+ "[XXXXX1141]",
1145
+ "[XXXXX1142]",
1146
+ "[XXXXX1143]",
1147
+ "[XXXXX1144]",
1148
+ "[XXXXX1145]",
1149
+ "[XXXXX1146]",
1150
+ "[XXXXX1147]",
1151
+ "[XXXXX1148]",
1152
+ "[XXXXX1149]",
1153
+ "[XXXXX1150]",
1154
+ "[XXXXX1151]",
1155
+ "[XXXXX1152]",
1156
+ "[XXXXX1153]",
1157
+ "[XXXXX1154]",
1158
+ "[XXXXX1155]",
1159
+ "[XXXXX1156]",
1160
+ "[XXXXX1157]",
1161
+ "[XXXXX1158]",
1162
+ "[XXXXX1159]",
1163
+ "[XXXXX1160]",
1164
+ "[XXXXX1161]",
1165
+ "[XXXXX1162]",
1166
+ "[XXXXX1163]",
1167
+ "[XXXXX1164]",
1168
+ "[XXXXX1165]",
1169
+ "[XXXXX1166]",
1170
+ "[XXXXX1167]",
1171
+ "[XXXXX1168]",
1172
+ "[XXXXX1169]",
1173
+ "[XXXXX1170]",
1174
+ "[XXXXX1171]",
1175
+ "[XXXXX1172]",
1176
+ "[XXXXX1173]",
1177
+ "[XXXXX1174]",
1178
+ "[XXXXX1175]",
1179
+ "[XXXXX1176]",
1180
+ "[XXXXX1177]",
1181
+ "[XXXXX1178]",
1182
+ "[XXXXX1179]",
1183
+ "[XXXXX1180]",
1184
+ "[XXXXX1181]",
1185
+ "[XXXXX1182]",
1186
+ "[XXXXX1183]",
1187
+ "[XXXXX1184]",
1188
+ "[XXXXX1185]",
1189
+ "[XXXXX1186]",
1190
+ "[XXXXX1187]",
1191
+ "[XXXXX1188]",
1192
+ "[XXXXX1189]",
1193
+ "[XXXXX1190]",
1194
+ "[XXXXX1191]",
1195
+ "[XXXXX1192]",
1196
+ "[XXXXX1193]",
1197
+ "[XXXXX1194]",
1198
+ "[XXXXX1195]"
1199
+ ],
1200
+ "bos_token": {
1201
+ "content": "[CLS]",
1202
+ "lstrip": false,
1203
+ "normalized": false,
1204
+ "rstrip": false,
1205
+ "single_word": false
1206
+ },
1207
+ "cls_token": {
1208
+ "content": "[CLS]",
1209
+ "lstrip": false,
1210
+ "normalized": false,
1211
+ "rstrip": false,
1212
+ "single_word": false
1213
+ },
1214
+ "eos_token": {
1215
+ "content": "[SEP]",
1216
+ "lstrip": false,
1217
+ "normalized": false,
1218
+ "rstrip": false,
1219
+ "single_word": false
1220
+ },
1221
+ "mask_token": {
1222
+ "content": "[MASK]",
1223
+ "lstrip": true,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false
1227
+ },
1228
+ "pad_token": {
1229
+ "content": "<pad>",
1230
+ "lstrip": false,
1231
+ "normalized": false,
1232
+ "rstrip": false,
1233
+ "single_word": false
1234
+ },
1235
+ "sep_token": {
1236
+ "content": "[SEP]",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false
1241
+ },
1242
+ "unk_token": {
1243
+ "content": "<unk>",
1244
+ "lstrip": false,
1245
+ "normalized": false,
1246
+ "rstrip": false,
1247
+ "single_word": false
1248
+ }
1249
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
ud.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy
2
+ from transformers import TokenClassificationPipeline
3
+
4
+ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
+ def __init__(self,**kwargs):
6
+ super().__init__(**kwargs)
7
+ x=self.model.config.label2id
8
+ y=[k for k in x if k.startswith("B-") or not (k.startswith("I-") or k.endswith("|root") or k.find("|l-")>0 or k.find("|r-")>0)]
9
+ self.transition=numpy.full((len(x),len(x)),numpy.nan)
10
+ for k,v in x.items():
11
+ for j in ["I-"+k[2:]] if k.startswith("B-") else [k]+y if k.startswith("I-") else y:
12
+ self.transition[v,x[j]]=0
13
+ def check_model_type(self,supported_models):
14
+ pass
15
+ def postprocess(self,model_outputs,**kwargs):
16
+ if "logits" not in model_outputs:
17
+ return self.postprocess(model_outputs[0],**kwargs)
18
+ m=model_outputs["logits"][0].numpy()
19
+ e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
20
+ z=e/e.sum(axis=-1,keepdims=True)
21
+ for i in range(m.shape[0]-1,0,-1):
22
+ m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)
23
+ k=[numpy.nanargmax(m[0]+self.transition[0])]
24
+ for i in range(1,m.shape[0]):
25
+ k.append(numpy.nanargmax(m[i]+self.transition[k[-1]]))
26
+ w=[{"entity":self.model.config.id2label[j],"start":s,"end":e,"score":z[i,j]} for i,((s,e),j) in enumerate(zip(model_outputs["offset_mapping"][0].tolist(),k)) if s<e]
27
+ if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
28
+ for i,t in reversed(list(enumerate(w))):
29
+ p=t.pop("entity")
30
+ if p.startswith("I-"):
31
+ w[i-1]["score"]=min(w[i-1]["score"],t["score"])
32
+ w[i-1]["end"]=w.pop(i)["end"]
33
+ elif p.startswith("B-"):
34
+ t["entity_group"]=p[2:]
35
+ else:
36
+ t["entity_group"]=p
37
+ for t in w:
38
+ t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
39
+ return w
40
+
41
+ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline):
42
+ def __init__(self,**kwargs):
43
+ kwargs["aggregation_strategy"]="simple"
44
+ super().__init__(**kwargs)
45
+ x=self.model.config.label2id
46
+ self.root=numpy.full((len(x)),numpy.nan)
47
+ self.left_arc=numpy.full((len(x)),numpy.nan)
48
+ self.right_arc=numpy.full((len(x)),numpy.nan)
49
+ for k,v in x.items():
50
+ if k.endswith("|root"):
51
+ self.root[v]=0
52
+ elif k.find("|l-")>0:
53
+ self.left_arc[v]=0
54
+ elif k.find("|r-")>0:
55
+ self.right_arc[v]=0
56
+ def postprocess(self,model_outputs,**kwargs):
57
+ import torch
58
+ if "logits" not in model_outputs:
59
+ return self.postprocess(model_outputs[0],**kwargs)
60
+ m=model_outputs["logits"][0].numpy()
61
+ for i in range(m.shape[0]-1,0,-1):
62
+ m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)
63
+ k=[numpy.nanargmax(m[0]+self.transition[0])]
64
+ for i in range(1,m.shape[0]):
65
+ k.append(numpy.nanargmax(m[i]+self.transition[k[-1]]))
66
+ w=[{"entity":self.model.config.id2label[j],"start":s,"end":e} for i,((s,e),j) in enumerate(zip(model_outputs["offset_mapping"][0].tolist(),k)) if s<e]
67
+ for i,t in reversed(list(enumerate(w))):
68
+ p=t.pop("entity")
69
+ if p.startswith("I-"):
70
+ w[i-1]["end"]=max(w.pop(i)["end"],w[i-1]["end"])
71
+ elif i>0 and w[i-1]["end"]>w[i]["start"]:
72
+ w[i-1]["end"]=max(w.pop(i)["end"],w[i-1]["end"])
73
+ elif p.startswith("B-"):
74
+ t["entity_group"]=p[2:]
75
+ else:
76
+ t["entity_group"]=p
77
+ d=[model_outputs["sentence"][t["start"]:t["end"]] for t in w]
78
+ for i in range(len(d)-1,-1,-1):
79
+ if d[i].startswith(" "):
80
+ j=len(d[i])-len(d[i].lstrip())
81
+ d[i]=d[i].lstrip()
82
+ w[i]["start"]+=j
83
+ if d[i].endswith(" "):
84
+ j=len(d[i])-len(d[i].rstrip())
85
+ d[i]=d[i].rstrip()
86
+ w[i]["end"]-=j
87
+ if d[i].strip()=="":
88
+ d.pop(i)
89
+ w.pop(i)
90
+ v=self.tokenizer(d,add_special_tokens=False)
91
+ e=self.model.get_input_embeddings().weight
92
+ m=[]
93
+ for x in v["input_ids"]:
94
+ if x==[]:
95
+ x=[self.tokenizer.unk_token_id]
96
+ m.append(e[x,:].sum(axis=0))
97
+ m.append(e[self.tokenizer.sep_token_id,:])
98
+ m.append(e[self.tokenizer.pad_token_id,:])
99
+ m.append(e[self.tokenizer.cls_token_id,:])
100
+ m=torch.stack(m).to(self.device)
101
+ k=list(range(-1,len(d)+1))
102
+ e=[]
103
+ with torch.no_grad():
104
+ for i in range(len(d)):
105
+ e.append(self.model(inputs_embeds=torch.unsqueeze(m[k+list(range(i,len(d)))+[-2]*i,:],0)).logits[0,-len(d):,:])
106
+ e=torch.stack(e).cpu().numpy()
107
+ for i in range(len(d)):
108
+ for j in range(i):
109
+ e[-j-1,-i-1],e[-i-1,-j-1]=e[-i-1,i-j]+self.left_arc,e[-i-1,i-j]+self.right_arc
110
+ e[-i-1,-i-1]=e[-i-1,0]+self.root
111
+ m,p=numpy.nanmax(e,axis=2),numpy.nanargmax(e,axis=2)
112
+ h=self.chu_liu_edmonds(m)
113
+ z=[i for i,j in enumerate(h) if i==j]
114
+ if len(z)>1:
115
+ k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
116
+ m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
117
+ h=self.chu_liu_edmonds(m)
118
+ q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
119
+ t=model_outputs["sentence"].replace("\n"," ")
120
+ u="# text = "+t+"\n"
121
+ for i,j in enumerate(d):
122
+ u+="\t".join([str(i+1),j,"_",q[i][0],"_","_" if len(q[i])<3 else "|".join(q[i][1:-1]),str(0 if h[i]==i else h[i]+1),"root" if q[i][-1]=="root" else q[i][-1][2:],"_","_" if i+1<len(d) and w[i]["end"]<w[i+1]["start"] else "SpaceAfter=No"])+"\n"
123
+ return u+"\n"
124
+ def chu_liu_edmonds(self,matrix):
125
+ h=numpy.nanargmax(matrix,axis=0)
126
+ x=[-1 if i==j else j for i,j in enumerate(h)]
127
+ for b in [lambda x,i,j:-1 if i not in x else x[i],lambda x,i,j:-1 if j<0 else x[j]]:
128
+ y=[]
129
+ while x!=y:
130
+ y=list(x)
131
+ for i,j in enumerate(x):
132
+ x[i]=b(x,i,j)
133
+ if max(x)<0:
134
+ return h
135
+ y,x=[i for i,j in enumerate(x) if j==max(x)],[i for i,j in enumerate(x) if j<max(x)]
136
+ z=matrix-numpy.nanmax(matrix,axis=0)
137
+ m=numpy.block([[z[x,:][:,x],numpy.nanmax(z[x,:][:,y],axis=1).reshape(len(x),1)],[numpy.nanmax(z[y,:][:,x],axis=0),numpy.nanmax(z[y,y])]])
138
+ k=[j if i==len(x) else x[j] if j<len(x) else y[numpy.nanargmax(z[y,x[i]])] for i,j in enumerate(self.chu_liu_edmonds(m))]
139
+ h=[j if i in y else k[x.index(i)] for i,j in enumerate(h)]
140
+ i=y[numpy.nanargmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
141
+ h[i]=x[k[-1]] if k[-1]<len(x) else i
142
+ return h