abhaskumarsinha
commited on
Commit
•
70d3cae
1
Parent(s):
73f96f1
Upload 21 files
Browse files- subword/.ipynb_checkpoints/encoding-checkpoint.ipynb +700 -0
- subword/__init__.py +0 -0
- subword/__pycache__/__init__.cpython-39.pyc +0 -0
- subword/__pycache__/apply_bpe.cpython-39.pyc +0 -0
- subword/apply_bpe.py +457 -0
- subword/bpe_toy.py +51 -0
- subword/chrF.py +139 -0
- subword/dataset/codec.txt +0 -0
- subword/encoding.ipynb +700 -0
- subword/get_vocab.py +87 -0
- subword/learn_bpe.py +372 -0
- subword/learn_joint_bpe_and_vocab.py +166 -0
- subword/segment_char_ngrams.py +95 -0
- subword/subword_nmt.py +97 -0
- subword/tests/__init__.py +0 -0
- subword/tests/data/.gitignore +1 -0
- subword/tests/data/bpe.ref +1001 -0
- subword/tests/data/corpus.bpe.ref.en +0 -0
- subword/tests/data/corpus.en +0 -0
- subword/tests/test_bpe.py +83 -0
- subword/tests/test_glossaries.py +137 -0
subword/.ipynb_checkpoints/encoding-checkpoint.ipynb
ADDED
@@ -0,0 +1,700 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 11,
|
6 |
+
"id": "9644db35",
|
7 |
+
"metadata": {
|
8 |
+
"scrolled": true
|
9 |
+
},
|
10 |
+
"outputs": [
|
11 |
+
{
|
12 |
+
"name": "stderr",
|
13 |
+
"output_type": "stream",
|
14 |
+
"text": [
|
15 |
+
"\n",
|
16 |
+
" 0%| | 0/20000 [00:00<?, ?it/s]\n",
|
17 |
+
" 0%| | 1/20000 [00:00<38:40, 8.62it/s]\n",
|
18 |
+
" 0%| | 2/20000 [00:00<1:31:59, 3.62it/s]\n",
|
19 |
+
" 0%| | 3/20000 [00:00<1:21:11, 4.11it/s]\n",
|
20 |
+
" 0%| | 4/20000 [00:01<1:48:20, 3.08it/s]\n",
|
21 |
+
" 0%| | 6/20000 [00:01<1:03:27, 5.25it/s]\n",
|
22 |
+
" 0%| | 7/20000 [00:01<1:12:17, 4.61it/s]\n",
|
23 |
+
" 0%| | 8/20000 [00:01<1:10:13, 4.74it/s]\n",
|
24 |
+
" 0%| | 10/20000 [00:02<1:09:39, 4.78it/s]\n",
|
25 |
+
" 0%| | 13/20000 [00:02<42:59, 7.75it/s] \n",
|
26 |
+
" 0%| | 16/20000 [00:02<30:25, 10.95it/s]\n",
|
27 |
+
" 0%| | 19/20000 [00:02<24:50, 13.41it/s]\n",
|
28 |
+
" 0%| | 21/20000 [00:03<36:58, 9.01it/s]\n",
|
29 |
+
" 0%| | 23/20000 [00:03<35:20, 9.42it/s]\n",
|
30 |
+
" 0%| | 25/20000 [00:03<31:30, 10.56it/s]\n",
|
31 |
+
" 0%| | 27/20000 [00:03<33:12, 10.03it/s]\n",
|
32 |
+
" 0%| | 29/20000 [00:03<33:30, 9.93it/s]\n",
|
33 |
+
" 0%| | 31/20000 [00:03<35:06, 9.48it/s]\n",
|
34 |
+
" 0%| | 33/20000 [00:04<37:03, 8.98it/s]\n",
|
35 |
+
" 0%| | 37/20000 [00:04<26:59, 12.32it/s]\n",
|
36 |
+
" 0%| | 39/20000 [00:04<26:54, 12.37it/s]\n",
|
37 |
+
" 0%| | 42/20000 [00:04<22:51, 14.55it/s]\n",
|
38 |
+
" 0%| | 46/20000 [00:04<19:15, 17.27it/s]\n",
|
39 |
+
" 0%| | 48/20000 [00:05<22:34, 14.73it/s]\n",
|
40 |
+
" 0%| | 50/20000 [00:05<23:39, 14.06it/s]\n",
|
41 |
+
" 0%| | 52/20000 [00:05<23:30, 14.14it/s]\n",
|
42 |
+
" 0%| | 55/20000 [00:05<20:12, 16.45it/s]\n",
|
43 |
+
" 0%| | 58/20000 [00:05<19:09, 17.35it/s]\n",
|
44 |
+
" 0%| | 60/20000 [00:05<19:25, 17.11it/s]\n",
|
45 |
+
" 0%| | 63/20000 [00:06<28:45, 11.56it/s]\n",
|
46 |
+
" 0%| | 70/20000 [00:06<16:14, 20.45it/s]\n",
|
47 |
+
" 0%| | 75/20000 [00:06<14:15, 23.28it/s]\n",
|
48 |
+
" 0%| | 78/20000 [00:06<14:28, 22.94it/s]\n",
|
49 |
+
" 0%| | 81/20000 [00:06<16:06, 20.62it/s]\n",
|
50 |
+
" 0%| | 85/20000 [00:06<15:13, 21.81it/s]\n",
|
51 |
+
" 0%| | 88/20000 [00:07<15:39, 21.20it/s]\n",
|
52 |
+
" 0%| | 92/20000 [00:07<13:28, 24.61it/s]\n",
|
53 |
+
" 0%| | 95/20000 [00:07<13:32, 24.50it/s]\n",
|
54 |
+
" 0%| | 99/20000 [00:07<12:06, 27.40it/s]\n",
|
55 |
+
" 1%| | 102/20000 [00:07<12:55, 25.65it/s]\n",
|
56 |
+
" 1%| | 105/20000 [00:07<12:46, 25.95it/s]\n",
|
57 |
+
" 1%| | 112/20000 [00:07<12:02, 27.54it/s]\n",
|
58 |
+
" 1%| | 118/20000 [00:08<10:02, 33.00it/s]\n",
|
59 |
+
" 1%| | 122/20000 [00:08<10:12, 32.46it/s]\n",
|
60 |
+
" 1%| | 127/20000 [00:08<10:07, 32.73it/s]\n",
|
61 |
+
" 1%| | 138/20000 [00:08<06:56, 47.66it/s]\n",
|
62 |
+
" 1%| | 144/20000 [00:08<06:47, 48.73it/s]\n",
|
63 |
+
" 1%| | 150/20000 [00:08<07:00, 47.21it/s]\n",
|
64 |
+
" 1%| | 156/20000 [00:08<06:44, 49.01it/s]\n",
|
65 |
+
" 1%| | 162/20000 [00:08<06:47, 48.71it/s]\n",
|
66 |
+
" 1%| | 169/20000 [00:09<06:38, 49.81it/s]\n",
|
67 |
+
" 1%| | 176/20000 [00:09<06:28, 51.03it/s]\n",
|
68 |
+
" 1%| | 184/20000 [00:09<05:58, 55.22it/s]\n",
|
69 |
+
" 1%| | 190/20000 [00:09<08:05, 40.78it/s]\n",
|
70 |
+
" 1%| | 197/20000 [00:09<07:30, 43.91it/s]\n",
|
71 |
+
" 1%|1 | 202/20000 [00:09<07:53, 41.79it/s]\n",
|
72 |
+
" 1%|1 | 210/20000 [00:10<06:51, 48.15it/s]\n",
|
73 |
+
" 1%|1 | 220/20000 [00:10<05:35, 59.01it/s]\n",
|
74 |
+
" 1%|1 | 229/20000 [00:10<05:05, 64.72it/s]\n",
|
75 |
+
" 1%|1 | 236/20000 [00:10<05:37, 58.47it/s]\n",
|
76 |
+
" 1%|1 | 244/20000 [00:10<05:22, 61.18it/s]\n",
|
77 |
+
" 1%|1 | 251/20000 [00:10<05:31, 59.65it/s]\n",
|
78 |
+
" 1%|1 | 259/20000 [00:10<05:11, 63.33it/s]\n",
|
79 |
+
" 1%|1 | 266/20000 [00:10<05:25, 60.70it/s]\n",
|
80 |
+
" 1%|1 | 273/20000 [00:10<05:31, 59.42it/s]\n",
|
81 |
+
" 1%|1 | 282/20000 [00:11<04:57, 66.26it/s]\n",
|
82 |
+
" 1%|1 | 289/20000 [00:11<05:00, 65.52it/s]\n",
|
83 |
+
" 1%|1 | 296/20000 [00:11<05:10, 63.47it/s]\n",
|
84 |
+
" 2%|1 | 303/20000 [00:11<07:06, 46.23it/s]\n",
|
85 |
+
" 2%|1 | 313/20000 [00:11<05:41, 57.60it/s]\n",
|
86 |
+
" 2%|1 | 324/20000 [00:11<04:54, 66.87it/s]\n",
|
87 |
+
" 2%|1 | 335/20000 [00:11<04:16, 76.56it/s]\n",
|
88 |
+
" 2%|1 | 346/20000 [00:11<03:50, 85.09it/s]\n",
|
89 |
+
" 2%|1 | 357/20000 [00:12<03:38, 90.06it/s]\n",
|
90 |
+
" 2%|1 | 367/20000 [00:12<03:46, 86.85it/s]\n",
|
91 |
+
" 2%|1 | 377/20000 [00:12<03:41, 88.70it/s]\n",
|
92 |
+
" 2%|1 | 387/20000 [00:12<03:43, 87.95it/s]\n",
|
93 |
+
" 2%|1 | 396/20000 [00:12<03:49, 85.48it/s]\n",
|
94 |
+
" 2%|2 | 405/20000 [00:12<04:04, 80.15it/s]\n",
|
95 |
+
" 2%|2 | 416/20000 [00:12<03:42, 87.82it/s]\n",
|
96 |
+
" 2%|2 | 429/20000 [00:12<03:19, 98.03it/s]\n",
|
97 |
+
" 2%|2 | 439/20000 [00:13<03:35, 90.74it/s]\n",
|
98 |
+
" 2%|2 | 450/20000 [00:13<03:24, 95.65it/s]\n",
|
99 |
+
" 2%|2 | 462/20000 [00:13<03:13, 100.98it/s]\n",
|
100 |
+
" 2%|2 | 473/20000 [00:13<03:16, 99.60it/s] \n",
|
101 |
+
" 2%|2 | 484/20000 [00:13<03:28, 93.64it/s]\n",
|
102 |
+
" 2%|2 | 494/20000 [00:13<03:30, 92.86it/s]\n",
|
103 |
+
" 3%|2 | 504/20000 [00:13<04:34, 70.99it/s]\n",
|
104 |
+
" 3%|2 | 520/20000 [00:13<03:34, 90.96it/s]\n",
|
105 |
+
" 3%|2 | 534/20000 [00:14<03:12, 101.38it/s]\n",
|
106 |
+
" 3%|2 | 547/20000 [00:14<03:01, 107.03it/s]\n",
|
107 |
+
" 3%|2 | 559/20000 [00:14<03:02, 106.25it/s]\n",
|
108 |
+
" 3%|2 | 571/20000 [00:14<03:14, 99.81it/s] \n",
|
109 |
+
" 3%|2 | 582/20000 [00:14<03:13, 100.11it/s]\n",
|
110 |
+
" 3%|2 | 595/20000 [00:14<03:03, 105.49it/s]\n",
|
111 |
+
" 3%|3 | 606/20000 [00:14<03:07, 103.63it/s]\n",
|
112 |
+
" 3%|3 | 625/20000 [00:14<02:33, 126.08it/s]\n",
|
113 |
+
" 3%|3 | 643/20000 [00:14<02:17, 140.29it/s]\n",
|
114 |
+
" 3%|3 | 658/20000 [00:15<02:23, 135.01it/s]\n",
|
115 |
+
" 3%|3 | 672/20000 [00:15<02:32, 126.59it/s]\n",
|
116 |
+
" 3%|3 | 685/20000 [00:15<02:42, 119.19it/s]\n",
|
117 |
+
" 3%|3 | 698/20000 [00:15<02:46, 116.22it/s]\n",
|
118 |
+
" 4%|3 | 710/20000 [00:15<02:49, 113.91it/s]\n",
|
119 |
+
" 4%|3 | 727/20000 [00:15<02:31, 127.58it/s]\n",
|
120 |
+
" 4%|3 | 744/20000 [00:15<02:18, 139.24it/s]\n",
|
121 |
+
" 4%|3 | 759/20000 [00:15<03:10, 101.19it/s]\n",
|
122 |
+
" 4%|3 | 771/20000 [00:16<03:03, 104.67it/s]\n",
|
123 |
+
" 4%|3 | 783/20000 [00:16<03:02, 105.07it/s]\n",
|
124 |
+
" 4%|3 | 795/20000 [00:16<03:14, 98.92it/s] \n",
|
125 |
+
" 4%|4 | 807/20000 [00:16<03:06, 102.82it/s]\n",
|
126 |
+
" 4%|4 | 822/20000 [00:16<02:50, 112.34it/s]\n",
|
127 |
+
" 4%|4 | 834/20000 [00:16<02:55, 109.45it/s]\n",
|
128 |
+
" 4%|4 | 847/20000 [00:16<02:47, 114.28it/s]\n",
|
129 |
+
" 4%|4 | 860/20000 [00:16<02:42, 117.94it/s]\n",
|
130 |
+
" 4%|4 | 873/20000 [00:16<02:46, 114.58it/s]\n",
|
131 |
+
" 4%|4 | 885/20000 [00:17<02:58, 106.97it/s]\n",
|
132 |
+
" 4%|4 | 896/20000 [00:17<03:07, 102.08it/s]\n",
|
133 |
+
" 5%|4 | 908/20000 [00:17<03:01, 105.42it/s]\n",
|
134 |
+
" 5%|4 | 924/20000 [00:17<02:42, 117.67it/s]\n",
|
135 |
+
" 5%|4 | 940/20000 [00:17<02:28, 128.26it/s]\n",
|
136 |
+
" 5%|4 | 954/20000 [00:17<02:24, 131.54it/s]\n",
|
137 |
+
" 5%|4 | 968/20000 [00:17<02:34, 123.37it/s]\n",
|
138 |
+
" 5%|4 | 982/20000 [00:17<02:31, 125.85it/s]\n",
|
139 |
+
" 5%|4 | 995/20000 [00:18<02:39, 119.06it/s]\n",
|
140 |
+
" 5%|5 | 1008/20000 [00:18<03:38, 86.92it/s]\n",
|
141 |
+
" 5%|5 | 1024/20000 [00:18<03:04, 102.72it/s]\n",
|
142 |
+
" 5%|5 | 1041/20000 [00:18<02:40, 118.03it/s]\n",
|
143 |
+
" 5%|5 | 1055/20000 [00:18<02:34, 122.63it/s]\n",
|
144 |
+
" 5%|5 | 1069/20000 [00:18<02:31, 124.89it/s]\n",
|
145 |
+
" 5%|5 | 1083/20000 [00:18<02:34, 122.68it/s]\n",
|
146 |
+
" 5%|5 | 1096/20000 [00:18<02:39, 118.60it/s]\n",
|
147 |
+
" 6%|5 | 1110/20000 [00:19<02:32, 123.65it/s]\n",
|
148 |
+
" 6%|5 | 1127/20000 [00:19<02:18, 136.02it/s]\n",
|
149 |
+
" 6%|5 | 1145/20000 [00:19<02:08, 146.74it/s]\n",
|
150 |
+
" 6%|5 | 1161/20000 [00:19<02:06, 148.82it/s]\n",
|
151 |
+
" 6%|5 | 1177/20000 [00:19<02:04, 151.15it/s]\n",
|
152 |
+
" 6%|5 | 1193/20000 [00:19<02:09, 145.70it/s]\n",
|
153 |
+
" 6%|6 | 1208/20000 [00:19<02:10, 144.47it/s]\n",
|
154 |
+
" 6%|6 | 1227/20000 [00:19<02:00, 156.04it/s]\n",
|
155 |
+
" 6%|6 | 1244/20000 [00:19<01:57, 159.13it/s]\n",
|
156 |
+
" 6%|6 | 1261/20000 [00:19<02:01, 154.24it/s]\n",
|
157 |
+
" 6%|6 | 1277/20000 [00:20<02:09, 145.11it/s]\n",
|
158 |
+
" 6%|6 | 1292/20000 [00:20<02:10, 143.32it/s]\n",
|
159 |
+
" 7%|6 | 1307/20000 [00:20<03:02, 102.65it/s]\n",
|
160 |
+
" 7%|6 | 1330/20000 [00:20<02:23, 130.14it/s]\n",
|
161 |
+
" 7%|6 | 1348/20000 [00:20<02:13, 139.97it/s]\n",
|
162 |
+
" 7%|6 | 1368/20000 [00:20<02:02, 152.70it/s]\n",
|
163 |
+
" 7%|6 | 1385/20000 [00:20<02:00, 153.99it/s]\n",
|
164 |
+
" 7%|7 | 1402/20000 [00:21<02:07, 146.16it/s]\n",
|
165 |
+
" 7%|7 | 1423/20000 [00:21<01:55, 161.53it/s]\n",
|
166 |
+
" 7%|7 | 1441/20000 [00:21<01:52, 165.17it/s]\n",
|
167 |
+
" 7%|7 | 1459/20000 [00:21<01:55, 160.82it/s]\n",
|
168 |
+
" 7%|7 | 1476/20000 [00:21<02:03, 149.82it/s]\n",
|
169 |
+
" 7%|7 | 1492/20000 [00:21<02:08, 143.79it/s]\n",
|
170 |
+
" 8%|7 | 1507/20000 [00:21<02:10, 142.06it/s]\n",
|
171 |
+
" 8%|7 | 1530/20000 [00:21<01:52, 164.72it/s]\n",
|
172 |
+
" 8%|7 | 1548/20000 [00:21<01:50, 167.09it/s]\n",
|
173 |
+
" 8%|7 | 1565/20000 [00:22<01:49, 167.90it/s]\n",
|
174 |
+
" 8%|7 | 1582/20000 [00:22<01:53, 161.57it/s]\n",
|
175 |
+
" 8%|7 | 1599/20000 [00:22<01:56, 158.15it/s]\n",
|
176 |
+
" 8%|8 | 1617/20000 [00:22<01:51, 164.25it/s]\n",
|
177 |
+
" 8%|8 | 1637/20000 [00:22<01:45, 174.45it/s]\n",
|
178 |
+
" 8%|8 | 1657/20000 [00:22<01:41, 181.32it/s]\n",
|
179 |
+
" 8%|8 | 1676/20000 [00:22<01:40, 182.25it/s]\n",
|
180 |
+
" 8%|8 | 1695/20000 [00:22<01:46, 171.94it/s]\n",
|
181 |
+
" 9%|8 | 1718/20000 [00:22<01:38, 186.12it/s]\n",
|
182 |
+
" 9%|8 | 1739/20000 [00:22<01:34, 192.48it/s]\n",
|
183 |
+
" 9%|8 | 1759/20000 [00:23<02:13, 136.76it/s]\n",
|
184 |
+
" 9%|8 | 1777/20000 [00:23<02:04, 145.80it/s]\n",
|
185 |
+
" 9%|8 | 1794/20000 [00:23<02:04, 146.68it/s]\n",
|
186 |
+
" 9%|9 | 1814/20000 [00:23<01:53, 159.63it/s]\n",
|
187 |
+
" 9%|9 | 1836/20000 [00:23<01:43, 175.04it/s]\n",
|
188 |
+
" 9%|9 | 1856/20000 [00:23<01:41, 179.30it/s]\n",
|
189 |
+
" 9%|9 | 1875/20000 [00:23<01:42, 176.01it/s]\n",
|
190 |
+
" 9%|9 | 1894/20000 [00:23<01:45, 171.34it/s]\n",
|
191 |
+
" 10%|9 | 1915/20000 [00:24<01:39, 180.93it/s]\n",
|
192 |
+
" 10%|9 | 1937/20000 [00:24<01:34, 190.79it/s]\n",
|
193 |
+
" 10%|9 | 1957/20000 [00:24<01:35, 189.63it/s]\n",
|
194 |
+
" 10%|9 | 1977/20000 [00:24<01:36, 186.73it/s]\n",
|
195 |
+
" 10%|9 | 1996/20000 [00:24<01:42, 175.72it/s]\n",
|
196 |
+
" 10%|# | 2018/20000 [00:24<01:35, 187.87it/s]\n",
|
197 |
+
" 10%|# | 2046/20000 [00:24<01:24, 212.03it/s]\n",
|
198 |
+
" 10%|# | 2068/20000 [00:24<01:27, 204.39it/s]\n",
|
199 |
+
" 10%|# | 2089/20000 [00:24<01:31, 195.56it/s]\n",
|
200 |
+
" 11%|# | 2109/20000 [00:25<01:33, 192.02it/s]\n",
|
201 |
+
" 11%|# | 2140/20000 [00:25<01:19, 224.10it/s]\n",
|
202 |
+
" 11%|# | 2165/20000 [00:25<01:17, 230.78it/s]\n",
|
203 |
+
" 11%|# | 2189/20000 [00:25<01:18, 225.64it/s]\n",
|
204 |
+
" 11%|#1 | 2212/20000 [00:25<01:24, 210.15it/s]\n",
|
205 |
+
" 11%|#1 | 2236/20000 [00:25<01:21, 217.71it/s]\n",
|
206 |
+
" 11%|#1 | 2259/20000 [00:25<01:22, 215.12it/s]\n",
|
207 |
+
" 11%|#1 | 2281/20000 [00:25<01:24, 208.87it/s]\n",
|
208 |
+
" 12%|#1 | 2303/20000 [00:25<01:35, 185.14it/s]\n",
|
209 |
+
" 12%|#1 | 2333/20000 [00:26<01:22, 213.67it/s]\n",
|
210 |
+
" 12%|#1 | 2357/20000 [00:26<01:19, 220.73it/s]\n",
|
211 |
+
" 12%|#1 | 2380/20000 [00:26<01:21, 214.95it/s]\n",
|
212 |
+
" 12%|#2 | 2402/20000 [00:26<02:03, 142.71it/s]\n",
|
213 |
+
" 12%|#2 | 2432/20000 [00:26<01:40, 174.08it/s]\n",
|
214 |
+
" 12%|#2 | 2459/20000 [00:26<01:29, 195.81it/s]\n",
|
215 |
+
" 12%|#2 | 2482/20000 [00:26<01:28, 198.82it/s]\n",
|
216 |
+
" 13%|#2 | 2505/20000 [00:27<01:29, 195.33it/s]\n",
|
217 |
+
" 13%|#2 | 2538/20000 [00:27<01:16, 228.52it/s]\n",
|
218 |
+
" 13%|#2 | 2566/20000 [00:27<01:11, 242.22it/s]\n",
|
219 |
+
" 13%|#2 | 2592/20000 [00:27<01:15, 230.01it/s]\n",
|
220 |
+
" 13%|#3 | 2620/20000 [00:27<01:11, 243.40it/s]\n",
|
221 |
+
" 13%|#3 | 2651/20000 [00:27<01:06, 261.84it/s]\n",
|
222 |
+
" 13%|#3 | 2678/20000 [00:27<01:06, 260.46it/s]\n",
|
223 |
+
" 14%|#3 | 2705/20000 [00:27<01:08, 252.37it/s]\n",
|
224 |
+
" 14%|#3 | 2740/20000 [00:27<01:02, 278.24it/s]\n",
|
225 |
+
" 14%|#3 | 2769/20000 [00:27<01:05, 264.95it/s]\n",
|
226 |
+
" 14%|#3 | 2796/20000 [00:28<01:09, 247.16it/s]\n",
|
227 |
+
" 14%|#4 | 2828/20000 [00:28<01:04, 264.60it/s]\n",
|
228 |
+
" 14%|#4 | 2855/20000 [00:28<01:05, 260.34it/s]\n",
|
229 |
+
" 14%|#4 | 2882/20000 [00:28<01:09, 247.20it/s]\n",
|
230 |
+
" 15%|#4 | 2908/20000 [00:28<01:12, 236.53it/s]\n",
|
231 |
+
" 15%|#4 | 2952/20000 [00:28<00:58, 291.10it/s]\n",
|
232 |
+
" 15%|#4 | 2982/20000 [00:28<01:03, 266.27it/s]\n",
|
233 |
+
" 15%|#5 | 3010/20000 [00:28<01:03, 267.07it/s]\n",
|
234 |
+
" 15%|#5 | 3039/20000 [00:29<01:02, 270.37it/s]\n",
|
235 |
+
" 15%|#5 | 3068/20000 [00:29<01:01, 273.53it/s]\n",
|
236 |
+
" 15%|#5 | 3096/20000 [00:29<01:04, 263.45it/s]\n",
|
237 |
+
" 16%|#5 | 3129/20000 [00:29<00:59, 281.96it/s]\n",
|
238 |
+
" 16%|#5 | 3160/20000 [00:29<00:58, 287.48it/s]\n",
|
239 |
+
" 16%|#5 | 3190/20000 [00:29<01:00, 279.05it/s]\n",
|
240 |
+
" 16%|#6 | 3226/20000 [00:29<00:55, 301.05it/s]\n",
|
241 |
+
" 16%|#6 | 3257/20000 [00:29<00:55, 303.61it/s]\n",
|
242 |
+
" 16%|#6 | 3288/20000 [00:29<00:56, 293.52it/s]\n",
|
243 |
+
" 17%|#6 | 3318/20000 [00:29<00:56, 293.68it/s]\n",
|
244 |
+
" 17%|#6 | 3357/20000 [00:30<00:52, 318.68it/s]\n",
|
245 |
+
" 17%|#6 | 3390/20000 [00:30<00:58, 284.80it/s]\n",
|
246 |
+
" 17%|#7 | 3420/20000 [00:30<01:21, 204.06it/s]\n",
|
247 |
+
" 17%|#7 | 3459/20000 [00:30<01:08, 242.62it/s]\n",
|
248 |
+
" 17%|#7 | 3491/20000 [00:30<01:03, 260.00it/s]\n",
|
249 |
+
" 18%|#7 | 3535/20000 [00:30<00:54, 304.04it/s]\n",
|
250 |
+
" 18%|#7 | 3573/20000 [00:30<00:50, 323.92it/s]\n",
|
251 |
+
" 18%|#8 | 3608/20000 [00:31<00:55, 296.34it/s]\n",
|
252 |
+
" 18%|#8 | 3653/20000 [00:31<00:48, 336.01it/s]\n",
|
253 |
+
" 18%|#8 | 3689/20000 [00:31<00:49, 329.16it/s]\n",
|
254 |
+
" 19%|#8 | 3733/20000 [00:31<00:45, 358.11it/s]\n",
|
255 |
+
" 19%|#8 | 3771/20000 [00:31<00:44, 361.17it/s]\n",
|
256 |
+
" 19%|#9 | 3809/20000 [00:31<00:47, 342.31it/s]\n",
|
257 |
+
" 19%|#9 | 3861/20000 [00:31<00:41, 390.94it/s]\n",
|
258 |
+
" 20%|#9 | 3902/20000 [00:31<00:42, 378.22it/s]\n",
|
259 |
+
" 20%|#9 | 3968/20000 [00:31<00:35, 455.02it/s]\n",
|
260 |
+
" 20%|## | 4015/20000 [00:32<00:37, 427.77it/s]\n",
|
261 |
+
" 20%|## | 4066/20000 [00:32<00:35, 449.03it/s]\n",
|
262 |
+
" 21%|## | 4112/20000 [00:32<00:39, 404.45it/s]\n",
|
263 |
+
" 21%|## | 4174/20000 [00:32<00:34, 458.89it/s]\n",
|
264 |
+
" 21%|##1 | 4222/20000 [00:32<00:35, 442.90it/s]\n",
|
265 |
+
" 21%|##1 | 4271/20000 [00:32<00:34, 454.41it/s]\n",
|
266 |
+
" 22%|##1 | 4329/20000 [00:32<00:32, 489.36it/s]\n",
|
267 |
+
" 22%|##1 | 4387/20000 [00:32<00:30, 515.14it/s]\n",
|
268 |
+
" 22%|##2 | 4447/20000 [00:32<00:28, 538.10it/s]\n",
|
269 |
+
" 23%|##2 | 4502/20000 [00:33<00:32, 478.73it/s]\n",
|
270 |
+
" 23%|##2 | 4563/20000 [00:33<00:30, 512.67it/s]\n",
|
271 |
+
" 23%|##3 | 4616/20000 [00:33<00:30, 496.81it/s]\n",
|
272 |
+
" 23%|##3 | 4677/20000 [00:33<00:29, 527.98it/s]\n",
|
273 |
+
" 24%|##3 | 4733/20000 [00:33<00:28, 537.01it/s]\n",
|
274 |
+
" 24%|##3 | 4788/20000 [00:33<00:28, 534.59it/s]\n",
|
275 |
+
" 24%|##4 | 4864/20000 [00:33<00:25, 599.65it/s]\n",
|
276 |
+
" 25%|##4 | 4925/20000 [00:33<00:25, 595.70it/s]\n",
|
277 |
+
" 25%|##4 | 4994/20000 [00:33<00:24, 617.81it/s]\n",
|
278 |
+
" 25%|##5 | 5079/20000 [00:33<00:21, 683.71it/s]\n",
|
279 |
+
" 26%|##5 | 5148/20000 [00:34<00:35, 419.97it/s]\n",
|
280 |
+
" 26%|##6 | 5203/20000 [00:34<00:33, 446.58it/s]\n",
|
281 |
+
" 26%|##6 | 5289/20000 [00:34<00:27, 538.90it/s]\n",
|
282 |
+
" 27%|##6 | 5377/20000 [00:34<00:23, 622.07it/s]\n",
|
283 |
+
" 27%|##7 | 5471/20000 [00:34<00:20, 703.42it/s]\n",
|
284 |
+
" 28%|##7 | 5549/20000 [00:36<01:35, 150.73it/s]\n",
|
285 |
+
" 28%|##8 | 5606/20000 [00:36<01:37, 147.12it/s]\n",
|
286 |
+
" 28%|##8 | 5650/20000 [00:36<01:34, 151.57it/s]\n",
|
287 |
+
" 28%|##8 | 5686/20000 [00:37<01:33, 153.50it/s]\n",
|
288 |
+
" 29%|##8 | 5716/20000 [00:37<01:32, 154.45it/s]\n",
|
289 |
+
" 29%|##8 | 5742/20000 [00:37<01:29, 158.75it/s]\n",
|
290 |
+
" 29%|##8 | 5766/20000 [00:37<01:28, 160.05it/s]\n",
|
291 |
+
" 29%|##8 | 5788/20000 [00:37<01:29, 159.44it/s]\n",
|
292 |
+
" 29%|##9 | 5808/20000 [00:37<01:29, 158.22it/s]\n",
|
293 |
+
" 29%|##9 | 5827/20000 [00:37<01:27, 162.78it/s]\n",
|
294 |
+
" 29%|##9 | 5846/20000 [00:38<01:25, 165.07it/s]\n",
|
295 |
+
" 29%|##9 | 5864/20000 [00:38<01:25, 164.71it/s]\n",
|
296 |
+
" 29%|##9 | 5882/20000 [00:38<01:26, 162.88it/s]\n",
|
297 |
+
" 29%|##9 | 5899/20000 [00:38<01:30, 155.66it/s]\n",
|
298 |
+
" 30%|##9 | 5916/20000 [00:38<01:29, 158.09it/s]\n",
|
299 |
+
" 30%|##9 | 5935/20000 [00:38<01:24, 166.09it/s]\n",
|
300 |
+
" 30%|##9 | 5954/20000 [00:38<01:22, 169.84it/s]\n",
|
301 |
+
" 30%|##9 | 5972/20000 [00:38<01:21, 171.23it/s]\n",
|
302 |
+
" 30%|##9 | 5990/20000 [00:38<01:22, 170.81it/s]\n",
|
303 |
+
" 30%|### | 6008/20000 [00:39<01:23, 167.70it/s]\n",
|
304 |
+
" 30%|### | 6027/20000 [00:39<01:20, 173.01it/s]\n",
|
305 |
+
" 30%|### | 6046/20000 [00:39<01:19, 175.35it/s]\n",
|
306 |
+
" 30%|### | 6064/20000 [00:39<01:20, 172.23it/s]\n",
|
307 |
+
" 30%|### | 6082/20000 [00:39<01:21, 170.55it/s]\n",
|
308 |
+
" 30%|### | 6100/20000 [00:39<01:23, 167.05it/s]\n",
|
309 |
+
" 31%|### | 6118/20000 [00:39<01:21, 170.70it/s]\n",
|
310 |
+
" 31%|### | 6138/20000 [00:39<01:17, 178.09it/s]\n",
|
311 |
+
" 31%|### | 6157/20000 [00:39<01:16, 179.96it/s]\n",
|
312 |
+
" 31%|### | 6176/20000 [00:39<01:18, 177.21it/s]\n",
|
313 |
+
" 31%|### | 6194/20000 [00:40<01:18, 174.99it/s]\n",
|
314 |
+
" 31%|###1 | 6212/20000 [00:40<01:19, 173.44it/s]\n",
|
315 |
+
" 31%|###1 | 6232/20000 [00:40<01:16, 180.04it/s]\n",
|
316 |
+
" 31%|###1 | 6251/20000 [00:40<01:16, 179.80it/s]\n",
|
317 |
+
" 31%|###1 | 6270/20000 [00:40<01:19, 172.28it/s]\n",
|
318 |
+
" 31%|###1 | 6288/20000 [00:40<01:20, 170.14it/s]\n",
|
319 |
+
" 32%|###1 | 6306/20000 [00:40<01:22, 165.92it/s]\n",
|
320 |
+
" 32%|###1 | 6327/20000 [00:40<01:16, 178.17it/s]\n",
|
321 |
+
" 32%|###1 | 6347/20000 [00:40<01:14, 183.85it/s]\n",
|
322 |
+
" 32%|###1 | 6366/20000 [00:41<01:14, 182.46it/s]\n",
|
323 |
+
" 32%|###1 | 6385/20000 [00:41<01:17, 175.53it/s]\n",
|
324 |
+
" 32%|###2 | 6403/20000 [00:41<01:21, 166.92it/s]\n",
|
325 |
+
" 32%|###2 | 6423/20000 [00:41<01:17, 174.57it/s]\n",
|
326 |
+
" 32%|###2 | 6443/20000 [00:41<01:15, 179.69it/s]\n",
|
327 |
+
" 32%|###2 | 6462/20000 [00:41<01:16, 178.06it/s]\n",
|
328 |
+
" 32%|###2 | 6480/20000 [00:41<01:17, 174.17it/s]\n",
|
329 |
+
" 32%|###2 | 6498/20000 [00:41<01:19, 170.04it/s]\n",
|
330 |
+
" 33%|###2 | 6517/20000 [00:41<01:16, 175.15it/s]\n",
|
331 |
+
" 33%|###2 | 6538/20000 [00:42<01:13, 184.06it/s]\n",
|
332 |
+
" 33%|###2 | 6558/20000 [00:42<01:11, 187.58it/s]\n",
|
333 |
+
" 33%|###2 | 6577/20000 [00:42<01:12, 183.99it/s]\n",
|
334 |
+
" 33%|###2 | 6596/20000 [00:42<01:14, 180.51it/s]\n",
|
335 |
+
" 33%|###3 | 6615/20000 [00:42<01:14, 180.64it/s]\n",
|
336 |
+
" 33%|###3 | 6636/20000 [00:42<01:11, 187.45it/s]\n",
|
337 |
+
" 33%|###3 | 6656/20000 [00:42<01:10, 189.43it/s]\n",
|
338 |
+
" 33%|###3 | 6675/20000 [00:42<01:11, 185.29it/s]\n",
|
339 |
+
" 33%|###3 | 6694/20000 [00:42<01:14, 177.91it/s]\n",
|
340 |
+
" 34%|###3 | 6712/20000 [00:42<01:15, 176.02it/s]\n",
|
341 |
+
" 34%|###3 | 6733/20000 [00:43<01:11, 185.68it/s]\n",
|
342 |
+
" 34%|###3 | 6752/20000 [00:43<01:10, 186.91it/s]\n",
|
343 |
+
" 34%|###3 | 6771/20000 [00:43<01:12, 183.53it/s]\n",
|
344 |
+
" 34%|###3 | 6790/20000 [00:43<01:15, 175.73it/s]\n",
|
345 |
+
" 34%|###4 | 6808/20000 [00:43<01:17, 170.68it/s]\n",
|
346 |
+
" 34%|###4 | 6828/20000 [00:43<01:13, 178.87it/s]\n",
|
347 |
+
" 34%|###4 | 6849/20000 [00:43<01:10, 186.16it/s]\n",
|
348 |
+
" 34%|###4 | 6868/20000 [00:43<01:10, 187.26it/s]\n",
|
349 |
+
" 34%|###4 | 6887/20000 [00:43<01:10, 185.36it/s]\n",
|
350 |
+
" 35%|###4 | 6906/20000 [00:44<01:12, 180.40it/s]\n",
|
351 |
+
" 35%|###4 | 6929/20000 [00:44<01:07, 193.50it/s]\n",
|
352 |
+
" 35%|###4 | 6950/20000 [00:44<01:05, 198.27it/s]\n",
|
353 |
+
" 35%|###4 | 6970/20000 [00:44<01:06, 197.04it/s]\n",
|
354 |
+
" 35%|###4 | 6990/20000 [00:44<01:08, 190.65it/s]\n",
|
355 |
+
" 35%|###5 | 7010/20000 [00:44<01:10, 184.33it/s]\n",
|
356 |
+
" 35%|###5 | 7029/20000 [00:44<01:10, 183.34it/s]\n",
|
357 |
+
" 35%|###5 | 7049/20000 [00:44<01:08, 188.06it/s]\n",
|
358 |
+
" 35%|###5 | 7068/20000 [00:44<01:08, 188.62it/s]\n",
|
359 |
+
" 35%|###5 | 7087/20000 [00:44<01:09, 184.71it/s]\n",
|
360 |
+
" 36%|###5 | 7106/20000 [00:45<01:11, 179.97it/s]\n",
|
361 |
+
" 36%|###5 | 7129/20000 [00:45<01:06, 192.60it/s]\n",
|
362 |
+
" 36%|###5 | 7151/20000 [00:45<01:04, 198.21it/s]\n",
|
363 |
+
" 36%|###5 | 7172/20000 [00:45<01:04, 198.74it/s]\n",
|
364 |
+
" 36%|###5 | 7192/20000 [00:45<01:04, 197.39it/s]\n",
|
365 |
+
" 36%|###6 | 7212/20000 [00:45<01:05, 195.87it/s]\n",
|
366 |
+
" 36%|###6 | 7235/20000 [00:45<01:02, 205.80it/s]\n",
|
367 |
+
" 36%|###6 | 7256/20000 [00:45<01:01, 205.81it/s]\n",
|
368 |
+
" 36%|###6 | 7277/20000 [00:45<01:03, 201.69it/s]\n",
|
369 |
+
" 36%|###6 | 7298/20000 [00:46<01:05, 193.96it/s]\n",
|
370 |
+
" 37%|###6 | 7320/20000 [00:46<01:03, 200.19it/s]\n",
|
371 |
+
" 37%|###6 | 7343/20000 [00:46<01:00, 208.13it/s]\n",
|
372 |
+
" 37%|###6 | 7364/20000 [00:46<01:00, 208.67it/s]\n",
|
373 |
+
" 37%|###6 | 7385/20000 [00:46<01:01, 204.27it/s]\n",
|
374 |
+
" 37%|###7 | 7406/20000 [00:46<01:03, 199.56it/s]\n",
|
375 |
+
" 37%|###7 | 7429/20000 [00:46<01:00, 207.10it/s]\n",
|
376 |
+
" 37%|###7 | 7450/20000 [00:46<01:00, 207.93it/s]\n",
|
377 |
+
" 37%|###7 | 7471/20000 [00:46<01:00, 207.32it/s]\n",
|
378 |
+
" 37%|###7 | 7492/20000 [00:46<01:01, 204.50it/s]\n",
|
379 |
+
" 38%|###7 | 7513/20000 [00:47<01:00, 205.49it/s]\n",
|
380 |
+
" 38%|###7 | 7537/20000 [00:47<00:57, 215.00it/s]\n",
|
381 |
+
" 38%|###7 | 7559/20000 [00:47<00:57, 215.82it/s]\n",
|
382 |
+
" 38%|###7 | 7581/20000 [00:47<00:58, 210.82it/s]\n",
|
383 |
+
" 38%|###8 | 7603/20000 [00:47<01:01, 200.64it/s]\n",
|
384 |
+
" 38%|###8 | 7627/20000 [00:47<00:58, 211.11it/s]\n",
|
385 |
+
" 38%|###8 | 7650/20000 [00:47<00:57, 215.27it/s]\n",
|
386 |
+
" 38%|###8 | 7672/20000 [00:47<00:58, 211.14it/s]\n",
|
387 |
+
" 38%|###8 | 7694/20000 [00:47<01:00, 203.74it/s]\n",
|
388 |
+
" 39%|###8 | 7716/20000 [00:48<00:59, 206.56it/s]\n",
|
389 |
+
" 39%|###8 | 7742/20000 [00:48<00:55, 219.27it/s]\n",
|
390 |
+
" 39%|###8 | 7765/20000 [00:48<00:55, 222.34it/s]\n",
|
391 |
+
" 39%|###8 | 7788/20000 [00:48<00:55, 220.09it/s]\n",
|
392 |
+
" 39%|###9 | 7811/20000 [00:48<00:56, 217.29it/s]\n",
|
393 |
+
" 39%|###9 | 7837/20000 [00:48<00:53, 227.59it/s]\n",
|
394 |
+
" 39%|###9 | 7860/20000 [00:48<00:53, 225.68it/s]\n",
|
395 |
+
" 39%|###9 | 7883/20000 [00:48<00:57, 210.45it/s]\n",
|
396 |
+
" 40%|###9 | 7905/20000 [00:48<00:59, 203.50it/s]\n",
|
397 |
+
" 40%|###9 | 7931/20000 [00:49<00:55, 218.40it/s]\n",
|
398 |
+
" 40%|###9 | 7955/20000 [00:49<00:53, 223.84it/s]\n",
|
399 |
+
" 40%|###9 | 7978/20000 [00:49<00:54, 222.42it/s]\n",
|
400 |
+
" 40%|#### | 8001/20000 [00:49<00:56, 211.24it/s]\n",
|
401 |
+
" 40%|#### | 8028/20000 [00:49<00:52, 226.96it/s]\n",
|
402 |
+
" 40%|#### | 8052/20000 [00:49<00:52, 229.36it/s]\n",
|
403 |
+
" 40%|#### | 8076/20000 [00:49<00:52, 226.55it/s]\n",
|
404 |
+
" 40%|#### | 8099/20000 [00:49<00:54, 217.59it/s]\n",
|
405 |
+
" 41%|#### | 8121/20000 [00:50<01:26, 136.56it/s]\n",
|
406 |
+
" 41%|#### | 8144/20000 [00:50<01:16, 154.69it/s]\n",
|
407 |
+
" 41%|#### | 8165/20000 [00:50<01:11, 165.14it/s]\n",
|
408 |
+
" 41%|#### | 8186/20000 [00:50<01:07, 174.19it/s]\n",
|
409 |
+
" 41%|####1 | 8206/20000 [00:50<01:05, 179.79it/s]\n",
|
410 |
+
" 41%|####1 | 8234/20000 [00:50<00:57, 205.18it/s]\n",
|
411 |
+
" 41%|####1 | 8259/20000 [00:50<00:54, 215.64it/s]\n",
|
412 |
+
" 41%|####1 | 8282/20000 [00:50<00:53, 219.03it/s]\n",
|
413 |
+
" 42%|####1 | 8305/20000 [00:50<00:55, 209.63it/s]\n",
|
414 |
+
" 42%|####1 | 8334/20000 [00:51<00:50, 229.98it/s]\n",
|
415 |
+
" 42%|####1 | 8359/20000 [00:51<00:49, 234.96it/s]\n",
|
416 |
+
" 42%|####1 | 8383/20000 [00:51<00:50, 230.45it/s]\n",
|
417 |
+
" 42%|####2 | 8407/20000 [00:51<00:52, 222.38it/s]\n",
|
418 |
+
" 42%|####2 | 8436/20000 [00:51<00:48, 240.62it/s]\n",
|
419 |
+
" 42%|####2 | 8461/20000 [00:51<00:47, 242.60it/s]\n",
|
420 |
+
" 42%|####2 | 8486/20000 [00:51<00:47, 239.88it/s]\n",
|
421 |
+
" 43%|####2 | 8511/20000 [00:51<00:48, 236.67it/s]\n",
|
422 |
+
" 43%|####2 | 8539/20000 [00:51<00:46, 247.65it/s]\n",
|
423 |
+
" 43%|####2 | 8564/20000 [00:51<00:46, 244.78it/s]\n",
|
424 |
+
" 43%|####2 | 8589/20000 [00:52<00:48, 236.04it/s]\n",
|
425 |
+
" 43%|####3 | 8613/20000 [00:52<00:48, 235.15it/s]\n",
|
426 |
+
" 43%|####3 | 8643/20000 [00:52<00:45, 252.29it/s]\n",
|
427 |
+
" 43%|####3 | 8669/20000 [00:52<00:45, 248.06it/s]\n",
|
428 |
+
" 43%|####3 | 8694/20000 [00:52<00:47, 240.33it/s]\n",
|
429 |
+
" 44%|####3 | 8720/20000 [00:52<00:46, 243.84it/s]\n",
|
430 |
+
" 44%|####3 | 8748/20000 [00:52<00:44, 254.21it/s]\n",
|
431 |
+
" 44%|####3 | 8777/20000 [00:52<00:42, 263.08it/s]\n",
|
432 |
+
" 44%|####4 | 8808/20000 [00:52<00:40, 276.78it/s]\n",
|
433 |
+
" 44%|####4 | 8856/20000 [00:53<00:33, 336.62it/s]\n",
|
434 |
+
" 44%|####4 | 8896/20000 [00:53<00:31, 353.29it/s]\n",
|
435 |
+
" 45%|####4 | 8955/20000 [00:53<00:26, 422.10it/s]\n",
|
436 |
+
" 45%|####5 | 9001/20000 [00:53<00:25, 425.76it/s]\n",
|
437 |
+
" 45%|####5 | 9070/20000 [00:53<00:21, 501.92it/s]\n",
|
438 |
+
" 46%|####5 | 9128/20000 [00:53<00:20, 523.43it/s]\n",
|
439 |
+
" 46%|####5 | 9183/20000 [00:53<00:20, 529.69it/s]\n",
|
440 |
+
" 46%|####6 | 9237/20000 [00:53<00:20, 531.16it/s]\n",
|
441 |
+
" 46%|####6 | 9291/20000 [00:53<00:21, 508.14it/s]\n",
|
442 |
+
" 47%|####6 | 9346/20000 [00:53<00:20, 520.17it/s]\n",
|
443 |
+
" 47%|####6 | 9399/20000 [00:54<00:20, 509.66it/s]\n",
|
444 |
+
" 47%|####7 | 9451/20000 [00:54<00:20, 509.69it/s]\n",
|
445 |
+
" 48%|####7 | 9503/20000 [00:54<00:21, 494.01it/s]\n",
|
446 |
+
" 48%|####7 | 9580/20000 [00:54<00:18, 569.61it/s]\n",
|
447 |
+
" 48%|####8 | 9649/20000 [00:54<00:17, 602.61it/s]\n",
|
448 |
+
" 49%|####8 | 9710/20000 [00:54<00:17, 597.78it/s]\n",
|
449 |
+
" 49%|####8 | 9792/20000 [00:54<00:15, 660.63it/s]\n",
|
450 |
+
" 49%|####9 | 9874/20000 [00:54<00:14, 707.37it/s]\n",
|
451 |
+
" 50%|####9 | 9946/20000 [00:54<00:14, 698.70it/s]\n",
|
452 |
+
" 50%|##### | 10029/20000 [00:54<00:13, 735.01it/s]\n",
|
453 |
+
" 51%|##### | 10103/20000 [00:55<00:14, 703.33it/s]\n",
|
454 |
+
" 51%|##### | 10187/20000 [00:55<00:13, 742.49it/s]\n",
|
455 |
+
" 51%|#####1 | 10275/20000 [00:55<00:12, 782.42it/s]\n",
|
456 |
+
" 52%|#####1 | 10372/20000 [00:55<00:11, 837.41it/s]\n",
|
457 |
+
" 52%|#####2 | 10463/20000 [00:55<00:11, 858.83it/s]\n",
|
458 |
+
" 53%|#####2 | 10550/20000 [00:55<00:10, 862.08it/s]\n",
|
459 |
+
" 53%|#####3 | 10640/20000 [00:55<00:10, 873.35it/s]\n",
|
460 |
+
" 54%|#####3 | 10728/20000 [00:55<00:10, 857.41it/s]\n",
|
461 |
+
" 54%|#####4 | 10815/20000 [00:55<00:10, 858.61it/s]\n",
|
462 |
+
" 55%|#####4 | 10902/20000 [00:56<00:10, 861.95it/s]\n",
|
463 |
+
" 55%|#####5 | 11034/20000 [00:56<00:08, 997.87it/s]\n",
|
464 |
+
" 56%|#####5 | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
|
465 |
+
" 56%|#####6 | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
|
466 |
+
" 57%|#####7 | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
|
467 |
+
" 58%|#####7 | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
|
468 |
+
" 59%|#####8 | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
|
469 |
+
" 60%|#####9 | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
|
470 |
+
" 61%|###### | 12137/20000 [00:58<00:32, 241.50it/s] \n",
|
471 |
+
" 61%|######1 | 12256/20000 [00:59<00:34, 221.55it/s]\n",
|
472 |
+
" 62%|######1 | 12344/20000 [00:59<00:36, 211.18it/s]\n",
|
473 |
+
" 62%|######2 | 12411/20000 [01:00<00:37, 204.44it/s]\n",
|
474 |
+
" 62%|######2 | 12464/20000 [01:00<00:37, 201.86it/s]\n",
|
475 |
+
" 63%|######2 | 12507/20000 [01:00<00:37, 197.48it/s]\n",
|
476 |
+
" 63%|######2 | 12542/20000 [01:00<00:35, 209.76it/s]\n",
|
477 |
+
" 63%|######2 | 12576/20000 [01:01<00:33, 220.35it/s]\n",
|
478 |
+
" 63%|######3 | 12609/20000 [01:01<00:32, 226.03it/s]\n",
|
479 |
+
" 63%|######3 | 12640/20000 [01:01<00:31, 234.04it/s]\n",
|
480 |
+
" 63%|######3 | 12670/20000 [01:01<00:29, 246.31it/s]\n",
|
481 |
+
" 64%|######3 | 12700/20000 [01:01<00:28, 251.76it/s]\n",
|
482 |
+
" 64%|######3 | 12731/20000 [01:01<00:27, 263.76it/s]\n",
|
483 |
+
" 64%|######3 | 12761/20000 [01:01<00:26, 272.59it/s]\n",
|
484 |
+
" 64%|######3 | 12791/20000 [01:01<00:26, 271.94it/s]\n",
|
485 |
+
" 64%|######4 | 12820/20000 [01:01<00:26, 274.53it/s]\n",
|
486 |
+
" 64%|######4 | 12850/20000 [01:02<00:25, 281.46it/s]\n",
|
487 |
+
" 64%|######4 | 12879/20000 [01:02<00:25, 277.63it/s]\n",
|
488 |
+
" 65%|######4 | 12908/20000 [01:02<00:26, 264.03it/s]\n",
|
489 |
+
" 65%|######4 | 12939/20000 [01:02<00:25, 276.61it/s]\n",
|
490 |
+
" 65%|######4 | 12969/20000 [01:02<00:24, 281.60it/s]\n",
|
491 |
+
" 65%|######4 | 12998/20000 [01:02<00:24, 280.78it/s]\n",
|
492 |
+
" 65%|######5 | 13027/20000 [01:02<00:25, 275.51it/s]\n",
|
493 |
+
" 65%|######5 | 13058/20000 [01:02<00:24, 285.34it/s]\n",
|
494 |
+
" 65%|######5 | 13087/20000 [01:02<00:24, 285.03it/s]\n",
|
495 |
+
" 66%|######5 | 13117/20000 [01:03<00:23, 287.71it/s]\n",
|
496 |
+
" 66%|######5 | 13151/20000 [01:03<00:22, 301.25it/s]\n",
|
497 |
+
" 66%|######5 | 13182/20000 [01:03<00:22, 299.42it/s]\n",
|
498 |
+
" 66%|######6 | 13213/20000 [01:03<00:23, 288.18it/s]\n",
|
499 |
+
" 66%|######6 | 13247/20000 [01:03<00:22, 302.07it/s]\n",
|
500 |
+
" 66%|######6 | 13280/20000 [01:03<00:21, 309.23it/s]\n",
|
501 |
+
" 67%|######6 | 13312/20000 [01:03<00:21, 306.12it/s]\n",
|
502 |
+
" 67%|######6 | 13348/20000 [01:03<00:20, 321.72it/s]\n",
|
503 |
+
" 67%|######6 | 13381/20000 [01:03<00:20, 320.39it/s]\n",
|
504 |
+
" 67%|######7 | 13414/20000 [01:04<00:35, 183.90it/s]\n",
|
505 |
+
" 67%|######7 | 13448/20000 [01:04<00:30, 213.47it/s]\n",
|
506 |
+
" 67%|######7 | 13478/20000 [01:04<00:28, 232.06it/s]\n",
|
507 |
+
" 68%|######7 | 13508/20000 [01:04<00:26, 246.85it/s]\n",
|
508 |
+
" 68%|######7 | 13546/20000 [01:04<00:23, 278.79it/s]\n",
|
509 |
+
" 68%|######7 | 13578/20000 [01:04<00:22, 289.60it/s]\n",
|
510 |
+
" 68%|######8 | 13610/20000 [01:04<00:21, 290.75it/s]\n",
|
511 |
+
" 68%|######8 | 13650/20000 [01:04<00:19, 319.96it/s]\n",
|
512 |
+
" 68%|######8 | 13684/20000 [01:05<00:19, 322.87it/s]\n",
|
513 |
+
" 69%|######8 | 13718/20000 [01:05<00:19, 324.97it/s]\n",
|
514 |
+
" 69%|######8 | 13753/20000 [01:05<00:18, 332.16it/s]\n",
|
515 |
+
" 69%|######8 | 13787/20000 [01:05<00:19, 323.16it/s]\n",
|
516 |
+
" 69%|######9 | 13820/20000 [01:05<00:19, 317.82it/s]\n",
|
517 |
+
" 69%|######9 | 13857/20000 [01:05<00:18, 332.74it/s]\n",
|
518 |
+
" 69%|######9 | 13891/20000 [01:05<00:18, 333.86it/s]\n",
|
519 |
+
" 70%|######9 | 13927/20000 [01:05<00:17, 340.50it/s]\n",
|
520 |
+
" 70%|######9 | 13963/20000 [01:05<00:17, 345.20it/s]\n",
|
521 |
+
" 70%|######9 | 13998/20000 [01:05<00:17, 340.60it/s]\n",
|
522 |
+
" 70%|####### | 14036/20000 [01:06<00:16, 351.09it/s]\n",
|
523 |
+
" 70%|####### | 14073/20000 [01:06<00:16, 356.65it/s]\n",
|
524 |
+
" 71%|####### | 14109/20000 [01:06<00:16, 353.45it/s]\n",
|
525 |
+
" 71%|####### | 14150/20000 [01:06<00:15, 369.02it/s]\n",
|
526 |
+
" 71%|####### | 14187/20000 [01:06<00:15, 368.21it/s]\n",
|
527 |
+
" 71%|#######1 | 14227/20000 [01:06<00:15, 375.42it/s]\n",
|
528 |
+
" 71%|#######1 | 14265/20000 [01:06<00:16, 345.08it/s]\n",
|
529 |
+
" 72%|#######1 | 14301/20000 [01:06<00:16, 347.30it/s]\n",
|
530 |
+
" 72%|#######1 | 14349/20000 [01:06<00:14, 383.90it/s]\n",
|
531 |
+
" 72%|#######1 | 14388/20000 [01:06<00:14, 376.96it/s]\n",
|
532 |
+
" 72%|#######2 | 14430/20000 [01:07<00:14, 389.28it/s]\n",
|
533 |
+
" 72%|#######2 | 14471/20000 [01:07<00:13, 395.30it/s]\n",
|
534 |
+
" 73%|#######2 | 14511/20000 [01:07<00:14, 389.82it/s]\n",
|
535 |
+
" 73%|#######2 | 14554/20000 [01:07<00:13, 401.53it/s]\n",
|
536 |
+
" 73%|#######2 | 14595/20000 [01:07<00:14, 378.41it/s]\n",
|
537 |
+
" 73%|#######3 | 14643/20000 [01:07<00:13, 405.95it/s]\n",
|
538 |
+
" 73%|#######3 | 14687/20000 [01:07<00:12, 415.69it/s]\n",
|
539 |
+
" 74%|#######3 | 14730/20000 [01:07<00:12, 418.62it/s]\n",
|
540 |
+
" 74%|#######3 | 14774/20000 [01:07<00:12, 422.40it/s]\n",
|
541 |
+
" 74%|#######4 | 14817/20000 [01:08<00:12, 418.48it/s]\n",
|
542 |
+
" 74%|#######4 | 14868/20000 [01:08<00:11, 443.95it/s]\n",
|
543 |
+
" 75%|#######4 | 14913/20000 [01:08<00:11, 444.41it/s]\n",
|
544 |
+
" 75%|#######4 | 14962/20000 [01:08<00:11, 457.86it/s]\n",
|
545 |
+
" 75%|#######5 | 15008/20000 [01:08<00:11, 438.97it/s]\n",
|
546 |
+
" 75%|#######5 | 15067/20000 [01:08<00:10, 481.14it/s]\n",
|
547 |
+
" 76%|#######5 | 15116/20000 [01:08<00:10, 483.71it/s]\n",
|
548 |
+
" 76%|#######5 | 15173/20000 [01:08<00:09, 509.06it/s]\n",
|
549 |
+
" 76%|#######6 | 15227/20000 [01:08<00:09, 518.19it/s]\n",
|
550 |
+
" 76%|#######6 | 15285/20000 [01:08<00:08, 534.95it/s]\n",
|
551 |
+
" 77%|#######6 | 15351/20000 [01:09<00:08, 570.41it/s]\n",
|
552 |
+
" 77%|#######7 | 15409/20000 [01:09<00:08, 569.86it/s]\n",
|
553 |
+
" 77%|#######7 | 15477/20000 [01:09<00:07, 602.56it/s]\n",
|
554 |
+
" 78%|#######7 | 15538/20000 [01:09<00:07, 602.96it/s]\n",
|
555 |
+
" 78%|#######7 | 15599/20000 [01:09<00:07, 585.87it/s]\n",
|
556 |
+
" 78%|#######8 | 15658/20000 [01:09<00:07, 581.97it/s]\n",
|
557 |
+
" 79%|#######8 | 15722/20000 [01:09<00:07, 598.93it/s]\n",
|
558 |
+
" 79%|#######8 | 15799/20000 [01:09<00:06, 647.41it/s]\n",
|
559 |
+
" 79%|#######9 | 15877/20000 [01:09<00:06, 684.57it/s]\n",
|
560 |
+
" 80%|#######9 | 15957/20000 [01:09<00:05, 718.72it/s]\n",
|
561 |
+
" 80%|######## | 16037/20000 [01:10<00:05, 740.70it/s]\n",
|
562 |
+
" 81%|######## | 16112/20000 [01:10<00:05, 730.42it/s]\n",
|
563 |
+
" 81%|######## | 16195/20000 [01:10<00:05, 757.50it/s]\n",
|
564 |
+
" 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
|
565 |
+
" 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
|
566 |
+
" 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
|
567 |
+
" 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
|
568 |
+
" 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
|
569 |
+
" 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
|
570 |
+
" 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
|
571 |
+
" 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
|
572 |
+
" 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
|
573 |
+
" 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
|
574 |
+
" 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
|
575 |
+
" 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
|
576 |
+
" 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
|
577 |
+
" 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
|
578 |
+
" 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
|
579 |
+
" 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
|
580 |
+
" 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
|
581 |
+
" 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
|
582 |
+
" 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
|
583 |
+
" 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
|
584 |
+
" 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
|
585 |
+
" 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
|
586 |
+
" 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
|
587 |
+
" 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
|
588 |
+
" 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
|
589 |
+
" 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
|
590 |
+
" 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
|
591 |
+
" 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
|
592 |
+
" 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
|
593 |
+
" 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
|
594 |
+
" 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
|
595 |
+
" 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
|
596 |
+
" 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
|
597 |
+
" 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
|
598 |
+
" 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
|
599 |
+
" 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
|
600 |
+
" 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
|
601 |
+
" 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
|
602 |
+
" 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
|
603 |
+
" 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
|
604 |
+
" 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
|
605 |
+
" 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
|
606 |
+
" 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
|
607 |
+
" 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
|
608 |
+
" 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
|
609 |
+
" 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
|
610 |
+
" 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
|
611 |
+
" 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
|
612 |
+
" 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
|
613 |
+
" 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
|
614 |
+
" 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
|
615 |
+
" 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
|
616 |
+
" 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
|
617 |
+
" 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
|
618 |
+
" 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
|
619 |
+
" 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
|
620 |
+
"100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
|
621 |
+
"100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
|
622 |
+
"100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
|
623 |
+
"100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
|
624 |
+
"100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
|
625 |
+
"100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
|
626 |
+
]
|
627 |
+
}
|
628 |
+
],
|
629 |
+
"source": [
|
630 |
+
"!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
|
631 |
+
]
|
632 |
+
},
|
633 |
+
{
|
634 |
+
"cell_type": "code",
|
635 |
+
"execution_count": 12,
|
636 |
+
"id": "68a4113a",
|
637 |
+
"metadata": {},
|
638 |
+
"outputs": [],
|
639 |
+
"source": [
|
640 |
+
"!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
|
641 |
+
]
|
642 |
+
},
|
643 |
+
{
|
644 |
+
"cell_type": "code",
|
645 |
+
"execution_count": 13,
|
646 |
+
"id": "06254f0d",
|
647 |
+
"metadata": {},
|
648 |
+
"outputs": [
|
649 |
+
{
|
650 |
+
"name": "stdout",
|
651 |
+
"output_type": "stream",
|
652 |
+
"text": [
|
653 |
+
"Vocabulary size: 20217\n"
|
654 |
+
]
|
655 |
+
}
|
656 |
+
],
|
657 |
+
"source": [
|
658 |
+
"def count_tokens(file_path):\n",
|
659 |
+
" try:\n",
|
660 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
661 |
+
" text = file.read()\n",
|
662 |
+
" # Split the text into tokens based on spaces\n",
|
663 |
+
" tokens = text.split()\n",
|
664 |
+
" # Count the vocabulary size (number of unique tokens)\n",
|
665 |
+
" vocabulary_size = len(set(tokens))\n",
|
666 |
+
" return vocabulary_size\n",
|
667 |
+
" except IOError:\n",
|
668 |
+
" print(f\"Error: Could not open or read the file '{file_path}'\")\n",
|
669 |
+
" return -1\n",
|
670 |
+
"\n",
|
671 |
+
"# Example usage\n",
|
672 |
+
"file_path = './dataset/output_dataset.txt' # Replace with the actual file path\n",
|
673 |
+
"vocabulary_size = count_tokens(file_path)\n",
|
674 |
+
"if vocabulary_size != -1:\n",
|
675 |
+
" print(f\"Vocabulary size: {vocabulary_size}\")\n"
|
676 |
+
]
|
677 |
+
}
|
678 |
+
],
|
679 |
+
"metadata": {
|
680 |
+
"kernelspec": {
|
681 |
+
"display_name": "Python 3 (ipykernel)",
|
682 |
+
"language": "python",
|
683 |
+
"name": "python3"
|
684 |
+
},
|
685 |
+
"language_info": {
|
686 |
+
"codemirror_mode": {
|
687 |
+
"name": "ipython",
|
688 |
+
"version": 3
|
689 |
+
},
|
690 |
+
"file_extension": ".py",
|
691 |
+
"mimetype": "text/x-python",
|
692 |
+
"name": "python",
|
693 |
+
"nbconvert_exporter": "python",
|
694 |
+
"pygments_lexer": "ipython3",
|
695 |
+
"version": "3.9.5"
|
696 |
+
}
|
697 |
+
},
|
698 |
+
"nbformat": 4,
|
699 |
+
"nbformat_minor": 5
|
700 |
+
}
|
subword/__init__.py
ADDED
File without changes
|
subword/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (166 Bytes). View file
|
|
subword/__pycache__/apply_bpe.cpython-39.pyc
ADDED
Binary file (13.4 kB). View file
|
|
subword/apply_bpe.py
ADDED
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
"""Use operations learned with learn_bpe.py to encode a new text.
|
6 |
+
The text will not be smaller, but use only a fixed vocabulary, with rare words
|
7 |
+
encoded as variable-length sequences of subword units.
|
8 |
+
|
9 |
+
Reference:
|
10 |
+
Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
|
11 |
+
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
|
12 |
+
"""
|
13 |
+
|
14 |
+
from __future__ import unicode_literals, division
|
15 |
+
|
16 |
+
import sys
|
17 |
+
import os
|
18 |
+
import inspect
|
19 |
+
import codecs
|
20 |
+
import io
|
21 |
+
import argparse
|
22 |
+
import re
|
23 |
+
import warnings
|
24 |
+
import random
|
25 |
+
import tempfile
|
26 |
+
from multiprocessing import Pool, cpu_count
|
27 |
+
|
28 |
+
# hack for python2/3 compatibility
|
29 |
+
from io import open
|
30 |
+
argparse.open = open
|
31 |
+
|
32 |
+
class BPE(object):
|
33 |
+
|
34 |
+
def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):
|
35 |
+
|
36 |
+
codes.seek(0)
|
37 |
+
offset=1
|
38 |
+
|
39 |
+
# check version information
|
40 |
+
firstline = codes.readline()
|
41 |
+
if firstline.startswith('#version:'):
|
42 |
+
self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
|
43 |
+
offset += 1
|
44 |
+
else:
|
45 |
+
self.version = (0, 1)
|
46 |
+
codes.seek(0)
|
47 |
+
|
48 |
+
self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes.read().rstrip('\n').split('\n')) if (n < merges or merges == -1)]
|
49 |
+
|
50 |
+
for i, item in enumerate(self.bpe_codes):
|
51 |
+
if len(item) != 2:
|
52 |
+
sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item)))
|
53 |
+
sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n')
|
54 |
+
sys.exit(1)
|
55 |
+
|
56 |
+
# some hacking to deal with duplicates (only consider first instance)
|
57 |
+
self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
|
58 |
+
|
59 |
+
self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])
|
60 |
+
|
61 |
+
self.separator = separator
|
62 |
+
|
63 |
+
self.vocab = vocab
|
64 |
+
|
65 |
+
self.glossaries = glossaries if glossaries else []
|
66 |
+
|
67 |
+
self.glossaries_regex = re.compile('^({})$'.format('|'.join(glossaries))) if glossaries else None
|
68 |
+
|
69 |
+
self.cache = {}
|
70 |
+
|
71 |
+
def process_lines(self, filename, outfile, dropout=0, num_workers=1):
|
72 |
+
|
73 |
+
if sys.version_info < (3, 0):
|
74 |
+
print("Parallel mode is only supported in Python3.")
|
75 |
+
sys.exit(1)
|
76 |
+
|
77 |
+
if num_workers == 1:
|
78 |
+
_process_lines(self, filename, outfile, dropout, 0, 0)
|
79 |
+
elif num_workers > 1:
|
80 |
+
with open(filename, encoding="utf-8") as f:
|
81 |
+
size = os.fstat(f.fileno()).st_size
|
82 |
+
chunk_size = int(size / num_workers)
|
83 |
+
offsets = [0 for _ in range(num_workers + 1)]
|
84 |
+
for i in range(1, num_workers):
|
85 |
+
f.seek(chunk_size * i)
|
86 |
+
pos = f.tell()
|
87 |
+
while True:
|
88 |
+
try:
|
89 |
+
line = f.readline()
|
90 |
+
break
|
91 |
+
except UnicodeDecodeError:
|
92 |
+
pos -= 1
|
93 |
+
f.seek(pos)
|
94 |
+
offsets[i] = f.tell()
|
95 |
+
assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
|
96 |
+
res_files = []
|
97 |
+
pool = Pool(processes=num_workers)
|
98 |
+
for i in range(num_workers):
|
99 |
+
tmp = tempfile.NamedTemporaryFile(delete=False)
|
100 |
+
tmp.close()
|
101 |
+
res_files.append(tmp)
|
102 |
+
pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
|
103 |
+
pool.close()
|
104 |
+
pool.join()
|
105 |
+
for i in range(num_workers):
|
106 |
+
with open(res_files[i].name, encoding="utf-8") as fi:
|
107 |
+
for line in fi:
|
108 |
+
outfile.write(line)
|
109 |
+
os.remove(res_files[i].name)
|
110 |
+
else:
|
111 |
+
raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
|
112 |
+
|
113 |
+
def process_line(self, line, dropout=0):
|
114 |
+
"""segment line, dealing with leading and trailing whitespace"""
|
115 |
+
|
116 |
+
out = ""
|
117 |
+
|
118 |
+
leading_whitespace = len(line)-len(line.lstrip('\r\n '))
|
119 |
+
if leading_whitespace:
|
120 |
+
out += line[:leading_whitespace]
|
121 |
+
|
122 |
+
out += self.segment(line, dropout)
|
123 |
+
|
124 |
+
trailing_whitespace = len(line)-len(line.rstrip('\r\n '))
|
125 |
+
if trailing_whitespace and trailing_whitespace != len(line):
|
126 |
+
out += line[-trailing_whitespace:]
|
127 |
+
|
128 |
+
return out
|
129 |
+
|
130 |
+
def segment(self, sentence, dropout=0):
|
131 |
+
"""segment single sentence (whitespace-tokenized string) with BPE encoding"""
|
132 |
+
segments = self.segment_tokens(sentence.strip('\r\n ').split(' '), dropout)
|
133 |
+
return ' '.join(segments)
|
134 |
+
|
135 |
+
def segment_tokens(self, tokens, dropout=0):
|
136 |
+
"""segment a sequence of tokens with BPE encoding"""
|
137 |
+
output = []
|
138 |
+
for word in tokens:
|
139 |
+
# eliminate double spaces
|
140 |
+
if not word:
|
141 |
+
continue
|
142 |
+
new_word = [out for segment in self._isolate_glossaries(word)
|
143 |
+
for out in encode(segment,
|
144 |
+
self.bpe_codes,
|
145 |
+
self.bpe_codes_reverse,
|
146 |
+
self.vocab,
|
147 |
+
self.separator,
|
148 |
+
self.version,
|
149 |
+
self.cache,
|
150 |
+
self.glossaries_regex,
|
151 |
+
dropout)]
|
152 |
+
|
153 |
+
for item in new_word[:-1]:
|
154 |
+
output.append(item + self.separator)
|
155 |
+
output.append(new_word[-1])
|
156 |
+
|
157 |
+
return output
|
158 |
+
|
159 |
+
def _isolate_glossaries(self, word):
|
160 |
+
word_segments = [word]
|
161 |
+
for gloss in self.glossaries:
|
162 |
+
word_segments = [out_segments for segment in word_segments
|
163 |
+
for out_segments in isolate_glossary(segment, gloss)]
|
164 |
+
return word_segments
|
165 |
+
|
166 |
+
def _process_lines(bpe, filename, outfile, dropout, begin, end):
|
167 |
+
if isinstance(outfile, str):
|
168 |
+
fo = open(outfile, "w", encoding="utf-8")
|
169 |
+
else:
|
170 |
+
fo = outfile
|
171 |
+
with open(filename, encoding="utf-8") as f:
|
172 |
+
f.seek(begin)
|
173 |
+
line = f.readline()
|
174 |
+
while line:
|
175 |
+
pos = f.tell()
|
176 |
+
assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
|
177 |
+
if end > 0 and pos > end:
|
178 |
+
break
|
179 |
+
fo.write(bpe.process_line(line, dropout))
|
180 |
+
line = f.readline()
|
181 |
+
if isinstance(outfile, str):
|
182 |
+
fo.close()
|
183 |
+
|
184 |
+
def create_parser(subparsers=None):
|
185 |
+
|
186 |
+
if subparsers:
|
187 |
+
parser = subparsers.add_parser('apply-bpe',
|
188 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
189 |
+
description="learn BPE-based word segmentation")
|
190 |
+
else:
|
191 |
+
parser = argparse.ArgumentParser(
|
192 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
193 |
+
description="learn BPE-based word segmentation")
|
194 |
+
|
195 |
+
parser.add_argument(
|
196 |
+
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
197 |
+
metavar='PATH',
|
198 |
+
help="Input file (default: standard input).")
|
199 |
+
parser.add_argument(
|
200 |
+
'--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
|
201 |
+
required=True,
|
202 |
+
help="File with BPE codes (created by learn_bpe.py).")
|
203 |
+
parser.add_argument(
|
204 |
+
'--merges', '-m', type=int, default=-1,
|
205 |
+
metavar='INT',
|
206 |
+
help="Use this many BPE operations (<= number of learned symbols)"+
|
207 |
+
"default: Apply all the learned merge operations")
|
208 |
+
parser.add_argument(
|
209 |
+
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
210 |
+
metavar='PATH',
|
211 |
+
help="Output file (default: standard output)")
|
212 |
+
parser.add_argument(
|
213 |
+
'--separator', '-s', type=str, default='@@', metavar='STR',
|
214 |
+
help="Separator between non-final subword units (default: '%(default)s'))")
|
215 |
+
parser.add_argument(
|
216 |
+
'--vocabulary', type=argparse.FileType('r'), default=None,
|
217 |
+
metavar="PATH",
|
218 |
+
help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.")
|
219 |
+
parser.add_argument(
|
220 |
+
'--vocabulary-threshold', type=int, default=None,
|
221 |
+
metavar="INT",
|
222 |
+
help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV")
|
223 |
+
parser.add_argument(
|
224 |
+
'--dropout', type=float, default=0,
|
225 |
+
metavar="P",
|
226 |
+
help="Dropout BPE merge operations with probability P (Provilkov et al., 2019). Use this on training data only.")
|
227 |
+
parser.add_argument(
|
228 |
+
'--glossaries', type=str, nargs='+', default=None,
|
229 |
+
metavar="STR",
|
230 |
+
help="Glossaries. Words matching any of the words/regex provided in glossaries will not be affected "+
|
231 |
+
"by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords. "+
|
232 |
+
"Can be provided as a list of words/regex after the --glossaries argument. Enclose each regex in quotes.")
|
233 |
+
parser.add_argument(
|
234 |
+
'--seed', type=int, default=None,
|
235 |
+
metavar="S",
|
236 |
+
help="Random seed for the random number generators (e.g. for BPE dropout with --dropout).")
|
237 |
+
parser.add_argument(
|
238 |
+
'--num-workers', type=int, default=1,
|
239 |
+
help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
|
240 |
+
|
241 |
+
return parser
|
242 |
+
|
243 |
+
def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries_regex=None, dropout=0):
|
244 |
+
"""Encode word based on list of BPE merge operations, which are applied consecutively
|
245 |
+
"""
|
246 |
+
|
247 |
+
if not dropout and orig in cache:
|
248 |
+
return cache[orig]
|
249 |
+
|
250 |
+
if glossaries_regex and glossaries_regex.match(orig):
|
251 |
+
cache[orig] = (orig,)
|
252 |
+
return (orig,)
|
253 |
+
|
254 |
+
if len(orig) == 1:
|
255 |
+
return orig
|
256 |
+
|
257 |
+
if version == (0, 1):
|
258 |
+
word = list(orig) + ['</w>']
|
259 |
+
elif version == (0, 2): # more consistent handling of word-final segments
|
260 |
+
word = list(orig[:-1]) + [orig[-1] + '</w>']
|
261 |
+
else:
|
262 |
+
raise NotImplementedError
|
263 |
+
|
264 |
+
while len(word) > 1:
|
265 |
+
|
266 |
+
# get list of symbol pairs; optionally apply dropout
|
267 |
+
pairs = [(bpe_codes[pair],i,pair) for (i,pair) in enumerate(zip(word, word[1:])) if (not dropout or random.random() > dropout) and pair in bpe_codes]
|
268 |
+
|
269 |
+
if not pairs:
|
270 |
+
break
|
271 |
+
|
272 |
+
#get first merge operation in list of BPE codes
|
273 |
+
bigram = min(pairs)[2]
|
274 |
+
|
275 |
+
# find start position of all pairs that we want to merge
|
276 |
+
positions = [i for (rank,i,pair) in pairs if pair == bigram]
|
277 |
+
|
278 |
+
i = 0
|
279 |
+
new_word = []
|
280 |
+
bigram = ''.join(bigram)
|
281 |
+
for j in positions:
|
282 |
+
# merges are invalid if they start before current position. This can happen if there are overlapping pairs: (x x x -> xx x)
|
283 |
+
if j < i:
|
284 |
+
continue
|
285 |
+
new_word.extend(word[i:j]) # all symbols before merged pair
|
286 |
+
new_word.append(bigram) # merged pair
|
287 |
+
i = j+2 # continue after merged pair
|
288 |
+
new_word.extend(word[i:]) # add all symbols until end of word
|
289 |
+
word = new_word
|
290 |
+
|
291 |
+
# don't print end-of-word symbols
|
292 |
+
if word[-1] == '</w>':
|
293 |
+
word = word[:-1]
|
294 |
+
elif word[-1].endswith('</w>'):
|
295 |
+
word[-1] = word[-1][:-4]
|
296 |
+
|
297 |
+
word = tuple(word)
|
298 |
+
if vocab:
|
299 |
+
word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator)
|
300 |
+
|
301 |
+
cache[orig] = word
|
302 |
+
return word
|
303 |
+
|
304 |
+
def recursive_split(segment, bpe_codes, vocab, separator, final=False):
|
305 |
+
"""Recursively split segment into smaller units (by reversing BPE merges)
|
306 |
+
until all units are either in-vocabulary, or cannot be split futher."""
|
307 |
+
|
308 |
+
try:
|
309 |
+
if final:
|
310 |
+
left, right = bpe_codes[segment + '</w>']
|
311 |
+
right = right[:-4]
|
312 |
+
else:
|
313 |
+
left, right = bpe_codes[segment]
|
314 |
+
except:
|
315 |
+
#sys.stderr.write('cannot split {0} further.\n'.format(segment))
|
316 |
+
yield segment
|
317 |
+
return
|
318 |
+
|
319 |
+
if left + separator in vocab:
|
320 |
+
yield left
|
321 |
+
else:
|
322 |
+
for item in recursive_split(left, bpe_codes, vocab, separator, False):
|
323 |
+
yield item
|
324 |
+
|
325 |
+
if (final and right in vocab) or (not final and right + separator in vocab):
|
326 |
+
yield right
|
327 |
+
else:
|
328 |
+
for item in recursive_split(right, bpe_codes, vocab, separator, final):
|
329 |
+
yield item
|
330 |
+
|
331 |
+
def check_vocab_and_split(orig, bpe_codes, vocab, separator):
|
332 |
+
"""Check for each segment in word if it is in-vocabulary,
|
333 |
+
and segment OOV segments into smaller units by reversing the BPE merge operations"""
|
334 |
+
|
335 |
+
out = []
|
336 |
+
|
337 |
+
for segment in orig[:-1]:
|
338 |
+
if segment + separator in vocab:
|
339 |
+
out.append(segment)
|
340 |
+
else:
|
341 |
+
#sys.stderr.write('OOV: {0}\n'.format(segment))
|
342 |
+
for item in recursive_split(segment, bpe_codes, vocab, separator, False):
|
343 |
+
out.append(item)
|
344 |
+
|
345 |
+
segment = orig[-1]
|
346 |
+
if segment in vocab:
|
347 |
+
out.append(segment)
|
348 |
+
else:
|
349 |
+
#sys.stderr.write('OOV: {0}\n'.format(segment))
|
350 |
+
for item in recursive_split(segment, bpe_codes, vocab, separator, True):
|
351 |
+
out.append(item)
|
352 |
+
|
353 |
+
return out
|
354 |
+
|
355 |
+
|
356 |
+
def read_vocabulary(vocab_file, threshold):
|
357 |
+
"""read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
|
358 |
+
"""
|
359 |
+
|
360 |
+
vocabulary = set()
|
361 |
+
|
362 |
+
for line in vocab_file:
|
363 |
+
word, freq = line.strip('\r\n ').split(' ')
|
364 |
+
freq = int(freq)
|
365 |
+
if threshold == None or freq >= threshold:
|
366 |
+
vocabulary.add(word)
|
367 |
+
|
368 |
+
return vocabulary
|
369 |
+
|
370 |
+
def isolate_glossary(word, glossary):
|
371 |
+
"""
|
372 |
+
Isolate a glossary present inside a word.
|
373 |
+
|
374 |
+
Returns a list of subwords. In which all 'glossary' glossaries are isolated
|
375 |
+
|
376 |
+
For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is:
|
377 |
+
['1934', 'USA', 'B', 'USA']
|
378 |
+
"""
|
379 |
+
# regex equivalent of (if word == glossary or glossary not in word)
|
380 |
+
if re.match('^'+glossary+'$', word) or not re.search(glossary, word):
|
381 |
+
return [word]
|
382 |
+
else:
|
383 |
+
segments = re.split(r'({})'.format(glossary), word)
|
384 |
+
segments, ending = segments[:-1], segments[-1]
|
385 |
+
segments = list(filter(None, segments)) # Remove empty strings in regex group.
|
386 |
+
return segments + [ending.strip('\r\n ')] if ending != '' else segments
|
387 |
+
|
388 |
+
if __name__ == '__main__':
|
389 |
+
|
390 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
391 |
+
newdir = os.path.join(currentdir, 'subword_nmt')
|
392 |
+
if os.path.isdir(newdir):
|
393 |
+
warnings.warn(
|
394 |
+
"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
|
395 |
+
DeprecationWarning
|
396 |
+
)
|
397 |
+
|
398 |
+
# python 2/3 compatibility
|
399 |
+
if sys.version_info < (3, 0):
|
400 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
401 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
402 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
403 |
+
else:
|
404 |
+
sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
|
405 |
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
406 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
|
407 |
+
|
408 |
+
parser = create_parser()
|
409 |
+
args = parser.parse_args()
|
410 |
+
|
411 |
+
if args.num_workers <= 0:
|
412 |
+
args.num_workers = cpu_count()
|
413 |
+
|
414 |
+
# read/write files as UTF-8
|
415 |
+
|
416 |
+
args.codes = codecs.open(args.codes.name, encoding='utf-8')
|
417 |
+
if args.input.name != '<stdin>':
|
418 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
419 |
+
if args.output.name != '<stdout>':
|
420 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
421 |
+
if args.vocabulary:
|
422 |
+
args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
|
423 |
+
|
424 |
+
if args.vocabulary:
|
425 |
+
vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
|
426 |
+
else:
|
427 |
+
vocabulary = None
|
428 |
+
|
429 |
+
if sys.version_info < (3, 0):
|
430 |
+
args.separator = args.separator.decode('UTF-8')
|
431 |
+
if args.glossaries:
|
432 |
+
args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
|
433 |
+
if args.num_workers > 1:
|
434 |
+
args.num_workers = 1
|
435 |
+
warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
|
436 |
+
|
437 |
+
if args.seed is not None:
|
438 |
+
random.seed(args.seed)
|
439 |
+
|
440 |
+
bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
|
441 |
+
|
442 |
+
if args.input.name == '<stdin>' or args.num_workers == 1:
|
443 |
+
if args.num_workers > 1:
|
444 |
+
warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
|
445 |
+
for line in args.input:
|
446 |
+
args.output.write(bpe.process_line(line, args.dropout))
|
447 |
+
else:
|
448 |
+
bpe.process_lines(args.input.name, args.output, args.dropout, args.num_workers)
|
449 |
+
|
450 |
+
# close files
|
451 |
+
args.codes.close()
|
452 |
+
if args.input.name != '<stdin>':
|
453 |
+
args.input.close()
|
454 |
+
if args.output.name != '<stdout>':
|
455 |
+
args.output.close()
|
456 |
+
if args.vocabulary:
|
457 |
+
args.vocabulary.close()
|
subword/bpe_toy.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
|
6 |
+
Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
|
7 |
+
of a text to a configurable number of symbols, with only a small increase in the number of tokens.
|
8 |
+
This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
|
9 |
+
indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).
|
10 |
+
|
11 |
+
Reference:
|
12 |
+
Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
|
13 |
+
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
|
14 |
+
"""
|
15 |
+
|
16 |
+
|
17 |
+
import re
|
18 |
+
import sys
|
19 |
+
import collections
|
20 |
+
|
21 |
+
def get_stats(vocab):
|
22 |
+
pairs = collections.defaultdict(int)
|
23 |
+
for word, freq in vocab.items():
|
24 |
+
symbols = word.split()
|
25 |
+
for i in range(len(symbols)-1):
|
26 |
+
pairs[symbols[i],symbols[i+1]] += freq
|
27 |
+
return pairs
|
28 |
+
|
29 |
+
def merge_vocab(pair, v_in):
|
30 |
+
v_out = {}
|
31 |
+
bigram_pattern = re.escape(' '.join(pair))
|
32 |
+
p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
|
33 |
+
for word in v_in:
|
34 |
+
w_out = p.sub(''.join(pair), word)
|
35 |
+
v_out[w_out] = v_in[word]
|
36 |
+
return v_out
|
37 |
+
|
38 |
+
vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2,
|
39 |
+
'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3}
|
40 |
+
num_merges = 15
|
41 |
+
for i in range(num_merges):
|
42 |
+
pairs = get_stats(vocab)
|
43 |
+
try:
|
44 |
+
best = max(pairs, key=pairs.get)
|
45 |
+
except ValueError:
|
46 |
+
break
|
47 |
+
if pairs[best] < 2:
|
48 |
+
sys.stderr.write('no pair has frequency > 1. Stopping\n')
|
49 |
+
break
|
50 |
+
vocab = merge_vocab(best, vocab)
|
51 |
+
print(best)
|
subword/chrF.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
"""Compute chrF3 for machine translation evaluation
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal.
|
9 |
+
"""
|
10 |
+
|
11 |
+
from __future__ import print_function, unicode_literals, division
|
12 |
+
|
13 |
+
import sys
|
14 |
+
import codecs
|
15 |
+
import io
|
16 |
+
import argparse
|
17 |
+
|
18 |
+
from collections import defaultdict
|
19 |
+
|
20 |
+
# hack for python2/3 compatibility
|
21 |
+
from io import open
|
22 |
+
argparse.open = open
|
23 |
+
|
24 |
+
def create_parser():
|
25 |
+
parser = argparse.ArgumentParser(
|
26 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
27 |
+
description="learn BPE-based word segmentation")
|
28 |
+
|
29 |
+
parser.add_argument(
|
30 |
+
'--ref', '-r', type=argparse.FileType('r'), required=True,
|
31 |
+
metavar='PATH',
|
32 |
+
help="Reference file")
|
33 |
+
parser.add_argument(
|
34 |
+
'--hyp', type=argparse.FileType('r'), metavar='PATH',
|
35 |
+
default=sys.stdin,
|
36 |
+
help="Hypothesis file (default: stdin).")
|
37 |
+
parser.add_argument(
|
38 |
+
'--beta', '-b', type=float, default=3,
|
39 |
+
metavar='FLOAT',
|
40 |
+
help="beta parameter (default: '%(default)s')")
|
41 |
+
parser.add_argument(
|
42 |
+
'--ngram', '-n', type=int, default=6,
|
43 |
+
metavar='INT',
|
44 |
+
help="ngram order (default: '%(default)s')")
|
45 |
+
parser.add_argument(
|
46 |
+
'--space', '-s', action='store_true',
|
47 |
+
help="take spaces into account (default: '%(default)s')")
|
48 |
+
parser.add_argument(
|
49 |
+
'--precision', action='store_true',
|
50 |
+
help="report precision (default: '%(default)s')")
|
51 |
+
parser.add_argument(
|
52 |
+
'--recall', action='store_true',
|
53 |
+
help="report recall (default: '%(default)s')")
|
54 |
+
|
55 |
+
return parser
|
56 |
+
|
57 |
+
def extract_ngrams(words, max_length=4, spaces=False):
|
58 |
+
|
59 |
+
if not spaces:
|
60 |
+
words = ''.join(words.split())
|
61 |
+
else:
|
62 |
+
words = words.strip()
|
63 |
+
|
64 |
+
results = defaultdict(lambda: defaultdict(int))
|
65 |
+
for length in range(max_length):
|
66 |
+
for start_pos in range(len(words)):
|
67 |
+
end_pos = start_pos + length + 1
|
68 |
+
if end_pos <= len(words):
|
69 |
+
results[length][tuple(words[start_pos: end_pos])] += 1
|
70 |
+
return results
|
71 |
+
|
72 |
+
|
73 |
+
def get_correct(ngrams_ref, ngrams_test, correct, total):
|
74 |
+
|
75 |
+
for rank in ngrams_test:
|
76 |
+
for chain in ngrams_test[rank]:
|
77 |
+
total[rank] += ngrams_test[rank][chain]
|
78 |
+
if chain in ngrams_ref[rank]:
|
79 |
+
correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain])
|
80 |
+
|
81 |
+
return correct, total
|
82 |
+
|
83 |
+
|
84 |
+
def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0):
|
85 |
+
|
86 |
+
precision = 0
|
87 |
+
recall = 0
|
88 |
+
|
89 |
+
for i in range(max_length):
|
90 |
+
if total_hyp[i] + smooth and total_ref[i] + smooth:
|
91 |
+
precision += (correct[i] + smooth) / (total_hyp[i] + smooth)
|
92 |
+
recall += (correct[i] + smooth) / (total_ref[i] + smooth)
|
93 |
+
|
94 |
+
precision /= max_length
|
95 |
+
recall /= max_length
|
96 |
+
|
97 |
+
return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall
|
98 |
+
|
99 |
+
def main(args):
|
100 |
+
|
101 |
+
correct = [0]*args.ngram
|
102 |
+
total = [0]*args.ngram
|
103 |
+
total_ref = [0]*args.ngram
|
104 |
+
for line in args.ref:
|
105 |
+
line2 = args.hyp.readline()
|
106 |
+
|
107 |
+
ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space)
|
108 |
+
ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space)
|
109 |
+
|
110 |
+
get_correct(ngrams_ref, ngrams_test, correct, total)
|
111 |
+
|
112 |
+
for rank in ngrams_ref:
|
113 |
+
for chain in ngrams_ref[rank]:
|
114 |
+
total_ref[rank] += ngrams_ref[rank][chain]
|
115 |
+
|
116 |
+
chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta)
|
117 |
+
|
118 |
+
print('chrF3: {0:.4f}'.format(chrf))
|
119 |
+
if args.precision:
|
120 |
+
print('chrPrec: {0:.4f}'.format(precision))
|
121 |
+
if args.recall:
|
122 |
+
print('chrRec: {0:.4f}'.format(recall))
|
123 |
+
|
124 |
+
if __name__ == '__main__':
|
125 |
+
|
126 |
+
# python 2/3 compatibility
|
127 |
+
if sys.version_info < (3, 0):
|
128 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
129 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
130 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
131 |
+
else:
|
132 |
+
sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
|
133 |
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
134 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
|
135 |
+
|
136 |
+
parser = create_parser()
|
137 |
+
args = parser.parse_args()
|
138 |
+
|
139 |
+
main(args)
|
subword/dataset/codec.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
subword/encoding.ipynb
ADDED
@@ -0,0 +1,700 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 11,
|
6 |
+
"id": "9644db35",
|
7 |
+
"metadata": {
|
8 |
+
"scrolled": true
|
9 |
+
},
|
10 |
+
"outputs": [
|
11 |
+
{
|
12 |
+
"name": "stderr",
|
13 |
+
"output_type": "stream",
|
14 |
+
"text": [
|
15 |
+
"\n",
|
16 |
+
" 0%| | 0/20000 [00:00<?, ?it/s]\n",
|
17 |
+
" 0%| | 1/20000 [00:00<38:40, 8.62it/s]\n",
|
18 |
+
" 0%| | 2/20000 [00:00<1:31:59, 3.62it/s]\n",
|
19 |
+
" 0%| | 3/20000 [00:00<1:21:11, 4.11it/s]\n",
|
20 |
+
" 0%| | 4/20000 [00:01<1:48:20, 3.08it/s]\n",
|
21 |
+
" 0%| | 6/20000 [00:01<1:03:27, 5.25it/s]\n",
|
22 |
+
" 0%| | 7/20000 [00:01<1:12:17, 4.61it/s]\n",
|
23 |
+
" 0%| | 8/20000 [00:01<1:10:13, 4.74it/s]\n",
|
24 |
+
" 0%| | 10/20000 [00:02<1:09:39, 4.78it/s]\n",
|
25 |
+
" 0%| | 13/20000 [00:02<42:59, 7.75it/s] \n",
|
26 |
+
" 0%| | 16/20000 [00:02<30:25, 10.95it/s]\n",
|
27 |
+
" 0%| | 19/20000 [00:02<24:50, 13.41it/s]\n",
|
28 |
+
" 0%| | 21/20000 [00:03<36:58, 9.01it/s]\n",
|
29 |
+
" 0%| | 23/20000 [00:03<35:20, 9.42it/s]\n",
|
30 |
+
" 0%| | 25/20000 [00:03<31:30, 10.56it/s]\n",
|
31 |
+
" 0%| | 27/20000 [00:03<33:12, 10.03it/s]\n",
|
32 |
+
" 0%| | 29/20000 [00:03<33:30, 9.93it/s]\n",
|
33 |
+
" 0%| | 31/20000 [00:03<35:06, 9.48it/s]\n",
|
34 |
+
" 0%| | 33/20000 [00:04<37:03, 8.98it/s]\n",
|
35 |
+
" 0%| | 37/20000 [00:04<26:59, 12.32it/s]\n",
|
36 |
+
" 0%| | 39/20000 [00:04<26:54, 12.37it/s]\n",
|
37 |
+
" 0%| | 42/20000 [00:04<22:51, 14.55it/s]\n",
|
38 |
+
" 0%| | 46/20000 [00:04<19:15, 17.27it/s]\n",
|
39 |
+
" 0%| | 48/20000 [00:05<22:34, 14.73it/s]\n",
|
40 |
+
" 0%| | 50/20000 [00:05<23:39, 14.06it/s]\n",
|
41 |
+
" 0%| | 52/20000 [00:05<23:30, 14.14it/s]\n",
|
42 |
+
" 0%| | 55/20000 [00:05<20:12, 16.45it/s]\n",
|
43 |
+
" 0%| | 58/20000 [00:05<19:09, 17.35it/s]\n",
|
44 |
+
" 0%| | 60/20000 [00:05<19:25, 17.11it/s]\n",
|
45 |
+
" 0%| | 63/20000 [00:06<28:45, 11.56it/s]\n",
|
46 |
+
" 0%| | 70/20000 [00:06<16:14, 20.45it/s]\n",
|
47 |
+
" 0%| | 75/20000 [00:06<14:15, 23.28it/s]\n",
|
48 |
+
" 0%| | 78/20000 [00:06<14:28, 22.94it/s]\n",
|
49 |
+
" 0%| | 81/20000 [00:06<16:06, 20.62it/s]\n",
|
50 |
+
" 0%| | 85/20000 [00:06<15:13, 21.81it/s]\n",
|
51 |
+
" 0%| | 88/20000 [00:07<15:39, 21.20it/s]\n",
|
52 |
+
" 0%| | 92/20000 [00:07<13:28, 24.61it/s]\n",
|
53 |
+
" 0%| | 95/20000 [00:07<13:32, 24.50it/s]\n",
|
54 |
+
" 0%| | 99/20000 [00:07<12:06, 27.40it/s]\n",
|
55 |
+
" 1%| | 102/20000 [00:07<12:55, 25.65it/s]\n",
|
56 |
+
" 1%| | 105/20000 [00:07<12:46, 25.95it/s]\n",
|
57 |
+
" 1%| | 112/20000 [00:07<12:02, 27.54it/s]\n",
|
58 |
+
" 1%| | 118/20000 [00:08<10:02, 33.00it/s]\n",
|
59 |
+
" 1%| | 122/20000 [00:08<10:12, 32.46it/s]\n",
|
60 |
+
" 1%| | 127/20000 [00:08<10:07, 32.73it/s]\n",
|
61 |
+
" 1%| | 138/20000 [00:08<06:56, 47.66it/s]\n",
|
62 |
+
" 1%| | 144/20000 [00:08<06:47, 48.73it/s]\n",
|
63 |
+
" 1%| | 150/20000 [00:08<07:00, 47.21it/s]\n",
|
64 |
+
" 1%| | 156/20000 [00:08<06:44, 49.01it/s]\n",
|
65 |
+
" 1%| | 162/20000 [00:08<06:47, 48.71it/s]\n",
|
66 |
+
" 1%| | 169/20000 [00:09<06:38, 49.81it/s]\n",
|
67 |
+
" 1%| | 176/20000 [00:09<06:28, 51.03it/s]\n",
|
68 |
+
" 1%| | 184/20000 [00:09<05:58, 55.22it/s]\n",
|
69 |
+
" 1%| | 190/20000 [00:09<08:05, 40.78it/s]\n",
|
70 |
+
" 1%| | 197/20000 [00:09<07:30, 43.91it/s]\n",
|
71 |
+
" 1%|1 | 202/20000 [00:09<07:53, 41.79it/s]\n",
|
72 |
+
" 1%|1 | 210/20000 [00:10<06:51, 48.15it/s]\n",
|
73 |
+
" 1%|1 | 220/20000 [00:10<05:35, 59.01it/s]\n",
|
74 |
+
" 1%|1 | 229/20000 [00:10<05:05, 64.72it/s]\n",
|
75 |
+
" 1%|1 | 236/20000 [00:10<05:37, 58.47it/s]\n",
|
76 |
+
" 1%|1 | 244/20000 [00:10<05:22, 61.18it/s]\n",
|
77 |
+
" 1%|1 | 251/20000 [00:10<05:31, 59.65it/s]\n",
|
78 |
+
" 1%|1 | 259/20000 [00:10<05:11, 63.33it/s]\n",
|
79 |
+
" 1%|1 | 266/20000 [00:10<05:25, 60.70it/s]\n",
|
80 |
+
" 1%|1 | 273/20000 [00:10<05:31, 59.42it/s]\n",
|
81 |
+
" 1%|1 | 282/20000 [00:11<04:57, 66.26it/s]\n",
|
82 |
+
" 1%|1 | 289/20000 [00:11<05:00, 65.52it/s]\n",
|
83 |
+
" 1%|1 | 296/20000 [00:11<05:10, 63.47it/s]\n",
|
84 |
+
" 2%|1 | 303/20000 [00:11<07:06, 46.23it/s]\n",
|
85 |
+
" 2%|1 | 313/20000 [00:11<05:41, 57.60it/s]\n",
|
86 |
+
" 2%|1 | 324/20000 [00:11<04:54, 66.87it/s]\n",
|
87 |
+
" 2%|1 | 335/20000 [00:11<04:16, 76.56it/s]\n",
|
88 |
+
" 2%|1 | 346/20000 [00:11<03:50, 85.09it/s]\n",
|
89 |
+
" 2%|1 | 357/20000 [00:12<03:38, 90.06it/s]\n",
|
90 |
+
" 2%|1 | 367/20000 [00:12<03:46, 86.85it/s]\n",
|
91 |
+
" 2%|1 | 377/20000 [00:12<03:41, 88.70it/s]\n",
|
92 |
+
" 2%|1 | 387/20000 [00:12<03:43, 87.95it/s]\n",
|
93 |
+
" 2%|1 | 396/20000 [00:12<03:49, 85.48it/s]\n",
|
94 |
+
" 2%|2 | 405/20000 [00:12<04:04, 80.15it/s]\n",
|
95 |
+
" 2%|2 | 416/20000 [00:12<03:42, 87.82it/s]\n",
|
96 |
+
" 2%|2 | 429/20000 [00:12<03:19, 98.03it/s]\n",
|
97 |
+
" 2%|2 | 439/20000 [00:13<03:35, 90.74it/s]\n",
|
98 |
+
" 2%|2 | 450/20000 [00:13<03:24, 95.65it/s]\n",
|
99 |
+
" 2%|2 | 462/20000 [00:13<03:13, 100.98it/s]\n",
|
100 |
+
" 2%|2 | 473/20000 [00:13<03:16, 99.60it/s] \n",
|
101 |
+
" 2%|2 | 484/20000 [00:13<03:28, 93.64it/s]\n",
|
102 |
+
" 2%|2 | 494/20000 [00:13<03:30, 92.86it/s]\n",
|
103 |
+
" 3%|2 | 504/20000 [00:13<04:34, 70.99it/s]\n",
|
104 |
+
" 3%|2 | 520/20000 [00:13<03:34, 90.96it/s]\n",
|
105 |
+
" 3%|2 | 534/20000 [00:14<03:12, 101.38it/s]\n",
|
106 |
+
" 3%|2 | 547/20000 [00:14<03:01, 107.03it/s]\n",
|
107 |
+
" 3%|2 | 559/20000 [00:14<03:02, 106.25it/s]\n",
|
108 |
+
" 3%|2 | 571/20000 [00:14<03:14, 99.81it/s] \n",
|
109 |
+
" 3%|2 | 582/20000 [00:14<03:13, 100.11it/s]\n",
|
110 |
+
" 3%|2 | 595/20000 [00:14<03:03, 105.49it/s]\n",
|
111 |
+
" 3%|3 | 606/20000 [00:14<03:07, 103.63it/s]\n",
|
112 |
+
" 3%|3 | 625/20000 [00:14<02:33, 126.08it/s]\n",
|
113 |
+
" 3%|3 | 643/20000 [00:14<02:17, 140.29it/s]\n",
|
114 |
+
" 3%|3 | 658/20000 [00:15<02:23, 135.01it/s]\n",
|
115 |
+
" 3%|3 | 672/20000 [00:15<02:32, 126.59it/s]\n",
|
116 |
+
" 3%|3 | 685/20000 [00:15<02:42, 119.19it/s]\n",
|
117 |
+
" 3%|3 | 698/20000 [00:15<02:46, 116.22it/s]\n",
|
118 |
+
" 4%|3 | 710/20000 [00:15<02:49, 113.91it/s]\n",
|
119 |
+
" 4%|3 | 727/20000 [00:15<02:31, 127.58it/s]\n",
|
120 |
+
" 4%|3 | 744/20000 [00:15<02:18, 139.24it/s]\n",
|
121 |
+
" 4%|3 | 759/20000 [00:15<03:10, 101.19it/s]\n",
|
122 |
+
" 4%|3 | 771/20000 [00:16<03:03, 104.67it/s]\n",
|
123 |
+
" 4%|3 | 783/20000 [00:16<03:02, 105.07it/s]\n",
|
124 |
+
" 4%|3 | 795/20000 [00:16<03:14, 98.92it/s] \n",
|
125 |
+
" 4%|4 | 807/20000 [00:16<03:06, 102.82it/s]\n",
|
126 |
+
" 4%|4 | 822/20000 [00:16<02:50, 112.34it/s]\n",
|
127 |
+
" 4%|4 | 834/20000 [00:16<02:55, 109.45it/s]\n",
|
128 |
+
" 4%|4 | 847/20000 [00:16<02:47, 114.28it/s]\n",
|
129 |
+
" 4%|4 | 860/20000 [00:16<02:42, 117.94it/s]\n",
|
130 |
+
" 4%|4 | 873/20000 [00:16<02:46, 114.58it/s]\n",
|
131 |
+
" 4%|4 | 885/20000 [00:17<02:58, 106.97it/s]\n",
|
132 |
+
" 4%|4 | 896/20000 [00:17<03:07, 102.08it/s]\n",
|
133 |
+
" 5%|4 | 908/20000 [00:17<03:01, 105.42it/s]\n",
|
134 |
+
" 5%|4 | 924/20000 [00:17<02:42, 117.67it/s]\n",
|
135 |
+
" 5%|4 | 940/20000 [00:17<02:28, 128.26it/s]\n",
|
136 |
+
" 5%|4 | 954/20000 [00:17<02:24, 131.54it/s]\n",
|
137 |
+
" 5%|4 | 968/20000 [00:17<02:34, 123.37it/s]\n",
|
138 |
+
" 5%|4 | 982/20000 [00:17<02:31, 125.85it/s]\n",
|
139 |
+
" 5%|4 | 995/20000 [00:18<02:39, 119.06it/s]\n",
|
140 |
+
" 5%|5 | 1008/20000 [00:18<03:38, 86.92it/s]\n",
|
141 |
+
" 5%|5 | 1024/20000 [00:18<03:04, 102.72it/s]\n",
|
142 |
+
" 5%|5 | 1041/20000 [00:18<02:40, 118.03it/s]\n",
|
143 |
+
" 5%|5 | 1055/20000 [00:18<02:34, 122.63it/s]\n",
|
144 |
+
" 5%|5 | 1069/20000 [00:18<02:31, 124.89it/s]\n",
|
145 |
+
" 5%|5 | 1083/20000 [00:18<02:34, 122.68it/s]\n",
|
146 |
+
" 5%|5 | 1096/20000 [00:18<02:39, 118.60it/s]\n",
|
147 |
+
" 6%|5 | 1110/20000 [00:19<02:32, 123.65it/s]\n",
|
148 |
+
" 6%|5 | 1127/20000 [00:19<02:18, 136.02it/s]\n",
|
149 |
+
" 6%|5 | 1145/20000 [00:19<02:08, 146.74it/s]\n",
|
150 |
+
" 6%|5 | 1161/20000 [00:19<02:06, 148.82it/s]\n",
|
151 |
+
" 6%|5 | 1177/20000 [00:19<02:04, 151.15it/s]\n",
|
152 |
+
" 6%|5 | 1193/20000 [00:19<02:09, 145.70it/s]\n",
|
153 |
+
" 6%|6 | 1208/20000 [00:19<02:10, 144.47it/s]\n",
|
154 |
+
" 6%|6 | 1227/20000 [00:19<02:00, 156.04it/s]\n",
|
155 |
+
" 6%|6 | 1244/20000 [00:19<01:57, 159.13it/s]\n",
|
156 |
+
" 6%|6 | 1261/20000 [00:19<02:01, 154.24it/s]\n",
|
157 |
+
" 6%|6 | 1277/20000 [00:20<02:09, 145.11it/s]\n",
|
158 |
+
" 6%|6 | 1292/20000 [00:20<02:10, 143.32it/s]\n",
|
159 |
+
" 7%|6 | 1307/20000 [00:20<03:02, 102.65it/s]\n",
|
160 |
+
" 7%|6 | 1330/20000 [00:20<02:23, 130.14it/s]\n",
|
161 |
+
" 7%|6 | 1348/20000 [00:20<02:13, 139.97it/s]\n",
|
162 |
+
" 7%|6 | 1368/20000 [00:20<02:02, 152.70it/s]\n",
|
163 |
+
" 7%|6 | 1385/20000 [00:20<02:00, 153.99it/s]\n",
|
164 |
+
" 7%|7 | 1402/20000 [00:21<02:07, 146.16it/s]\n",
|
165 |
+
" 7%|7 | 1423/20000 [00:21<01:55, 161.53it/s]\n",
|
166 |
+
" 7%|7 | 1441/20000 [00:21<01:52, 165.17it/s]\n",
|
167 |
+
" 7%|7 | 1459/20000 [00:21<01:55, 160.82it/s]\n",
|
168 |
+
" 7%|7 | 1476/20000 [00:21<02:03, 149.82it/s]\n",
|
169 |
+
" 7%|7 | 1492/20000 [00:21<02:08, 143.79it/s]\n",
|
170 |
+
" 8%|7 | 1507/20000 [00:21<02:10, 142.06it/s]\n",
|
171 |
+
" 8%|7 | 1530/20000 [00:21<01:52, 164.72it/s]\n",
|
172 |
+
" 8%|7 | 1548/20000 [00:21<01:50, 167.09it/s]\n",
|
173 |
+
" 8%|7 | 1565/20000 [00:22<01:49, 167.90it/s]\n",
|
174 |
+
" 8%|7 | 1582/20000 [00:22<01:53, 161.57it/s]\n",
|
175 |
+
" 8%|7 | 1599/20000 [00:22<01:56, 158.15it/s]\n",
|
176 |
+
" 8%|8 | 1617/20000 [00:22<01:51, 164.25it/s]\n",
|
177 |
+
" 8%|8 | 1637/20000 [00:22<01:45, 174.45it/s]\n",
|
178 |
+
" 8%|8 | 1657/20000 [00:22<01:41, 181.32it/s]\n",
|
179 |
+
" 8%|8 | 1676/20000 [00:22<01:40, 182.25it/s]\n",
|
180 |
+
" 8%|8 | 1695/20000 [00:22<01:46, 171.94it/s]\n",
|
181 |
+
" 9%|8 | 1718/20000 [00:22<01:38, 186.12it/s]\n",
|
182 |
+
" 9%|8 | 1739/20000 [00:22<01:34, 192.48it/s]\n",
|
183 |
+
" 9%|8 | 1759/20000 [00:23<02:13, 136.76it/s]\n",
|
184 |
+
" 9%|8 | 1777/20000 [00:23<02:04, 145.80it/s]\n",
|
185 |
+
" 9%|8 | 1794/20000 [00:23<02:04, 146.68it/s]\n",
|
186 |
+
" 9%|9 | 1814/20000 [00:23<01:53, 159.63it/s]\n",
|
187 |
+
" 9%|9 | 1836/20000 [00:23<01:43, 175.04it/s]\n",
|
188 |
+
" 9%|9 | 1856/20000 [00:23<01:41, 179.30it/s]\n",
|
189 |
+
" 9%|9 | 1875/20000 [00:23<01:42, 176.01it/s]\n",
|
190 |
+
" 9%|9 | 1894/20000 [00:23<01:45, 171.34it/s]\n",
|
191 |
+
" 10%|9 | 1915/20000 [00:24<01:39, 180.93it/s]\n",
|
192 |
+
" 10%|9 | 1937/20000 [00:24<01:34, 190.79it/s]\n",
|
193 |
+
" 10%|9 | 1957/20000 [00:24<01:35, 189.63it/s]\n",
|
194 |
+
" 10%|9 | 1977/20000 [00:24<01:36, 186.73it/s]\n",
|
195 |
+
" 10%|9 | 1996/20000 [00:24<01:42, 175.72it/s]\n",
|
196 |
+
" 10%|# | 2018/20000 [00:24<01:35, 187.87it/s]\n",
|
197 |
+
" 10%|# | 2046/20000 [00:24<01:24, 212.03it/s]\n",
|
198 |
+
" 10%|# | 2068/20000 [00:24<01:27, 204.39it/s]\n",
|
199 |
+
" 10%|# | 2089/20000 [00:24<01:31, 195.56it/s]\n",
|
200 |
+
" 11%|# | 2109/20000 [00:25<01:33, 192.02it/s]\n",
|
201 |
+
" 11%|# | 2140/20000 [00:25<01:19, 224.10it/s]\n",
|
202 |
+
" 11%|# | 2165/20000 [00:25<01:17, 230.78it/s]\n",
|
203 |
+
" 11%|# | 2189/20000 [00:25<01:18, 225.64it/s]\n",
|
204 |
+
" 11%|#1 | 2212/20000 [00:25<01:24, 210.15it/s]\n",
|
205 |
+
" 11%|#1 | 2236/20000 [00:25<01:21, 217.71it/s]\n",
|
206 |
+
" 11%|#1 | 2259/20000 [00:25<01:22, 215.12it/s]\n",
|
207 |
+
" 11%|#1 | 2281/20000 [00:25<01:24, 208.87it/s]\n",
|
208 |
+
" 12%|#1 | 2303/20000 [00:25<01:35, 185.14it/s]\n",
|
209 |
+
" 12%|#1 | 2333/20000 [00:26<01:22, 213.67it/s]\n",
|
210 |
+
" 12%|#1 | 2357/20000 [00:26<01:19, 220.73it/s]\n",
|
211 |
+
" 12%|#1 | 2380/20000 [00:26<01:21, 214.95it/s]\n",
|
212 |
+
" 12%|#2 | 2402/20000 [00:26<02:03, 142.71it/s]\n",
|
213 |
+
" 12%|#2 | 2432/20000 [00:26<01:40, 174.08it/s]\n",
|
214 |
+
" 12%|#2 | 2459/20000 [00:26<01:29, 195.81it/s]\n",
|
215 |
+
" 12%|#2 | 2482/20000 [00:26<01:28, 198.82it/s]\n",
|
216 |
+
" 13%|#2 | 2505/20000 [00:27<01:29, 195.33it/s]\n",
|
217 |
+
" 13%|#2 | 2538/20000 [00:27<01:16, 228.52it/s]\n",
|
218 |
+
" 13%|#2 | 2566/20000 [00:27<01:11, 242.22it/s]\n",
|
219 |
+
" 13%|#2 | 2592/20000 [00:27<01:15, 230.01it/s]\n",
|
220 |
+
" 13%|#3 | 2620/20000 [00:27<01:11, 243.40it/s]\n",
|
221 |
+
" 13%|#3 | 2651/20000 [00:27<01:06, 261.84it/s]\n",
|
222 |
+
" 13%|#3 | 2678/20000 [00:27<01:06, 260.46it/s]\n",
|
223 |
+
" 14%|#3 | 2705/20000 [00:27<01:08, 252.37it/s]\n",
|
224 |
+
" 14%|#3 | 2740/20000 [00:27<01:02, 278.24it/s]\n",
|
225 |
+
" 14%|#3 | 2769/20000 [00:27<01:05, 264.95it/s]\n",
|
226 |
+
" 14%|#3 | 2796/20000 [00:28<01:09, 247.16it/s]\n",
|
227 |
+
" 14%|#4 | 2828/20000 [00:28<01:04, 264.60it/s]\n",
|
228 |
+
" 14%|#4 | 2855/20000 [00:28<01:05, 260.34it/s]\n",
|
229 |
+
" 14%|#4 | 2882/20000 [00:28<01:09, 247.20it/s]\n",
|
230 |
+
" 15%|#4 | 2908/20000 [00:28<01:12, 236.53it/s]\n",
|
231 |
+
" 15%|#4 | 2952/20000 [00:28<00:58, 291.10it/s]\n",
|
232 |
+
" 15%|#4 | 2982/20000 [00:28<01:03, 266.27it/s]\n",
|
233 |
+
" 15%|#5 | 3010/20000 [00:28<01:03, 267.07it/s]\n",
|
234 |
+
" 15%|#5 | 3039/20000 [00:29<01:02, 270.37it/s]\n",
|
235 |
+
" 15%|#5 | 3068/20000 [00:29<01:01, 273.53it/s]\n",
|
236 |
+
" 15%|#5 | 3096/20000 [00:29<01:04, 263.45it/s]\n",
|
237 |
+
" 16%|#5 | 3129/20000 [00:29<00:59, 281.96it/s]\n",
|
238 |
+
" 16%|#5 | 3160/20000 [00:29<00:58, 287.48it/s]\n",
|
239 |
+
" 16%|#5 | 3190/20000 [00:29<01:00, 279.05it/s]\n",
|
240 |
+
" 16%|#6 | 3226/20000 [00:29<00:55, 301.05it/s]\n",
|
241 |
+
" 16%|#6 | 3257/20000 [00:29<00:55, 303.61it/s]\n",
|
242 |
+
" 16%|#6 | 3288/20000 [00:29<00:56, 293.52it/s]\n",
|
243 |
+
" 17%|#6 | 3318/20000 [00:29<00:56, 293.68it/s]\n",
|
244 |
+
" 17%|#6 | 3357/20000 [00:30<00:52, 318.68it/s]\n",
|
245 |
+
" 17%|#6 | 3390/20000 [00:30<00:58, 284.80it/s]\n",
|
246 |
+
" 17%|#7 | 3420/20000 [00:30<01:21, 204.06it/s]\n",
|
247 |
+
" 17%|#7 | 3459/20000 [00:30<01:08, 242.62it/s]\n",
|
248 |
+
" 17%|#7 | 3491/20000 [00:30<01:03, 260.00it/s]\n",
|
249 |
+
" 18%|#7 | 3535/20000 [00:30<00:54, 304.04it/s]\n",
|
250 |
+
" 18%|#7 | 3573/20000 [00:30<00:50, 323.92it/s]\n",
|
251 |
+
" 18%|#8 | 3608/20000 [00:31<00:55, 296.34it/s]\n",
|
252 |
+
" 18%|#8 | 3653/20000 [00:31<00:48, 336.01it/s]\n",
|
253 |
+
" 18%|#8 | 3689/20000 [00:31<00:49, 329.16it/s]\n",
|
254 |
+
" 19%|#8 | 3733/20000 [00:31<00:45, 358.11it/s]\n",
|
255 |
+
" 19%|#8 | 3771/20000 [00:31<00:44, 361.17it/s]\n",
|
256 |
+
" 19%|#9 | 3809/20000 [00:31<00:47, 342.31it/s]\n",
|
257 |
+
" 19%|#9 | 3861/20000 [00:31<00:41, 390.94it/s]\n",
|
258 |
+
" 20%|#9 | 3902/20000 [00:31<00:42, 378.22it/s]\n",
|
259 |
+
" 20%|#9 | 3968/20000 [00:31<00:35, 455.02it/s]\n",
|
260 |
+
" 20%|## | 4015/20000 [00:32<00:37, 427.77it/s]\n",
|
261 |
+
" 20%|## | 4066/20000 [00:32<00:35, 449.03it/s]\n",
|
262 |
+
" 21%|## | 4112/20000 [00:32<00:39, 404.45it/s]\n",
|
263 |
+
" 21%|## | 4174/20000 [00:32<00:34, 458.89it/s]\n",
|
264 |
+
" 21%|##1 | 4222/20000 [00:32<00:35, 442.90it/s]\n",
|
265 |
+
" 21%|##1 | 4271/20000 [00:32<00:34, 454.41it/s]\n",
|
266 |
+
" 22%|##1 | 4329/20000 [00:32<00:32, 489.36it/s]\n",
|
267 |
+
" 22%|##1 | 4387/20000 [00:32<00:30, 515.14it/s]\n",
|
268 |
+
" 22%|##2 | 4447/20000 [00:32<00:28, 538.10it/s]\n",
|
269 |
+
" 23%|##2 | 4502/20000 [00:33<00:32, 478.73it/s]\n",
|
270 |
+
" 23%|##2 | 4563/20000 [00:33<00:30, 512.67it/s]\n",
|
271 |
+
" 23%|##3 | 4616/20000 [00:33<00:30, 496.81it/s]\n",
|
272 |
+
" 23%|##3 | 4677/20000 [00:33<00:29, 527.98it/s]\n",
|
273 |
+
" 24%|##3 | 4733/20000 [00:33<00:28, 537.01it/s]\n",
|
274 |
+
" 24%|##3 | 4788/20000 [00:33<00:28, 534.59it/s]\n",
|
275 |
+
" 24%|##4 | 4864/20000 [00:33<00:25, 599.65it/s]\n",
|
276 |
+
" 25%|##4 | 4925/20000 [00:33<00:25, 595.70it/s]\n",
|
277 |
+
" 25%|##4 | 4994/20000 [00:33<00:24, 617.81it/s]\n",
|
278 |
+
" 25%|##5 | 5079/20000 [00:33<00:21, 683.71it/s]\n",
|
279 |
+
" 26%|##5 | 5148/20000 [00:34<00:35, 419.97it/s]\n",
|
280 |
+
" 26%|##6 | 5203/20000 [00:34<00:33, 446.58it/s]\n",
|
281 |
+
" 26%|##6 | 5289/20000 [00:34<00:27, 538.90it/s]\n",
|
282 |
+
" 27%|##6 | 5377/20000 [00:34<00:23, 622.07it/s]\n",
|
283 |
+
" 27%|##7 | 5471/20000 [00:34<00:20, 703.42it/s]\n",
|
284 |
+
" 28%|##7 | 5549/20000 [00:36<01:35, 150.73it/s]\n",
|
285 |
+
" 28%|##8 | 5606/20000 [00:36<01:37, 147.12it/s]\n",
|
286 |
+
" 28%|##8 | 5650/20000 [00:36<01:34, 151.57it/s]\n",
|
287 |
+
" 28%|##8 | 5686/20000 [00:37<01:33, 153.50it/s]\n",
|
288 |
+
" 29%|##8 | 5716/20000 [00:37<01:32, 154.45it/s]\n",
|
289 |
+
" 29%|##8 | 5742/20000 [00:37<01:29, 158.75it/s]\n",
|
290 |
+
" 29%|##8 | 5766/20000 [00:37<01:28, 160.05it/s]\n",
|
291 |
+
" 29%|##8 | 5788/20000 [00:37<01:29, 159.44it/s]\n",
|
292 |
+
" 29%|##9 | 5808/20000 [00:37<01:29, 158.22it/s]\n",
|
293 |
+
" 29%|##9 | 5827/20000 [00:37<01:27, 162.78it/s]\n",
|
294 |
+
" 29%|##9 | 5846/20000 [00:38<01:25, 165.07it/s]\n",
|
295 |
+
" 29%|##9 | 5864/20000 [00:38<01:25, 164.71it/s]\n",
|
296 |
+
" 29%|##9 | 5882/20000 [00:38<01:26, 162.88it/s]\n",
|
297 |
+
" 29%|##9 | 5899/20000 [00:38<01:30, 155.66it/s]\n",
|
298 |
+
" 30%|##9 | 5916/20000 [00:38<01:29, 158.09it/s]\n",
|
299 |
+
" 30%|##9 | 5935/20000 [00:38<01:24, 166.09it/s]\n",
|
300 |
+
" 30%|##9 | 5954/20000 [00:38<01:22, 169.84it/s]\n",
|
301 |
+
" 30%|##9 | 5972/20000 [00:38<01:21, 171.23it/s]\n",
|
302 |
+
" 30%|##9 | 5990/20000 [00:38<01:22, 170.81it/s]\n",
|
303 |
+
" 30%|### | 6008/20000 [00:39<01:23, 167.70it/s]\n",
|
304 |
+
" 30%|### | 6027/20000 [00:39<01:20, 173.01it/s]\n",
|
305 |
+
" 30%|### | 6046/20000 [00:39<01:19, 175.35it/s]\n",
|
306 |
+
" 30%|### | 6064/20000 [00:39<01:20, 172.23it/s]\n",
|
307 |
+
" 30%|### | 6082/20000 [00:39<01:21, 170.55it/s]\n",
|
308 |
+
" 30%|### | 6100/20000 [00:39<01:23, 167.05it/s]\n",
|
309 |
+
" 31%|### | 6118/20000 [00:39<01:21, 170.70it/s]\n",
|
310 |
+
" 31%|### | 6138/20000 [00:39<01:17, 178.09it/s]\n",
|
311 |
+
" 31%|### | 6157/20000 [00:39<01:16, 179.96it/s]\n",
|
312 |
+
" 31%|### | 6176/20000 [00:39<01:18, 177.21it/s]\n",
|
313 |
+
" 31%|### | 6194/20000 [00:40<01:18, 174.99it/s]\n",
|
314 |
+
" 31%|###1 | 6212/20000 [00:40<01:19, 173.44it/s]\n",
|
315 |
+
" 31%|###1 | 6232/20000 [00:40<01:16, 180.04it/s]\n",
|
316 |
+
" 31%|###1 | 6251/20000 [00:40<01:16, 179.80it/s]\n",
|
317 |
+
" 31%|###1 | 6270/20000 [00:40<01:19, 172.28it/s]\n",
|
318 |
+
" 31%|###1 | 6288/20000 [00:40<01:20, 170.14it/s]\n",
|
319 |
+
" 32%|###1 | 6306/20000 [00:40<01:22, 165.92it/s]\n",
|
320 |
+
" 32%|###1 | 6327/20000 [00:40<01:16, 178.17it/s]\n",
|
321 |
+
" 32%|###1 | 6347/20000 [00:40<01:14, 183.85it/s]\n",
|
322 |
+
" 32%|###1 | 6366/20000 [00:41<01:14, 182.46it/s]\n",
|
323 |
+
" 32%|###1 | 6385/20000 [00:41<01:17, 175.53it/s]\n",
|
324 |
+
" 32%|###2 | 6403/20000 [00:41<01:21, 166.92it/s]\n",
|
325 |
+
" 32%|###2 | 6423/20000 [00:41<01:17, 174.57it/s]\n",
|
326 |
+
" 32%|###2 | 6443/20000 [00:41<01:15, 179.69it/s]\n",
|
327 |
+
" 32%|###2 | 6462/20000 [00:41<01:16, 178.06it/s]\n",
|
328 |
+
" 32%|###2 | 6480/20000 [00:41<01:17, 174.17it/s]\n",
|
329 |
+
" 32%|###2 | 6498/20000 [00:41<01:19, 170.04it/s]\n",
|
330 |
+
" 33%|###2 | 6517/20000 [00:41<01:16, 175.15it/s]\n",
|
331 |
+
" 33%|###2 | 6538/20000 [00:42<01:13, 184.06it/s]\n",
|
332 |
+
" 33%|###2 | 6558/20000 [00:42<01:11, 187.58it/s]\n",
|
333 |
+
" 33%|###2 | 6577/20000 [00:42<01:12, 183.99it/s]\n",
|
334 |
+
" 33%|###2 | 6596/20000 [00:42<01:14, 180.51it/s]\n",
|
335 |
+
" 33%|###3 | 6615/20000 [00:42<01:14, 180.64it/s]\n",
|
336 |
+
" 33%|###3 | 6636/20000 [00:42<01:11, 187.45it/s]\n",
|
337 |
+
" 33%|###3 | 6656/20000 [00:42<01:10, 189.43it/s]\n",
|
338 |
+
" 33%|###3 | 6675/20000 [00:42<01:11, 185.29it/s]\n",
|
339 |
+
" 33%|###3 | 6694/20000 [00:42<01:14, 177.91it/s]\n",
|
340 |
+
" 34%|###3 | 6712/20000 [00:42<01:15, 176.02it/s]\n",
|
341 |
+
" 34%|###3 | 6733/20000 [00:43<01:11, 185.68it/s]\n",
|
342 |
+
" 34%|###3 | 6752/20000 [00:43<01:10, 186.91it/s]\n",
|
343 |
+
" 34%|###3 | 6771/20000 [00:43<01:12, 183.53it/s]\n",
|
344 |
+
" 34%|###3 | 6790/20000 [00:43<01:15, 175.73it/s]\n",
|
345 |
+
" 34%|###4 | 6808/20000 [00:43<01:17, 170.68it/s]\n",
|
346 |
+
" 34%|###4 | 6828/20000 [00:43<01:13, 178.87it/s]\n",
|
347 |
+
" 34%|###4 | 6849/20000 [00:43<01:10, 186.16it/s]\n",
|
348 |
+
" 34%|###4 | 6868/20000 [00:43<01:10, 187.26it/s]\n",
|
349 |
+
" 34%|###4 | 6887/20000 [00:43<01:10, 185.36it/s]\n",
|
350 |
+
" 35%|###4 | 6906/20000 [00:44<01:12, 180.40it/s]\n",
|
351 |
+
" 35%|###4 | 6929/20000 [00:44<01:07, 193.50it/s]\n",
|
352 |
+
" 35%|###4 | 6950/20000 [00:44<01:05, 198.27it/s]\n",
|
353 |
+
" 35%|###4 | 6970/20000 [00:44<01:06, 197.04it/s]\n",
|
354 |
+
" 35%|###4 | 6990/20000 [00:44<01:08, 190.65it/s]\n",
|
355 |
+
" 35%|###5 | 7010/20000 [00:44<01:10, 184.33it/s]\n",
|
356 |
+
" 35%|###5 | 7029/20000 [00:44<01:10, 183.34it/s]\n",
|
357 |
+
" 35%|###5 | 7049/20000 [00:44<01:08, 188.06it/s]\n",
|
358 |
+
" 35%|###5 | 7068/20000 [00:44<01:08, 188.62it/s]\n",
|
359 |
+
" 35%|###5 | 7087/20000 [00:44<01:09, 184.71it/s]\n",
|
360 |
+
" 36%|###5 | 7106/20000 [00:45<01:11, 179.97it/s]\n",
|
361 |
+
" 36%|###5 | 7129/20000 [00:45<01:06, 192.60it/s]\n",
|
362 |
+
" 36%|###5 | 7151/20000 [00:45<01:04, 198.21it/s]\n",
|
363 |
+
" 36%|###5 | 7172/20000 [00:45<01:04, 198.74it/s]\n",
|
364 |
+
" 36%|###5 | 7192/20000 [00:45<01:04, 197.39it/s]\n",
|
365 |
+
" 36%|###6 | 7212/20000 [00:45<01:05, 195.87it/s]\n",
|
366 |
+
" 36%|###6 | 7235/20000 [00:45<01:02, 205.80it/s]\n",
|
367 |
+
" 36%|###6 | 7256/20000 [00:45<01:01, 205.81it/s]\n",
|
368 |
+
" 36%|###6 | 7277/20000 [00:45<01:03, 201.69it/s]\n",
|
369 |
+
" 36%|###6 | 7298/20000 [00:46<01:05, 193.96it/s]\n",
|
370 |
+
" 37%|###6 | 7320/20000 [00:46<01:03, 200.19it/s]\n",
|
371 |
+
" 37%|###6 | 7343/20000 [00:46<01:00, 208.13it/s]\n",
|
372 |
+
" 37%|###6 | 7364/20000 [00:46<01:00, 208.67it/s]\n",
|
373 |
+
" 37%|###6 | 7385/20000 [00:46<01:01, 204.27it/s]\n",
|
374 |
+
" 37%|###7 | 7406/20000 [00:46<01:03, 199.56it/s]\n",
|
375 |
+
" 37%|###7 | 7429/20000 [00:46<01:00, 207.10it/s]\n",
|
376 |
+
" 37%|###7 | 7450/20000 [00:46<01:00, 207.93it/s]\n",
|
377 |
+
" 37%|###7 | 7471/20000 [00:46<01:00, 207.32it/s]\n",
|
378 |
+
" 37%|###7 | 7492/20000 [00:46<01:01, 204.50it/s]\n",
|
379 |
+
" 38%|###7 | 7513/20000 [00:47<01:00, 205.49it/s]\n",
|
380 |
+
" 38%|###7 | 7537/20000 [00:47<00:57, 215.00it/s]\n",
|
381 |
+
" 38%|###7 | 7559/20000 [00:47<00:57, 215.82it/s]\n",
|
382 |
+
" 38%|###7 | 7581/20000 [00:47<00:58, 210.82it/s]\n",
|
383 |
+
" 38%|###8 | 7603/20000 [00:47<01:01, 200.64it/s]\n",
|
384 |
+
" 38%|###8 | 7627/20000 [00:47<00:58, 211.11it/s]\n",
|
385 |
+
" 38%|###8 | 7650/20000 [00:47<00:57, 215.27it/s]\n",
|
386 |
+
" 38%|###8 | 7672/20000 [00:47<00:58, 211.14it/s]\n",
|
387 |
+
" 38%|###8 | 7694/20000 [00:47<01:00, 203.74it/s]\n",
|
388 |
+
" 39%|###8 | 7716/20000 [00:48<00:59, 206.56it/s]\n",
|
389 |
+
" 39%|###8 | 7742/20000 [00:48<00:55, 219.27it/s]\n",
|
390 |
+
" 39%|###8 | 7765/20000 [00:48<00:55, 222.34it/s]\n",
|
391 |
+
" 39%|###8 | 7788/20000 [00:48<00:55, 220.09it/s]\n",
|
392 |
+
" 39%|###9 | 7811/20000 [00:48<00:56, 217.29it/s]\n",
|
393 |
+
" 39%|###9 | 7837/20000 [00:48<00:53, 227.59it/s]\n",
|
394 |
+
" 39%|###9 | 7860/20000 [00:48<00:53, 225.68it/s]\n",
|
395 |
+
" 39%|###9 | 7883/20000 [00:48<00:57, 210.45it/s]\n",
|
396 |
+
" 40%|###9 | 7905/20000 [00:48<00:59, 203.50it/s]\n",
|
397 |
+
" 40%|###9 | 7931/20000 [00:49<00:55, 218.40it/s]\n",
|
398 |
+
" 40%|###9 | 7955/20000 [00:49<00:53, 223.84it/s]\n",
|
399 |
+
" 40%|###9 | 7978/20000 [00:49<00:54, 222.42it/s]\n",
|
400 |
+
" 40%|#### | 8001/20000 [00:49<00:56, 211.24it/s]\n",
|
401 |
+
" 40%|#### | 8028/20000 [00:49<00:52, 226.96it/s]\n",
|
402 |
+
" 40%|#### | 8052/20000 [00:49<00:52, 229.36it/s]\n",
|
403 |
+
" 40%|#### | 8076/20000 [00:49<00:52, 226.55it/s]\n",
|
404 |
+
" 40%|#### | 8099/20000 [00:49<00:54, 217.59it/s]\n",
|
405 |
+
" 41%|#### | 8121/20000 [00:50<01:26, 136.56it/s]\n",
|
406 |
+
" 41%|#### | 8144/20000 [00:50<01:16, 154.69it/s]\n",
|
407 |
+
" 41%|#### | 8165/20000 [00:50<01:11, 165.14it/s]\n",
|
408 |
+
" 41%|#### | 8186/20000 [00:50<01:07, 174.19it/s]\n",
|
409 |
+
" 41%|####1 | 8206/20000 [00:50<01:05, 179.79it/s]\n",
|
410 |
+
" 41%|####1 | 8234/20000 [00:50<00:57, 205.18it/s]\n",
|
411 |
+
" 41%|####1 | 8259/20000 [00:50<00:54, 215.64it/s]\n",
|
412 |
+
" 41%|####1 | 8282/20000 [00:50<00:53, 219.03it/s]\n",
|
413 |
+
" 42%|####1 | 8305/20000 [00:50<00:55, 209.63it/s]\n",
|
414 |
+
" 42%|####1 | 8334/20000 [00:51<00:50, 229.98it/s]\n",
|
415 |
+
" 42%|####1 | 8359/20000 [00:51<00:49, 234.96it/s]\n",
|
416 |
+
" 42%|####1 | 8383/20000 [00:51<00:50, 230.45it/s]\n",
|
417 |
+
" 42%|####2 | 8407/20000 [00:51<00:52, 222.38it/s]\n",
|
418 |
+
" 42%|####2 | 8436/20000 [00:51<00:48, 240.62it/s]\n",
|
419 |
+
" 42%|####2 | 8461/20000 [00:51<00:47, 242.60it/s]\n",
|
420 |
+
" 42%|####2 | 8486/20000 [00:51<00:47, 239.88it/s]\n",
|
421 |
+
" 43%|####2 | 8511/20000 [00:51<00:48, 236.67it/s]\n",
|
422 |
+
" 43%|####2 | 8539/20000 [00:51<00:46, 247.65it/s]\n",
|
423 |
+
" 43%|####2 | 8564/20000 [00:51<00:46, 244.78it/s]\n",
|
424 |
+
" 43%|####2 | 8589/20000 [00:52<00:48, 236.04it/s]\n",
|
425 |
+
" 43%|####3 | 8613/20000 [00:52<00:48, 235.15it/s]\n",
|
426 |
+
" 43%|####3 | 8643/20000 [00:52<00:45, 252.29it/s]\n",
|
427 |
+
" 43%|####3 | 8669/20000 [00:52<00:45, 248.06it/s]\n",
|
428 |
+
" 43%|####3 | 8694/20000 [00:52<00:47, 240.33it/s]\n",
|
429 |
+
" 44%|####3 | 8720/20000 [00:52<00:46, 243.84it/s]\n",
|
430 |
+
" 44%|####3 | 8748/20000 [00:52<00:44, 254.21it/s]\n",
|
431 |
+
" 44%|####3 | 8777/20000 [00:52<00:42, 263.08it/s]\n",
|
432 |
+
" 44%|####4 | 8808/20000 [00:52<00:40, 276.78it/s]\n",
|
433 |
+
" 44%|####4 | 8856/20000 [00:53<00:33, 336.62it/s]\n",
|
434 |
+
" 44%|####4 | 8896/20000 [00:53<00:31, 353.29it/s]\n",
|
435 |
+
" 45%|####4 | 8955/20000 [00:53<00:26, 422.10it/s]\n",
|
436 |
+
" 45%|####5 | 9001/20000 [00:53<00:25, 425.76it/s]\n",
|
437 |
+
" 45%|####5 | 9070/20000 [00:53<00:21, 501.92it/s]\n",
|
438 |
+
" 46%|####5 | 9128/20000 [00:53<00:20, 523.43it/s]\n",
|
439 |
+
" 46%|####5 | 9183/20000 [00:53<00:20, 529.69it/s]\n",
|
440 |
+
" 46%|####6 | 9237/20000 [00:53<00:20, 531.16it/s]\n",
|
441 |
+
" 46%|####6 | 9291/20000 [00:53<00:21, 508.14it/s]\n",
|
442 |
+
" 47%|####6 | 9346/20000 [00:53<00:20, 520.17it/s]\n",
|
443 |
+
" 47%|####6 | 9399/20000 [00:54<00:20, 509.66it/s]\n",
|
444 |
+
" 47%|####7 | 9451/20000 [00:54<00:20, 509.69it/s]\n",
|
445 |
+
" 48%|####7 | 9503/20000 [00:54<00:21, 494.01it/s]\n",
|
446 |
+
" 48%|####7 | 9580/20000 [00:54<00:18, 569.61it/s]\n",
|
447 |
+
" 48%|####8 | 9649/20000 [00:54<00:17, 602.61it/s]\n",
|
448 |
+
" 49%|####8 | 9710/20000 [00:54<00:17, 597.78it/s]\n",
|
449 |
+
" 49%|####8 | 9792/20000 [00:54<00:15, 660.63it/s]\n",
|
450 |
+
" 49%|####9 | 9874/20000 [00:54<00:14, 707.37it/s]\n",
|
451 |
+
" 50%|####9 | 9946/20000 [00:54<00:14, 698.70it/s]\n",
|
452 |
+
" 50%|##### | 10029/20000 [00:54<00:13, 735.01it/s]\n",
|
453 |
+
" 51%|##### | 10103/20000 [00:55<00:14, 703.33it/s]\n",
|
454 |
+
" 51%|##### | 10187/20000 [00:55<00:13, 742.49it/s]\n",
|
455 |
+
" 51%|#####1 | 10275/20000 [00:55<00:12, 782.42it/s]\n",
|
456 |
+
" 52%|#####1 | 10372/20000 [00:55<00:11, 837.41it/s]\n",
|
457 |
+
" 52%|#####2 | 10463/20000 [00:55<00:11, 858.83it/s]\n",
|
458 |
+
" 53%|#####2 | 10550/20000 [00:55<00:10, 862.08it/s]\n",
|
459 |
+
" 53%|#####3 | 10640/20000 [00:55<00:10, 873.35it/s]\n",
|
460 |
+
" 54%|#####3 | 10728/20000 [00:55<00:10, 857.41it/s]\n",
|
461 |
+
" 54%|#####4 | 10815/20000 [00:55<00:10, 858.61it/s]\n",
|
462 |
+
" 55%|#####4 | 10902/20000 [00:56<00:10, 861.95it/s]\n",
|
463 |
+
" 55%|#####5 | 11034/20000 [00:56<00:08, 997.87it/s]\n",
|
464 |
+
" 56%|#####5 | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
|
465 |
+
" 56%|#####6 | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
|
466 |
+
" 57%|#####7 | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
|
467 |
+
" 58%|#####7 | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
|
468 |
+
" 59%|#####8 | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
|
469 |
+
" 60%|#####9 | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
|
470 |
+
" 61%|###### | 12137/20000 [00:58<00:32, 241.50it/s] \n",
|
471 |
+
" 61%|######1 | 12256/20000 [00:59<00:34, 221.55it/s]\n",
|
472 |
+
" 62%|######1 | 12344/20000 [00:59<00:36, 211.18it/s]\n",
|
473 |
+
" 62%|######2 | 12411/20000 [01:00<00:37, 204.44it/s]\n",
|
474 |
+
" 62%|######2 | 12464/20000 [01:00<00:37, 201.86it/s]\n",
|
475 |
+
" 63%|######2 | 12507/20000 [01:00<00:37, 197.48it/s]\n",
|
476 |
+
" 63%|######2 | 12542/20000 [01:00<00:35, 209.76it/s]\n",
|
477 |
+
" 63%|######2 | 12576/20000 [01:01<00:33, 220.35it/s]\n",
|
478 |
+
" 63%|######3 | 12609/20000 [01:01<00:32, 226.03it/s]\n",
|
479 |
+
" 63%|######3 | 12640/20000 [01:01<00:31, 234.04it/s]\n",
|
480 |
+
" 63%|######3 | 12670/20000 [01:01<00:29, 246.31it/s]\n",
|
481 |
+
" 64%|######3 | 12700/20000 [01:01<00:28, 251.76it/s]\n",
|
482 |
+
" 64%|######3 | 12731/20000 [01:01<00:27, 263.76it/s]\n",
|
483 |
+
" 64%|######3 | 12761/20000 [01:01<00:26, 272.59it/s]\n",
|
484 |
+
" 64%|######3 | 12791/20000 [01:01<00:26, 271.94it/s]\n",
|
485 |
+
" 64%|######4 | 12820/20000 [01:01<00:26, 274.53it/s]\n",
|
486 |
+
" 64%|######4 | 12850/20000 [01:02<00:25, 281.46it/s]\n",
|
487 |
+
" 64%|######4 | 12879/20000 [01:02<00:25, 277.63it/s]\n",
|
488 |
+
" 65%|######4 | 12908/20000 [01:02<00:26, 264.03it/s]\n",
|
489 |
+
" 65%|######4 | 12939/20000 [01:02<00:25, 276.61it/s]\n",
|
490 |
+
" 65%|######4 | 12969/20000 [01:02<00:24, 281.60it/s]\n",
|
491 |
+
" 65%|######4 | 12998/20000 [01:02<00:24, 280.78it/s]\n",
|
492 |
+
" 65%|######5 | 13027/20000 [01:02<00:25, 275.51it/s]\n",
|
493 |
+
" 65%|######5 | 13058/20000 [01:02<00:24, 285.34it/s]\n",
|
494 |
+
" 65%|######5 | 13087/20000 [01:02<00:24, 285.03it/s]\n",
|
495 |
+
" 66%|######5 | 13117/20000 [01:03<00:23, 287.71it/s]\n",
|
496 |
+
" 66%|######5 | 13151/20000 [01:03<00:22, 301.25it/s]\n",
|
497 |
+
" 66%|######5 | 13182/20000 [01:03<00:22, 299.42it/s]\n",
|
498 |
+
" 66%|######6 | 13213/20000 [01:03<00:23, 288.18it/s]\n",
|
499 |
+
" 66%|######6 | 13247/20000 [01:03<00:22, 302.07it/s]\n",
|
500 |
+
" 66%|######6 | 13280/20000 [01:03<00:21, 309.23it/s]\n",
|
501 |
+
" 67%|######6 | 13312/20000 [01:03<00:21, 306.12it/s]\n",
|
502 |
+
" 67%|######6 | 13348/20000 [01:03<00:20, 321.72it/s]\n",
|
503 |
+
" 67%|######6 | 13381/20000 [01:03<00:20, 320.39it/s]\n",
|
504 |
+
" 67%|######7 | 13414/20000 [01:04<00:35, 183.90it/s]\n",
|
505 |
+
" 67%|######7 | 13448/20000 [01:04<00:30, 213.47it/s]\n",
|
506 |
+
" 67%|######7 | 13478/20000 [01:04<00:28, 232.06it/s]\n",
|
507 |
+
" 68%|######7 | 13508/20000 [01:04<00:26, 246.85it/s]\n",
|
508 |
+
" 68%|######7 | 13546/20000 [01:04<00:23, 278.79it/s]\n",
|
509 |
+
" 68%|######7 | 13578/20000 [01:04<00:22, 289.60it/s]\n",
|
510 |
+
" 68%|######8 | 13610/20000 [01:04<00:21, 290.75it/s]\n",
|
511 |
+
" 68%|######8 | 13650/20000 [01:04<00:19, 319.96it/s]\n",
|
512 |
+
" 68%|######8 | 13684/20000 [01:05<00:19, 322.87it/s]\n",
|
513 |
+
" 69%|######8 | 13718/20000 [01:05<00:19, 324.97it/s]\n",
|
514 |
+
" 69%|######8 | 13753/20000 [01:05<00:18, 332.16it/s]\n",
|
515 |
+
" 69%|######8 | 13787/20000 [01:05<00:19, 323.16it/s]\n",
|
516 |
+
" 69%|######9 | 13820/20000 [01:05<00:19, 317.82it/s]\n",
|
517 |
+
" 69%|######9 | 13857/20000 [01:05<00:18, 332.74it/s]\n",
|
518 |
+
" 69%|######9 | 13891/20000 [01:05<00:18, 333.86it/s]\n",
|
519 |
+
" 70%|######9 | 13927/20000 [01:05<00:17, 340.50it/s]\n",
|
520 |
+
" 70%|######9 | 13963/20000 [01:05<00:17, 345.20it/s]\n",
|
521 |
+
" 70%|######9 | 13998/20000 [01:05<00:17, 340.60it/s]\n",
|
522 |
+
" 70%|####### | 14036/20000 [01:06<00:16, 351.09it/s]\n",
|
523 |
+
" 70%|####### | 14073/20000 [01:06<00:16, 356.65it/s]\n",
|
524 |
+
" 71%|####### | 14109/20000 [01:06<00:16, 353.45it/s]\n",
|
525 |
+
" 71%|####### | 14150/20000 [01:06<00:15, 369.02it/s]\n",
|
526 |
+
" 71%|####### | 14187/20000 [01:06<00:15, 368.21it/s]\n",
|
527 |
+
" 71%|#######1 | 14227/20000 [01:06<00:15, 375.42it/s]\n",
|
528 |
+
" 71%|#######1 | 14265/20000 [01:06<00:16, 345.08it/s]\n",
|
529 |
+
" 72%|#######1 | 14301/20000 [01:06<00:16, 347.30it/s]\n",
|
530 |
+
" 72%|#######1 | 14349/20000 [01:06<00:14, 383.90it/s]\n",
|
531 |
+
" 72%|#######1 | 14388/20000 [01:06<00:14, 376.96it/s]\n",
|
532 |
+
" 72%|#######2 | 14430/20000 [01:07<00:14, 389.28it/s]\n",
|
533 |
+
" 72%|#######2 | 14471/20000 [01:07<00:13, 395.30it/s]\n",
|
534 |
+
" 73%|#######2 | 14511/20000 [01:07<00:14, 389.82it/s]\n",
|
535 |
+
" 73%|#######2 | 14554/20000 [01:07<00:13, 401.53it/s]\n",
|
536 |
+
" 73%|#######2 | 14595/20000 [01:07<00:14, 378.41it/s]\n",
|
537 |
+
" 73%|#######3 | 14643/20000 [01:07<00:13, 405.95it/s]\n",
|
538 |
+
" 73%|#######3 | 14687/20000 [01:07<00:12, 415.69it/s]\n",
|
539 |
+
" 74%|#######3 | 14730/20000 [01:07<00:12, 418.62it/s]\n",
|
540 |
+
" 74%|#######3 | 14774/20000 [01:07<00:12, 422.40it/s]\n",
|
541 |
+
" 74%|#######4 | 14817/20000 [01:08<00:12, 418.48it/s]\n",
|
542 |
+
" 74%|#######4 | 14868/20000 [01:08<00:11, 443.95it/s]\n",
|
543 |
+
" 75%|#######4 | 14913/20000 [01:08<00:11, 444.41it/s]\n",
|
544 |
+
" 75%|#######4 | 14962/20000 [01:08<00:11, 457.86it/s]\n",
|
545 |
+
" 75%|#######5 | 15008/20000 [01:08<00:11, 438.97it/s]\n",
|
546 |
+
" 75%|#######5 | 15067/20000 [01:08<00:10, 481.14it/s]\n",
|
547 |
+
" 76%|#######5 | 15116/20000 [01:08<00:10, 483.71it/s]\n",
|
548 |
+
" 76%|#######5 | 15173/20000 [01:08<00:09, 509.06it/s]\n",
|
549 |
+
" 76%|#######6 | 15227/20000 [01:08<00:09, 518.19it/s]\n",
|
550 |
+
" 76%|#######6 | 15285/20000 [01:08<00:08, 534.95it/s]\n",
|
551 |
+
" 77%|#######6 | 15351/20000 [01:09<00:08, 570.41it/s]\n",
|
552 |
+
" 77%|#######7 | 15409/20000 [01:09<00:08, 569.86it/s]\n",
|
553 |
+
" 77%|#######7 | 15477/20000 [01:09<00:07, 602.56it/s]\n",
|
554 |
+
" 78%|#######7 | 15538/20000 [01:09<00:07, 602.96it/s]\n",
|
555 |
+
" 78%|#######7 | 15599/20000 [01:09<00:07, 585.87it/s]\n",
|
556 |
+
" 78%|#######8 | 15658/20000 [01:09<00:07, 581.97it/s]\n",
|
557 |
+
" 79%|#######8 | 15722/20000 [01:09<00:07, 598.93it/s]\n",
|
558 |
+
" 79%|#######8 | 15799/20000 [01:09<00:06, 647.41it/s]\n",
|
559 |
+
" 79%|#######9 | 15877/20000 [01:09<00:06, 684.57it/s]\n",
|
560 |
+
" 80%|#######9 | 15957/20000 [01:09<00:05, 718.72it/s]\n",
|
561 |
+
" 80%|######## | 16037/20000 [01:10<00:05, 740.70it/s]\n",
|
562 |
+
" 81%|######## | 16112/20000 [01:10<00:05, 730.42it/s]\n",
|
563 |
+
" 81%|######## | 16195/20000 [01:10<00:05, 757.50it/s]\n",
|
564 |
+
" 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
|
565 |
+
" 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
|
566 |
+
" 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
|
567 |
+
" 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
|
568 |
+
" 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
|
569 |
+
" 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
|
570 |
+
" 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
|
571 |
+
" 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
|
572 |
+
" 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
|
573 |
+
" 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
|
574 |
+
" 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
|
575 |
+
" 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
|
576 |
+
" 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
|
577 |
+
" 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
|
578 |
+
" 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
|
579 |
+
" 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
|
580 |
+
" 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
|
581 |
+
" 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
|
582 |
+
" 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
|
583 |
+
" 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
|
584 |
+
" 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
|
585 |
+
" 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
|
586 |
+
" 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
|
587 |
+
" 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
|
588 |
+
" 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
|
589 |
+
" 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
|
590 |
+
" 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
|
591 |
+
" 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
|
592 |
+
" 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
|
593 |
+
" 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
|
594 |
+
" 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
|
595 |
+
" 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
|
596 |
+
" 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
|
597 |
+
" 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
|
598 |
+
" 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
|
599 |
+
" 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
|
600 |
+
" 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
|
601 |
+
" 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
|
602 |
+
" 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
|
603 |
+
" 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
|
604 |
+
" 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
|
605 |
+
" 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
|
606 |
+
" 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
|
607 |
+
" 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
|
608 |
+
" 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
|
609 |
+
" 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
|
610 |
+
" 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
|
611 |
+
" 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
|
612 |
+
" 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
|
613 |
+
" 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
|
614 |
+
" 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
|
615 |
+
" 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
|
616 |
+
" 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
|
617 |
+
" 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
|
618 |
+
" 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
|
619 |
+
" 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
|
620 |
+
"100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
|
621 |
+
"100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
|
622 |
+
"100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
|
623 |
+
"100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
|
624 |
+
"100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
|
625 |
+
"100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
|
626 |
+
]
|
627 |
+
}
|
628 |
+
],
|
629 |
+
"source": [
|
630 |
+
"!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
|
631 |
+
]
|
632 |
+
},
|
633 |
+
{
|
634 |
+
"cell_type": "code",
|
635 |
+
"execution_count": 12,
|
636 |
+
"id": "68a4113a",
|
637 |
+
"metadata": {},
|
638 |
+
"outputs": [],
|
639 |
+
"source": [
|
640 |
+
"!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
|
641 |
+
]
|
642 |
+
},
|
643 |
+
{
|
644 |
+
"cell_type": "code",
|
645 |
+
"execution_count": 13,
|
646 |
+
"id": "06254f0d",
|
647 |
+
"metadata": {},
|
648 |
+
"outputs": [
|
649 |
+
{
|
650 |
+
"name": "stdout",
|
651 |
+
"output_type": "stream",
|
652 |
+
"text": [
|
653 |
+
"Vocabulary size: 20217\n"
|
654 |
+
]
|
655 |
+
}
|
656 |
+
],
|
657 |
+
"source": [
|
658 |
+
"def count_tokens(file_path):\n",
|
659 |
+
" try:\n",
|
660 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
661 |
+
" text = file.read()\n",
|
662 |
+
" # Split the text into tokens based on spaces\n",
|
663 |
+
" tokens = text.split()\n",
|
664 |
+
" # Count the vocabulary size (number of unique tokens)\n",
|
665 |
+
" vocabulary_size = len(set(tokens))\n",
|
666 |
+
" return vocabulary_size\n",
|
667 |
+
" except IOError:\n",
|
668 |
+
" print(f\"Error: Could not open or read the file '{file_path}'\")\n",
|
669 |
+
" return -1\n",
|
670 |
+
"\n",
|
671 |
+
"# Example usage\n",
|
672 |
+
"file_path = './dataset/output_dataset.txt' # Replace with the actual file path\n",
|
673 |
+
"vocabulary_size = count_tokens(file_path)\n",
|
674 |
+
"if vocabulary_size != -1:\n",
|
675 |
+
" print(f\"Vocabulary size: {vocabulary_size}\")\n"
|
676 |
+
]
|
677 |
+
}
|
678 |
+
],
|
679 |
+
"metadata": {
|
680 |
+
"kernelspec": {
|
681 |
+
"display_name": "Python 3 (ipykernel)",
|
682 |
+
"language": "python",
|
683 |
+
"name": "python3"
|
684 |
+
},
|
685 |
+
"language_info": {
|
686 |
+
"codemirror_mode": {
|
687 |
+
"name": "ipython",
|
688 |
+
"version": 3
|
689 |
+
},
|
690 |
+
"file_extension": ".py",
|
691 |
+
"mimetype": "text/x-python",
|
692 |
+
"name": "python",
|
693 |
+
"nbconvert_exporter": "python",
|
694 |
+
"pygments_lexer": "ipython3",
|
695 |
+
"version": "3.9.5"
|
696 |
+
}
|
697 |
+
},
|
698 |
+
"nbformat": 4,
|
699 |
+
"nbformat_minor": 5
|
700 |
+
}
|
subword/get_vocab.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/env python
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import inspect
|
7 |
+
import warnings
|
8 |
+
import argparse
|
9 |
+
import codecs
|
10 |
+
|
11 |
+
from collections import Counter
|
12 |
+
|
13 |
+
# hack for python2/3 compatibility
|
14 |
+
from io import open
|
15 |
+
argparse.open = open
|
16 |
+
|
17 |
+
def create_parser(subparsers=None):
|
18 |
+
|
19 |
+
if subparsers:
|
20 |
+
parser = subparsers.add_parser('get-vocab',
|
21 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
22 |
+
description="Generates vocabulary")
|
23 |
+
else:
|
24 |
+
parser = argparse.ArgumentParser(
|
25 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
26 |
+
description="Generates vocabulary")
|
27 |
+
|
28 |
+
parser.add_argument(
|
29 |
+
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
30 |
+
metavar='PATH',
|
31 |
+
help="Input file (default: standard input).")
|
32 |
+
|
33 |
+
parser.add_argument(
|
34 |
+
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
35 |
+
metavar='PATH',
|
36 |
+
help="Output file (default: standard output)")
|
37 |
+
|
38 |
+
return parser
|
39 |
+
|
40 |
+
def get_vocab(train_file, vocab_file):
|
41 |
+
|
42 |
+
c = Counter()
|
43 |
+
|
44 |
+
for line in train_file:
|
45 |
+
for word in line.strip('\r\n ').split(' '):
|
46 |
+
if word:
|
47 |
+
c[word] += 1
|
48 |
+
|
49 |
+
for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
|
50 |
+
vocab_file.write(key+" "+ str(f) + "\n")
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
|
54 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
55 |
+
newdir = os.path.join(currentdir, 'subword_nmt')
|
56 |
+
if os.path.isdir(newdir):
|
57 |
+
warnings.warn(
|
58 |
+
"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
|
59 |
+
DeprecationWarning
|
60 |
+
)
|
61 |
+
|
62 |
+
# python 2/3 compatibility
|
63 |
+
if sys.version_info < (3, 0):
|
64 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
65 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
66 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
67 |
+
else:
|
68 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
69 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
70 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
71 |
+
|
72 |
+
parser = create_parser()
|
73 |
+
args = parser.parse_args()
|
74 |
+
|
75 |
+
# read/write files as UTF-8
|
76 |
+
if args.input.name != '<stdin>':
|
77 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
78 |
+
if args.output.name != '<stdout>':
|
79 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
80 |
+
|
81 |
+
get_vocab(args.input, args.output)
|
82 |
+
|
83 |
+
# close files
|
84 |
+
if args.input.name != '<stdin>':
|
85 |
+
args.input.close()
|
86 |
+
if args.output.name != '<stdout>':
|
87 |
+
args.output.close()
|
subword/learn_bpe.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
|
6 |
+
Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
|
7 |
+
of a text to a configurable number of symbols, with only a small increase in the number of tokens.
|
8 |
+
|
9 |
+
Reference:
|
10 |
+
Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
|
11 |
+
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
|
12 |
+
"""
|
13 |
+
|
14 |
+
from __future__ import unicode_literals
|
15 |
+
|
16 |
+
import os
|
17 |
+
import sys
|
18 |
+
import inspect
|
19 |
+
import codecs
|
20 |
+
import re
|
21 |
+
import copy
|
22 |
+
import argparse
|
23 |
+
import warnings
|
24 |
+
import tempfile
|
25 |
+
from multiprocessing import Pool, cpu_count
|
26 |
+
from collections import defaultdict, Counter
|
27 |
+
|
28 |
+
try:
|
29 |
+
from tqdm import tqdm
|
30 |
+
except ImportError:
|
31 |
+
def tqdm(iterator, *args, **kwargs):
|
32 |
+
return iterator
|
33 |
+
|
34 |
+
# hack for python2/3 compatibility
|
35 |
+
from io import open
|
36 |
+
argparse.open = open
|
37 |
+
|
38 |
+
def create_parser(subparsers=None):
|
39 |
+
|
40 |
+
if subparsers:
|
41 |
+
parser = subparsers.add_parser('learn-bpe',
|
42 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
43 |
+
description="learn BPE-based word segmentation")
|
44 |
+
else:
|
45 |
+
parser = argparse.ArgumentParser(
|
46 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
47 |
+
description="learn BPE-based word segmentation")
|
48 |
+
|
49 |
+
parser.add_argument(
|
50 |
+
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
51 |
+
metavar='PATH',
|
52 |
+
help="Input text (default: standard input).")
|
53 |
+
|
54 |
+
parser.add_argument(
|
55 |
+
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
56 |
+
metavar='PATH',
|
57 |
+
help="Output file for BPE codes (default: standard output)")
|
58 |
+
parser.add_argument(
|
59 |
+
'--symbols', '-s', type=int, default=10000,
|
60 |
+
help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
|
61 |
+
parser.add_argument(
|
62 |
+
'--min-frequency', type=int, default=2, metavar='FREQ',
|
63 |
+
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
|
64 |
+
parser.add_argument('--dict-input', action="store_true",
|
65 |
+
help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
|
66 |
+
parser.add_argument(
|
67 |
+
'--total-symbols', '-t', action="store_true",
|
68 |
+
help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
|
69 |
+
parser.add_argument(
|
70 |
+
'--num-workers', type=int, default=1,
|
71 |
+
help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
|
72 |
+
parser.add_argument(
|
73 |
+
'--verbose', '-v', action="store_true",
|
74 |
+
help="verbose mode.")
|
75 |
+
|
76 |
+
return parser
|
77 |
+
|
78 |
+
def get_vocabulary(fobj, is_dict=False, num_workers=1):
|
79 |
+
"""Read text and return dictionary that encodes vocabulary
|
80 |
+
"""
|
81 |
+
vocab = Counter()
|
82 |
+
if is_dict:
|
83 |
+
for i, line in enumerate(fobj):
|
84 |
+
try:
|
85 |
+
word, count = line.strip('\r\n ').split(' ')
|
86 |
+
except:
|
87 |
+
print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
|
88 |
+
sys.exit(1)
|
89 |
+
vocab[word] += int(count)
|
90 |
+
elif num_workers == 1 or fobj.name == '<stdin>':
|
91 |
+
if num_workers > 1:
|
92 |
+
warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
|
93 |
+
for i, line in enumerate(fobj):
|
94 |
+
for word in line.strip('\r\n ').split(' '):
|
95 |
+
if word:
|
96 |
+
vocab[word] += 1
|
97 |
+
elif num_workers > 1:
|
98 |
+
|
99 |
+
if sys.version_info < (3, 0):
|
100 |
+
print("Parallel mode is only supported in Python3.")
|
101 |
+
sys.exit(1)
|
102 |
+
|
103 |
+
with open(fobj.name, encoding="utf8") as f:
|
104 |
+
size = os.fstat(f.fileno()).st_size
|
105 |
+
chunk_size = int(size / num_workers)
|
106 |
+
offsets = [0 for _ in range(num_workers + 1)]
|
107 |
+
for i in range(1, num_workers):
|
108 |
+
f.seek(chunk_size * i)
|
109 |
+
pos = f.tell()
|
110 |
+
while True:
|
111 |
+
try:
|
112 |
+
line = f.readline()
|
113 |
+
break
|
114 |
+
except UnicodeDecodeError:
|
115 |
+
pos -= 1
|
116 |
+
f.seek(pos)
|
117 |
+
offsets[i] = f.tell()
|
118 |
+
assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
|
119 |
+
|
120 |
+
vocab_files = []
|
121 |
+
pool = Pool(processes=num_workers)
|
122 |
+
for i in range(num_workers):
|
123 |
+
tmp = tempfile.NamedTemporaryFile(delete=False)
|
124 |
+
tmp.close()
|
125 |
+
vocab_files.append(tmp)
|
126 |
+
pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1]))
|
127 |
+
pool.close()
|
128 |
+
pool.join()
|
129 |
+
import pickle
|
130 |
+
for i in range(num_workers):
|
131 |
+
with open(vocab_files[i].name, 'rb') as f:
|
132 |
+
vocab += pickle.load(f)
|
133 |
+
os.remove(vocab_files[i].name)
|
134 |
+
else:
|
135 |
+
raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
|
136 |
+
return vocab
|
137 |
+
|
138 |
+
def _get_vocabulary(infile, outfile, begin, end):
|
139 |
+
import pickle
|
140 |
+
vocab = Counter()
|
141 |
+
with open(infile, encoding="utf8") as f:
|
142 |
+
f.seek(begin)
|
143 |
+
line = f.readline()
|
144 |
+
while line:
|
145 |
+
pos = f.tell()
|
146 |
+
assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
|
147 |
+
if end > 0 and pos > end:
|
148 |
+
break
|
149 |
+
for word in line.strip('\r\n ').split(' '):
|
150 |
+
if word:
|
151 |
+
vocab[word] += 1
|
152 |
+
line = f.readline()
|
153 |
+
with open(outfile, 'wb') as f:
|
154 |
+
pickle.dump(vocab, f)
|
155 |
+
|
156 |
+
def update_pair_statistics(pair, changed, stats, indices):
|
157 |
+
"""Minimally update the indices and frequency of symbol pairs
|
158 |
+
|
159 |
+
if we merge a pair of symbols, only pairs that overlap with occurrences
|
160 |
+
of this pair are affected, and need to be updated.
|
161 |
+
"""
|
162 |
+
stats[pair] = 0
|
163 |
+
indices[pair] = defaultdict(int)
|
164 |
+
first, second = pair
|
165 |
+
new_pair = first+second
|
166 |
+
for j, word, old_word, freq in changed:
|
167 |
+
|
168 |
+
# find all instances of pair, and update frequency/indices around it
|
169 |
+
i = 0
|
170 |
+
while True:
|
171 |
+
# find first symbol
|
172 |
+
try:
|
173 |
+
i = old_word.index(first, i)
|
174 |
+
except ValueError:
|
175 |
+
break
|
176 |
+
# if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
|
177 |
+
if i < len(old_word)-1 and old_word[i+1] == second:
|
178 |
+
# assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
|
179 |
+
if i:
|
180 |
+
prev = old_word[i-1:i+1]
|
181 |
+
stats[prev] -= freq
|
182 |
+
indices[prev][j] -= 1
|
183 |
+
if i < len(old_word)-2:
|
184 |
+
# assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
|
185 |
+
# however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
|
186 |
+
if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
|
187 |
+
nex = old_word[i+1:i+3]
|
188 |
+
stats[nex] -= freq
|
189 |
+
indices[nex][j] -= 1
|
190 |
+
i += 2
|
191 |
+
else:
|
192 |
+
i += 1
|
193 |
+
|
194 |
+
i = 0
|
195 |
+
while True:
|
196 |
+
try:
|
197 |
+
# find new pair
|
198 |
+
i = word.index(new_pair, i)
|
199 |
+
except ValueError:
|
200 |
+
break
|
201 |
+
# assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
|
202 |
+
if i:
|
203 |
+
prev = word[i-1:i+1]
|
204 |
+
stats[prev] += freq
|
205 |
+
indices[prev][j] += 1
|
206 |
+
# assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
|
207 |
+
# however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
|
208 |
+
if i < len(word)-1 and word[i+1] != new_pair:
|
209 |
+
nex = word[i:i+2]
|
210 |
+
stats[nex] += freq
|
211 |
+
indices[nex][j] += 1
|
212 |
+
i += 1
|
213 |
+
|
214 |
+
|
215 |
+
def get_pair_statistics(vocab):
|
216 |
+
"""Count frequency of all symbol pairs, and create index"""
|
217 |
+
|
218 |
+
# data structure of pair frequencies
|
219 |
+
stats = defaultdict(int)
|
220 |
+
|
221 |
+
#index from pairs to words
|
222 |
+
indices = defaultdict(lambda: defaultdict(int))
|
223 |
+
|
224 |
+
for i, (word, freq) in enumerate(vocab):
|
225 |
+
prev_char = word[0]
|
226 |
+
for char in word[1:]:
|
227 |
+
stats[prev_char, char] += freq
|
228 |
+
indices[prev_char, char][i] += 1
|
229 |
+
prev_char = char
|
230 |
+
|
231 |
+
return stats, indices
|
232 |
+
|
233 |
+
|
234 |
+
def replace_pair(pair, vocab, indices):
|
235 |
+
"""Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
|
236 |
+
first, second = pair
|
237 |
+
pair_str = ''.join(pair)
|
238 |
+
pair_str = pair_str.replace('\\','\\\\')
|
239 |
+
changes = []
|
240 |
+
pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
|
241 |
+
if sys.version_info < (3, 0):
|
242 |
+
iterator = indices[pair].iteritems()
|
243 |
+
else:
|
244 |
+
iterator = indices[pair].items()
|
245 |
+
for j, freq in iterator:
|
246 |
+
if freq < 1:
|
247 |
+
continue
|
248 |
+
word, freq = vocab[j]
|
249 |
+
new_word = ' '.join(word)
|
250 |
+
new_word = pattern.sub(pair_str, new_word)
|
251 |
+
new_word = tuple(new_word.split(' '))
|
252 |
+
|
253 |
+
vocab[j] = (new_word, freq)
|
254 |
+
changes.append((j, new_word, word, freq))
|
255 |
+
|
256 |
+
return changes
|
257 |
+
|
258 |
+
def prune_stats(stats, big_stats, threshold):
|
259 |
+
"""Prune statistics dict for efficiency of max()
|
260 |
+
|
261 |
+
The frequency of a symbol pair never increases, so pruning is generally safe
|
262 |
+
(until we the most frequent pair is less frequent than a pair we previously pruned)
|
263 |
+
big_stats keeps full statistics for when we need to access pruned items
|
264 |
+
"""
|
265 |
+
for item,freq in list(stats.items()):
|
266 |
+
if freq < threshold:
|
267 |
+
del stats[item]
|
268 |
+
if freq < 0:
|
269 |
+
big_stats[item] += freq
|
270 |
+
else:
|
271 |
+
big_stats[item] = freq
|
272 |
+
|
273 |
+
|
274 |
+
def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False, total_symbols=False, num_workers=1):
|
275 |
+
"""Learn num_symbols BPE operations from vocabulary, and write to outfile.
|
276 |
+
"""
|
277 |
+
|
278 |
+
# version 0.2 changes the handling of the end-of-word token ('</w>');
|
279 |
+
# version numbering allows bckward compatibility
|
280 |
+
outfile.write('#version: 0.2\n')
|
281 |
+
|
282 |
+
vocab = get_vocabulary(infile, is_dict, num_workers)
|
283 |
+
vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
|
284 |
+
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
|
285 |
+
|
286 |
+
stats, indices = get_pair_statistics(sorted_vocab)
|
287 |
+
big_stats = copy.deepcopy(stats)
|
288 |
+
|
289 |
+
if total_symbols:
|
290 |
+
uniq_char_internal = set()
|
291 |
+
uniq_char_final = set()
|
292 |
+
for word in vocab:
|
293 |
+
for char in word[:-1]:
|
294 |
+
uniq_char_internal.add(char)
|
295 |
+
uniq_char_final.add(word[-1])
|
296 |
+
sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal)))
|
297 |
+
sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final)))
|
298 |
+
sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final)))
|
299 |
+
num_symbols -= len(uniq_char_internal) + len(uniq_char_final)
|
300 |
+
|
301 |
+
# threshold is inspired by Zipfian assumption, but should only affect speed
|
302 |
+
threshold = max(stats.values()) / 10
|
303 |
+
for i in tqdm(range(num_symbols)):
|
304 |
+
if stats:
|
305 |
+
most_frequent = max(stats, key=lambda x: (stats[x], x))
|
306 |
+
|
307 |
+
# we probably missed the best pair because of pruning; go back to full statistics
|
308 |
+
if not stats or (i and stats[most_frequent] < threshold):
|
309 |
+
prune_stats(stats, big_stats, threshold)
|
310 |
+
stats = copy.deepcopy(big_stats)
|
311 |
+
most_frequent = max(stats, key=lambda x: (stats[x], x))
|
312 |
+
# threshold is inspired by Zipfian assumption, but should only affect speed
|
313 |
+
threshold = stats[most_frequent] * i/(i+10000.0)
|
314 |
+
prune_stats(stats, big_stats, threshold)
|
315 |
+
|
316 |
+
if stats[most_frequent] < min_frequency:
|
317 |
+
sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
|
318 |
+
break
|
319 |
+
|
320 |
+
if verbose:
|
321 |
+
sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
|
322 |
+
outfile.write('{0} {1}\n'.format(*most_frequent))
|
323 |
+
changes = replace_pair(most_frequent, sorted_vocab, indices)
|
324 |
+
update_pair_statistics(most_frequent, changes, stats, indices)
|
325 |
+
stats[most_frequent] = 0
|
326 |
+
if not i % 100:
|
327 |
+
prune_stats(stats, big_stats, threshold)
|
328 |
+
|
329 |
+
|
330 |
+
if __name__ == '__main__':
|
331 |
+
|
332 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
333 |
+
newdir = os.path.join(currentdir, 'subword_nmt')
|
334 |
+
if os.path.isdir(newdir):
|
335 |
+
warnings.warn(
|
336 |
+
"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
|
337 |
+
DeprecationWarning
|
338 |
+
)
|
339 |
+
|
340 |
+
# python 2/3 compatibility
|
341 |
+
if sys.version_info < (3, 0):
|
342 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
343 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
344 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
345 |
+
else:
|
346 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
347 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
348 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
349 |
+
|
350 |
+
parser = create_parser()
|
351 |
+
args = parser.parse_args()
|
352 |
+
|
353 |
+
if args.num_workers <= 0:
|
354 |
+
args.num_workers = cpu_count()
|
355 |
+
|
356 |
+
if sys.version_info < (3, 0) and args.num_workers > 1:
|
357 |
+
args.num_workers = 1
|
358 |
+
warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
|
359 |
+
|
360 |
+
# read/write files as UTF-8
|
361 |
+
if args.input.name != '<stdin>':
|
362 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
363 |
+
if args.output.name != '<stdout>':
|
364 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
365 |
+
|
366 |
+
learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols, num_workers=args.num_workers)
|
367 |
+
|
368 |
+
# close files
|
369 |
+
if args.input.name != '<stdin>':
|
370 |
+
args.input.close()
|
371 |
+
if args.output.name != '<stdout>':
|
372 |
+
args.output.close()
|
subword/learn_joint_bpe_and_vocab.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
|
6 |
+
This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus,
|
7 |
+
applies the learned operation to each and (optionally) returns the resulting vocabulary of each text.
|
8 |
+
The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text.
|
9 |
+
|
10 |
+
Reference:
|
11 |
+
Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
|
12 |
+
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
|
13 |
+
"""
|
14 |
+
|
15 |
+
from __future__ import unicode_literals
|
16 |
+
|
17 |
+
import sys
|
18 |
+
import os
|
19 |
+
import inspect
|
20 |
+
import codecs
|
21 |
+
import argparse
|
22 |
+
import tempfile
|
23 |
+
import warnings
|
24 |
+
from collections import Counter
|
25 |
+
from multiprocessing import cpu_count
|
26 |
+
|
27 |
+
#hack to get imports working if running this as a script, or within a package
|
28 |
+
if __name__ == '__main__':
|
29 |
+
import learn_bpe
|
30 |
+
import apply_bpe
|
31 |
+
else:
|
32 |
+
from . import learn_bpe
|
33 |
+
from . import apply_bpe
|
34 |
+
|
35 |
+
# hack for python2/3 compatibility
|
36 |
+
from io import open
|
37 |
+
argparse.open = open
|
38 |
+
|
39 |
+
def create_parser(subparsers=None):
|
40 |
+
|
41 |
+
if subparsers:
|
42 |
+
parser = subparsers.add_parser('learn-joint-bpe-and-vocab',
|
43 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
44 |
+
description="learn BPE-based word segmentation")
|
45 |
+
else:
|
46 |
+
parser = argparse.ArgumentParser(
|
47 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
48 |
+
description="learn BPE-based word segmentation")
|
49 |
+
|
50 |
+
parser.add_argument(
|
51 |
+
'--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+',
|
52 |
+
metavar='PATH',
|
53 |
+
help="Input texts (multiple allowed).")
|
54 |
+
parser.add_argument(
|
55 |
+
'--output', '-o', type=argparse.FileType('w'), required=True,
|
56 |
+
metavar='PATH',
|
57 |
+
help="Output file for BPE codes.")
|
58 |
+
parser.add_argument(
|
59 |
+
'--symbols', '-s', type=int, default=10000,
|
60 |
+
help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
|
61 |
+
parser.add_argument(
|
62 |
+
'--separator', type=str, default='@@', metavar='STR',
|
63 |
+
help="Separator between non-final subword units (default: '%(default)s')")
|
64 |
+
parser.add_argument(
|
65 |
+
'--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None,
|
66 |
+
metavar='PATH', dest='vocab',
|
67 |
+
help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py')
|
68 |
+
parser.add_argument(
|
69 |
+
'--min-frequency', type=int, default=2, metavar='FREQ',
|
70 |
+
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
|
71 |
+
parser.add_argument(
|
72 |
+
'--total-symbols', '-t', action="store_true",
|
73 |
+
help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
|
74 |
+
parser.add_argument(
|
75 |
+
'--num-workers', type=int, default=1,
|
76 |
+
help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
|
77 |
+
parser.add_argument(
|
78 |
+
'--verbose', '-v', action="store_true",
|
79 |
+
help="verbose mode.")
|
80 |
+
|
81 |
+
return parser
|
82 |
+
|
83 |
+
def learn_joint_bpe_and_vocab(args):
|
84 |
+
|
85 |
+
if args.vocab and len(args.input) != len(args.vocab):
|
86 |
+
sys.stderr.write('Error: number of input files and vocabulary files must match\n')
|
87 |
+
sys.exit(1)
|
88 |
+
|
89 |
+
# read/write files as UTF-8
|
90 |
+
args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
|
91 |
+
args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]
|
92 |
+
|
93 |
+
# get combined vocabulary of all input texts
|
94 |
+
full_vocab = Counter()
|
95 |
+
for f in args.input:
|
96 |
+
full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
|
97 |
+
f.seek(0)
|
98 |
+
|
99 |
+
vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
|
100 |
+
|
101 |
+
# learn BPE on combined vocabulary
|
102 |
+
with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
|
103 |
+
learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)
|
104 |
+
|
105 |
+
with codecs.open(args.output.name, encoding='UTF-8') as codes:
|
106 |
+
bpe = apply_bpe.BPE(codes, separator=args.separator)
|
107 |
+
|
108 |
+
# apply BPE to each training corpus and get vocabulary
|
109 |
+
for train_file, vocab_file in zip(args.input, args.vocab):
|
110 |
+
|
111 |
+
tmp = tempfile.NamedTemporaryFile(delete=False)
|
112 |
+
tmp.close()
|
113 |
+
|
114 |
+
tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
|
115 |
+
|
116 |
+
train_file.seek(0)
|
117 |
+
bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)
|
118 |
+
|
119 |
+
tmpout.close()
|
120 |
+
tmpin = codecs.open(tmp.name, encoding='UTF-8')
|
121 |
+
|
122 |
+
vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
|
123 |
+
tmpin.close()
|
124 |
+
os.remove(tmp.name)
|
125 |
+
|
126 |
+
for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
|
127 |
+
vocab_file.write("{0} {1}\n".format(key, freq))
|
128 |
+
train_file.close()
|
129 |
+
vocab_file.close()
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == '__main__':
|
133 |
+
|
134 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
135 |
+
newdir = os.path.join(currentdir, 'subword_nmt')
|
136 |
+
if os.path.isdir(newdir):
|
137 |
+
warnings.warn(
|
138 |
+
"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
|
139 |
+
DeprecationWarning
|
140 |
+
)
|
141 |
+
|
142 |
+
# python 2/3 compatibility
|
143 |
+
if sys.version_info < (3, 0):
|
144 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
145 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
146 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
147 |
+
else:
|
148 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
149 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
150 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
151 |
+
|
152 |
+
parser = create_parser()
|
153 |
+
args = parser.parse_args()
|
154 |
+
|
155 |
+
if args.num_workers <= 0:
|
156 |
+
args.num_workers = cpu_count()
|
157 |
+
|
158 |
+
if sys.version_info < (3, 0):
|
159 |
+
args.separator = args.separator.decode('UTF-8')
|
160 |
+
if args.num_workers > 1:
|
161 |
+
args.num_workers = 1
|
162 |
+
warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
|
163 |
+
|
164 |
+
assert(len(args.input) == len(args.vocab))
|
165 |
+
|
166 |
+
learn_joint_bpe_and_vocab(args)
|
subword/segment_char_ngrams.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Author: Rico Sennrich
|
4 |
+
|
5 |
+
from __future__ import unicode_literals, division
|
6 |
+
|
7 |
+
import sys
|
8 |
+
import codecs
|
9 |
+
import argparse
|
10 |
+
|
11 |
+
# hack for python2/3 compatibility
|
12 |
+
from io import open
|
13 |
+
argparse.open = open
|
14 |
+
|
15 |
+
def create_parser(subparsers=None):
|
16 |
+
|
17 |
+
if subparsers:
|
18 |
+
parser = subparsers.add_parser('segment-char-ngrams',
|
19 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
20 |
+
description="segment rare words into character n-grams")
|
21 |
+
else:
|
22 |
+
parser = argparse.ArgumentParser(
|
23 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
24 |
+
description="segment rare words into character n-grams")
|
25 |
+
|
26 |
+
parser.add_argument(
|
27 |
+
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
28 |
+
metavar='PATH',
|
29 |
+
help="Input file (default: standard input).")
|
30 |
+
parser.add_argument(
|
31 |
+
'--vocab', type=argparse.FileType('r'), metavar='PATH',
|
32 |
+
required=True,
|
33 |
+
help="Vocabulary file.")
|
34 |
+
parser.add_argument(
|
35 |
+
'--shortlist', type=int, metavar='INT', default=0,
|
36 |
+
help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).")
|
37 |
+
parser.add_argument(
|
38 |
+
'-n', type=int, metavar='INT', default=2,
|
39 |
+
help="segment rare words into character n-grams of size INT (default: '%(default)s')).")
|
40 |
+
parser.add_argument(
|
41 |
+
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
42 |
+
metavar='PATH',
|
43 |
+
help="Output file (default: standard output)")
|
44 |
+
parser.add_argument(
|
45 |
+
'--separator', '-s', type=str, default='@@', metavar='STR',
|
46 |
+
help="Separator between non-final subword units (default: '%(default)s'))")
|
47 |
+
|
48 |
+
return parser
|
49 |
+
|
50 |
+
def segment_char_ngrams(args):
|
51 |
+
|
52 |
+
vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2]
|
53 |
+
vocab = dict((y,x) for (x,y) in enumerate(vocab))
|
54 |
+
|
55 |
+
for line in args.input:
|
56 |
+
for word in line.split():
|
57 |
+
if word not in vocab or vocab[word] > args.shortlist:
|
58 |
+
i = 0
|
59 |
+
while i*args.n < len(word):
|
60 |
+
args.output.write(word[i*args.n:i*args.n+args.n])
|
61 |
+
i += 1
|
62 |
+
if i*args.n < len(word):
|
63 |
+
args.output.write(args.separator)
|
64 |
+
args.output.write(' ')
|
65 |
+
else:
|
66 |
+
args.output.write(word + ' ')
|
67 |
+
args.output.write('\n')
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == '__main__':
|
71 |
+
|
72 |
+
# python 2/3 compatibility
|
73 |
+
if sys.version_info < (3, 0):
|
74 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
75 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
76 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
77 |
+
else:
|
78 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
79 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
80 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
81 |
+
|
82 |
+
parser = create_parser()
|
83 |
+
args = parser.parse_args()
|
84 |
+
|
85 |
+
if sys.version_info < (3, 0):
|
86 |
+
args.separator = args.separator.decode('UTF-8')
|
87 |
+
|
88 |
+
# read/write files as UTF-8
|
89 |
+
args.vocab = codecs.open(args.vocab.name, encoding='utf-8')
|
90 |
+
if args.input.name != '<stdin>':
|
91 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
92 |
+
if args.output.name != '<stdout>':
|
93 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
94 |
+
|
95 |
+
segment_char_ngrams(args)
|
subword/subword_nmt.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import io
|
5 |
+
import sys
|
6 |
+
import codecs
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
from .learn_bpe import learn_bpe
|
10 |
+
from .apply_bpe import BPE, read_vocabulary
|
11 |
+
from .get_vocab import get_vocab
|
12 |
+
from .learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab
|
13 |
+
|
14 |
+
from .learn_bpe import create_parser as create_learn_bpe_parser
|
15 |
+
from .apply_bpe import create_parser as create_apply_bpe_parser
|
16 |
+
from .get_vocab import create_parser as create_get_vocab_parser
|
17 |
+
from .learn_joint_bpe_and_vocab import create_parser as create_learn_joint_bpe_and_vocab_parser
|
18 |
+
|
19 |
+
# hack for python2/3 compatibility
|
20 |
+
argparse.open = io.open
|
21 |
+
|
22 |
+
def main():
|
23 |
+
parser = argparse.ArgumentParser(
|
24 |
+
formatter_class=argparse.RawTextHelpFormatter,
|
25 |
+
description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ")
|
26 |
+
subparsers = parser.add_subparsers(dest='command',
|
27 |
+
help="""command to run. Run one of the commands with '-h' for more info.
|
28 |
+
|
29 |
+
learn-bpe: learn BPE merge operations on input text.
|
30 |
+
apply-bpe: apply given BPE operations to input text.
|
31 |
+
get-vocab: extract vocabulary and word frequencies from input text.
|
32 |
+
learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""")
|
33 |
+
|
34 |
+
learn_bpe_parser = create_learn_bpe_parser(subparsers)
|
35 |
+
apply_bpe_parser = create_apply_bpe_parser(subparsers)
|
36 |
+
get_vocab_parser = create_get_vocab_parser(subparsers)
|
37 |
+
learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers)
|
38 |
+
|
39 |
+
args = parser.parse_args()
|
40 |
+
|
41 |
+
if args.command == 'learn-bpe':
|
42 |
+
# read/write files as UTF-8
|
43 |
+
if args.input.name != '<stdin>':
|
44 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
45 |
+
if args.output.name != '<stdout>':
|
46 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
47 |
+
|
48 |
+
learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose,
|
49 |
+
is_dict=args.dict_input, total_symbols=args.total_symbols)
|
50 |
+
elif args.command == 'apply-bpe':
|
51 |
+
# read/write files as UTF-8
|
52 |
+
args.codes = codecs.open(args.codes.name, encoding='utf-8')
|
53 |
+
if args.input.name != '<stdin>':
|
54 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
55 |
+
if args.output.name != '<stdout>':
|
56 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
57 |
+
if args.vocabulary:
|
58 |
+
args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
|
59 |
+
|
60 |
+
if args.vocabulary:
|
61 |
+
vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
|
62 |
+
else:
|
63 |
+
vocabulary = None
|
64 |
+
|
65 |
+
if sys.version_info < (3, 0):
|
66 |
+
args.separator = args.separator.decode('UTF-8')
|
67 |
+
if args.glossaries:
|
68 |
+
args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
|
69 |
+
|
70 |
+
bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
|
71 |
+
|
72 |
+
for line in args.input:
|
73 |
+
args.output.write(bpe.process_line(line, args.dropout))
|
74 |
+
|
75 |
+
elif args.command == 'get-vocab':
|
76 |
+
if args.input.name != '<stdin>':
|
77 |
+
args.input = codecs.open(args.input.name, encoding='utf-8')
|
78 |
+
if args.output.name != '<stdout>':
|
79 |
+
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
80 |
+
get_vocab(args.input, args.output)
|
81 |
+
elif args.command == 'learn-joint-bpe-and-vocab':
|
82 |
+
learn_joint_bpe_and_vocab(args)
|
83 |
+
if sys.version_info < (3, 0):
|
84 |
+
args.separator = args.separator.decode('UTF-8')
|
85 |
+
else:
|
86 |
+
raise Exception('Invalid command provided')
|
87 |
+
|
88 |
+
|
89 |
+
# python 2/3 compatibility
|
90 |
+
if sys.version_info < (3, 0):
|
91 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
92 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
93 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
94 |
+
else:
|
95 |
+
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
96 |
+
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
97 |
+
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
subword/tests/__init__.py
ADDED
File without changes
|
subword/tests/data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
bpe.out
|
subword/tests/data/bpe.ref
ADDED
@@ -0,0 +1,1001 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#version: 0.2
|
2 |
+
t h
|
3 |
+
th e</w>
|
4 |
+
i n
|
5 |
+
a n
|
6 |
+
e r
|
7 |
+
r e
|
8 |
+
o r
|
9 |
+
t i
|
10 |
+
a r
|
11 |
+
an d</w>
|
12 |
+
e n
|
13 |
+
o f</w>
|
14 |
+
o u
|
15 |
+
o n
|
16 |
+
t o</w>
|
17 |
+
o n</w>
|
18 |
+
i s</w>
|
19 |
+
e d</w>
|
20 |
+
in g</w>
|
21 |
+
a l
|
22 |
+
i n</w>
|
23 |
+
e r</w>
|
24 |
+
i t
|
25 |
+
s t
|
26 |
+
e s</w>
|
27 |
+
a t
|
28 |
+
o r</w>
|
29 |
+
a t</w>
|
30 |
+
r o
|
31 |
+
i c
|
32 |
+
o m
|
33 |
+
e s
|
34 |
+
i l
|
35 |
+
e n</w>
|
36 |
+
o u</w>
|
37 |
+
a s
|
38 |
+
a s</w>
|
39 |
+
e l
|
40 |
+
u s
|
41 |
+
a n</w>
|
42 |
+
e c
|
43 |
+
i s
|
44 |
+
o s
|
45 |
+
a c
|
46 |
+
ti on</w>
|
47 |
+
y ou</w>
|
48 |
+
o t
|
49 |
+
f or</w>
|
50 |
+
w h
|
51 |
+
i t</w>
|
52 |
+
a l</w>
|
53 |
+
v e</w>
|
54 |
+
p l
|
55 |
+
a p
|
56 |
+
s h
|
57 |
+
o l
|
58 |
+
d i
|
59 |
+
th e
|
60 |
+
q u
|
61 |
+
th at</w>
|
62 |
+
e t
|
63 |
+
m a
|
64 |
+
ar e</w>
|
65 |
+
al l</w>
|
66 |
+
th is</w>
|
67 |
+
c om
|
68 |
+
c h
|
69 |
+
r i
|
70 |
+
u n
|
71 |
+
en t</w>
|
72 |
+
b e</w>
|
73 |
+
b l
|
74 |
+
n o
|
75 |
+
a m
|
76 |
+
e v
|
77 |
+
c e</w>
|
78 |
+
@ -
|
79 |
+
@- @</w>
|
80 |
+
f or
|
81 |
+
s i
|
82 |
+
u r
|
83 |
+
l o
|
84 |
+
it h</w>
|
85 |
+
er s</w>
|
86 |
+
t s</w>
|
87 |
+
ou r</w>
|
88 |
+
w ith</w>
|
89 |
+
re s
|
90 |
+
h a
|
91 |
+
p ro
|
92 |
+
qu ot
|
93 |
+
quot ;</w>
|
94 |
+
& quot;</w>
|
95 |
+
e m
|
96 |
+
ti on
|
97 |
+
a d
|
98 |
+
l y</w>
|
99 |
+
e t</w>
|
100 |
+
b e
|
101 |
+
or d
|
102 |
+
c on
|
103 |
+
er e</w>
|
104 |
+
i g
|
105 |
+
n e
|
106 |
+
a y</w>
|
107 |
+
ro m</w>
|
108 |
+
f rom</w>
|
109 |
+
b u
|
110 |
+
n d</w>
|
111 |
+
ap os
|
112 |
+
& apos
|
113 |
+
o w
|
114 |
+
i r
|
115 |
+
w or
|
116 |
+
b y</w>
|
117 |
+
a tion</w>
|
118 |
+
o p
|
119 |
+
&apos ;
|
120 |
+
f f
|
121 |
+
t r
|
122 |
+
l i
|
123 |
+
s u
|
124 |
+
y our</w>
|
125 |
+
no t</w>
|
126 |
+
the y</w>
|
127 |
+
ic h</w>
|
128 |
+
s p
|
129 |
+
c an</w>
|
130 |
+
ou t</w>
|
131 |
+
e x
|
132 |
+
e ar
|
133 |
+
l d</w>
|
134 |
+
d e
|
135 |
+
v er
|
136 |
+
t a
|
137 |
+
g e</w>
|
138 |
+
wh ich</w>
|
139 |
+
d s</w>
|
140 |
+
bl e</w>
|
141 |
+
p ar
|
142 |
+
on e</w>
|
143 |
+
a y
|
144 |
+
w il
|
145 |
+
in g
|
146 |
+
d at
|
147 |
+
t er</w>
|
148 |
+
t er
|
149 |
+
ha ve</w>
|
150 |
+
sh all</w>
|
151 |
+
tion s</w>
|
152 |
+
m an
|
153 |
+
it y</w>
|
154 |
+
d e</w>
|
155 |
+
wil l</w>
|
156 |
+
p a
|
157 |
+
o d</w>
|
158 |
+
& #
|
159 |
+
th er</w>
|
160 |
+
c l
|
161 |
+
. .
|
162 |
+
.. .</w>
|
163 |
+
u l
|
164 |
+
es s</w>
|
165 |
+
0 0
|
166 |
+
i f</w>
|
167 |
+
a b
|
168 |
+
h e</w>
|
169 |
+
ou ld</w>
|
170 |
+
i r</w>
|
171 |
+
c h</w>
|
172 |
+
t h</w>
|
173 |
+
r a
|
174 |
+
m er
|
175 |
+
1 2
|
176 |
+
p u
|
177 |
+
A nd</w>
|
178 |
+
un to</w>
|
179 |
+
s it
|
180 |
+
res s</w>
|
181 |
+
p e
|
182 |
+
h t</w>
|
183 |
+
en ts</w>
|
184 |
+
4 ;</w>
|
185 |
+
12 4;</w>
|
186 |
+
&# 124;</w>
|
187 |
+
ing s</w>
|
188 |
+
h ol
|
189 |
+
v er</w>
|
190 |
+
m e</w>
|
191 |
+
w e</w>
|
192 |
+
s o</w>
|
193 |
+
re e</w>
|
194 |
+
m y</w>
|
195 |
+
u p
|
196 |
+
k e</w>
|
197 |
+
i d
|
198 |
+
at ed</w>
|
199 |
+
us e</w>
|
200 |
+
m ent</w>
|
201 |
+
' s</w>
|
202 |
+
es t</w>
|
203 |
+
a r</w>
|
204 |
+
P ress</w>
|
205 |
+
ou n
|
206 |
+
h o
|
207 |
+
for e</w>
|
208 |
+
f il
|
209 |
+
d ow
|
210 |
+
al l
|
211 |
+
at e</w>
|
212 |
+
t ed</w>
|
213 |
+
p er
|
214 |
+
h is</w>
|
215 |
+
er e
|
216 |
+
as e</w>
|
217 |
+
the ir</w>
|
218 |
+
p or
|
219 |
+
I C
|
220 |
+
th ere</w>
|
221 |
+
t o
|
222 |
+
is h</w>
|
223 |
+
2 00
|
224 |
+
r ou
|
225 |
+
m e
|
226 |
+
ec om
|
227 |
+
h i
|
228 |
+
as t</w>
|
229 |
+
wor k</w>
|
230 |
+
w as</w>
|
231 |
+
sit es</w>
|
232 |
+
f t
|
233 |
+
u m
|
234 |
+
in e</w>
|
235 |
+
a ti
|
236 |
+
ri bu
|
237 |
+
or e</w>
|
238 |
+
g l
|
239 |
+
c at</w>
|
240 |
+
a ble</w>
|
241 |
+
IC E
|
242 |
+
ICE cat</w>
|
243 |
+
g i
|
244 |
+
am e</w>
|
245 |
+
ac c
|
246 |
+
u d
|
247 |
+
st r
|
248 |
+
s o
|
249 |
+
pl e</w>
|
250 |
+
mer ce</w>
|
251 |
+
k s</w>
|
252 |
+
g o
|
253 |
+
ev en</w>
|
254 |
+
c re
|
255 |
+
y st
|
256 |
+
us t</w>
|
257 |
+
or s</w>
|
258 |
+
ic e</w>
|
259 |
+
h as</w>
|
260 |
+
ecom merce</w>
|
261 |
+
c i
|
262 |
+
no w</w>
|
263 |
+
a v
|
264 |
+
m ents</w>
|
265 |
+
a d</w>
|
266 |
+
us ing</w>
|
267 |
+
s t</w>
|
268 |
+
man y</w>
|
269 |
+
ma y</w>
|
270 |
+
k ing</w>
|
271 |
+
ev er
|
272 |
+
ere fore</w>
|
273 |
+
di st
|
274 |
+
y e</w>
|
275 |
+
u t
|
276 |
+
ti me</w>
|
277 |
+
s e
|
278 |
+
re n
|
279 |
+
os e</w>
|
280 |
+
o ther</w>
|
281 |
+
m ore</w>
|
282 |
+
e st
|
283 |
+
s er
|
284 |
+
s el
|
285 |
+
re c
|
286 |
+
p h
|
287 |
+
lo c
|
288 |
+
l ic
|
289 |
+
in ce</w>
|
290 |
+
en s
|
291 |
+
bu t</w>
|
292 |
+
ar y</w>
|
293 |
+
an t</w>
|
294 |
+
G od</w>
|
295 |
+
s yst
|
296 |
+
s om
|
297 |
+
l e
|
298 |
+
f ree</w>
|
299 |
+
dist ribu
|
300 |
+
an s
|
301 |
+
a g
|
302 |
+
W ord
|
303 |
+
p ur
|
304 |
+
en t
|
305 |
+
d o
|
306 |
+
ar t
|
307 |
+
al so</w>
|
308 |
+
w e
|
309 |
+
v i
|
310 |
+
s a
|
311 |
+
ri g
|
312 |
+
ne w</w>
|
313 |
+
l and</w>
|
314 |
+
b o
|
315 |
+
w ere</w>
|
316 |
+
u c
|
317 |
+
n ing</w>
|
318 |
+
m ig
|
319 |
+
i c</w>
|
320 |
+
f ir
|
321 |
+
es e</w>
|
322 |
+
em s</w>
|
323 |
+
e l</w>
|
324 |
+
d o</w>
|
325 |
+
b r
|
326 |
+
as ed</w>
|
327 |
+
ab out</w>
|
328 |
+
E n
|
329 |
+
th ings</w>
|
330 |
+
lic ens
|
331 |
+
it s</w>
|
332 |
+
i m
|
333 |
+
g r
|
334 |
+
dat a</w>
|
335 |
+
y e
|
336 |
+
up on</w>
|
337 |
+
s ti
|
338 |
+
or d</w>
|
339 |
+
in s</w>
|
340 |
+
con t
|
341 |
+
w i
|
342 |
+
us ed</w>
|
343 |
+
si on</w>
|
344 |
+
p os
|
345 |
+
ou nd</w>
|
346 |
+
l a
|
347 |
+
f e
|
348 |
+
es s
|
349 |
+
com m
|
350 |
+
L ord</w>
|
351 |
+
1 9
|
352 |
+
the m</w>
|
353 |
+
th ese</w>
|
354 |
+
on ly</w>
|
355 |
+
is h
|
356 |
+
in cl
|
357 |
+
et c</w>
|
358 |
+
el s</w>
|
359 |
+
el l</w>
|
360 |
+
c ol
|
361 |
+
c o
|
362 |
+
ac h</w>
|
363 |
+
a m</w>
|
364 |
+
a il
|
365 |
+
u l</w>
|
366 |
+
th ou</w>
|
367 |
+
ou r
|
368 |
+
n lo
|
369 |
+
in to</w>
|
370 |
+
i es</w>
|
371 |
+
hi m</w>
|
372 |
+
dow nlo
|
373 |
+
di z</w>
|
374 |
+
d er
|
375 |
+
al ly</w>
|
376 |
+
ac e</w>
|
377 |
+
Word Press</w>
|
378 |
+
som e</w>
|
379 |
+
s ince</w>
|
380 |
+
re m
|
381 |
+
pe o
|
382 |
+
peo ple</w>
|
383 |
+
pa in</w>
|
384 |
+
os t</w>
|
385 |
+
on s</w>
|
386 |
+
n o</w>
|
387 |
+
i ma
|
388 |
+
ho w</w>
|
389 |
+
for ma
|
390 |
+
en d
|
391 |
+
ad ing</w>
|
392 |
+
a re
|
393 |
+
S pain</w>
|
394 |
+
O p
|
395 |
+
u s</w>
|
396 |
+
por t</w>
|
397 |
+
ou s
|
398 |
+
in ter
|
399 |
+
ha d</w>
|
400 |
+
h ere</w>
|
401 |
+
en ti
|
402 |
+
be en</w>
|
403 |
+
ay s</w>
|
404 |
+
ur e</w>
|
405 |
+
t e
|
406 |
+
sh ould</w>
|
407 |
+
ser v
|
408 |
+
p re
|
409 |
+
l ay
|
410 |
+
g re
|
411 |
+
ff er
|
412 |
+
b ased</w>
|
413 |
+
ap art
|
414 |
+
a diz</w>
|
415 |
+
C h
|
416 |
+
C adiz</w>
|
417 |
+
w ould</w>
|
418 |
+
w are</w>
|
419 |
+
ver y</w>
|
420 |
+
u p</w>
|
421 |
+
syst ems</w>
|
422 |
+
o st
|
423 |
+
loc ated</w>
|
424 |
+
incl ud
|
425 |
+
hol d</w>
|
426 |
+
gl ish</w>
|
427 |
+
forma tion</w>
|
428 |
+
f in
|
429 |
+
en d</w>
|
430 |
+
d ev
|
431 |
+
ar k
|
432 |
+
Q u
|
433 |
+
Op en</w>
|
434 |
+
En glish</w>
|
435 |
+
wh o</w>
|
436 |
+
u ro
|
437 |
+
t ing</w>
|
438 |
+
su p
|
439 |
+
o re
|
440 |
+
n ess</w>
|
441 |
+
in formation</w>
|
442 |
+
g et</w>
|
443 |
+
f i
|
444 |
+
ec t</w>
|
445 |
+
b ec
|
446 |
+
ar d</w>
|
447 |
+
an ds</w>
|
448 |
+
an ce</w>
|
449 |
+
E uro
|
450 |
+
u e</w>
|
451 |
+
ord er</w>
|
452 |
+
id ay</w>
|
453 |
+
ic tion
|
454 |
+
ft ware</w>
|
455 |
+
f ul</w>
|
456 |
+
d is
|
457 |
+
at h</w>
|
458 |
+
a tions</w>
|
459 |
+
L u
|
460 |
+
wh en</w>
|
461 |
+
w ay</w>
|
462 |
+
t e</w>
|
463 |
+
sh e
|
464 |
+
pur ch
|
465 |
+
on g</w>
|
466 |
+
m ust</w>
|
467 |
+
fir st</w>
|
468 |
+
fil e</w>
|
469 |
+
em b
|
470 |
+
e p
|
471 |
+
e di
|
472 |
+
an g
|
473 |
+
ye a</w>
|
474 |
+
t ors</w>
|
475 |
+
st ati
|
476 |
+
stati sti
|
477 |
+
re s</w>
|
478 |
+
purch ase</w>
|
479 |
+
m ost</w>
|
480 |
+
m en</w>
|
481 |
+
m an</w>
|
482 |
+
l a</w>
|
483 |
+
it e</w>
|
484 |
+
i l</w>
|
485 |
+
h erefore</w>
|
486 |
+
fil es</w>
|
487 |
+
f t</w>
|
488 |
+
f a
|
489 |
+
an c
|
490 |
+
I n
|
491 |
+
w ell</w>
|
492 |
+
ti c
|
493 |
+
s ec
|
494 |
+
par is
|
495 |
+
p res
|
496 |
+
o ff
|
497 |
+
l in
|
498 |
+
ima ge</w>
|
499 |
+
iction ary</w>
|
500 |
+
i z
|
501 |
+
h op
|
502 |
+
h el
|
503 |
+
h e
|
504 |
+
g h</w>
|
505 |
+
f l
|
506 |
+
e d
|
507 |
+
com paris
|
508 |
+
a use</w>
|
509 |
+
P S
|
510 |
+
A S
|
511 |
+
v al
|
512 |
+
statisti c</w>
|
513 |
+
so ftware</w>
|
514 |
+
she et</w>
|
515 |
+
o k</w>
|
516 |
+
o g
|
517 |
+
m is
|
518 |
+
j o
|
519 |
+
hop s</w>
|
520 |
+
hol iday</w>
|
521 |
+
h ear
|
522 |
+
go od</w>
|
523 |
+
g o</w>
|
524 |
+
f e</w>
|
525 |
+
es hops</w>
|
526 |
+
en ce</w>
|
527 |
+
e i
|
528 |
+
downlo ading</w>
|
529 |
+
distribu tors</w>
|
530 |
+
di ffer
|
531 |
+
d ay</w>
|
532 |
+
comparis on</w>
|
533 |
+
an y</w>
|
534 |
+
am il
|
535 |
+
a ge</w>
|
536 |
+
a f
|
537 |
+
P s</w>
|
538 |
+
P H
|
539 |
+
N A</w>
|
540 |
+
AS Ps</w>
|
541 |
+
6 8
|
542 |
+
v ing</w>
|
543 |
+
th y</w>
|
544 |
+
su ch</w>
|
545 |
+
pu bl
|
546 |
+
ord ing</w>
|
547 |
+
l ine</w>
|
548 |
+
i d</w>
|
549 |
+
gre at</w>
|
550 |
+
for m
|
551 |
+
f ul
|
552 |
+
ever y</w>
|
553 |
+
el y</w>
|
554 |
+
d et
|
555 |
+
d es
|
556 |
+
ch o
|
557 |
+
c oun
|
558 |
+
c ity</w>
|
559 |
+
be hold</w>
|
560 |
+
all ed</w>
|
561 |
+
W herefore</w>
|
562 |
+
PH P</w>
|
563 |
+
P r
|
564 |
+
wor ld</w>
|
565 |
+
wi th
|
566 |
+
wh at</w>
|
567 |
+
w r
|
568 |
+
w at
|
569 |
+
tion al</w>
|
570 |
+
si m
|
571 |
+
ren t</w>
|
572 |
+
p r
|
573 |
+
ord s</w>
|
574 |
+
o b
|
575 |
+
no w
|
576 |
+
mig ht</w>
|
577 |
+
m u
|
578 |
+
f amil
|
579 |
+
e as
|
580 |
+
d ing</w>
|
581 |
+
bec ause</w>
|
582 |
+
ark X
|
583 |
+
arkX Press</w>
|
584 |
+
acc ording</w>
|
585 |
+
a u
|
586 |
+
Qu arkXPress</w>
|
587 |
+
M edi
|
588 |
+
C om
|
589 |
+
0 0</w>
|
590 |
+
w s</w>
|
591 |
+
us ers</w>
|
592 |
+
ti es</w>
|
593 |
+
th ing</w>
|
594 |
+
se e</w>
|
595 |
+
p ri
|
596 |
+
o m</w>
|
597 |
+
o c
|
598 |
+
l l</w>
|
599 |
+
k e
|
600 |
+
ic es</w>
|
601 |
+
em ent</w>
|
602 |
+
ec i
|
603 |
+
e p</w>
|
604 |
+
e m</w>
|
605 |
+
d uc
|
606 |
+
d er</w>
|
607 |
+
ar i
|
608 |
+
am p
|
609 |
+
af ter</w>
|
610 |
+
Medi a</w>
|
611 |
+
' t</w>
|
612 |
+
ver sion</w>
|
613 |
+
v es</w>
|
614 |
+
u res</w>
|
615 |
+
u m</w>
|
616 |
+
ta r</w>
|
617 |
+
rig ht</w>
|
618 |
+
rig h
|
619 |
+
par t
|
620 |
+
ow n</w>
|
621 |
+
or y</w>
|
622 |
+
o ver</w>
|
623 |
+
o s</w>
|
624 |
+
o k
|
625 |
+
mu ch</w>
|
626 |
+
k now
|
627 |
+
in st
|
628 |
+
ig h
|
629 |
+
g en
|
630 |
+
ex c
|
631 |
+
differ ent</w>
|
632 |
+
d en</w>
|
633 |
+
ap p
|
634 |
+
ans a</w>
|
635 |
+
al lo
|
636 |
+
S tar</w>
|
637 |
+
Lu f
|
638 |
+
L NA</w>
|
639 |
+
D LNA</w>
|
640 |
+
1 9</w>
|
641 |
+
y p
|
642 |
+
w ords</w>
|
643 |
+
v is
|
644 |
+
v en</w>
|
645 |
+
u r</w>
|
646 |
+
th ansa</w>
|
647 |
+
si d
|
648 |
+
sel f</w>
|
649 |
+
re n</w>
|
650 |
+
pu ter</w>
|
651 |
+
pl o
|
652 |
+
p ow
|
653 |
+
ot h</w>
|
654 |
+
n i
|
655 |
+
licens e</w>
|
656 |
+
li ke</w>
|
657 |
+
l ear
|
658 |
+
k now</w>
|
659 |
+
in ut
|
660 |
+
il e</w>
|
661 |
+
f ore
|
662 |
+
et s</w>
|
663 |
+
emb er</w>
|
664 |
+
d ec
|
665 |
+
cont ent</w>
|
666 |
+
com e</w>
|
667 |
+
c alled</w>
|
668 |
+
av ail
|
669 |
+
ar ound</w>
|
670 |
+
an d
|
671 |
+
O ff
|
672 |
+
Luf thansa</w>
|
673 |
+
F or
|
674 |
+
A l
|
675 |
+
w o</w>
|
676 |
+
up dat
|
677 |
+
u t</w>
|
678 |
+
u g
|
679 |
+
ti ve</w>
|
680 |
+
ta ke</w>
|
681 |
+
str uc
|
682 |
+
sid enti
|
683 |
+
s et</w>
|
684 |
+
s e</w>
|
685 |
+
s ame</w>
|
686 |
+
rec ei
|
687 |
+
re ad
|
688 |
+
pro duc
|
689 |
+
pl ay
|
690 |
+
p dat
|
691 |
+
ou s</w>
|
692 |
+
o l</w>
|
693 |
+
n al</w>
|
694 |
+
m at
|
695 |
+
ish ed</w>
|
696 |
+
ir it</w>
|
697 |
+
in ed</w>
|
698 |
+
i um</w>
|
699 |
+
h ot
|
700 |
+
g in
|
701 |
+
g ht</w>
|
702 |
+
f un
|
703 |
+
com pl
|
704 |
+
c ur
|
705 |
+
avail able</w>
|
706 |
+
a ir
|
707 |
+
W in
|
708 |
+
U pdat
|
709 |
+
wor ks</w>
|
710 |
+
with out</w>
|
711 |
+
un g</w>
|
712 |
+
tr ans
|
713 |
+
th ose</w>
|
714 |
+
th an</w>
|
715 |
+
sp on
|
716 |
+
sp eci
|
717 |
+
pro c
|
718 |
+
pa ge</w>
|
719 |
+
on al</w>
|
720 |
+
o ds</w>
|
721 |
+
ma de</w>
|
722 |
+
m es</w>
|
723 |
+
includ ed</w>
|
724 |
+
in i
|
725 |
+
ig n</w>
|
726 |
+
fe at
|
727 |
+
el l
|
728 |
+
ec ts</w>
|
729 |
+
ear s</w>
|
730 |
+
e w</w>
|
731 |
+
e Star</w>
|
732 |
+
dow s</w>
|
733 |
+
be fore</w>
|
734 |
+
b et
|
735 |
+
at or</w>
|
736 |
+
an s</w>
|
737 |
+
al s</w>
|
738 |
+
Win dows</w>
|
739 |
+
Updat eStar</w>
|
740 |
+
F ra
|
741 |
+
ä sidenti
|
742 |
+
äsidenti n</w>
|
743 |
+
ä ft
|
744 |
+
äft s
|
745 |
+
äfts ord
|
746 |
+
äftsord n
|
747 |
+
äftsordn ung</w>
|
748 |
+
z ur</w>
|
749 |
+
v id
|
750 |
+
um b
|
751 |
+
u plo
|
752 |
+
th rou
|
753 |
+
t yp
|
754 |
+
t wo</w>
|
755 |
+
spon s
|
756 |
+
si ble</w>
|
757 |
+
s m
|
758 |
+
rem ium</w>
|
759 |
+
re p
|
760 |
+
re gi
|
761 |
+
r e</w>
|
762 |
+
pow er</w>
|
763 |
+
per s
|
764 |
+
p an
|
765 |
+
or ing</w>
|
766 |
+
op en</w>
|
767 |
+
o w</w>
|
768 |
+
n ec
|
769 |
+
mig al</w>
|
770 |
+
is t</w>
|
771 |
+
ha ving</w>
|
772 |
+
h ath</w>
|
773 |
+
gi ven</w>
|
774 |
+
ev er</w>
|
775 |
+
et h</w>
|
776 |
+
es ch
|
777 |
+
esch äftsordnung</w>
|
778 |
+
en ter</w>
|
779 |
+
e a
|
780 |
+
con ta
|
781 |
+
com man
|
782 |
+
ch il
|
783 |
+
c or
|
784 |
+
c ap
|
785 |
+
b oth</w>
|
786 |
+
ati ve</w>
|
787 |
+
apart ments</w>
|
788 |
+
apart ment</w>
|
789 |
+
ad a</w>
|
790 |
+
S er
|
791 |
+
Pr äsidentin</w>
|
792 |
+
PS D</w>
|
793 |
+
H ot
|
794 |
+
G eschäftsordnung</w>
|
795 |
+
Fra u</w>
|
796 |
+
For migal</w>
|
797 |
+
C al
|
798 |
+
2 .
|
799 |
+
1 1</w>
|
800 |
+
y ears</w>
|
801 |
+
wh erefore</w>
|
802 |
+
u st
|
803 |
+
throu gh</w>
|
804 |
+
th en</w>
|
805 |
+
t l
|
806 |
+
t en</w>
|
807 |
+
sh al
|
808 |
+
shal t</w>
|
809 |
+
s ou
|
810 |
+
res t</w>
|
811 |
+
recei ve</w>
|
812 |
+
r u
|
813 |
+
ot ter
|
814 |
+
mer ci
|
815 |
+
ma ke</w>
|
816 |
+
m s</w>
|
817 |
+
m o
|
818 |
+
la w</w>
|
819 |
+
k et</w>
|
820 |
+
j ust</w>
|
821 |
+
ic k</w>
|
822 |
+
g rou
|
823 |
+
fun c
|
824 |
+
fore ver</w>
|
825 |
+
fin d</w>
|
826 |
+
f ace</w>
|
827 |
+
ear ch</w>
|
828 |
+
e ds</w>
|
829 |
+
e al
|
830 |
+
distribu tion</w>
|
831 |
+
d ays</w>
|
832 |
+
comman d
|
833 |
+
chil d
|
834 |
+
br ands</w>
|
835 |
+
bl ess
|
836 |
+
be gin
|
837 |
+
am ong</w>
|
838 |
+
am es</w>
|
839 |
+
ac t</w>
|
840 |
+
a in</w>
|
841 |
+
a bl
|
842 |
+
T h
|
843 |
+
P remium</w>
|
844 |
+
D e
|
845 |
+
wat ers</w>
|
846 |
+
v o
|
847 |
+
u es</w>
|
848 |
+
ti v
|
849 |
+
t y</w>
|
850 |
+
t ur
|
851 |
+
sup port</w>
|
852 |
+
spons oring</w>
|
853 |
+
r on
|
854 |
+
r an
|
855 |
+
qu i
|
856 |
+
pl ug
|
857 |
+
par t</w>
|
858 |
+
p as
|
859 |
+
otter y</w>
|
860 |
+
n or</w>
|
861 |
+
n er</w>
|
862 |
+
n ed</w>
|
863 |
+
m ine</w>
|
864 |
+
l ast</w>
|
865 |
+
it ed</w>
|
866 |
+
inut e</w>
|
867 |
+
in d
|
868 |
+
il li
|
869 |
+
ic ation</w>
|
870 |
+
gen er
|
871 |
+
g es</w>
|
872 |
+
g e
|
873 |
+
g al</w>
|
874 |
+
famil y</w>
|
875 |
+
f ol
|
876 |
+
f f</w>
|
877 |
+
er y</w>
|
878 |
+
er nal</w>
|
879 |
+
el i
|
880 |
+
d ra
|
881 |
+
cho ose</w>
|
882 |
+
child ren</w>
|
883 |
+
c at
|
884 |
+
be ach</w>
|
885 |
+
as es</w>
|
886 |
+
Off ers</w>
|
887 |
+
M inute</w>
|
888 |
+
L e
|
889 |
+
L ast</w>
|
890 |
+
G ods</w>
|
891 |
+
G er
|
892 |
+
D ictionary</w>
|
893 |
+
Cal a</w>
|
894 |
+
B o
|
895 |
+
6 3
|
896 |
+
1 5</w>
|
897 |
+
wr it
|
898 |
+
wh ile</w>
|
899 |
+
w ar
|
900 |
+
val ue</w>
|
901 |
+
v ed</w>
|
902 |
+
v ari
|
903 |
+
u al</w>
|
904 |
+
tr an
|
905 |
+
to ol</w>
|
906 |
+
t ri
|
907 |
+
t en
|
908 |
+
st ing</w>
|
909 |
+
s ed</w>
|
910 |
+
s ay</w>
|
911 |
+
re d</w>
|
912 |
+
pl e
|
913 |
+
on g
|
914 |
+
ol d</w>
|
915 |
+
n ers</w>
|
916 |
+
n a
|
917 |
+
merci al</w>
|
918 |
+
me di
|
919 |
+
m on
|
920 |
+
lo ok</w>
|
921 |
+
l et</w>
|
922 |
+
j ada</w>
|
923 |
+
ic i
|
924 |
+
hel p</w>
|
925 |
+
feat ures</w>
|
926 |
+
en tr
|
927 |
+
en c
|
928 |
+
eas y</w>
|
929 |
+
ear th</w>
|
930 |
+
d on</w>
|
931 |
+
con nec
|
932 |
+
ch ar
|
933 |
+
c ould</w>
|
934 |
+
be ing</w>
|
935 |
+
b ac
|
936 |
+
ar k</w>
|
937 |
+
amp ;</w>
|
938 |
+
a in
|
939 |
+
P y
|
940 |
+
H ost
|
941 |
+
A n
|
942 |
+
2 0</w>
|
943 |
+
& amp;</w>
|
944 |
+
ye ar</w>
|
945 |
+
w ing</w>
|
946 |
+
w ant</w>
|
947 |
+
w a
|
948 |
+
v ers</w>
|
949 |
+
us er</w>
|
950 |
+
ur ing</w>
|
951 |
+
updat es</w>
|
952 |
+
ti mes</w>
|
953 |
+
t re
|
954 |
+
t ly</w>
|
955 |
+
syst em</w>
|
956 |
+
sp ea
|
957 |
+
sit e</w>
|
958 |
+
sim pl
|
959 |
+
sa id</w>
|
960 |
+
s k
|
961 |
+
s et
|
962 |
+
re v
|
963 |
+
re l
|
964 |
+
re f
|
965 |
+
pu t</w>
|
966 |
+
pro g
|
967 |
+
pl ace</w>
|
968 |
+
pe an</w>
|
969 |
+
p ho
|
970 |
+
pho to</w>
|
971 |
+
p at
|
972 |
+
oun t</w>
|
973 |
+
ot e</w>
|
974 |
+
or t</w>
|
975 |
+
og y</w>
|
976 |
+
ne y</w>
|
977 |
+
ne es</w>
|
978 |
+
ne eds</w>
|
979 |
+
ne ed</w>
|
980 |
+
n umb
|
981 |
+
n ame</w>
|
982 |
+
lay ers</w>
|
983 |
+
l l
|
984 |
+
k en</w>
|
985 |
+
ic al</w>
|
986 |
+
i a</w>
|
987 |
+
ful l</w>
|
988 |
+
fi ed</w>
|
989 |
+
fe w</w>
|
990 |
+
et y</w>
|
991 |
+
est s</w>
|
992 |
+
es si
|
993 |
+
dow n</w>
|
994 |
+
do m</w>
|
995 |
+
det ail
|
996 |
+
dat ab
|
997 |
+
d ictionary</w>
|
998 |
+
con f
|
999 |
+
com mercial</w>
|
1000 |
+
c a</w>
|
1001 |
+
b re
|
subword/tests/data/corpus.bpe.ref.en
ADDED
The diff for this file is too large to render.
See raw diff
|
|
subword/tests/data/corpus.en
ADDED
The diff for this file is too large to render.
See raw diff
|
|
subword/tests/test_bpe.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
from __future__ import unicode_literals
|
5 |
+
import unittest
|
6 |
+
import codecs
|
7 |
+
|
8 |
+
import os,sys,inspect
|
9 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
10 |
+
parentdir = os.path.dirname(currentdir)
|
11 |
+
sys.path.insert(0,parentdir)
|
12 |
+
|
13 |
+
from learn_bpe import learn_bpe
|
14 |
+
from apply_bpe import BPE
|
15 |
+
|
16 |
+
|
17 |
+
class TestBPELearnMethod(unittest.TestCase):
|
18 |
+
|
19 |
+
def test_learn_bpe(self):
|
20 |
+
infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
|
21 |
+
outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8')
|
22 |
+
learn_bpe(infile, outfile, 1000)
|
23 |
+
infile.close()
|
24 |
+
outfile.close()
|
25 |
+
|
26 |
+
outlines = open(os.path.join(currentdir,'data','bpe.out'))
|
27 |
+
reflines = open(os.path.join(currentdir,'data','bpe.ref'))
|
28 |
+
|
29 |
+
for line, line2 in zip(outlines, reflines):
|
30 |
+
self.assertEqual(line, line2)
|
31 |
+
|
32 |
+
outlines.close()
|
33 |
+
reflines.close()
|
34 |
+
|
35 |
+
class TestBPESegmentMethod(unittest.TestCase):
|
36 |
+
|
37 |
+
def setUp(self):
|
38 |
+
|
39 |
+
with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile:
|
40 |
+
self.bpe = BPE(bpefile)
|
41 |
+
|
42 |
+
self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
|
43 |
+
self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
|
44 |
+
|
45 |
+
def tearDown(self):
|
46 |
+
|
47 |
+
self.infile.close()
|
48 |
+
self.reffile.close()
|
49 |
+
|
50 |
+
def test_apply_bpe(self):
|
51 |
+
|
52 |
+
for line, ref in zip(self.infile, self.reffile):
|
53 |
+
out = self.bpe.process_line(line)
|
54 |
+
self.assertEqual(out, ref)
|
55 |
+
|
56 |
+
def test_trailing_whitespace(self):
|
57 |
+
"""BPE.proces_line() preserves leading and trailing whitespace"""
|
58 |
+
|
59 |
+
orig = ' iron cement \n'
|
60 |
+
exp = ' ir@@ on c@@ ement \n'
|
61 |
+
|
62 |
+
out = self.bpe.process_line(orig)
|
63 |
+
self.assertEqual(out, exp)
|
64 |
+
|
65 |
+
def test_utf8_whitespace(self):
|
66 |
+
"""UTF-8 whitespace is treated as normal character, not word boundary"""
|
67 |
+
|
68 |
+
orig = 'iron\xa0cement\n'
|
69 |
+
exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'
|
70 |
+
|
71 |
+
out = self.bpe.process_line(orig)
|
72 |
+
self.assertEqual(out, exp)
|
73 |
+
|
74 |
+
def test_empty_line(self):
|
75 |
+
|
76 |
+
orig = '\n'
|
77 |
+
exp = '\n'
|
78 |
+
|
79 |
+
out = self.bpe.process_line(orig)
|
80 |
+
self.assertEqual(out, exp)
|
81 |
+
|
82 |
+
if __name__ == '__main__':
|
83 |
+
unittest.main()
|
subword/tests/test_glossaries.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import unittest
|
5 |
+
import mock
|
6 |
+
|
7 |
+
import os,sys,inspect
|
8 |
+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
9 |
+
parentdir = os.path.dirname(currentdir)
|
10 |
+
sys.path.insert(0,parentdir)
|
11 |
+
|
12 |
+
from apply_bpe import isolate_glossary, BPE
|
13 |
+
|
14 |
+
class TestIsolateGlossaryFunction(unittest.TestCase):
|
15 |
+
|
16 |
+
def setUp(self):
|
17 |
+
self.glossary = 'like'
|
18 |
+
|
19 |
+
def _run_test_case(self, test_case):
|
20 |
+
orig, expected = test_case
|
21 |
+
out = isolate_glossary(orig, self.glossary)
|
22 |
+
self.assertEqual(out, expected)
|
23 |
+
|
24 |
+
def test_empty_string(self):
|
25 |
+
orig = ''
|
26 |
+
exp = ['']
|
27 |
+
test_case = (orig, exp)
|
28 |
+
self._run_test_case(test_case)
|
29 |
+
|
30 |
+
def test_no_glossary(self):
|
31 |
+
orig = 'word'
|
32 |
+
exp = ['word']
|
33 |
+
test_case = (orig, exp)
|
34 |
+
self._run_test_case(test_case)
|
35 |
+
|
36 |
+
def test_isolated_glossary(self):
|
37 |
+
orig = 'like'
|
38 |
+
exp = ['like']
|
39 |
+
test_case = (orig, exp)
|
40 |
+
self._run_test_case(test_case)
|
41 |
+
|
42 |
+
def test_word_one_side(self):
|
43 |
+
orig = 'likeword'
|
44 |
+
exp = ['like', 'word']
|
45 |
+
test_case = (orig, exp)
|
46 |
+
self._run_test_case(test_case)
|
47 |
+
|
48 |
+
def test_words_both_sides(self):
|
49 |
+
orig = 'wordlikeword'
|
50 |
+
exp = ['word', 'like', 'word']
|
51 |
+
test_case = (orig, exp)
|
52 |
+
self._run_test_case(test_case)
|
53 |
+
|
54 |
+
def test_back_to_back_glossary(self):
|
55 |
+
orig = 'likelike'
|
56 |
+
exp = ['like', 'like']
|
57 |
+
test_case = (orig, exp)
|
58 |
+
self._run_test_case(test_case)
|
59 |
+
|
60 |
+
def test_multiple_glossaries(self):
|
61 |
+
orig = 'wordlikewordlike'
|
62 |
+
exp = ['word', 'like', 'word', 'like']
|
63 |
+
test_case = (orig, exp)
|
64 |
+
self._run_test_case(test_case)
|
65 |
+
|
66 |
+
class TestBPEIsolateGlossariesMethod(unittest.TestCase):
|
67 |
+
|
68 |
+
def setUp(self):
|
69 |
+
|
70 |
+
amock = mock.MagicMock()
|
71 |
+
amock.readline.return_value = 'something'
|
72 |
+
glossaries = ['like', 'Manuel', 'USA']
|
73 |
+
self.bpe = BPE(amock, glossaries=glossaries)
|
74 |
+
|
75 |
+
def _run_test_case(self, test_case):
|
76 |
+
orig, expected = test_case
|
77 |
+
out = self.bpe._isolate_glossaries(orig)
|
78 |
+
self.assertEqual(out, expected)
|
79 |
+
|
80 |
+
def test_multiple_glossaries(self):
|
81 |
+
orig = 'wordlikeUSAwordManuelManuelwordUSA'
|
82 |
+
exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA']
|
83 |
+
test_case = (orig, exp)
|
84 |
+
self._run_test_case(test_case)
|
85 |
+
|
86 |
+
class TestRegexIsolateGlossaries(unittest.TestCase):
|
87 |
+
|
88 |
+
def setUp(self):
|
89 |
+
|
90 |
+
amock = mock.MagicMock()
|
91 |
+
amock.readline.return_value = 'something'
|
92 |
+
glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
|
93 |
+
self.bpe = BPE(amock, glossaries=glossaries)
|
94 |
+
|
95 |
+
def _run_test_case(self, test_case):
|
96 |
+
orig, expected = test_case
|
97 |
+
out = self.bpe._isolate_glossaries(orig)
|
98 |
+
self.assertEqual(out, expected)
|
99 |
+
|
100 |
+
def test_regex_glossaries(self):
|
101 |
+
orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>'
|
102 |
+
exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>']
|
103 |
+
test_case = (orig, exp)
|
104 |
+
self._run_test_case(test_case)
|
105 |
+
|
106 |
+
def encode_mock(segment, x2, x3, x4, x5, x6, x7, glosses, dropout):
|
107 |
+
if glosses.match(segment):
|
108 |
+
return (segment,)
|
109 |
+
else:
|
110 |
+
l = len(segment)
|
111 |
+
return (segment[:l//2], segment[l//2:])
|
112 |
+
|
113 |
+
class TestBPESegmentMethod(unittest.TestCase):
|
114 |
+
|
115 |
+
def setUp(self):
|
116 |
+
|
117 |
+
amock = mock.MagicMock()
|
118 |
+
amock.readline.return_value = 'something'
|
119 |
+
glossaries = ['like', 'Manuel', 'USA']
|
120 |
+
self.bpe = BPE(amock, glossaries=glossaries)
|
121 |
+
|
122 |
+
@mock.patch('apply_bpe.encode', side_effect=encode_mock)
|
123 |
+
def _run_test_case(self, test_case, encode_function):
|
124 |
+
|
125 |
+
orig, expected = test_case
|
126 |
+
out = self.bpe.segment(orig)
|
127 |
+
|
128 |
+
self.assertEqual(out, expected)
|
129 |
+
|
130 |
+
def test_multiple_glossaries(self):
|
131 |
+
orig = 'wordlikeword likeManuelword'
|
132 |
+
exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
|
133 |
+
test_case = (orig, exp)
|
134 |
+
self._run_test_case(test_case)
|
135 |
+
|
136 |
+
if __name__ == '__main__':
|
137 |
+
unittest.main()
|