abhaskumarsinha commited on
Commit
70d3cae
1 Parent(s): 73f96f1

Upload 21 files

Browse files
subword/.ipynb_checkpoints/encoding-checkpoint.ipynb ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "id": "9644db35",
7
+ "metadata": {
8
+ "scrolled": true
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stderr",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "\n",
16
+ " 0%| | 0/20000 [00:00<?, ?it/s]\n",
17
+ " 0%| | 1/20000 [00:00<38:40, 8.62it/s]\n",
18
+ " 0%| | 2/20000 [00:00<1:31:59, 3.62it/s]\n",
19
+ " 0%| | 3/20000 [00:00<1:21:11, 4.11it/s]\n",
20
+ " 0%| | 4/20000 [00:01<1:48:20, 3.08it/s]\n",
21
+ " 0%| | 6/20000 [00:01<1:03:27, 5.25it/s]\n",
22
+ " 0%| | 7/20000 [00:01<1:12:17, 4.61it/s]\n",
23
+ " 0%| | 8/20000 [00:01<1:10:13, 4.74it/s]\n",
24
+ " 0%| | 10/20000 [00:02<1:09:39, 4.78it/s]\n",
25
+ " 0%| | 13/20000 [00:02<42:59, 7.75it/s] \n",
26
+ " 0%| | 16/20000 [00:02<30:25, 10.95it/s]\n",
27
+ " 0%| | 19/20000 [00:02<24:50, 13.41it/s]\n",
28
+ " 0%| | 21/20000 [00:03<36:58, 9.01it/s]\n",
29
+ " 0%| | 23/20000 [00:03<35:20, 9.42it/s]\n",
30
+ " 0%| | 25/20000 [00:03<31:30, 10.56it/s]\n",
31
+ " 0%| | 27/20000 [00:03<33:12, 10.03it/s]\n",
32
+ " 0%| | 29/20000 [00:03<33:30, 9.93it/s]\n",
33
+ " 0%| | 31/20000 [00:03<35:06, 9.48it/s]\n",
34
+ " 0%| | 33/20000 [00:04<37:03, 8.98it/s]\n",
35
+ " 0%| | 37/20000 [00:04<26:59, 12.32it/s]\n",
36
+ " 0%| | 39/20000 [00:04<26:54, 12.37it/s]\n",
37
+ " 0%| | 42/20000 [00:04<22:51, 14.55it/s]\n",
38
+ " 0%| | 46/20000 [00:04<19:15, 17.27it/s]\n",
39
+ " 0%| | 48/20000 [00:05<22:34, 14.73it/s]\n",
40
+ " 0%| | 50/20000 [00:05<23:39, 14.06it/s]\n",
41
+ " 0%| | 52/20000 [00:05<23:30, 14.14it/s]\n",
42
+ " 0%| | 55/20000 [00:05<20:12, 16.45it/s]\n",
43
+ " 0%| | 58/20000 [00:05<19:09, 17.35it/s]\n",
44
+ " 0%| | 60/20000 [00:05<19:25, 17.11it/s]\n",
45
+ " 0%| | 63/20000 [00:06<28:45, 11.56it/s]\n",
46
+ " 0%| | 70/20000 [00:06<16:14, 20.45it/s]\n",
47
+ " 0%| | 75/20000 [00:06<14:15, 23.28it/s]\n",
48
+ " 0%| | 78/20000 [00:06<14:28, 22.94it/s]\n",
49
+ " 0%| | 81/20000 [00:06<16:06, 20.62it/s]\n",
50
+ " 0%| | 85/20000 [00:06<15:13, 21.81it/s]\n",
51
+ " 0%| | 88/20000 [00:07<15:39, 21.20it/s]\n",
52
+ " 0%| | 92/20000 [00:07<13:28, 24.61it/s]\n",
53
+ " 0%| | 95/20000 [00:07<13:32, 24.50it/s]\n",
54
+ " 0%| | 99/20000 [00:07<12:06, 27.40it/s]\n",
55
+ " 1%| | 102/20000 [00:07<12:55, 25.65it/s]\n",
56
+ " 1%| | 105/20000 [00:07<12:46, 25.95it/s]\n",
57
+ " 1%| | 112/20000 [00:07<12:02, 27.54it/s]\n",
58
+ " 1%| | 118/20000 [00:08<10:02, 33.00it/s]\n",
59
+ " 1%| | 122/20000 [00:08<10:12, 32.46it/s]\n",
60
+ " 1%| | 127/20000 [00:08<10:07, 32.73it/s]\n",
61
+ " 1%| | 138/20000 [00:08<06:56, 47.66it/s]\n",
62
+ " 1%| | 144/20000 [00:08<06:47, 48.73it/s]\n",
63
+ " 1%| | 150/20000 [00:08<07:00, 47.21it/s]\n",
64
+ " 1%| | 156/20000 [00:08<06:44, 49.01it/s]\n",
65
+ " 1%| | 162/20000 [00:08<06:47, 48.71it/s]\n",
66
+ " 1%| | 169/20000 [00:09<06:38, 49.81it/s]\n",
67
+ " 1%| | 176/20000 [00:09<06:28, 51.03it/s]\n",
68
+ " 1%| | 184/20000 [00:09<05:58, 55.22it/s]\n",
69
+ " 1%| | 190/20000 [00:09<08:05, 40.78it/s]\n",
70
+ " 1%| | 197/20000 [00:09<07:30, 43.91it/s]\n",
71
+ " 1%|1 | 202/20000 [00:09<07:53, 41.79it/s]\n",
72
+ " 1%|1 | 210/20000 [00:10<06:51, 48.15it/s]\n",
73
+ " 1%|1 | 220/20000 [00:10<05:35, 59.01it/s]\n",
74
+ " 1%|1 | 229/20000 [00:10<05:05, 64.72it/s]\n",
75
+ " 1%|1 | 236/20000 [00:10<05:37, 58.47it/s]\n",
76
+ " 1%|1 | 244/20000 [00:10<05:22, 61.18it/s]\n",
77
+ " 1%|1 | 251/20000 [00:10<05:31, 59.65it/s]\n",
78
+ " 1%|1 | 259/20000 [00:10<05:11, 63.33it/s]\n",
79
+ " 1%|1 | 266/20000 [00:10<05:25, 60.70it/s]\n",
80
+ " 1%|1 | 273/20000 [00:10<05:31, 59.42it/s]\n",
81
+ " 1%|1 | 282/20000 [00:11<04:57, 66.26it/s]\n",
82
+ " 1%|1 | 289/20000 [00:11<05:00, 65.52it/s]\n",
83
+ " 1%|1 | 296/20000 [00:11<05:10, 63.47it/s]\n",
84
+ " 2%|1 | 303/20000 [00:11<07:06, 46.23it/s]\n",
85
+ " 2%|1 | 313/20000 [00:11<05:41, 57.60it/s]\n",
86
+ " 2%|1 | 324/20000 [00:11<04:54, 66.87it/s]\n",
87
+ " 2%|1 | 335/20000 [00:11<04:16, 76.56it/s]\n",
88
+ " 2%|1 | 346/20000 [00:11<03:50, 85.09it/s]\n",
89
+ " 2%|1 | 357/20000 [00:12<03:38, 90.06it/s]\n",
90
+ " 2%|1 | 367/20000 [00:12<03:46, 86.85it/s]\n",
91
+ " 2%|1 | 377/20000 [00:12<03:41, 88.70it/s]\n",
92
+ " 2%|1 | 387/20000 [00:12<03:43, 87.95it/s]\n",
93
+ " 2%|1 | 396/20000 [00:12<03:49, 85.48it/s]\n",
94
+ " 2%|2 | 405/20000 [00:12<04:04, 80.15it/s]\n",
95
+ " 2%|2 | 416/20000 [00:12<03:42, 87.82it/s]\n",
96
+ " 2%|2 | 429/20000 [00:12<03:19, 98.03it/s]\n",
97
+ " 2%|2 | 439/20000 [00:13<03:35, 90.74it/s]\n",
98
+ " 2%|2 | 450/20000 [00:13<03:24, 95.65it/s]\n",
99
+ " 2%|2 | 462/20000 [00:13<03:13, 100.98it/s]\n",
100
+ " 2%|2 | 473/20000 [00:13<03:16, 99.60it/s] \n",
101
+ " 2%|2 | 484/20000 [00:13<03:28, 93.64it/s]\n",
102
+ " 2%|2 | 494/20000 [00:13<03:30, 92.86it/s]\n",
103
+ " 3%|2 | 504/20000 [00:13<04:34, 70.99it/s]\n",
104
+ " 3%|2 | 520/20000 [00:13<03:34, 90.96it/s]\n",
105
+ " 3%|2 | 534/20000 [00:14<03:12, 101.38it/s]\n",
106
+ " 3%|2 | 547/20000 [00:14<03:01, 107.03it/s]\n",
107
+ " 3%|2 | 559/20000 [00:14<03:02, 106.25it/s]\n",
108
+ " 3%|2 | 571/20000 [00:14<03:14, 99.81it/s] \n",
109
+ " 3%|2 | 582/20000 [00:14<03:13, 100.11it/s]\n",
110
+ " 3%|2 | 595/20000 [00:14<03:03, 105.49it/s]\n",
111
+ " 3%|3 | 606/20000 [00:14<03:07, 103.63it/s]\n",
112
+ " 3%|3 | 625/20000 [00:14<02:33, 126.08it/s]\n",
113
+ " 3%|3 | 643/20000 [00:14<02:17, 140.29it/s]\n",
114
+ " 3%|3 | 658/20000 [00:15<02:23, 135.01it/s]\n",
115
+ " 3%|3 | 672/20000 [00:15<02:32, 126.59it/s]\n",
116
+ " 3%|3 | 685/20000 [00:15<02:42, 119.19it/s]\n",
117
+ " 3%|3 | 698/20000 [00:15<02:46, 116.22it/s]\n",
118
+ " 4%|3 | 710/20000 [00:15<02:49, 113.91it/s]\n",
119
+ " 4%|3 | 727/20000 [00:15<02:31, 127.58it/s]\n",
120
+ " 4%|3 | 744/20000 [00:15<02:18, 139.24it/s]\n",
121
+ " 4%|3 | 759/20000 [00:15<03:10, 101.19it/s]\n",
122
+ " 4%|3 | 771/20000 [00:16<03:03, 104.67it/s]\n",
123
+ " 4%|3 | 783/20000 [00:16<03:02, 105.07it/s]\n",
124
+ " 4%|3 | 795/20000 [00:16<03:14, 98.92it/s] \n",
125
+ " 4%|4 | 807/20000 [00:16<03:06, 102.82it/s]\n",
126
+ " 4%|4 | 822/20000 [00:16<02:50, 112.34it/s]\n",
127
+ " 4%|4 | 834/20000 [00:16<02:55, 109.45it/s]\n",
128
+ " 4%|4 | 847/20000 [00:16<02:47, 114.28it/s]\n",
129
+ " 4%|4 | 860/20000 [00:16<02:42, 117.94it/s]\n",
130
+ " 4%|4 | 873/20000 [00:16<02:46, 114.58it/s]\n",
131
+ " 4%|4 | 885/20000 [00:17<02:58, 106.97it/s]\n",
132
+ " 4%|4 | 896/20000 [00:17<03:07, 102.08it/s]\n",
133
+ " 5%|4 | 908/20000 [00:17<03:01, 105.42it/s]\n",
134
+ " 5%|4 | 924/20000 [00:17<02:42, 117.67it/s]\n",
135
+ " 5%|4 | 940/20000 [00:17<02:28, 128.26it/s]\n",
136
+ " 5%|4 | 954/20000 [00:17<02:24, 131.54it/s]\n",
137
+ " 5%|4 | 968/20000 [00:17<02:34, 123.37it/s]\n",
138
+ " 5%|4 | 982/20000 [00:17<02:31, 125.85it/s]\n",
139
+ " 5%|4 | 995/20000 [00:18<02:39, 119.06it/s]\n",
140
+ " 5%|5 | 1008/20000 [00:18<03:38, 86.92it/s]\n",
141
+ " 5%|5 | 1024/20000 [00:18<03:04, 102.72it/s]\n",
142
+ " 5%|5 | 1041/20000 [00:18<02:40, 118.03it/s]\n",
143
+ " 5%|5 | 1055/20000 [00:18<02:34, 122.63it/s]\n",
144
+ " 5%|5 | 1069/20000 [00:18<02:31, 124.89it/s]\n",
145
+ " 5%|5 | 1083/20000 [00:18<02:34, 122.68it/s]\n",
146
+ " 5%|5 | 1096/20000 [00:18<02:39, 118.60it/s]\n",
147
+ " 6%|5 | 1110/20000 [00:19<02:32, 123.65it/s]\n",
148
+ " 6%|5 | 1127/20000 [00:19<02:18, 136.02it/s]\n",
149
+ " 6%|5 | 1145/20000 [00:19<02:08, 146.74it/s]\n",
150
+ " 6%|5 | 1161/20000 [00:19<02:06, 148.82it/s]\n",
151
+ " 6%|5 | 1177/20000 [00:19<02:04, 151.15it/s]\n",
152
+ " 6%|5 | 1193/20000 [00:19<02:09, 145.70it/s]\n",
153
+ " 6%|6 | 1208/20000 [00:19<02:10, 144.47it/s]\n",
154
+ " 6%|6 | 1227/20000 [00:19<02:00, 156.04it/s]\n",
155
+ " 6%|6 | 1244/20000 [00:19<01:57, 159.13it/s]\n",
156
+ " 6%|6 | 1261/20000 [00:19<02:01, 154.24it/s]\n",
157
+ " 6%|6 | 1277/20000 [00:20<02:09, 145.11it/s]\n",
158
+ " 6%|6 | 1292/20000 [00:20<02:10, 143.32it/s]\n",
159
+ " 7%|6 | 1307/20000 [00:20<03:02, 102.65it/s]\n",
160
+ " 7%|6 | 1330/20000 [00:20<02:23, 130.14it/s]\n",
161
+ " 7%|6 | 1348/20000 [00:20<02:13, 139.97it/s]\n",
162
+ " 7%|6 | 1368/20000 [00:20<02:02, 152.70it/s]\n",
163
+ " 7%|6 | 1385/20000 [00:20<02:00, 153.99it/s]\n",
164
+ " 7%|7 | 1402/20000 [00:21<02:07, 146.16it/s]\n",
165
+ " 7%|7 | 1423/20000 [00:21<01:55, 161.53it/s]\n",
166
+ " 7%|7 | 1441/20000 [00:21<01:52, 165.17it/s]\n",
167
+ " 7%|7 | 1459/20000 [00:21<01:55, 160.82it/s]\n",
168
+ " 7%|7 | 1476/20000 [00:21<02:03, 149.82it/s]\n",
169
+ " 7%|7 | 1492/20000 [00:21<02:08, 143.79it/s]\n",
170
+ " 8%|7 | 1507/20000 [00:21<02:10, 142.06it/s]\n",
171
+ " 8%|7 | 1530/20000 [00:21<01:52, 164.72it/s]\n",
172
+ " 8%|7 | 1548/20000 [00:21<01:50, 167.09it/s]\n",
173
+ " 8%|7 | 1565/20000 [00:22<01:49, 167.90it/s]\n",
174
+ " 8%|7 | 1582/20000 [00:22<01:53, 161.57it/s]\n",
175
+ " 8%|7 | 1599/20000 [00:22<01:56, 158.15it/s]\n",
176
+ " 8%|8 | 1617/20000 [00:22<01:51, 164.25it/s]\n",
177
+ " 8%|8 | 1637/20000 [00:22<01:45, 174.45it/s]\n",
178
+ " 8%|8 | 1657/20000 [00:22<01:41, 181.32it/s]\n",
179
+ " 8%|8 | 1676/20000 [00:22<01:40, 182.25it/s]\n",
180
+ " 8%|8 | 1695/20000 [00:22<01:46, 171.94it/s]\n",
181
+ " 9%|8 | 1718/20000 [00:22<01:38, 186.12it/s]\n",
182
+ " 9%|8 | 1739/20000 [00:22<01:34, 192.48it/s]\n",
183
+ " 9%|8 | 1759/20000 [00:23<02:13, 136.76it/s]\n",
184
+ " 9%|8 | 1777/20000 [00:23<02:04, 145.80it/s]\n",
185
+ " 9%|8 | 1794/20000 [00:23<02:04, 146.68it/s]\n",
186
+ " 9%|9 | 1814/20000 [00:23<01:53, 159.63it/s]\n",
187
+ " 9%|9 | 1836/20000 [00:23<01:43, 175.04it/s]\n",
188
+ " 9%|9 | 1856/20000 [00:23<01:41, 179.30it/s]\n",
189
+ " 9%|9 | 1875/20000 [00:23<01:42, 176.01it/s]\n",
190
+ " 9%|9 | 1894/20000 [00:23<01:45, 171.34it/s]\n",
191
+ " 10%|9 | 1915/20000 [00:24<01:39, 180.93it/s]\n",
192
+ " 10%|9 | 1937/20000 [00:24<01:34, 190.79it/s]\n",
193
+ " 10%|9 | 1957/20000 [00:24<01:35, 189.63it/s]\n",
194
+ " 10%|9 | 1977/20000 [00:24<01:36, 186.73it/s]\n",
195
+ " 10%|9 | 1996/20000 [00:24<01:42, 175.72it/s]\n",
196
+ " 10%|# | 2018/20000 [00:24<01:35, 187.87it/s]\n",
197
+ " 10%|# | 2046/20000 [00:24<01:24, 212.03it/s]\n",
198
+ " 10%|# | 2068/20000 [00:24<01:27, 204.39it/s]\n",
199
+ " 10%|# | 2089/20000 [00:24<01:31, 195.56it/s]\n",
200
+ " 11%|# | 2109/20000 [00:25<01:33, 192.02it/s]\n",
201
+ " 11%|# | 2140/20000 [00:25<01:19, 224.10it/s]\n",
202
+ " 11%|# | 2165/20000 [00:25<01:17, 230.78it/s]\n",
203
+ " 11%|# | 2189/20000 [00:25<01:18, 225.64it/s]\n",
204
+ " 11%|#1 | 2212/20000 [00:25<01:24, 210.15it/s]\n",
205
+ " 11%|#1 | 2236/20000 [00:25<01:21, 217.71it/s]\n",
206
+ " 11%|#1 | 2259/20000 [00:25<01:22, 215.12it/s]\n",
207
+ " 11%|#1 | 2281/20000 [00:25<01:24, 208.87it/s]\n",
208
+ " 12%|#1 | 2303/20000 [00:25<01:35, 185.14it/s]\n",
209
+ " 12%|#1 | 2333/20000 [00:26<01:22, 213.67it/s]\n",
210
+ " 12%|#1 | 2357/20000 [00:26<01:19, 220.73it/s]\n",
211
+ " 12%|#1 | 2380/20000 [00:26<01:21, 214.95it/s]\n",
212
+ " 12%|#2 | 2402/20000 [00:26<02:03, 142.71it/s]\n",
213
+ " 12%|#2 | 2432/20000 [00:26<01:40, 174.08it/s]\n",
214
+ " 12%|#2 | 2459/20000 [00:26<01:29, 195.81it/s]\n",
215
+ " 12%|#2 | 2482/20000 [00:26<01:28, 198.82it/s]\n",
216
+ " 13%|#2 | 2505/20000 [00:27<01:29, 195.33it/s]\n",
217
+ " 13%|#2 | 2538/20000 [00:27<01:16, 228.52it/s]\n",
218
+ " 13%|#2 | 2566/20000 [00:27<01:11, 242.22it/s]\n",
219
+ " 13%|#2 | 2592/20000 [00:27<01:15, 230.01it/s]\n",
220
+ " 13%|#3 | 2620/20000 [00:27<01:11, 243.40it/s]\n",
221
+ " 13%|#3 | 2651/20000 [00:27<01:06, 261.84it/s]\n",
222
+ " 13%|#3 | 2678/20000 [00:27<01:06, 260.46it/s]\n",
223
+ " 14%|#3 | 2705/20000 [00:27<01:08, 252.37it/s]\n",
224
+ " 14%|#3 | 2740/20000 [00:27<01:02, 278.24it/s]\n",
225
+ " 14%|#3 | 2769/20000 [00:27<01:05, 264.95it/s]\n",
226
+ " 14%|#3 | 2796/20000 [00:28<01:09, 247.16it/s]\n",
227
+ " 14%|#4 | 2828/20000 [00:28<01:04, 264.60it/s]\n",
228
+ " 14%|#4 | 2855/20000 [00:28<01:05, 260.34it/s]\n",
229
+ " 14%|#4 | 2882/20000 [00:28<01:09, 247.20it/s]\n",
230
+ " 15%|#4 | 2908/20000 [00:28<01:12, 236.53it/s]\n",
231
+ " 15%|#4 | 2952/20000 [00:28<00:58, 291.10it/s]\n",
232
+ " 15%|#4 | 2982/20000 [00:28<01:03, 266.27it/s]\n",
233
+ " 15%|#5 | 3010/20000 [00:28<01:03, 267.07it/s]\n",
234
+ " 15%|#5 | 3039/20000 [00:29<01:02, 270.37it/s]\n",
235
+ " 15%|#5 | 3068/20000 [00:29<01:01, 273.53it/s]\n",
236
+ " 15%|#5 | 3096/20000 [00:29<01:04, 263.45it/s]\n",
237
+ " 16%|#5 | 3129/20000 [00:29<00:59, 281.96it/s]\n",
238
+ " 16%|#5 | 3160/20000 [00:29<00:58, 287.48it/s]\n",
239
+ " 16%|#5 | 3190/20000 [00:29<01:00, 279.05it/s]\n",
240
+ " 16%|#6 | 3226/20000 [00:29<00:55, 301.05it/s]\n",
241
+ " 16%|#6 | 3257/20000 [00:29<00:55, 303.61it/s]\n",
242
+ " 16%|#6 | 3288/20000 [00:29<00:56, 293.52it/s]\n",
243
+ " 17%|#6 | 3318/20000 [00:29<00:56, 293.68it/s]\n",
244
+ " 17%|#6 | 3357/20000 [00:30<00:52, 318.68it/s]\n",
245
+ " 17%|#6 | 3390/20000 [00:30<00:58, 284.80it/s]\n",
246
+ " 17%|#7 | 3420/20000 [00:30<01:21, 204.06it/s]\n",
247
+ " 17%|#7 | 3459/20000 [00:30<01:08, 242.62it/s]\n",
248
+ " 17%|#7 | 3491/20000 [00:30<01:03, 260.00it/s]\n",
249
+ " 18%|#7 | 3535/20000 [00:30<00:54, 304.04it/s]\n",
250
+ " 18%|#7 | 3573/20000 [00:30<00:50, 323.92it/s]\n",
251
+ " 18%|#8 | 3608/20000 [00:31<00:55, 296.34it/s]\n",
252
+ " 18%|#8 | 3653/20000 [00:31<00:48, 336.01it/s]\n",
253
+ " 18%|#8 | 3689/20000 [00:31<00:49, 329.16it/s]\n",
254
+ " 19%|#8 | 3733/20000 [00:31<00:45, 358.11it/s]\n",
255
+ " 19%|#8 | 3771/20000 [00:31<00:44, 361.17it/s]\n",
256
+ " 19%|#9 | 3809/20000 [00:31<00:47, 342.31it/s]\n",
257
+ " 19%|#9 | 3861/20000 [00:31<00:41, 390.94it/s]\n",
258
+ " 20%|#9 | 3902/20000 [00:31<00:42, 378.22it/s]\n",
259
+ " 20%|#9 | 3968/20000 [00:31<00:35, 455.02it/s]\n",
260
+ " 20%|## | 4015/20000 [00:32<00:37, 427.77it/s]\n",
261
+ " 20%|## | 4066/20000 [00:32<00:35, 449.03it/s]\n",
262
+ " 21%|## | 4112/20000 [00:32<00:39, 404.45it/s]\n",
263
+ " 21%|## | 4174/20000 [00:32<00:34, 458.89it/s]\n",
264
+ " 21%|##1 | 4222/20000 [00:32<00:35, 442.90it/s]\n",
265
+ " 21%|##1 | 4271/20000 [00:32<00:34, 454.41it/s]\n",
266
+ " 22%|##1 | 4329/20000 [00:32<00:32, 489.36it/s]\n",
267
+ " 22%|##1 | 4387/20000 [00:32<00:30, 515.14it/s]\n",
268
+ " 22%|##2 | 4447/20000 [00:32<00:28, 538.10it/s]\n",
269
+ " 23%|##2 | 4502/20000 [00:33<00:32, 478.73it/s]\n",
270
+ " 23%|##2 | 4563/20000 [00:33<00:30, 512.67it/s]\n",
271
+ " 23%|##3 | 4616/20000 [00:33<00:30, 496.81it/s]\n",
272
+ " 23%|##3 | 4677/20000 [00:33<00:29, 527.98it/s]\n",
273
+ " 24%|##3 | 4733/20000 [00:33<00:28, 537.01it/s]\n",
274
+ " 24%|##3 | 4788/20000 [00:33<00:28, 534.59it/s]\n",
275
+ " 24%|##4 | 4864/20000 [00:33<00:25, 599.65it/s]\n",
276
+ " 25%|##4 | 4925/20000 [00:33<00:25, 595.70it/s]\n",
277
+ " 25%|##4 | 4994/20000 [00:33<00:24, 617.81it/s]\n",
278
+ " 25%|##5 | 5079/20000 [00:33<00:21, 683.71it/s]\n",
279
+ " 26%|##5 | 5148/20000 [00:34<00:35, 419.97it/s]\n",
280
+ " 26%|##6 | 5203/20000 [00:34<00:33, 446.58it/s]\n",
281
+ " 26%|##6 | 5289/20000 [00:34<00:27, 538.90it/s]\n",
282
+ " 27%|##6 | 5377/20000 [00:34<00:23, 622.07it/s]\n",
283
+ " 27%|##7 | 5471/20000 [00:34<00:20, 703.42it/s]\n",
284
+ " 28%|##7 | 5549/20000 [00:36<01:35, 150.73it/s]\n",
285
+ " 28%|##8 | 5606/20000 [00:36<01:37, 147.12it/s]\n",
286
+ " 28%|##8 | 5650/20000 [00:36<01:34, 151.57it/s]\n",
287
+ " 28%|##8 | 5686/20000 [00:37<01:33, 153.50it/s]\n",
288
+ " 29%|##8 | 5716/20000 [00:37<01:32, 154.45it/s]\n",
289
+ " 29%|##8 | 5742/20000 [00:37<01:29, 158.75it/s]\n",
290
+ " 29%|##8 | 5766/20000 [00:37<01:28, 160.05it/s]\n",
291
+ " 29%|##8 | 5788/20000 [00:37<01:29, 159.44it/s]\n",
292
+ " 29%|##9 | 5808/20000 [00:37<01:29, 158.22it/s]\n",
293
+ " 29%|##9 | 5827/20000 [00:37<01:27, 162.78it/s]\n",
294
+ " 29%|##9 | 5846/20000 [00:38<01:25, 165.07it/s]\n",
295
+ " 29%|##9 | 5864/20000 [00:38<01:25, 164.71it/s]\n",
296
+ " 29%|##9 | 5882/20000 [00:38<01:26, 162.88it/s]\n",
297
+ " 29%|##9 | 5899/20000 [00:38<01:30, 155.66it/s]\n",
298
+ " 30%|##9 | 5916/20000 [00:38<01:29, 158.09it/s]\n",
299
+ " 30%|##9 | 5935/20000 [00:38<01:24, 166.09it/s]\n",
300
+ " 30%|##9 | 5954/20000 [00:38<01:22, 169.84it/s]\n",
301
+ " 30%|##9 | 5972/20000 [00:38<01:21, 171.23it/s]\n",
302
+ " 30%|##9 | 5990/20000 [00:38<01:22, 170.81it/s]\n",
303
+ " 30%|### | 6008/20000 [00:39<01:23, 167.70it/s]\n",
304
+ " 30%|### | 6027/20000 [00:39<01:20, 173.01it/s]\n",
305
+ " 30%|### | 6046/20000 [00:39<01:19, 175.35it/s]\n",
306
+ " 30%|### | 6064/20000 [00:39<01:20, 172.23it/s]\n",
307
+ " 30%|### | 6082/20000 [00:39<01:21, 170.55it/s]\n",
308
+ " 30%|### | 6100/20000 [00:39<01:23, 167.05it/s]\n",
309
+ " 31%|### | 6118/20000 [00:39<01:21, 170.70it/s]\n",
310
+ " 31%|### | 6138/20000 [00:39<01:17, 178.09it/s]\n",
311
+ " 31%|### | 6157/20000 [00:39<01:16, 179.96it/s]\n",
312
+ " 31%|### | 6176/20000 [00:39<01:18, 177.21it/s]\n",
313
+ " 31%|### | 6194/20000 [00:40<01:18, 174.99it/s]\n",
314
+ " 31%|###1 | 6212/20000 [00:40<01:19, 173.44it/s]\n",
315
+ " 31%|###1 | 6232/20000 [00:40<01:16, 180.04it/s]\n",
316
+ " 31%|###1 | 6251/20000 [00:40<01:16, 179.80it/s]\n",
317
+ " 31%|###1 | 6270/20000 [00:40<01:19, 172.28it/s]\n",
318
+ " 31%|###1 | 6288/20000 [00:40<01:20, 170.14it/s]\n",
319
+ " 32%|###1 | 6306/20000 [00:40<01:22, 165.92it/s]\n",
320
+ " 32%|###1 | 6327/20000 [00:40<01:16, 178.17it/s]\n",
321
+ " 32%|###1 | 6347/20000 [00:40<01:14, 183.85it/s]\n",
322
+ " 32%|###1 | 6366/20000 [00:41<01:14, 182.46it/s]\n",
323
+ " 32%|###1 | 6385/20000 [00:41<01:17, 175.53it/s]\n",
324
+ " 32%|###2 | 6403/20000 [00:41<01:21, 166.92it/s]\n",
325
+ " 32%|###2 | 6423/20000 [00:41<01:17, 174.57it/s]\n",
326
+ " 32%|###2 | 6443/20000 [00:41<01:15, 179.69it/s]\n",
327
+ " 32%|###2 | 6462/20000 [00:41<01:16, 178.06it/s]\n",
328
+ " 32%|###2 | 6480/20000 [00:41<01:17, 174.17it/s]\n",
329
+ " 32%|###2 | 6498/20000 [00:41<01:19, 170.04it/s]\n",
330
+ " 33%|###2 | 6517/20000 [00:41<01:16, 175.15it/s]\n",
331
+ " 33%|###2 | 6538/20000 [00:42<01:13, 184.06it/s]\n",
332
+ " 33%|###2 | 6558/20000 [00:42<01:11, 187.58it/s]\n",
333
+ " 33%|###2 | 6577/20000 [00:42<01:12, 183.99it/s]\n",
334
+ " 33%|###2 | 6596/20000 [00:42<01:14, 180.51it/s]\n",
335
+ " 33%|###3 | 6615/20000 [00:42<01:14, 180.64it/s]\n",
336
+ " 33%|###3 | 6636/20000 [00:42<01:11, 187.45it/s]\n",
337
+ " 33%|###3 | 6656/20000 [00:42<01:10, 189.43it/s]\n",
338
+ " 33%|###3 | 6675/20000 [00:42<01:11, 185.29it/s]\n",
339
+ " 33%|###3 | 6694/20000 [00:42<01:14, 177.91it/s]\n",
340
+ " 34%|###3 | 6712/20000 [00:42<01:15, 176.02it/s]\n",
341
+ " 34%|###3 | 6733/20000 [00:43<01:11, 185.68it/s]\n",
342
+ " 34%|###3 | 6752/20000 [00:43<01:10, 186.91it/s]\n",
343
+ " 34%|###3 | 6771/20000 [00:43<01:12, 183.53it/s]\n",
344
+ " 34%|###3 | 6790/20000 [00:43<01:15, 175.73it/s]\n",
345
+ " 34%|###4 | 6808/20000 [00:43<01:17, 170.68it/s]\n",
346
+ " 34%|###4 | 6828/20000 [00:43<01:13, 178.87it/s]\n",
347
+ " 34%|###4 | 6849/20000 [00:43<01:10, 186.16it/s]\n",
348
+ " 34%|###4 | 6868/20000 [00:43<01:10, 187.26it/s]\n",
349
+ " 34%|###4 | 6887/20000 [00:43<01:10, 185.36it/s]\n",
350
+ " 35%|###4 | 6906/20000 [00:44<01:12, 180.40it/s]\n",
351
+ " 35%|###4 | 6929/20000 [00:44<01:07, 193.50it/s]\n",
352
+ " 35%|###4 | 6950/20000 [00:44<01:05, 198.27it/s]\n",
353
+ " 35%|###4 | 6970/20000 [00:44<01:06, 197.04it/s]\n",
354
+ " 35%|###4 | 6990/20000 [00:44<01:08, 190.65it/s]\n",
355
+ " 35%|###5 | 7010/20000 [00:44<01:10, 184.33it/s]\n",
356
+ " 35%|###5 | 7029/20000 [00:44<01:10, 183.34it/s]\n",
357
+ " 35%|###5 | 7049/20000 [00:44<01:08, 188.06it/s]\n",
358
+ " 35%|###5 | 7068/20000 [00:44<01:08, 188.62it/s]\n",
359
+ " 35%|###5 | 7087/20000 [00:44<01:09, 184.71it/s]\n",
360
+ " 36%|###5 | 7106/20000 [00:45<01:11, 179.97it/s]\n",
361
+ " 36%|###5 | 7129/20000 [00:45<01:06, 192.60it/s]\n",
362
+ " 36%|###5 | 7151/20000 [00:45<01:04, 198.21it/s]\n",
363
+ " 36%|###5 | 7172/20000 [00:45<01:04, 198.74it/s]\n",
364
+ " 36%|###5 | 7192/20000 [00:45<01:04, 197.39it/s]\n",
365
+ " 36%|###6 | 7212/20000 [00:45<01:05, 195.87it/s]\n",
366
+ " 36%|###6 | 7235/20000 [00:45<01:02, 205.80it/s]\n",
367
+ " 36%|###6 | 7256/20000 [00:45<01:01, 205.81it/s]\n",
368
+ " 36%|###6 | 7277/20000 [00:45<01:03, 201.69it/s]\n",
369
+ " 36%|###6 | 7298/20000 [00:46<01:05, 193.96it/s]\n",
370
+ " 37%|###6 | 7320/20000 [00:46<01:03, 200.19it/s]\n",
371
+ " 37%|###6 | 7343/20000 [00:46<01:00, 208.13it/s]\n",
372
+ " 37%|###6 | 7364/20000 [00:46<01:00, 208.67it/s]\n",
373
+ " 37%|###6 | 7385/20000 [00:46<01:01, 204.27it/s]\n",
374
+ " 37%|###7 | 7406/20000 [00:46<01:03, 199.56it/s]\n",
375
+ " 37%|###7 | 7429/20000 [00:46<01:00, 207.10it/s]\n",
376
+ " 37%|###7 | 7450/20000 [00:46<01:00, 207.93it/s]\n",
377
+ " 37%|###7 | 7471/20000 [00:46<01:00, 207.32it/s]\n",
378
+ " 37%|###7 | 7492/20000 [00:46<01:01, 204.50it/s]\n",
379
+ " 38%|###7 | 7513/20000 [00:47<01:00, 205.49it/s]\n",
380
+ " 38%|###7 | 7537/20000 [00:47<00:57, 215.00it/s]\n",
381
+ " 38%|###7 | 7559/20000 [00:47<00:57, 215.82it/s]\n",
382
+ " 38%|###7 | 7581/20000 [00:47<00:58, 210.82it/s]\n",
383
+ " 38%|###8 | 7603/20000 [00:47<01:01, 200.64it/s]\n",
384
+ " 38%|###8 | 7627/20000 [00:47<00:58, 211.11it/s]\n",
385
+ " 38%|###8 | 7650/20000 [00:47<00:57, 215.27it/s]\n",
386
+ " 38%|###8 | 7672/20000 [00:47<00:58, 211.14it/s]\n",
387
+ " 38%|###8 | 7694/20000 [00:47<01:00, 203.74it/s]\n",
388
+ " 39%|###8 | 7716/20000 [00:48<00:59, 206.56it/s]\n",
389
+ " 39%|###8 | 7742/20000 [00:48<00:55, 219.27it/s]\n",
390
+ " 39%|###8 | 7765/20000 [00:48<00:55, 222.34it/s]\n",
391
+ " 39%|###8 | 7788/20000 [00:48<00:55, 220.09it/s]\n",
392
+ " 39%|###9 | 7811/20000 [00:48<00:56, 217.29it/s]\n",
393
+ " 39%|###9 | 7837/20000 [00:48<00:53, 227.59it/s]\n",
394
+ " 39%|###9 | 7860/20000 [00:48<00:53, 225.68it/s]\n",
395
+ " 39%|###9 | 7883/20000 [00:48<00:57, 210.45it/s]\n",
396
+ " 40%|###9 | 7905/20000 [00:48<00:59, 203.50it/s]\n",
397
+ " 40%|###9 | 7931/20000 [00:49<00:55, 218.40it/s]\n",
398
+ " 40%|###9 | 7955/20000 [00:49<00:53, 223.84it/s]\n",
399
+ " 40%|###9 | 7978/20000 [00:49<00:54, 222.42it/s]\n",
400
+ " 40%|#### | 8001/20000 [00:49<00:56, 211.24it/s]\n",
401
+ " 40%|#### | 8028/20000 [00:49<00:52, 226.96it/s]\n",
402
+ " 40%|#### | 8052/20000 [00:49<00:52, 229.36it/s]\n",
403
+ " 40%|#### | 8076/20000 [00:49<00:52, 226.55it/s]\n",
404
+ " 40%|#### | 8099/20000 [00:49<00:54, 217.59it/s]\n",
405
+ " 41%|#### | 8121/20000 [00:50<01:26, 136.56it/s]\n",
406
+ " 41%|#### | 8144/20000 [00:50<01:16, 154.69it/s]\n",
407
+ " 41%|#### | 8165/20000 [00:50<01:11, 165.14it/s]\n",
408
+ " 41%|#### | 8186/20000 [00:50<01:07, 174.19it/s]\n",
409
+ " 41%|####1 | 8206/20000 [00:50<01:05, 179.79it/s]\n",
410
+ " 41%|####1 | 8234/20000 [00:50<00:57, 205.18it/s]\n",
411
+ " 41%|####1 | 8259/20000 [00:50<00:54, 215.64it/s]\n",
412
+ " 41%|####1 | 8282/20000 [00:50<00:53, 219.03it/s]\n",
413
+ " 42%|####1 | 8305/20000 [00:50<00:55, 209.63it/s]\n",
414
+ " 42%|####1 | 8334/20000 [00:51<00:50, 229.98it/s]\n",
415
+ " 42%|####1 | 8359/20000 [00:51<00:49, 234.96it/s]\n",
416
+ " 42%|####1 | 8383/20000 [00:51<00:50, 230.45it/s]\n",
417
+ " 42%|####2 | 8407/20000 [00:51<00:52, 222.38it/s]\n",
418
+ " 42%|####2 | 8436/20000 [00:51<00:48, 240.62it/s]\n",
419
+ " 42%|####2 | 8461/20000 [00:51<00:47, 242.60it/s]\n",
420
+ " 42%|####2 | 8486/20000 [00:51<00:47, 239.88it/s]\n",
421
+ " 43%|####2 | 8511/20000 [00:51<00:48, 236.67it/s]\n",
422
+ " 43%|####2 | 8539/20000 [00:51<00:46, 247.65it/s]\n",
423
+ " 43%|####2 | 8564/20000 [00:51<00:46, 244.78it/s]\n",
424
+ " 43%|####2 | 8589/20000 [00:52<00:48, 236.04it/s]\n",
425
+ " 43%|####3 | 8613/20000 [00:52<00:48, 235.15it/s]\n",
426
+ " 43%|####3 | 8643/20000 [00:52<00:45, 252.29it/s]\n",
427
+ " 43%|####3 | 8669/20000 [00:52<00:45, 248.06it/s]\n",
428
+ " 43%|####3 | 8694/20000 [00:52<00:47, 240.33it/s]\n",
429
+ " 44%|####3 | 8720/20000 [00:52<00:46, 243.84it/s]\n",
430
+ " 44%|####3 | 8748/20000 [00:52<00:44, 254.21it/s]\n",
431
+ " 44%|####3 | 8777/20000 [00:52<00:42, 263.08it/s]\n",
432
+ " 44%|####4 | 8808/20000 [00:52<00:40, 276.78it/s]\n",
433
+ " 44%|####4 | 8856/20000 [00:53<00:33, 336.62it/s]\n",
434
+ " 44%|####4 | 8896/20000 [00:53<00:31, 353.29it/s]\n",
435
+ " 45%|####4 | 8955/20000 [00:53<00:26, 422.10it/s]\n",
436
+ " 45%|####5 | 9001/20000 [00:53<00:25, 425.76it/s]\n",
437
+ " 45%|####5 | 9070/20000 [00:53<00:21, 501.92it/s]\n",
438
+ " 46%|####5 | 9128/20000 [00:53<00:20, 523.43it/s]\n",
439
+ " 46%|####5 | 9183/20000 [00:53<00:20, 529.69it/s]\n",
440
+ " 46%|####6 | 9237/20000 [00:53<00:20, 531.16it/s]\n",
441
+ " 46%|####6 | 9291/20000 [00:53<00:21, 508.14it/s]\n",
442
+ " 47%|####6 | 9346/20000 [00:53<00:20, 520.17it/s]\n",
443
+ " 47%|####6 | 9399/20000 [00:54<00:20, 509.66it/s]\n",
444
+ " 47%|####7 | 9451/20000 [00:54<00:20, 509.69it/s]\n",
445
+ " 48%|####7 | 9503/20000 [00:54<00:21, 494.01it/s]\n",
446
+ " 48%|####7 | 9580/20000 [00:54<00:18, 569.61it/s]\n",
447
+ " 48%|####8 | 9649/20000 [00:54<00:17, 602.61it/s]\n",
448
+ " 49%|####8 | 9710/20000 [00:54<00:17, 597.78it/s]\n",
449
+ " 49%|####8 | 9792/20000 [00:54<00:15, 660.63it/s]\n",
450
+ " 49%|####9 | 9874/20000 [00:54<00:14, 707.37it/s]\n",
451
+ " 50%|####9 | 9946/20000 [00:54<00:14, 698.70it/s]\n",
452
+ " 50%|##### | 10029/20000 [00:54<00:13, 735.01it/s]\n",
453
+ " 51%|##### | 10103/20000 [00:55<00:14, 703.33it/s]\n",
454
+ " 51%|##### | 10187/20000 [00:55<00:13, 742.49it/s]\n",
455
+ " 51%|#####1 | 10275/20000 [00:55<00:12, 782.42it/s]\n",
456
+ " 52%|#####1 | 10372/20000 [00:55<00:11, 837.41it/s]\n",
457
+ " 52%|#####2 | 10463/20000 [00:55<00:11, 858.83it/s]\n",
458
+ " 53%|#####2 | 10550/20000 [00:55<00:10, 862.08it/s]\n",
459
+ " 53%|#####3 | 10640/20000 [00:55<00:10, 873.35it/s]\n",
460
+ " 54%|#####3 | 10728/20000 [00:55<00:10, 857.41it/s]\n",
461
+ " 54%|#####4 | 10815/20000 [00:55<00:10, 858.61it/s]\n",
462
+ " 55%|#####4 | 10902/20000 [00:56<00:10, 861.95it/s]\n",
463
+ " 55%|#####5 | 11034/20000 [00:56<00:08, 997.87it/s]\n",
464
+ " 56%|#####5 | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
465
+ " 56%|#####6 | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
466
+ " 57%|#####7 | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
467
+ " 58%|#####7 | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
468
+ " 59%|#####8 | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
469
+ " 60%|#####9 | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
470
+ " 61%|###### | 12137/20000 [00:58<00:32, 241.50it/s] \n",
471
+ " 61%|######1 | 12256/20000 [00:59<00:34, 221.55it/s]\n",
472
+ " 62%|######1 | 12344/20000 [00:59<00:36, 211.18it/s]\n",
473
+ " 62%|######2 | 12411/20000 [01:00<00:37, 204.44it/s]\n",
474
+ " 62%|######2 | 12464/20000 [01:00<00:37, 201.86it/s]\n",
475
+ " 63%|######2 | 12507/20000 [01:00<00:37, 197.48it/s]\n",
476
+ " 63%|######2 | 12542/20000 [01:00<00:35, 209.76it/s]\n",
477
+ " 63%|######2 | 12576/20000 [01:01<00:33, 220.35it/s]\n",
478
+ " 63%|######3 | 12609/20000 [01:01<00:32, 226.03it/s]\n",
479
+ " 63%|######3 | 12640/20000 [01:01<00:31, 234.04it/s]\n",
480
+ " 63%|######3 | 12670/20000 [01:01<00:29, 246.31it/s]\n",
481
+ " 64%|######3 | 12700/20000 [01:01<00:28, 251.76it/s]\n",
482
+ " 64%|######3 | 12731/20000 [01:01<00:27, 263.76it/s]\n",
483
+ " 64%|######3 | 12761/20000 [01:01<00:26, 272.59it/s]\n",
484
+ " 64%|######3 | 12791/20000 [01:01<00:26, 271.94it/s]\n",
485
+ " 64%|######4 | 12820/20000 [01:01<00:26, 274.53it/s]\n",
486
+ " 64%|######4 | 12850/20000 [01:02<00:25, 281.46it/s]\n",
487
+ " 64%|######4 | 12879/20000 [01:02<00:25, 277.63it/s]\n",
488
+ " 65%|######4 | 12908/20000 [01:02<00:26, 264.03it/s]\n",
489
+ " 65%|######4 | 12939/20000 [01:02<00:25, 276.61it/s]\n",
490
+ " 65%|######4 | 12969/20000 [01:02<00:24, 281.60it/s]\n",
491
+ " 65%|######4 | 12998/20000 [01:02<00:24, 280.78it/s]\n",
492
+ " 65%|######5 | 13027/20000 [01:02<00:25, 275.51it/s]\n",
493
+ " 65%|######5 | 13058/20000 [01:02<00:24, 285.34it/s]\n",
494
+ " 65%|######5 | 13087/20000 [01:02<00:24, 285.03it/s]\n",
495
+ " 66%|######5 | 13117/20000 [01:03<00:23, 287.71it/s]\n",
496
+ " 66%|######5 | 13151/20000 [01:03<00:22, 301.25it/s]\n",
497
+ " 66%|######5 | 13182/20000 [01:03<00:22, 299.42it/s]\n",
498
+ " 66%|######6 | 13213/20000 [01:03<00:23, 288.18it/s]\n",
499
+ " 66%|######6 | 13247/20000 [01:03<00:22, 302.07it/s]\n",
500
+ " 66%|######6 | 13280/20000 [01:03<00:21, 309.23it/s]\n",
501
+ " 67%|######6 | 13312/20000 [01:03<00:21, 306.12it/s]\n",
502
+ " 67%|######6 | 13348/20000 [01:03<00:20, 321.72it/s]\n",
503
+ " 67%|######6 | 13381/20000 [01:03<00:20, 320.39it/s]\n",
504
+ " 67%|######7 | 13414/20000 [01:04<00:35, 183.90it/s]\n",
505
+ " 67%|######7 | 13448/20000 [01:04<00:30, 213.47it/s]\n",
506
+ " 67%|######7 | 13478/20000 [01:04<00:28, 232.06it/s]\n",
507
+ " 68%|######7 | 13508/20000 [01:04<00:26, 246.85it/s]\n",
508
+ " 68%|######7 | 13546/20000 [01:04<00:23, 278.79it/s]\n",
509
+ " 68%|######7 | 13578/20000 [01:04<00:22, 289.60it/s]\n",
510
+ " 68%|######8 | 13610/20000 [01:04<00:21, 290.75it/s]\n",
511
+ " 68%|######8 | 13650/20000 [01:04<00:19, 319.96it/s]\n",
512
+ " 68%|######8 | 13684/20000 [01:05<00:19, 322.87it/s]\n",
513
+ " 69%|######8 | 13718/20000 [01:05<00:19, 324.97it/s]\n",
514
+ " 69%|######8 | 13753/20000 [01:05<00:18, 332.16it/s]\n",
515
+ " 69%|######8 | 13787/20000 [01:05<00:19, 323.16it/s]\n",
516
+ " 69%|######9 | 13820/20000 [01:05<00:19, 317.82it/s]\n",
517
+ " 69%|######9 | 13857/20000 [01:05<00:18, 332.74it/s]\n",
518
+ " 69%|######9 | 13891/20000 [01:05<00:18, 333.86it/s]\n",
519
+ " 70%|######9 | 13927/20000 [01:05<00:17, 340.50it/s]\n",
520
+ " 70%|######9 | 13963/20000 [01:05<00:17, 345.20it/s]\n",
521
+ " 70%|######9 | 13998/20000 [01:05<00:17, 340.60it/s]\n",
522
+ " 70%|####### | 14036/20000 [01:06<00:16, 351.09it/s]\n",
523
+ " 70%|####### | 14073/20000 [01:06<00:16, 356.65it/s]\n",
524
+ " 71%|####### | 14109/20000 [01:06<00:16, 353.45it/s]\n",
525
+ " 71%|####### | 14150/20000 [01:06<00:15, 369.02it/s]\n",
526
+ " 71%|####### | 14187/20000 [01:06<00:15, 368.21it/s]\n",
527
+ " 71%|#######1 | 14227/20000 [01:06<00:15, 375.42it/s]\n",
528
+ " 71%|#######1 | 14265/20000 [01:06<00:16, 345.08it/s]\n",
529
+ " 72%|#######1 | 14301/20000 [01:06<00:16, 347.30it/s]\n",
530
+ " 72%|#######1 | 14349/20000 [01:06<00:14, 383.90it/s]\n",
531
+ " 72%|#######1 | 14388/20000 [01:06<00:14, 376.96it/s]\n",
532
+ " 72%|#######2 | 14430/20000 [01:07<00:14, 389.28it/s]\n",
533
+ " 72%|#######2 | 14471/20000 [01:07<00:13, 395.30it/s]\n",
534
+ " 73%|#######2 | 14511/20000 [01:07<00:14, 389.82it/s]\n",
535
+ " 73%|#######2 | 14554/20000 [01:07<00:13, 401.53it/s]\n",
536
+ " 73%|#######2 | 14595/20000 [01:07<00:14, 378.41it/s]\n",
537
+ " 73%|#######3 | 14643/20000 [01:07<00:13, 405.95it/s]\n",
538
+ " 73%|#######3 | 14687/20000 [01:07<00:12, 415.69it/s]\n",
539
+ " 74%|#######3 | 14730/20000 [01:07<00:12, 418.62it/s]\n",
540
+ " 74%|#######3 | 14774/20000 [01:07<00:12, 422.40it/s]\n",
541
+ " 74%|#######4 | 14817/20000 [01:08<00:12, 418.48it/s]\n",
542
+ " 74%|#######4 | 14868/20000 [01:08<00:11, 443.95it/s]\n",
543
+ " 75%|#######4 | 14913/20000 [01:08<00:11, 444.41it/s]\n",
544
+ " 75%|#######4 | 14962/20000 [01:08<00:11, 457.86it/s]\n",
545
+ " 75%|#######5 | 15008/20000 [01:08<00:11, 438.97it/s]\n",
546
+ " 75%|#######5 | 15067/20000 [01:08<00:10, 481.14it/s]\n",
547
+ " 76%|#######5 | 15116/20000 [01:08<00:10, 483.71it/s]\n",
548
+ " 76%|#######5 | 15173/20000 [01:08<00:09, 509.06it/s]\n",
549
+ " 76%|#######6 | 15227/20000 [01:08<00:09, 518.19it/s]\n",
550
+ " 76%|#######6 | 15285/20000 [01:08<00:08, 534.95it/s]\n",
551
+ " 77%|#######6 | 15351/20000 [01:09<00:08, 570.41it/s]\n",
552
+ " 77%|#######7 | 15409/20000 [01:09<00:08, 569.86it/s]\n",
553
+ " 77%|#######7 | 15477/20000 [01:09<00:07, 602.56it/s]\n",
554
+ " 78%|#######7 | 15538/20000 [01:09<00:07, 602.96it/s]\n",
555
+ " 78%|#######7 | 15599/20000 [01:09<00:07, 585.87it/s]\n",
556
+ " 78%|#######8 | 15658/20000 [01:09<00:07, 581.97it/s]\n",
557
+ " 79%|#######8 | 15722/20000 [01:09<00:07, 598.93it/s]\n",
558
+ " 79%|#######8 | 15799/20000 [01:09<00:06, 647.41it/s]\n",
559
+ " 79%|#######9 | 15877/20000 [01:09<00:06, 684.57it/s]\n",
560
+ " 80%|#######9 | 15957/20000 [01:09<00:05, 718.72it/s]\n",
561
+ " 80%|######## | 16037/20000 [01:10<00:05, 740.70it/s]\n",
562
+ " 81%|######## | 16112/20000 [01:10<00:05, 730.42it/s]\n",
563
+ " 81%|######## | 16195/20000 [01:10<00:05, 757.50it/s]\n",
564
+ " 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
565
+ " 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
566
+ " 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
567
+ " 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
568
+ " 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
569
+ " 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
570
+ " 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
571
+ " 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
572
+ " 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
573
+ " 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
574
+ " 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
575
+ " 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
576
+ " 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
577
+ " 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
578
+ " 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
579
+ " 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
580
+ " 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
581
+ " 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
582
+ " 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
583
+ " 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
584
+ " 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
585
+ " 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
586
+ " 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
587
+ " 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
588
+ " 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
589
+ " 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
590
+ " 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
591
+ " 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
592
+ " 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
593
+ " 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
594
+ " 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
595
+ " 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
596
+ " 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
597
+ " 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
598
+ " 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
599
+ " 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
600
+ " 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
601
+ " 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
602
+ " 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
603
+ " 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
604
+ " 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
605
+ " 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
606
+ " 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
607
+ " 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
608
+ " 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
609
+ " 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
610
+ " 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
611
+ " 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
612
+ " 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
613
+ " 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
614
+ " 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
615
+ " 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
616
+ " 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
617
+ " 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
618
+ " 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
619
+ " 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
620
+ "100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
621
+ "100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
622
+ "100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
623
+ "100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
624
+ "100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
625
+ "100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
626
+ ]
627
+ }
628
+ ],
629
+ "source": [
630
+ "!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 12,
636
+ "id": "68a4113a",
637
+ "metadata": {},
638
+ "outputs": [],
639
+ "source": [
640
+ "!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
641
+ ]
642
+ },
643
+ {
644
+ "cell_type": "code",
645
+ "execution_count": 13,
646
+ "id": "06254f0d",
647
+ "metadata": {},
648
+ "outputs": [
649
+ {
650
+ "name": "stdout",
651
+ "output_type": "stream",
652
+ "text": [
653
+ "Vocabulary size: 20217\n"
654
+ ]
655
+ }
656
+ ],
657
+ "source": [
658
+ "def count_tokens(file_path):\n",
659
+ " try:\n",
660
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
661
+ " text = file.read()\n",
662
+ " # Split the text into tokens based on spaces\n",
663
+ " tokens = text.split()\n",
664
+ " # Count the vocabulary size (number of unique tokens)\n",
665
+ " vocabulary_size = len(set(tokens))\n",
666
+ " return vocabulary_size\n",
667
+ " except IOError:\n",
668
+ " print(f\"Error: Could not open or read the file '{file_path}'\")\n",
669
+ " return -1\n",
670
+ "\n",
671
+ "# Example usage\n",
672
+ "file_path = './dataset/output_dataset.txt' # Replace with the actual file path\n",
673
+ "vocabulary_size = count_tokens(file_path)\n",
674
+ "if vocabulary_size != -1:\n",
675
+ " print(f\"Vocabulary size: {vocabulary_size}\")\n"
676
+ ]
677
+ }
678
+ ],
679
+ "metadata": {
680
+ "kernelspec": {
681
+ "display_name": "Python 3 (ipykernel)",
682
+ "language": "python",
683
+ "name": "python3"
684
+ },
685
+ "language_info": {
686
+ "codemirror_mode": {
687
+ "name": "ipython",
688
+ "version": 3
689
+ },
690
+ "file_extension": ".py",
691
+ "mimetype": "text/x-python",
692
+ "name": "python",
693
+ "nbconvert_exporter": "python",
694
+ "pygments_lexer": "ipython3",
695
+ "version": "3.9.5"
696
+ }
697
+ },
698
+ "nbformat": 4,
699
+ "nbformat_minor": 5
700
+ }
subword/__init__.py ADDED
File without changes
subword/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (166 Bytes). View file
 
subword/__pycache__/apply_bpe.cpython-39.pyc ADDED
Binary file (13.4 kB). View file
 
subword/apply_bpe.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ """Use operations learned with learn_bpe.py to encode a new text.
6
+ The text will not be smaller, but use only a fixed vocabulary, with rare words
7
+ encoded as variable-length sequences of subword units.
8
+
9
+ Reference:
10
+ Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
11
+ Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
12
+ """
13
+
14
+ from __future__ import unicode_literals, division
15
+
16
+ import sys
17
+ import os
18
+ import inspect
19
+ import codecs
20
+ import io
21
+ import argparse
22
+ import re
23
+ import warnings
24
+ import random
25
+ import tempfile
26
+ from multiprocessing import Pool, cpu_count
27
+
28
+ # hack for python2/3 compatibility
29
+ from io import open
30
+ argparse.open = open
31
+
32
+ class BPE(object):
33
+
34
+ def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):
35
+
36
+ codes.seek(0)
37
+ offset=1
38
+
39
+ # check version information
40
+ firstline = codes.readline()
41
+ if firstline.startswith('#version:'):
42
+ self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
43
+ offset += 1
44
+ else:
45
+ self.version = (0, 1)
46
+ codes.seek(0)
47
+
48
+ self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes.read().rstrip('\n').split('\n')) if (n < merges or merges == -1)]
49
+
50
+ for i, item in enumerate(self.bpe_codes):
51
+ if len(item) != 2:
52
+ sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item)))
53
+ sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n')
54
+ sys.exit(1)
55
+
56
+ # some hacking to deal with duplicates (only consider first instance)
57
+ self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
58
+
59
+ self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])
60
+
61
+ self.separator = separator
62
+
63
+ self.vocab = vocab
64
+
65
+ self.glossaries = glossaries if glossaries else []
66
+
67
+ self.glossaries_regex = re.compile('^({})$'.format('|'.join(glossaries))) if glossaries else None
68
+
69
+ self.cache = {}
70
+
71
+ def process_lines(self, filename, outfile, dropout=0, num_workers=1):
72
+
73
+ if sys.version_info < (3, 0):
74
+ print("Parallel mode is only supported in Python3.")
75
+ sys.exit(1)
76
+
77
+ if num_workers == 1:
78
+ _process_lines(self, filename, outfile, dropout, 0, 0)
79
+ elif num_workers > 1:
80
+ with open(filename, encoding="utf-8") as f:
81
+ size = os.fstat(f.fileno()).st_size
82
+ chunk_size = int(size / num_workers)
83
+ offsets = [0 for _ in range(num_workers + 1)]
84
+ for i in range(1, num_workers):
85
+ f.seek(chunk_size * i)
86
+ pos = f.tell()
87
+ while True:
88
+ try:
89
+ line = f.readline()
90
+ break
91
+ except UnicodeDecodeError:
92
+ pos -= 1
93
+ f.seek(pos)
94
+ offsets[i] = f.tell()
95
+ assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
96
+ res_files = []
97
+ pool = Pool(processes=num_workers)
98
+ for i in range(num_workers):
99
+ tmp = tempfile.NamedTemporaryFile(delete=False)
100
+ tmp.close()
101
+ res_files.append(tmp)
102
+ pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
103
+ pool.close()
104
+ pool.join()
105
+ for i in range(num_workers):
106
+ with open(res_files[i].name, encoding="utf-8") as fi:
107
+ for line in fi:
108
+ outfile.write(line)
109
+ os.remove(res_files[i].name)
110
+ else:
111
+ raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
112
+
113
+ def process_line(self, line, dropout=0):
114
+ """segment line, dealing with leading and trailing whitespace"""
115
+
116
+ out = ""
117
+
118
+ leading_whitespace = len(line)-len(line.lstrip('\r\n '))
119
+ if leading_whitespace:
120
+ out += line[:leading_whitespace]
121
+
122
+ out += self.segment(line, dropout)
123
+
124
+ trailing_whitespace = len(line)-len(line.rstrip('\r\n '))
125
+ if trailing_whitespace and trailing_whitespace != len(line):
126
+ out += line[-trailing_whitespace:]
127
+
128
+ return out
129
+
130
+ def segment(self, sentence, dropout=0):
131
+ """segment single sentence (whitespace-tokenized string) with BPE encoding"""
132
+ segments = self.segment_tokens(sentence.strip('\r\n ').split(' '), dropout)
133
+ return ' '.join(segments)
134
+
135
+ def segment_tokens(self, tokens, dropout=0):
136
+ """segment a sequence of tokens with BPE encoding"""
137
+ output = []
138
+ for word in tokens:
139
+ # eliminate double spaces
140
+ if not word:
141
+ continue
142
+ new_word = [out for segment in self._isolate_glossaries(word)
143
+ for out in encode(segment,
144
+ self.bpe_codes,
145
+ self.bpe_codes_reverse,
146
+ self.vocab,
147
+ self.separator,
148
+ self.version,
149
+ self.cache,
150
+ self.glossaries_regex,
151
+ dropout)]
152
+
153
+ for item in new_word[:-1]:
154
+ output.append(item + self.separator)
155
+ output.append(new_word[-1])
156
+
157
+ return output
158
+
159
+ def _isolate_glossaries(self, word):
160
+ word_segments = [word]
161
+ for gloss in self.glossaries:
162
+ word_segments = [out_segments for segment in word_segments
163
+ for out_segments in isolate_glossary(segment, gloss)]
164
+ return word_segments
165
+
166
+ def _process_lines(bpe, filename, outfile, dropout, begin, end):
167
+ if isinstance(outfile, str):
168
+ fo = open(outfile, "w", encoding="utf-8")
169
+ else:
170
+ fo = outfile
171
+ with open(filename, encoding="utf-8") as f:
172
+ f.seek(begin)
173
+ line = f.readline()
174
+ while line:
175
+ pos = f.tell()
176
+ assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
177
+ if end > 0 and pos > end:
178
+ break
179
+ fo.write(bpe.process_line(line, dropout))
180
+ line = f.readline()
181
+ if isinstance(outfile, str):
182
+ fo.close()
183
+
184
+ def create_parser(subparsers=None):
185
+
186
+ if subparsers:
187
+ parser = subparsers.add_parser('apply-bpe',
188
+ formatter_class=argparse.RawDescriptionHelpFormatter,
189
+ description="learn BPE-based word segmentation")
190
+ else:
191
+ parser = argparse.ArgumentParser(
192
+ formatter_class=argparse.RawDescriptionHelpFormatter,
193
+ description="learn BPE-based word segmentation")
194
+
195
+ parser.add_argument(
196
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
197
+ metavar='PATH',
198
+ help="Input file (default: standard input).")
199
+ parser.add_argument(
200
+ '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
201
+ required=True,
202
+ help="File with BPE codes (created by learn_bpe.py).")
203
+ parser.add_argument(
204
+ '--merges', '-m', type=int, default=-1,
205
+ metavar='INT',
206
+ help="Use this many BPE operations (<= number of learned symbols)"+
207
+ "default: Apply all the learned merge operations")
208
+ parser.add_argument(
209
+ '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
210
+ metavar='PATH',
211
+ help="Output file (default: standard output)")
212
+ parser.add_argument(
213
+ '--separator', '-s', type=str, default='@@', metavar='STR',
214
+ help="Separator between non-final subword units (default: '%(default)s'))")
215
+ parser.add_argument(
216
+ '--vocabulary', type=argparse.FileType('r'), default=None,
217
+ metavar="PATH",
218
+ help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.")
219
+ parser.add_argument(
220
+ '--vocabulary-threshold', type=int, default=None,
221
+ metavar="INT",
222
+ help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV")
223
+ parser.add_argument(
224
+ '--dropout', type=float, default=0,
225
+ metavar="P",
226
+ help="Dropout BPE merge operations with probability P (Provilkov et al., 2019). Use this on training data only.")
227
+ parser.add_argument(
228
+ '--glossaries', type=str, nargs='+', default=None,
229
+ metavar="STR",
230
+ help="Glossaries. Words matching any of the words/regex provided in glossaries will not be affected "+
231
+ "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords. "+
232
+ "Can be provided as a list of words/regex after the --glossaries argument. Enclose each regex in quotes.")
233
+ parser.add_argument(
234
+ '--seed', type=int, default=None,
235
+ metavar="S",
236
+ help="Random seed for the random number generators (e.g. for BPE dropout with --dropout).")
237
+ parser.add_argument(
238
+ '--num-workers', type=int, default=1,
239
+ help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
240
+
241
+ return parser
242
+
243
+ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries_regex=None, dropout=0):
244
+ """Encode word based on list of BPE merge operations, which are applied consecutively
245
+ """
246
+
247
+ if not dropout and orig in cache:
248
+ return cache[orig]
249
+
250
+ if glossaries_regex and glossaries_regex.match(orig):
251
+ cache[orig] = (orig,)
252
+ return (orig,)
253
+
254
+ if len(orig) == 1:
255
+ return orig
256
+
257
+ if version == (0, 1):
258
+ word = list(orig) + ['</w>']
259
+ elif version == (0, 2): # more consistent handling of word-final segments
260
+ word = list(orig[:-1]) + [orig[-1] + '</w>']
261
+ else:
262
+ raise NotImplementedError
263
+
264
+ while len(word) > 1:
265
+
266
+ # get list of symbol pairs; optionally apply dropout
267
+ pairs = [(bpe_codes[pair],i,pair) for (i,pair) in enumerate(zip(word, word[1:])) if (not dropout or random.random() > dropout) and pair in bpe_codes]
268
+
269
+ if not pairs:
270
+ break
271
+
272
+ #get first merge operation in list of BPE codes
273
+ bigram = min(pairs)[2]
274
+
275
+ # find start position of all pairs that we want to merge
276
+ positions = [i for (rank,i,pair) in pairs if pair == bigram]
277
+
278
+ i = 0
279
+ new_word = []
280
+ bigram = ''.join(bigram)
281
+ for j in positions:
282
+ # merges are invalid if they start before current position. This can happen if there are overlapping pairs: (x x x -> xx x)
283
+ if j < i:
284
+ continue
285
+ new_word.extend(word[i:j]) # all symbols before merged pair
286
+ new_word.append(bigram) # merged pair
287
+ i = j+2 # continue after merged pair
288
+ new_word.extend(word[i:]) # add all symbols until end of word
289
+ word = new_word
290
+
291
+ # don't print end-of-word symbols
292
+ if word[-1] == '</w>':
293
+ word = word[:-1]
294
+ elif word[-1].endswith('</w>'):
295
+ word[-1] = word[-1][:-4]
296
+
297
+ word = tuple(word)
298
+ if vocab:
299
+ word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator)
300
+
301
+ cache[orig] = word
302
+ return word
303
+
304
+ def recursive_split(segment, bpe_codes, vocab, separator, final=False):
305
+ """Recursively split segment into smaller units (by reversing BPE merges)
306
+ until all units are either in-vocabulary, or cannot be split futher."""
307
+
308
+ try:
309
+ if final:
310
+ left, right = bpe_codes[segment + '</w>']
311
+ right = right[:-4]
312
+ else:
313
+ left, right = bpe_codes[segment]
314
+ except:
315
+ #sys.stderr.write('cannot split {0} further.\n'.format(segment))
316
+ yield segment
317
+ return
318
+
319
+ if left + separator in vocab:
320
+ yield left
321
+ else:
322
+ for item in recursive_split(left, bpe_codes, vocab, separator, False):
323
+ yield item
324
+
325
+ if (final and right in vocab) or (not final and right + separator in vocab):
326
+ yield right
327
+ else:
328
+ for item in recursive_split(right, bpe_codes, vocab, separator, final):
329
+ yield item
330
+
331
+ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
332
+ """Check for each segment in word if it is in-vocabulary,
333
+ and segment OOV segments into smaller units by reversing the BPE merge operations"""
334
+
335
+ out = []
336
+
337
+ for segment in orig[:-1]:
338
+ if segment + separator in vocab:
339
+ out.append(segment)
340
+ else:
341
+ #sys.stderr.write('OOV: {0}\n'.format(segment))
342
+ for item in recursive_split(segment, bpe_codes, vocab, separator, False):
343
+ out.append(item)
344
+
345
+ segment = orig[-1]
346
+ if segment in vocab:
347
+ out.append(segment)
348
+ else:
349
+ #sys.stderr.write('OOV: {0}\n'.format(segment))
350
+ for item in recursive_split(segment, bpe_codes, vocab, separator, True):
351
+ out.append(item)
352
+
353
+ return out
354
+
355
+
356
+ def read_vocabulary(vocab_file, threshold):
357
+ """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
358
+ """
359
+
360
+ vocabulary = set()
361
+
362
+ for line in vocab_file:
363
+ word, freq = line.strip('\r\n ').split(' ')
364
+ freq = int(freq)
365
+ if threshold == None or freq >= threshold:
366
+ vocabulary.add(word)
367
+
368
+ return vocabulary
369
+
370
+ def isolate_glossary(word, glossary):
371
+ """
372
+ Isolate a glossary present inside a word.
373
+
374
+ Returns a list of subwords. In which all 'glossary' glossaries are isolated
375
+
376
+ For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is:
377
+ ['1934', 'USA', 'B', 'USA']
378
+ """
379
+ # regex equivalent of (if word == glossary or glossary not in word)
380
+ if re.match('^'+glossary+'$', word) or not re.search(glossary, word):
381
+ return [word]
382
+ else:
383
+ segments = re.split(r'({})'.format(glossary), word)
384
+ segments, ending = segments[:-1], segments[-1]
385
+ segments = list(filter(None, segments)) # Remove empty strings in regex group.
386
+ return segments + [ending.strip('\r\n ')] if ending != '' else segments
387
+
388
+ if __name__ == '__main__':
389
+
390
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
391
+ newdir = os.path.join(currentdir, 'subword_nmt')
392
+ if os.path.isdir(newdir):
393
+ warnings.warn(
394
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
395
+ DeprecationWarning
396
+ )
397
+
398
+ # python 2/3 compatibility
399
+ if sys.version_info < (3, 0):
400
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
401
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
402
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
403
+ else:
404
+ sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
405
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
406
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
407
+
408
+ parser = create_parser()
409
+ args = parser.parse_args()
410
+
411
+ if args.num_workers <= 0:
412
+ args.num_workers = cpu_count()
413
+
414
+ # read/write files as UTF-8
415
+
416
+ args.codes = codecs.open(args.codes.name, encoding='utf-8')
417
+ if args.input.name != '<stdin>':
418
+ args.input = codecs.open(args.input.name, encoding='utf-8')
419
+ if args.output.name != '<stdout>':
420
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
421
+ if args.vocabulary:
422
+ args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
423
+
424
+ if args.vocabulary:
425
+ vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
426
+ else:
427
+ vocabulary = None
428
+
429
+ if sys.version_info < (3, 0):
430
+ args.separator = args.separator.decode('UTF-8')
431
+ if args.glossaries:
432
+ args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
433
+ if args.num_workers > 1:
434
+ args.num_workers = 1
435
+ warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
436
+
437
+ if args.seed is not None:
438
+ random.seed(args.seed)
439
+
440
+ bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
441
+
442
+ if args.input.name == '<stdin>' or args.num_workers == 1:
443
+ if args.num_workers > 1:
444
+ warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
445
+ for line in args.input:
446
+ args.output.write(bpe.process_line(line, args.dropout))
447
+ else:
448
+ bpe.process_lines(args.input.name, args.output, args.dropout, args.num_workers)
449
+
450
+ # close files
451
+ args.codes.close()
452
+ if args.input.name != '<stdin>':
453
+ args.input.close()
454
+ if args.output.name != '<stdout>':
455
+ args.output.close()
456
+ if args.vocabulary:
457
+ args.vocabulary.close()
subword/bpe_toy.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
6
+ Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
7
+ of a text to a configurable number of symbols, with only a small increase in the number of tokens.
8
+ This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
9
+ indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).
10
+
11
+ Reference:
12
+ Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
13
+ Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
14
+ """
15
+
16
+
17
+ import re
18
+ import sys
19
+ import collections
20
+
21
+ def get_stats(vocab):
22
+ pairs = collections.defaultdict(int)
23
+ for word, freq in vocab.items():
24
+ symbols = word.split()
25
+ for i in range(len(symbols)-1):
26
+ pairs[symbols[i],symbols[i+1]] += freq
27
+ return pairs
28
+
29
+ def merge_vocab(pair, v_in):
30
+ v_out = {}
31
+ bigram_pattern = re.escape(' '.join(pair))
32
+ p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
33
+ for word in v_in:
34
+ w_out = p.sub(''.join(pair), word)
35
+ v_out[w_out] = v_in[word]
36
+ return v_out
37
+
38
+ vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2,
39
+ 'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3}
40
+ num_merges = 15
41
+ for i in range(num_merges):
42
+ pairs = get_stats(vocab)
43
+ try:
44
+ best = max(pairs, key=pairs.get)
45
+ except ValueError:
46
+ break
47
+ if pairs[best] < 2:
48
+ sys.stderr.write('no pair has frequency > 1. Stopping\n')
49
+ break
50
+ vocab = merge_vocab(best, vocab)
51
+ print(best)
subword/chrF.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ """Compute chrF3 for machine translation evaluation
6
+
7
+ Reference:
8
+ Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal.
9
+ """
10
+
11
+ from __future__ import print_function, unicode_literals, division
12
+
13
+ import sys
14
+ import codecs
15
+ import io
16
+ import argparse
17
+
18
+ from collections import defaultdict
19
+
20
+ # hack for python2/3 compatibility
21
+ from io import open
22
+ argparse.open = open
23
+
24
+ def create_parser():
25
+ parser = argparse.ArgumentParser(
26
+ formatter_class=argparse.RawDescriptionHelpFormatter,
27
+ description="learn BPE-based word segmentation")
28
+
29
+ parser.add_argument(
30
+ '--ref', '-r', type=argparse.FileType('r'), required=True,
31
+ metavar='PATH',
32
+ help="Reference file")
33
+ parser.add_argument(
34
+ '--hyp', type=argparse.FileType('r'), metavar='PATH',
35
+ default=sys.stdin,
36
+ help="Hypothesis file (default: stdin).")
37
+ parser.add_argument(
38
+ '--beta', '-b', type=float, default=3,
39
+ metavar='FLOAT',
40
+ help="beta parameter (default: '%(default)s')")
41
+ parser.add_argument(
42
+ '--ngram', '-n', type=int, default=6,
43
+ metavar='INT',
44
+ help="ngram order (default: '%(default)s')")
45
+ parser.add_argument(
46
+ '--space', '-s', action='store_true',
47
+ help="take spaces into account (default: '%(default)s')")
48
+ parser.add_argument(
49
+ '--precision', action='store_true',
50
+ help="report precision (default: '%(default)s')")
51
+ parser.add_argument(
52
+ '--recall', action='store_true',
53
+ help="report recall (default: '%(default)s')")
54
+
55
+ return parser
56
+
57
+ def extract_ngrams(words, max_length=4, spaces=False):
58
+
59
+ if not spaces:
60
+ words = ''.join(words.split())
61
+ else:
62
+ words = words.strip()
63
+
64
+ results = defaultdict(lambda: defaultdict(int))
65
+ for length in range(max_length):
66
+ for start_pos in range(len(words)):
67
+ end_pos = start_pos + length + 1
68
+ if end_pos <= len(words):
69
+ results[length][tuple(words[start_pos: end_pos])] += 1
70
+ return results
71
+
72
+
73
+ def get_correct(ngrams_ref, ngrams_test, correct, total):
74
+
75
+ for rank in ngrams_test:
76
+ for chain in ngrams_test[rank]:
77
+ total[rank] += ngrams_test[rank][chain]
78
+ if chain in ngrams_ref[rank]:
79
+ correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain])
80
+
81
+ return correct, total
82
+
83
+
84
+ def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0):
85
+
86
+ precision = 0
87
+ recall = 0
88
+
89
+ for i in range(max_length):
90
+ if total_hyp[i] + smooth and total_ref[i] + smooth:
91
+ precision += (correct[i] + smooth) / (total_hyp[i] + smooth)
92
+ recall += (correct[i] + smooth) / (total_ref[i] + smooth)
93
+
94
+ precision /= max_length
95
+ recall /= max_length
96
+
97
+ return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall
98
+
99
+ def main(args):
100
+
101
+ correct = [0]*args.ngram
102
+ total = [0]*args.ngram
103
+ total_ref = [0]*args.ngram
104
+ for line in args.ref:
105
+ line2 = args.hyp.readline()
106
+
107
+ ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space)
108
+ ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space)
109
+
110
+ get_correct(ngrams_ref, ngrams_test, correct, total)
111
+
112
+ for rank in ngrams_ref:
113
+ for chain in ngrams_ref[rank]:
114
+ total_ref[rank] += ngrams_ref[rank][chain]
115
+
116
+ chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta)
117
+
118
+ print('chrF3: {0:.4f}'.format(chrf))
119
+ if args.precision:
120
+ print('chrPrec: {0:.4f}'.format(precision))
121
+ if args.recall:
122
+ print('chrRec: {0:.4f}'.format(recall))
123
+
124
+ if __name__ == '__main__':
125
+
126
+ # python 2/3 compatibility
127
+ if sys.version_info < (3, 0):
128
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
129
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
130
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
131
+ else:
132
+ sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
133
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
134
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
135
+
136
+ parser = create_parser()
137
+ args = parser.parse_args()
138
+
139
+ main(args)
subword/dataset/codec.txt ADDED
The diff for this file is too large to render. See raw diff
 
subword/encoding.ipynb ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "id": "9644db35",
7
+ "metadata": {
8
+ "scrolled": true
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stderr",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "\n",
16
+ " 0%| | 0/20000 [00:00<?, ?it/s]\n",
17
+ " 0%| | 1/20000 [00:00<38:40, 8.62it/s]\n",
18
+ " 0%| | 2/20000 [00:00<1:31:59, 3.62it/s]\n",
19
+ " 0%| | 3/20000 [00:00<1:21:11, 4.11it/s]\n",
20
+ " 0%| | 4/20000 [00:01<1:48:20, 3.08it/s]\n",
21
+ " 0%| | 6/20000 [00:01<1:03:27, 5.25it/s]\n",
22
+ " 0%| | 7/20000 [00:01<1:12:17, 4.61it/s]\n",
23
+ " 0%| | 8/20000 [00:01<1:10:13, 4.74it/s]\n",
24
+ " 0%| | 10/20000 [00:02<1:09:39, 4.78it/s]\n",
25
+ " 0%| | 13/20000 [00:02<42:59, 7.75it/s] \n",
26
+ " 0%| | 16/20000 [00:02<30:25, 10.95it/s]\n",
27
+ " 0%| | 19/20000 [00:02<24:50, 13.41it/s]\n",
28
+ " 0%| | 21/20000 [00:03<36:58, 9.01it/s]\n",
29
+ " 0%| | 23/20000 [00:03<35:20, 9.42it/s]\n",
30
+ " 0%| | 25/20000 [00:03<31:30, 10.56it/s]\n",
31
+ " 0%| | 27/20000 [00:03<33:12, 10.03it/s]\n",
32
+ " 0%| | 29/20000 [00:03<33:30, 9.93it/s]\n",
33
+ " 0%| | 31/20000 [00:03<35:06, 9.48it/s]\n",
34
+ " 0%| | 33/20000 [00:04<37:03, 8.98it/s]\n",
35
+ " 0%| | 37/20000 [00:04<26:59, 12.32it/s]\n",
36
+ " 0%| | 39/20000 [00:04<26:54, 12.37it/s]\n",
37
+ " 0%| | 42/20000 [00:04<22:51, 14.55it/s]\n",
38
+ " 0%| | 46/20000 [00:04<19:15, 17.27it/s]\n",
39
+ " 0%| | 48/20000 [00:05<22:34, 14.73it/s]\n",
40
+ " 0%| | 50/20000 [00:05<23:39, 14.06it/s]\n",
41
+ " 0%| | 52/20000 [00:05<23:30, 14.14it/s]\n",
42
+ " 0%| | 55/20000 [00:05<20:12, 16.45it/s]\n",
43
+ " 0%| | 58/20000 [00:05<19:09, 17.35it/s]\n",
44
+ " 0%| | 60/20000 [00:05<19:25, 17.11it/s]\n",
45
+ " 0%| | 63/20000 [00:06<28:45, 11.56it/s]\n",
46
+ " 0%| | 70/20000 [00:06<16:14, 20.45it/s]\n",
47
+ " 0%| | 75/20000 [00:06<14:15, 23.28it/s]\n",
48
+ " 0%| | 78/20000 [00:06<14:28, 22.94it/s]\n",
49
+ " 0%| | 81/20000 [00:06<16:06, 20.62it/s]\n",
50
+ " 0%| | 85/20000 [00:06<15:13, 21.81it/s]\n",
51
+ " 0%| | 88/20000 [00:07<15:39, 21.20it/s]\n",
52
+ " 0%| | 92/20000 [00:07<13:28, 24.61it/s]\n",
53
+ " 0%| | 95/20000 [00:07<13:32, 24.50it/s]\n",
54
+ " 0%| | 99/20000 [00:07<12:06, 27.40it/s]\n",
55
+ " 1%| | 102/20000 [00:07<12:55, 25.65it/s]\n",
56
+ " 1%| | 105/20000 [00:07<12:46, 25.95it/s]\n",
57
+ " 1%| | 112/20000 [00:07<12:02, 27.54it/s]\n",
58
+ " 1%| | 118/20000 [00:08<10:02, 33.00it/s]\n",
59
+ " 1%| | 122/20000 [00:08<10:12, 32.46it/s]\n",
60
+ " 1%| | 127/20000 [00:08<10:07, 32.73it/s]\n",
61
+ " 1%| | 138/20000 [00:08<06:56, 47.66it/s]\n",
62
+ " 1%| | 144/20000 [00:08<06:47, 48.73it/s]\n",
63
+ " 1%| | 150/20000 [00:08<07:00, 47.21it/s]\n",
64
+ " 1%| | 156/20000 [00:08<06:44, 49.01it/s]\n",
65
+ " 1%| | 162/20000 [00:08<06:47, 48.71it/s]\n",
66
+ " 1%| | 169/20000 [00:09<06:38, 49.81it/s]\n",
67
+ " 1%| | 176/20000 [00:09<06:28, 51.03it/s]\n",
68
+ " 1%| | 184/20000 [00:09<05:58, 55.22it/s]\n",
69
+ " 1%| | 190/20000 [00:09<08:05, 40.78it/s]\n",
70
+ " 1%| | 197/20000 [00:09<07:30, 43.91it/s]\n",
71
+ " 1%|1 | 202/20000 [00:09<07:53, 41.79it/s]\n",
72
+ " 1%|1 | 210/20000 [00:10<06:51, 48.15it/s]\n",
73
+ " 1%|1 | 220/20000 [00:10<05:35, 59.01it/s]\n",
74
+ " 1%|1 | 229/20000 [00:10<05:05, 64.72it/s]\n",
75
+ " 1%|1 | 236/20000 [00:10<05:37, 58.47it/s]\n",
76
+ " 1%|1 | 244/20000 [00:10<05:22, 61.18it/s]\n",
77
+ " 1%|1 | 251/20000 [00:10<05:31, 59.65it/s]\n",
78
+ " 1%|1 | 259/20000 [00:10<05:11, 63.33it/s]\n",
79
+ " 1%|1 | 266/20000 [00:10<05:25, 60.70it/s]\n",
80
+ " 1%|1 | 273/20000 [00:10<05:31, 59.42it/s]\n",
81
+ " 1%|1 | 282/20000 [00:11<04:57, 66.26it/s]\n",
82
+ " 1%|1 | 289/20000 [00:11<05:00, 65.52it/s]\n",
83
+ " 1%|1 | 296/20000 [00:11<05:10, 63.47it/s]\n",
84
+ " 2%|1 | 303/20000 [00:11<07:06, 46.23it/s]\n",
85
+ " 2%|1 | 313/20000 [00:11<05:41, 57.60it/s]\n",
86
+ " 2%|1 | 324/20000 [00:11<04:54, 66.87it/s]\n",
87
+ " 2%|1 | 335/20000 [00:11<04:16, 76.56it/s]\n",
88
+ " 2%|1 | 346/20000 [00:11<03:50, 85.09it/s]\n",
89
+ " 2%|1 | 357/20000 [00:12<03:38, 90.06it/s]\n",
90
+ " 2%|1 | 367/20000 [00:12<03:46, 86.85it/s]\n",
91
+ " 2%|1 | 377/20000 [00:12<03:41, 88.70it/s]\n",
92
+ " 2%|1 | 387/20000 [00:12<03:43, 87.95it/s]\n",
93
+ " 2%|1 | 396/20000 [00:12<03:49, 85.48it/s]\n",
94
+ " 2%|2 | 405/20000 [00:12<04:04, 80.15it/s]\n",
95
+ " 2%|2 | 416/20000 [00:12<03:42, 87.82it/s]\n",
96
+ " 2%|2 | 429/20000 [00:12<03:19, 98.03it/s]\n",
97
+ " 2%|2 | 439/20000 [00:13<03:35, 90.74it/s]\n",
98
+ " 2%|2 | 450/20000 [00:13<03:24, 95.65it/s]\n",
99
+ " 2%|2 | 462/20000 [00:13<03:13, 100.98it/s]\n",
100
+ " 2%|2 | 473/20000 [00:13<03:16, 99.60it/s] \n",
101
+ " 2%|2 | 484/20000 [00:13<03:28, 93.64it/s]\n",
102
+ " 2%|2 | 494/20000 [00:13<03:30, 92.86it/s]\n",
103
+ " 3%|2 | 504/20000 [00:13<04:34, 70.99it/s]\n",
104
+ " 3%|2 | 520/20000 [00:13<03:34, 90.96it/s]\n",
105
+ " 3%|2 | 534/20000 [00:14<03:12, 101.38it/s]\n",
106
+ " 3%|2 | 547/20000 [00:14<03:01, 107.03it/s]\n",
107
+ " 3%|2 | 559/20000 [00:14<03:02, 106.25it/s]\n",
108
+ " 3%|2 | 571/20000 [00:14<03:14, 99.81it/s] \n",
109
+ " 3%|2 | 582/20000 [00:14<03:13, 100.11it/s]\n",
110
+ " 3%|2 | 595/20000 [00:14<03:03, 105.49it/s]\n",
111
+ " 3%|3 | 606/20000 [00:14<03:07, 103.63it/s]\n",
112
+ " 3%|3 | 625/20000 [00:14<02:33, 126.08it/s]\n",
113
+ " 3%|3 | 643/20000 [00:14<02:17, 140.29it/s]\n",
114
+ " 3%|3 | 658/20000 [00:15<02:23, 135.01it/s]\n",
115
+ " 3%|3 | 672/20000 [00:15<02:32, 126.59it/s]\n",
116
+ " 3%|3 | 685/20000 [00:15<02:42, 119.19it/s]\n",
117
+ " 3%|3 | 698/20000 [00:15<02:46, 116.22it/s]\n",
118
+ " 4%|3 | 710/20000 [00:15<02:49, 113.91it/s]\n",
119
+ " 4%|3 | 727/20000 [00:15<02:31, 127.58it/s]\n",
120
+ " 4%|3 | 744/20000 [00:15<02:18, 139.24it/s]\n",
121
+ " 4%|3 | 759/20000 [00:15<03:10, 101.19it/s]\n",
122
+ " 4%|3 | 771/20000 [00:16<03:03, 104.67it/s]\n",
123
+ " 4%|3 | 783/20000 [00:16<03:02, 105.07it/s]\n",
124
+ " 4%|3 | 795/20000 [00:16<03:14, 98.92it/s] \n",
125
+ " 4%|4 | 807/20000 [00:16<03:06, 102.82it/s]\n",
126
+ " 4%|4 | 822/20000 [00:16<02:50, 112.34it/s]\n",
127
+ " 4%|4 | 834/20000 [00:16<02:55, 109.45it/s]\n",
128
+ " 4%|4 | 847/20000 [00:16<02:47, 114.28it/s]\n",
129
+ " 4%|4 | 860/20000 [00:16<02:42, 117.94it/s]\n",
130
+ " 4%|4 | 873/20000 [00:16<02:46, 114.58it/s]\n",
131
+ " 4%|4 | 885/20000 [00:17<02:58, 106.97it/s]\n",
132
+ " 4%|4 | 896/20000 [00:17<03:07, 102.08it/s]\n",
133
+ " 5%|4 | 908/20000 [00:17<03:01, 105.42it/s]\n",
134
+ " 5%|4 | 924/20000 [00:17<02:42, 117.67it/s]\n",
135
+ " 5%|4 | 940/20000 [00:17<02:28, 128.26it/s]\n",
136
+ " 5%|4 | 954/20000 [00:17<02:24, 131.54it/s]\n",
137
+ " 5%|4 | 968/20000 [00:17<02:34, 123.37it/s]\n",
138
+ " 5%|4 | 982/20000 [00:17<02:31, 125.85it/s]\n",
139
+ " 5%|4 | 995/20000 [00:18<02:39, 119.06it/s]\n",
140
+ " 5%|5 | 1008/20000 [00:18<03:38, 86.92it/s]\n",
141
+ " 5%|5 | 1024/20000 [00:18<03:04, 102.72it/s]\n",
142
+ " 5%|5 | 1041/20000 [00:18<02:40, 118.03it/s]\n",
143
+ " 5%|5 | 1055/20000 [00:18<02:34, 122.63it/s]\n",
144
+ " 5%|5 | 1069/20000 [00:18<02:31, 124.89it/s]\n",
145
+ " 5%|5 | 1083/20000 [00:18<02:34, 122.68it/s]\n",
146
+ " 5%|5 | 1096/20000 [00:18<02:39, 118.60it/s]\n",
147
+ " 6%|5 | 1110/20000 [00:19<02:32, 123.65it/s]\n",
148
+ " 6%|5 | 1127/20000 [00:19<02:18, 136.02it/s]\n",
149
+ " 6%|5 | 1145/20000 [00:19<02:08, 146.74it/s]\n",
150
+ " 6%|5 | 1161/20000 [00:19<02:06, 148.82it/s]\n",
151
+ " 6%|5 | 1177/20000 [00:19<02:04, 151.15it/s]\n",
152
+ " 6%|5 | 1193/20000 [00:19<02:09, 145.70it/s]\n",
153
+ " 6%|6 | 1208/20000 [00:19<02:10, 144.47it/s]\n",
154
+ " 6%|6 | 1227/20000 [00:19<02:00, 156.04it/s]\n",
155
+ " 6%|6 | 1244/20000 [00:19<01:57, 159.13it/s]\n",
156
+ " 6%|6 | 1261/20000 [00:19<02:01, 154.24it/s]\n",
157
+ " 6%|6 | 1277/20000 [00:20<02:09, 145.11it/s]\n",
158
+ " 6%|6 | 1292/20000 [00:20<02:10, 143.32it/s]\n",
159
+ " 7%|6 | 1307/20000 [00:20<03:02, 102.65it/s]\n",
160
+ " 7%|6 | 1330/20000 [00:20<02:23, 130.14it/s]\n",
161
+ " 7%|6 | 1348/20000 [00:20<02:13, 139.97it/s]\n",
162
+ " 7%|6 | 1368/20000 [00:20<02:02, 152.70it/s]\n",
163
+ " 7%|6 | 1385/20000 [00:20<02:00, 153.99it/s]\n",
164
+ " 7%|7 | 1402/20000 [00:21<02:07, 146.16it/s]\n",
165
+ " 7%|7 | 1423/20000 [00:21<01:55, 161.53it/s]\n",
166
+ " 7%|7 | 1441/20000 [00:21<01:52, 165.17it/s]\n",
167
+ " 7%|7 | 1459/20000 [00:21<01:55, 160.82it/s]\n",
168
+ " 7%|7 | 1476/20000 [00:21<02:03, 149.82it/s]\n",
169
+ " 7%|7 | 1492/20000 [00:21<02:08, 143.79it/s]\n",
170
+ " 8%|7 | 1507/20000 [00:21<02:10, 142.06it/s]\n",
171
+ " 8%|7 | 1530/20000 [00:21<01:52, 164.72it/s]\n",
172
+ " 8%|7 | 1548/20000 [00:21<01:50, 167.09it/s]\n",
173
+ " 8%|7 | 1565/20000 [00:22<01:49, 167.90it/s]\n",
174
+ " 8%|7 | 1582/20000 [00:22<01:53, 161.57it/s]\n",
175
+ " 8%|7 | 1599/20000 [00:22<01:56, 158.15it/s]\n",
176
+ " 8%|8 | 1617/20000 [00:22<01:51, 164.25it/s]\n",
177
+ " 8%|8 | 1637/20000 [00:22<01:45, 174.45it/s]\n",
178
+ " 8%|8 | 1657/20000 [00:22<01:41, 181.32it/s]\n",
179
+ " 8%|8 | 1676/20000 [00:22<01:40, 182.25it/s]\n",
180
+ " 8%|8 | 1695/20000 [00:22<01:46, 171.94it/s]\n",
181
+ " 9%|8 | 1718/20000 [00:22<01:38, 186.12it/s]\n",
182
+ " 9%|8 | 1739/20000 [00:22<01:34, 192.48it/s]\n",
183
+ " 9%|8 | 1759/20000 [00:23<02:13, 136.76it/s]\n",
184
+ " 9%|8 | 1777/20000 [00:23<02:04, 145.80it/s]\n",
185
+ " 9%|8 | 1794/20000 [00:23<02:04, 146.68it/s]\n",
186
+ " 9%|9 | 1814/20000 [00:23<01:53, 159.63it/s]\n",
187
+ " 9%|9 | 1836/20000 [00:23<01:43, 175.04it/s]\n",
188
+ " 9%|9 | 1856/20000 [00:23<01:41, 179.30it/s]\n",
189
+ " 9%|9 | 1875/20000 [00:23<01:42, 176.01it/s]\n",
190
+ " 9%|9 | 1894/20000 [00:23<01:45, 171.34it/s]\n",
191
+ " 10%|9 | 1915/20000 [00:24<01:39, 180.93it/s]\n",
192
+ " 10%|9 | 1937/20000 [00:24<01:34, 190.79it/s]\n",
193
+ " 10%|9 | 1957/20000 [00:24<01:35, 189.63it/s]\n",
194
+ " 10%|9 | 1977/20000 [00:24<01:36, 186.73it/s]\n",
195
+ " 10%|9 | 1996/20000 [00:24<01:42, 175.72it/s]\n",
196
+ " 10%|# | 2018/20000 [00:24<01:35, 187.87it/s]\n",
197
+ " 10%|# | 2046/20000 [00:24<01:24, 212.03it/s]\n",
198
+ " 10%|# | 2068/20000 [00:24<01:27, 204.39it/s]\n",
199
+ " 10%|# | 2089/20000 [00:24<01:31, 195.56it/s]\n",
200
+ " 11%|# | 2109/20000 [00:25<01:33, 192.02it/s]\n",
201
+ " 11%|# | 2140/20000 [00:25<01:19, 224.10it/s]\n",
202
+ " 11%|# | 2165/20000 [00:25<01:17, 230.78it/s]\n",
203
+ " 11%|# | 2189/20000 [00:25<01:18, 225.64it/s]\n",
204
+ " 11%|#1 | 2212/20000 [00:25<01:24, 210.15it/s]\n",
205
+ " 11%|#1 | 2236/20000 [00:25<01:21, 217.71it/s]\n",
206
+ " 11%|#1 | 2259/20000 [00:25<01:22, 215.12it/s]\n",
207
+ " 11%|#1 | 2281/20000 [00:25<01:24, 208.87it/s]\n",
208
+ " 12%|#1 | 2303/20000 [00:25<01:35, 185.14it/s]\n",
209
+ " 12%|#1 | 2333/20000 [00:26<01:22, 213.67it/s]\n",
210
+ " 12%|#1 | 2357/20000 [00:26<01:19, 220.73it/s]\n",
211
+ " 12%|#1 | 2380/20000 [00:26<01:21, 214.95it/s]\n",
212
+ " 12%|#2 | 2402/20000 [00:26<02:03, 142.71it/s]\n",
213
+ " 12%|#2 | 2432/20000 [00:26<01:40, 174.08it/s]\n",
214
+ " 12%|#2 | 2459/20000 [00:26<01:29, 195.81it/s]\n",
215
+ " 12%|#2 | 2482/20000 [00:26<01:28, 198.82it/s]\n",
216
+ " 13%|#2 | 2505/20000 [00:27<01:29, 195.33it/s]\n",
217
+ " 13%|#2 | 2538/20000 [00:27<01:16, 228.52it/s]\n",
218
+ " 13%|#2 | 2566/20000 [00:27<01:11, 242.22it/s]\n",
219
+ " 13%|#2 | 2592/20000 [00:27<01:15, 230.01it/s]\n",
220
+ " 13%|#3 | 2620/20000 [00:27<01:11, 243.40it/s]\n",
221
+ " 13%|#3 | 2651/20000 [00:27<01:06, 261.84it/s]\n",
222
+ " 13%|#3 | 2678/20000 [00:27<01:06, 260.46it/s]\n",
223
+ " 14%|#3 | 2705/20000 [00:27<01:08, 252.37it/s]\n",
224
+ " 14%|#3 | 2740/20000 [00:27<01:02, 278.24it/s]\n",
225
+ " 14%|#3 | 2769/20000 [00:27<01:05, 264.95it/s]\n",
226
+ " 14%|#3 | 2796/20000 [00:28<01:09, 247.16it/s]\n",
227
+ " 14%|#4 | 2828/20000 [00:28<01:04, 264.60it/s]\n",
228
+ " 14%|#4 | 2855/20000 [00:28<01:05, 260.34it/s]\n",
229
+ " 14%|#4 | 2882/20000 [00:28<01:09, 247.20it/s]\n",
230
+ " 15%|#4 | 2908/20000 [00:28<01:12, 236.53it/s]\n",
231
+ " 15%|#4 | 2952/20000 [00:28<00:58, 291.10it/s]\n",
232
+ " 15%|#4 | 2982/20000 [00:28<01:03, 266.27it/s]\n",
233
+ " 15%|#5 | 3010/20000 [00:28<01:03, 267.07it/s]\n",
234
+ " 15%|#5 | 3039/20000 [00:29<01:02, 270.37it/s]\n",
235
+ " 15%|#5 | 3068/20000 [00:29<01:01, 273.53it/s]\n",
236
+ " 15%|#5 | 3096/20000 [00:29<01:04, 263.45it/s]\n",
237
+ " 16%|#5 | 3129/20000 [00:29<00:59, 281.96it/s]\n",
238
+ " 16%|#5 | 3160/20000 [00:29<00:58, 287.48it/s]\n",
239
+ " 16%|#5 | 3190/20000 [00:29<01:00, 279.05it/s]\n",
240
+ " 16%|#6 | 3226/20000 [00:29<00:55, 301.05it/s]\n",
241
+ " 16%|#6 | 3257/20000 [00:29<00:55, 303.61it/s]\n",
242
+ " 16%|#6 | 3288/20000 [00:29<00:56, 293.52it/s]\n",
243
+ " 17%|#6 | 3318/20000 [00:29<00:56, 293.68it/s]\n",
244
+ " 17%|#6 | 3357/20000 [00:30<00:52, 318.68it/s]\n",
245
+ " 17%|#6 | 3390/20000 [00:30<00:58, 284.80it/s]\n",
246
+ " 17%|#7 | 3420/20000 [00:30<01:21, 204.06it/s]\n",
247
+ " 17%|#7 | 3459/20000 [00:30<01:08, 242.62it/s]\n",
248
+ " 17%|#7 | 3491/20000 [00:30<01:03, 260.00it/s]\n",
249
+ " 18%|#7 | 3535/20000 [00:30<00:54, 304.04it/s]\n",
250
+ " 18%|#7 | 3573/20000 [00:30<00:50, 323.92it/s]\n",
251
+ " 18%|#8 | 3608/20000 [00:31<00:55, 296.34it/s]\n",
252
+ " 18%|#8 | 3653/20000 [00:31<00:48, 336.01it/s]\n",
253
+ " 18%|#8 | 3689/20000 [00:31<00:49, 329.16it/s]\n",
254
+ " 19%|#8 | 3733/20000 [00:31<00:45, 358.11it/s]\n",
255
+ " 19%|#8 | 3771/20000 [00:31<00:44, 361.17it/s]\n",
256
+ " 19%|#9 | 3809/20000 [00:31<00:47, 342.31it/s]\n",
257
+ " 19%|#9 | 3861/20000 [00:31<00:41, 390.94it/s]\n",
258
+ " 20%|#9 | 3902/20000 [00:31<00:42, 378.22it/s]\n",
259
+ " 20%|#9 | 3968/20000 [00:31<00:35, 455.02it/s]\n",
260
+ " 20%|## | 4015/20000 [00:32<00:37, 427.77it/s]\n",
261
+ " 20%|## | 4066/20000 [00:32<00:35, 449.03it/s]\n",
262
+ " 21%|## | 4112/20000 [00:32<00:39, 404.45it/s]\n",
263
+ " 21%|## | 4174/20000 [00:32<00:34, 458.89it/s]\n",
264
+ " 21%|##1 | 4222/20000 [00:32<00:35, 442.90it/s]\n",
265
+ " 21%|##1 | 4271/20000 [00:32<00:34, 454.41it/s]\n",
266
+ " 22%|##1 | 4329/20000 [00:32<00:32, 489.36it/s]\n",
267
+ " 22%|##1 | 4387/20000 [00:32<00:30, 515.14it/s]\n",
268
+ " 22%|##2 | 4447/20000 [00:32<00:28, 538.10it/s]\n",
269
+ " 23%|##2 | 4502/20000 [00:33<00:32, 478.73it/s]\n",
270
+ " 23%|##2 | 4563/20000 [00:33<00:30, 512.67it/s]\n",
271
+ " 23%|##3 | 4616/20000 [00:33<00:30, 496.81it/s]\n",
272
+ " 23%|##3 | 4677/20000 [00:33<00:29, 527.98it/s]\n",
273
+ " 24%|##3 | 4733/20000 [00:33<00:28, 537.01it/s]\n",
274
+ " 24%|##3 | 4788/20000 [00:33<00:28, 534.59it/s]\n",
275
+ " 24%|##4 | 4864/20000 [00:33<00:25, 599.65it/s]\n",
276
+ " 25%|##4 | 4925/20000 [00:33<00:25, 595.70it/s]\n",
277
+ " 25%|##4 | 4994/20000 [00:33<00:24, 617.81it/s]\n",
278
+ " 25%|##5 | 5079/20000 [00:33<00:21, 683.71it/s]\n",
279
+ " 26%|##5 | 5148/20000 [00:34<00:35, 419.97it/s]\n",
280
+ " 26%|##6 | 5203/20000 [00:34<00:33, 446.58it/s]\n",
281
+ " 26%|##6 | 5289/20000 [00:34<00:27, 538.90it/s]\n",
282
+ " 27%|##6 | 5377/20000 [00:34<00:23, 622.07it/s]\n",
283
+ " 27%|##7 | 5471/20000 [00:34<00:20, 703.42it/s]\n",
284
+ " 28%|##7 | 5549/20000 [00:36<01:35, 150.73it/s]\n",
285
+ " 28%|##8 | 5606/20000 [00:36<01:37, 147.12it/s]\n",
286
+ " 28%|##8 | 5650/20000 [00:36<01:34, 151.57it/s]\n",
287
+ " 28%|##8 | 5686/20000 [00:37<01:33, 153.50it/s]\n",
288
+ " 29%|##8 | 5716/20000 [00:37<01:32, 154.45it/s]\n",
289
+ " 29%|##8 | 5742/20000 [00:37<01:29, 158.75it/s]\n",
290
+ " 29%|##8 | 5766/20000 [00:37<01:28, 160.05it/s]\n",
291
+ " 29%|##8 | 5788/20000 [00:37<01:29, 159.44it/s]\n",
292
+ " 29%|##9 | 5808/20000 [00:37<01:29, 158.22it/s]\n",
293
+ " 29%|##9 | 5827/20000 [00:37<01:27, 162.78it/s]\n",
294
+ " 29%|##9 | 5846/20000 [00:38<01:25, 165.07it/s]\n",
295
+ " 29%|##9 | 5864/20000 [00:38<01:25, 164.71it/s]\n",
296
+ " 29%|##9 | 5882/20000 [00:38<01:26, 162.88it/s]\n",
297
+ " 29%|##9 | 5899/20000 [00:38<01:30, 155.66it/s]\n",
298
+ " 30%|##9 | 5916/20000 [00:38<01:29, 158.09it/s]\n",
299
+ " 30%|##9 | 5935/20000 [00:38<01:24, 166.09it/s]\n",
300
+ " 30%|##9 | 5954/20000 [00:38<01:22, 169.84it/s]\n",
301
+ " 30%|##9 | 5972/20000 [00:38<01:21, 171.23it/s]\n",
302
+ " 30%|##9 | 5990/20000 [00:38<01:22, 170.81it/s]\n",
303
+ " 30%|### | 6008/20000 [00:39<01:23, 167.70it/s]\n",
304
+ " 30%|### | 6027/20000 [00:39<01:20, 173.01it/s]\n",
305
+ " 30%|### | 6046/20000 [00:39<01:19, 175.35it/s]\n",
306
+ " 30%|### | 6064/20000 [00:39<01:20, 172.23it/s]\n",
307
+ " 30%|### | 6082/20000 [00:39<01:21, 170.55it/s]\n",
308
+ " 30%|### | 6100/20000 [00:39<01:23, 167.05it/s]\n",
309
+ " 31%|### | 6118/20000 [00:39<01:21, 170.70it/s]\n",
310
+ " 31%|### | 6138/20000 [00:39<01:17, 178.09it/s]\n",
311
+ " 31%|### | 6157/20000 [00:39<01:16, 179.96it/s]\n",
312
+ " 31%|### | 6176/20000 [00:39<01:18, 177.21it/s]\n",
313
+ " 31%|### | 6194/20000 [00:40<01:18, 174.99it/s]\n",
314
+ " 31%|###1 | 6212/20000 [00:40<01:19, 173.44it/s]\n",
315
+ " 31%|###1 | 6232/20000 [00:40<01:16, 180.04it/s]\n",
316
+ " 31%|###1 | 6251/20000 [00:40<01:16, 179.80it/s]\n",
317
+ " 31%|###1 | 6270/20000 [00:40<01:19, 172.28it/s]\n",
318
+ " 31%|###1 | 6288/20000 [00:40<01:20, 170.14it/s]\n",
319
+ " 32%|###1 | 6306/20000 [00:40<01:22, 165.92it/s]\n",
320
+ " 32%|###1 | 6327/20000 [00:40<01:16, 178.17it/s]\n",
321
+ " 32%|###1 | 6347/20000 [00:40<01:14, 183.85it/s]\n",
322
+ " 32%|###1 | 6366/20000 [00:41<01:14, 182.46it/s]\n",
323
+ " 32%|###1 | 6385/20000 [00:41<01:17, 175.53it/s]\n",
324
+ " 32%|###2 | 6403/20000 [00:41<01:21, 166.92it/s]\n",
325
+ " 32%|###2 | 6423/20000 [00:41<01:17, 174.57it/s]\n",
326
+ " 32%|###2 | 6443/20000 [00:41<01:15, 179.69it/s]\n",
327
+ " 32%|###2 | 6462/20000 [00:41<01:16, 178.06it/s]\n",
328
+ " 32%|###2 | 6480/20000 [00:41<01:17, 174.17it/s]\n",
329
+ " 32%|###2 | 6498/20000 [00:41<01:19, 170.04it/s]\n",
330
+ " 33%|###2 | 6517/20000 [00:41<01:16, 175.15it/s]\n",
331
+ " 33%|###2 | 6538/20000 [00:42<01:13, 184.06it/s]\n",
332
+ " 33%|###2 | 6558/20000 [00:42<01:11, 187.58it/s]\n",
333
+ " 33%|###2 | 6577/20000 [00:42<01:12, 183.99it/s]\n",
334
+ " 33%|###2 | 6596/20000 [00:42<01:14, 180.51it/s]\n",
335
+ " 33%|###3 | 6615/20000 [00:42<01:14, 180.64it/s]\n",
336
+ " 33%|###3 | 6636/20000 [00:42<01:11, 187.45it/s]\n",
337
+ " 33%|###3 | 6656/20000 [00:42<01:10, 189.43it/s]\n",
338
+ " 33%|###3 | 6675/20000 [00:42<01:11, 185.29it/s]\n",
339
+ " 33%|###3 | 6694/20000 [00:42<01:14, 177.91it/s]\n",
340
+ " 34%|###3 | 6712/20000 [00:42<01:15, 176.02it/s]\n",
341
+ " 34%|###3 | 6733/20000 [00:43<01:11, 185.68it/s]\n",
342
+ " 34%|###3 | 6752/20000 [00:43<01:10, 186.91it/s]\n",
343
+ " 34%|###3 | 6771/20000 [00:43<01:12, 183.53it/s]\n",
344
+ " 34%|###3 | 6790/20000 [00:43<01:15, 175.73it/s]\n",
345
+ " 34%|###4 | 6808/20000 [00:43<01:17, 170.68it/s]\n",
346
+ " 34%|###4 | 6828/20000 [00:43<01:13, 178.87it/s]\n",
347
+ " 34%|###4 | 6849/20000 [00:43<01:10, 186.16it/s]\n",
348
+ " 34%|###4 | 6868/20000 [00:43<01:10, 187.26it/s]\n",
349
+ " 34%|###4 | 6887/20000 [00:43<01:10, 185.36it/s]\n",
350
+ " 35%|###4 | 6906/20000 [00:44<01:12, 180.40it/s]\n",
351
+ " 35%|###4 | 6929/20000 [00:44<01:07, 193.50it/s]\n",
352
+ " 35%|###4 | 6950/20000 [00:44<01:05, 198.27it/s]\n",
353
+ " 35%|###4 | 6970/20000 [00:44<01:06, 197.04it/s]\n",
354
+ " 35%|###4 | 6990/20000 [00:44<01:08, 190.65it/s]\n",
355
+ " 35%|###5 | 7010/20000 [00:44<01:10, 184.33it/s]\n",
356
+ " 35%|###5 | 7029/20000 [00:44<01:10, 183.34it/s]\n",
357
+ " 35%|###5 | 7049/20000 [00:44<01:08, 188.06it/s]\n",
358
+ " 35%|###5 | 7068/20000 [00:44<01:08, 188.62it/s]\n",
359
+ " 35%|###5 | 7087/20000 [00:44<01:09, 184.71it/s]\n",
360
+ " 36%|###5 | 7106/20000 [00:45<01:11, 179.97it/s]\n",
361
+ " 36%|###5 | 7129/20000 [00:45<01:06, 192.60it/s]\n",
362
+ " 36%|###5 | 7151/20000 [00:45<01:04, 198.21it/s]\n",
363
+ " 36%|###5 | 7172/20000 [00:45<01:04, 198.74it/s]\n",
364
+ " 36%|###5 | 7192/20000 [00:45<01:04, 197.39it/s]\n",
365
+ " 36%|###6 | 7212/20000 [00:45<01:05, 195.87it/s]\n",
366
+ " 36%|###6 | 7235/20000 [00:45<01:02, 205.80it/s]\n",
367
+ " 36%|###6 | 7256/20000 [00:45<01:01, 205.81it/s]\n",
368
+ " 36%|###6 | 7277/20000 [00:45<01:03, 201.69it/s]\n",
369
+ " 36%|###6 | 7298/20000 [00:46<01:05, 193.96it/s]\n",
370
+ " 37%|###6 | 7320/20000 [00:46<01:03, 200.19it/s]\n",
371
+ " 37%|###6 | 7343/20000 [00:46<01:00, 208.13it/s]\n",
372
+ " 37%|###6 | 7364/20000 [00:46<01:00, 208.67it/s]\n",
373
+ " 37%|###6 | 7385/20000 [00:46<01:01, 204.27it/s]\n",
374
+ " 37%|###7 | 7406/20000 [00:46<01:03, 199.56it/s]\n",
375
+ " 37%|###7 | 7429/20000 [00:46<01:00, 207.10it/s]\n",
376
+ " 37%|###7 | 7450/20000 [00:46<01:00, 207.93it/s]\n",
377
+ " 37%|###7 | 7471/20000 [00:46<01:00, 207.32it/s]\n",
378
+ " 37%|###7 | 7492/20000 [00:46<01:01, 204.50it/s]\n",
379
+ " 38%|###7 | 7513/20000 [00:47<01:00, 205.49it/s]\n",
380
+ " 38%|###7 | 7537/20000 [00:47<00:57, 215.00it/s]\n",
381
+ " 38%|###7 | 7559/20000 [00:47<00:57, 215.82it/s]\n",
382
+ " 38%|###7 | 7581/20000 [00:47<00:58, 210.82it/s]\n",
383
+ " 38%|###8 | 7603/20000 [00:47<01:01, 200.64it/s]\n",
384
+ " 38%|###8 | 7627/20000 [00:47<00:58, 211.11it/s]\n",
385
+ " 38%|###8 | 7650/20000 [00:47<00:57, 215.27it/s]\n",
386
+ " 38%|###8 | 7672/20000 [00:47<00:58, 211.14it/s]\n",
387
+ " 38%|###8 | 7694/20000 [00:47<01:00, 203.74it/s]\n",
388
+ " 39%|###8 | 7716/20000 [00:48<00:59, 206.56it/s]\n",
389
+ " 39%|###8 | 7742/20000 [00:48<00:55, 219.27it/s]\n",
390
+ " 39%|###8 | 7765/20000 [00:48<00:55, 222.34it/s]\n",
391
+ " 39%|###8 | 7788/20000 [00:48<00:55, 220.09it/s]\n",
392
+ " 39%|###9 | 7811/20000 [00:48<00:56, 217.29it/s]\n",
393
+ " 39%|###9 | 7837/20000 [00:48<00:53, 227.59it/s]\n",
394
+ " 39%|###9 | 7860/20000 [00:48<00:53, 225.68it/s]\n",
395
+ " 39%|###9 | 7883/20000 [00:48<00:57, 210.45it/s]\n",
396
+ " 40%|###9 | 7905/20000 [00:48<00:59, 203.50it/s]\n",
397
+ " 40%|###9 | 7931/20000 [00:49<00:55, 218.40it/s]\n",
398
+ " 40%|###9 | 7955/20000 [00:49<00:53, 223.84it/s]\n",
399
+ " 40%|###9 | 7978/20000 [00:49<00:54, 222.42it/s]\n",
400
+ " 40%|#### | 8001/20000 [00:49<00:56, 211.24it/s]\n",
401
+ " 40%|#### | 8028/20000 [00:49<00:52, 226.96it/s]\n",
402
+ " 40%|#### | 8052/20000 [00:49<00:52, 229.36it/s]\n",
403
+ " 40%|#### | 8076/20000 [00:49<00:52, 226.55it/s]\n",
404
+ " 40%|#### | 8099/20000 [00:49<00:54, 217.59it/s]\n",
405
+ " 41%|#### | 8121/20000 [00:50<01:26, 136.56it/s]\n",
406
+ " 41%|#### | 8144/20000 [00:50<01:16, 154.69it/s]\n",
407
+ " 41%|#### | 8165/20000 [00:50<01:11, 165.14it/s]\n",
408
+ " 41%|#### | 8186/20000 [00:50<01:07, 174.19it/s]\n",
409
+ " 41%|####1 | 8206/20000 [00:50<01:05, 179.79it/s]\n",
410
+ " 41%|####1 | 8234/20000 [00:50<00:57, 205.18it/s]\n",
411
+ " 41%|####1 | 8259/20000 [00:50<00:54, 215.64it/s]\n",
412
+ " 41%|####1 | 8282/20000 [00:50<00:53, 219.03it/s]\n",
413
+ " 42%|####1 | 8305/20000 [00:50<00:55, 209.63it/s]\n",
414
+ " 42%|####1 | 8334/20000 [00:51<00:50, 229.98it/s]\n",
415
+ " 42%|####1 | 8359/20000 [00:51<00:49, 234.96it/s]\n",
416
+ " 42%|####1 | 8383/20000 [00:51<00:50, 230.45it/s]\n",
417
+ " 42%|####2 | 8407/20000 [00:51<00:52, 222.38it/s]\n",
418
+ " 42%|####2 | 8436/20000 [00:51<00:48, 240.62it/s]\n",
419
+ " 42%|####2 | 8461/20000 [00:51<00:47, 242.60it/s]\n",
420
+ " 42%|####2 | 8486/20000 [00:51<00:47, 239.88it/s]\n",
421
+ " 43%|####2 | 8511/20000 [00:51<00:48, 236.67it/s]\n",
422
+ " 43%|####2 | 8539/20000 [00:51<00:46, 247.65it/s]\n",
423
+ " 43%|####2 | 8564/20000 [00:51<00:46, 244.78it/s]\n",
424
+ " 43%|####2 | 8589/20000 [00:52<00:48, 236.04it/s]\n",
425
+ " 43%|####3 | 8613/20000 [00:52<00:48, 235.15it/s]\n",
426
+ " 43%|####3 | 8643/20000 [00:52<00:45, 252.29it/s]\n",
427
+ " 43%|####3 | 8669/20000 [00:52<00:45, 248.06it/s]\n",
428
+ " 43%|####3 | 8694/20000 [00:52<00:47, 240.33it/s]\n",
429
+ " 44%|####3 | 8720/20000 [00:52<00:46, 243.84it/s]\n",
430
+ " 44%|####3 | 8748/20000 [00:52<00:44, 254.21it/s]\n",
431
+ " 44%|####3 | 8777/20000 [00:52<00:42, 263.08it/s]\n",
432
+ " 44%|####4 | 8808/20000 [00:52<00:40, 276.78it/s]\n",
433
+ " 44%|####4 | 8856/20000 [00:53<00:33, 336.62it/s]\n",
434
+ " 44%|####4 | 8896/20000 [00:53<00:31, 353.29it/s]\n",
435
+ " 45%|####4 | 8955/20000 [00:53<00:26, 422.10it/s]\n",
436
+ " 45%|####5 | 9001/20000 [00:53<00:25, 425.76it/s]\n",
437
+ " 45%|####5 | 9070/20000 [00:53<00:21, 501.92it/s]\n",
438
+ " 46%|####5 | 9128/20000 [00:53<00:20, 523.43it/s]\n",
439
+ " 46%|####5 | 9183/20000 [00:53<00:20, 529.69it/s]\n",
440
+ " 46%|####6 | 9237/20000 [00:53<00:20, 531.16it/s]\n",
441
+ " 46%|####6 | 9291/20000 [00:53<00:21, 508.14it/s]\n",
442
+ " 47%|####6 | 9346/20000 [00:53<00:20, 520.17it/s]\n",
443
+ " 47%|####6 | 9399/20000 [00:54<00:20, 509.66it/s]\n",
444
+ " 47%|####7 | 9451/20000 [00:54<00:20, 509.69it/s]\n",
445
+ " 48%|####7 | 9503/20000 [00:54<00:21, 494.01it/s]\n",
446
+ " 48%|####7 | 9580/20000 [00:54<00:18, 569.61it/s]\n",
447
+ " 48%|####8 | 9649/20000 [00:54<00:17, 602.61it/s]\n",
448
+ " 49%|####8 | 9710/20000 [00:54<00:17, 597.78it/s]\n",
449
+ " 49%|####8 | 9792/20000 [00:54<00:15, 660.63it/s]\n",
450
+ " 49%|####9 | 9874/20000 [00:54<00:14, 707.37it/s]\n",
451
+ " 50%|####9 | 9946/20000 [00:54<00:14, 698.70it/s]\n",
452
+ " 50%|##### | 10029/20000 [00:54<00:13, 735.01it/s]\n",
453
+ " 51%|##### | 10103/20000 [00:55<00:14, 703.33it/s]\n",
454
+ " 51%|##### | 10187/20000 [00:55<00:13, 742.49it/s]\n",
455
+ " 51%|#####1 | 10275/20000 [00:55<00:12, 782.42it/s]\n",
456
+ " 52%|#####1 | 10372/20000 [00:55<00:11, 837.41it/s]\n",
457
+ " 52%|#####2 | 10463/20000 [00:55<00:11, 858.83it/s]\n",
458
+ " 53%|#####2 | 10550/20000 [00:55<00:10, 862.08it/s]\n",
459
+ " 53%|#####3 | 10640/20000 [00:55<00:10, 873.35it/s]\n",
460
+ " 54%|#####3 | 10728/20000 [00:55<00:10, 857.41it/s]\n",
461
+ " 54%|#####4 | 10815/20000 [00:55<00:10, 858.61it/s]\n",
462
+ " 55%|#####4 | 10902/20000 [00:56<00:10, 861.95it/s]\n",
463
+ " 55%|#####5 | 11034/20000 [00:56<00:08, 997.87it/s]\n",
464
+ " 56%|#####5 | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
465
+ " 56%|#####6 | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
466
+ " 57%|#####7 | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
467
+ " 58%|#####7 | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
468
+ " 59%|#####8 | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
469
+ " 60%|#####9 | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
470
+ " 61%|###### | 12137/20000 [00:58<00:32, 241.50it/s] \n",
471
+ " 61%|######1 | 12256/20000 [00:59<00:34, 221.55it/s]\n",
472
+ " 62%|######1 | 12344/20000 [00:59<00:36, 211.18it/s]\n",
473
+ " 62%|######2 | 12411/20000 [01:00<00:37, 204.44it/s]\n",
474
+ " 62%|######2 | 12464/20000 [01:00<00:37, 201.86it/s]\n",
475
+ " 63%|######2 | 12507/20000 [01:00<00:37, 197.48it/s]\n",
476
+ " 63%|######2 | 12542/20000 [01:00<00:35, 209.76it/s]\n",
477
+ " 63%|######2 | 12576/20000 [01:01<00:33, 220.35it/s]\n",
478
+ " 63%|######3 | 12609/20000 [01:01<00:32, 226.03it/s]\n",
479
+ " 63%|######3 | 12640/20000 [01:01<00:31, 234.04it/s]\n",
480
+ " 63%|######3 | 12670/20000 [01:01<00:29, 246.31it/s]\n",
481
+ " 64%|######3 | 12700/20000 [01:01<00:28, 251.76it/s]\n",
482
+ " 64%|######3 | 12731/20000 [01:01<00:27, 263.76it/s]\n",
483
+ " 64%|######3 | 12761/20000 [01:01<00:26, 272.59it/s]\n",
484
+ " 64%|######3 | 12791/20000 [01:01<00:26, 271.94it/s]\n",
485
+ " 64%|######4 | 12820/20000 [01:01<00:26, 274.53it/s]\n",
486
+ " 64%|######4 | 12850/20000 [01:02<00:25, 281.46it/s]\n",
487
+ " 64%|######4 | 12879/20000 [01:02<00:25, 277.63it/s]\n",
488
+ " 65%|######4 | 12908/20000 [01:02<00:26, 264.03it/s]\n",
489
+ " 65%|######4 | 12939/20000 [01:02<00:25, 276.61it/s]\n",
490
+ " 65%|######4 | 12969/20000 [01:02<00:24, 281.60it/s]\n",
491
+ " 65%|######4 | 12998/20000 [01:02<00:24, 280.78it/s]\n",
492
+ " 65%|######5 | 13027/20000 [01:02<00:25, 275.51it/s]\n",
493
+ " 65%|######5 | 13058/20000 [01:02<00:24, 285.34it/s]\n",
494
+ " 65%|######5 | 13087/20000 [01:02<00:24, 285.03it/s]\n",
495
+ " 66%|######5 | 13117/20000 [01:03<00:23, 287.71it/s]\n",
496
+ " 66%|######5 | 13151/20000 [01:03<00:22, 301.25it/s]\n",
497
+ " 66%|######5 | 13182/20000 [01:03<00:22, 299.42it/s]\n",
498
+ " 66%|######6 | 13213/20000 [01:03<00:23, 288.18it/s]\n",
499
+ " 66%|######6 | 13247/20000 [01:03<00:22, 302.07it/s]\n",
500
+ " 66%|######6 | 13280/20000 [01:03<00:21, 309.23it/s]\n",
501
+ " 67%|######6 | 13312/20000 [01:03<00:21, 306.12it/s]\n",
502
+ " 67%|######6 | 13348/20000 [01:03<00:20, 321.72it/s]\n",
503
+ " 67%|######6 | 13381/20000 [01:03<00:20, 320.39it/s]\n",
504
+ " 67%|######7 | 13414/20000 [01:04<00:35, 183.90it/s]\n",
505
+ " 67%|######7 | 13448/20000 [01:04<00:30, 213.47it/s]\n",
506
+ " 67%|######7 | 13478/20000 [01:04<00:28, 232.06it/s]\n",
507
+ " 68%|######7 | 13508/20000 [01:04<00:26, 246.85it/s]\n",
508
+ " 68%|######7 | 13546/20000 [01:04<00:23, 278.79it/s]\n",
509
+ " 68%|######7 | 13578/20000 [01:04<00:22, 289.60it/s]\n",
510
+ " 68%|######8 | 13610/20000 [01:04<00:21, 290.75it/s]\n",
511
+ " 68%|######8 | 13650/20000 [01:04<00:19, 319.96it/s]\n",
512
+ " 68%|######8 | 13684/20000 [01:05<00:19, 322.87it/s]\n",
513
+ " 69%|######8 | 13718/20000 [01:05<00:19, 324.97it/s]\n",
514
+ " 69%|######8 | 13753/20000 [01:05<00:18, 332.16it/s]\n",
515
+ " 69%|######8 | 13787/20000 [01:05<00:19, 323.16it/s]\n",
516
+ " 69%|######9 | 13820/20000 [01:05<00:19, 317.82it/s]\n",
517
+ " 69%|######9 | 13857/20000 [01:05<00:18, 332.74it/s]\n",
518
+ " 69%|######9 | 13891/20000 [01:05<00:18, 333.86it/s]\n",
519
+ " 70%|######9 | 13927/20000 [01:05<00:17, 340.50it/s]\n",
520
+ " 70%|######9 | 13963/20000 [01:05<00:17, 345.20it/s]\n",
521
+ " 70%|######9 | 13998/20000 [01:05<00:17, 340.60it/s]\n",
522
+ " 70%|####### | 14036/20000 [01:06<00:16, 351.09it/s]\n",
523
+ " 70%|####### | 14073/20000 [01:06<00:16, 356.65it/s]\n",
524
+ " 71%|####### | 14109/20000 [01:06<00:16, 353.45it/s]\n",
525
+ " 71%|####### | 14150/20000 [01:06<00:15, 369.02it/s]\n",
526
+ " 71%|####### | 14187/20000 [01:06<00:15, 368.21it/s]\n",
527
+ " 71%|#######1 | 14227/20000 [01:06<00:15, 375.42it/s]\n",
528
+ " 71%|#######1 | 14265/20000 [01:06<00:16, 345.08it/s]\n",
529
+ " 72%|#######1 | 14301/20000 [01:06<00:16, 347.30it/s]\n",
530
+ " 72%|#######1 | 14349/20000 [01:06<00:14, 383.90it/s]\n",
531
+ " 72%|#######1 | 14388/20000 [01:06<00:14, 376.96it/s]\n",
532
+ " 72%|#######2 | 14430/20000 [01:07<00:14, 389.28it/s]\n",
533
+ " 72%|#######2 | 14471/20000 [01:07<00:13, 395.30it/s]\n",
534
+ " 73%|#######2 | 14511/20000 [01:07<00:14, 389.82it/s]\n",
535
+ " 73%|#######2 | 14554/20000 [01:07<00:13, 401.53it/s]\n",
536
+ " 73%|#######2 | 14595/20000 [01:07<00:14, 378.41it/s]\n",
537
+ " 73%|#######3 | 14643/20000 [01:07<00:13, 405.95it/s]\n",
538
+ " 73%|#######3 | 14687/20000 [01:07<00:12, 415.69it/s]\n",
539
+ " 74%|#######3 | 14730/20000 [01:07<00:12, 418.62it/s]\n",
540
+ " 74%|#######3 | 14774/20000 [01:07<00:12, 422.40it/s]\n",
541
+ " 74%|#######4 | 14817/20000 [01:08<00:12, 418.48it/s]\n",
542
+ " 74%|#######4 | 14868/20000 [01:08<00:11, 443.95it/s]\n",
543
+ " 75%|#######4 | 14913/20000 [01:08<00:11, 444.41it/s]\n",
544
+ " 75%|#######4 | 14962/20000 [01:08<00:11, 457.86it/s]\n",
545
+ " 75%|#######5 | 15008/20000 [01:08<00:11, 438.97it/s]\n",
546
+ " 75%|#######5 | 15067/20000 [01:08<00:10, 481.14it/s]\n",
547
+ " 76%|#######5 | 15116/20000 [01:08<00:10, 483.71it/s]\n",
548
+ " 76%|#######5 | 15173/20000 [01:08<00:09, 509.06it/s]\n",
549
+ " 76%|#######6 | 15227/20000 [01:08<00:09, 518.19it/s]\n",
550
+ " 76%|#######6 | 15285/20000 [01:08<00:08, 534.95it/s]\n",
551
+ " 77%|#######6 | 15351/20000 [01:09<00:08, 570.41it/s]\n",
552
+ " 77%|#######7 | 15409/20000 [01:09<00:08, 569.86it/s]\n",
553
+ " 77%|#######7 | 15477/20000 [01:09<00:07, 602.56it/s]\n",
554
+ " 78%|#######7 | 15538/20000 [01:09<00:07, 602.96it/s]\n",
555
+ " 78%|#######7 | 15599/20000 [01:09<00:07, 585.87it/s]\n",
556
+ " 78%|#######8 | 15658/20000 [01:09<00:07, 581.97it/s]\n",
557
+ " 79%|#######8 | 15722/20000 [01:09<00:07, 598.93it/s]\n",
558
+ " 79%|#######8 | 15799/20000 [01:09<00:06, 647.41it/s]\n",
559
+ " 79%|#######9 | 15877/20000 [01:09<00:06, 684.57it/s]\n",
560
+ " 80%|#######9 | 15957/20000 [01:09<00:05, 718.72it/s]\n",
561
+ " 80%|######## | 16037/20000 [01:10<00:05, 740.70it/s]\n",
562
+ " 81%|######## | 16112/20000 [01:10<00:05, 730.42it/s]\n",
563
+ " 81%|######## | 16195/20000 [01:10<00:05, 757.50it/s]\n",
564
+ " 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
565
+ " 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
566
+ " 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
567
+ " 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
568
+ " 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
569
+ " 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
570
+ " 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
571
+ " 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
572
+ " 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
573
+ " 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
574
+ " 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
575
+ " 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
576
+ " 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
577
+ " 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
578
+ " 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
579
+ " 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
580
+ " 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
581
+ " 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
582
+ " 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
583
+ " 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
584
+ " 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
585
+ " 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
586
+ " 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
587
+ " 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
588
+ " 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
589
+ " 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
590
+ " 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
591
+ " 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
592
+ " 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
593
+ " 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
594
+ " 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
595
+ " 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
596
+ " 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
597
+ " 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
598
+ " 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
599
+ " 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
600
+ " 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
601
+ " 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
602
+ " 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
603
+ " 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
604
+ " 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
605
+ " 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
606
+ " 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
607
+ " 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
608
+ " 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
609
+ " 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
610
+ " 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
611
+ " 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
612
+ " 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
613
+ " 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
614
+ " 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
615
+ " 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
616
+ " 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
617
+ " 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
618
+ " 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
619
+ " 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
620
+ "100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
621
+ "100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
622
+ "100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
623
+ "100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
624
+ "100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
625
+ "100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
626
+ ]
627
+ }
628
+ ],
629
+ "source": [
630
+ "!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 12,
636
+ "id": "68a4113a",
637
+ "metadata": {},
638
+ "outputs": [],
639
+ "source": [
640
+ "!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
641
+ ]
642
+ },
643
+ {
644
+ "cell_type": "code",
645
+ "execution_count": 13,
646
+ "id": "06254f0d",
647
+ "metadata": {},
648
+ "outputs": [
649
+ {
650
+ "name": "stdout",
651
+ "output_type": "stream",
652
+ "text": [
653
+ "Vocabulary size: 20217\n"
654
+ ]
655
+ }
656
+ ],
657
+ "source": [
658
+ "def count_tokens(file_path):\n",
659
+ " try:\n",
660
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
661
+ " text = file.read()\n",
662
+ " # Split the text into tokens based on spaces\n",
663
+ " tokens = text.split()\n",
664
+ " # Count the vocabulary size (number of unique tokens)\n",
665
+ " vocabulary_size = len(set(tokens))\n",
666
+ " return vocabulary_size\n",
667
+ " except IOError:\n",
668
+ " print(f\"Error: Could not open or read the file '{file_path}'\")\n",
669
+ " return -1\n",
670
+ "\n",
671
+ "# Example usage\n",
672
+ "file_path = './dataset/output_dataset.txt' # Replace with the actual file path\n",
673
+ "vocabulary_size = count_tokens(file_path)\n",
674
+ "if vocabulary_size != -1:\n",
675
+ " print(f\"Vocabulary size: {vocabulary_size}\")\n"
676
+ ]
677
+ }
678
+ ],
679
+ "metadata": {
680
+ "kernelspec": {
681
+ "display_name": "Python 3 (ipykernel)",
682
+ "language": "python",
683
+ "name": "python3"
684
+ },
685
+ "language_info": {
686
+ "codemirror_mode": {
687
+ "name": "ipython",
688
+ "version": 3
689
+ },
690
+ "file_extension": ".py",
691
+ "mimetype": "text/x-python",
692
+ "name": "python",
693
+ "nbconvert_exporter": "python",
694
+ "pygments_lexer": "ipython3",
695
+ "version": "3.9.5"
696
+ }
697
+ },
698
+ "nbformat": 4,
699
+ "nbformat_minor": 5
700
+ }
subword/get_vocab.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env python
2
+ from __future__ import print_function
3
+
4
+ import os
5
+ import sys
6
+ import inspect
7
+ import warnings
8
+ import argparse
9
+ import codecs
10
+
11
+ from collections import Counter
12
+
13
+ # hack for python2/3 compatibility
14
+ from io import open
15
+ argparse.open = open
16
+
17
+ def create_parser(subparsers=None):
18
+
19
+ if subparsers:
20
+ parser = subparsers.add_parser('get-vocab',
21
+ formatter_class=argparse.RawDescriptionHelpFormatter,
22
+ description="Generates vocabulary")
23
+ else:
24
+ parser = argparse.ArgumentParser(
25
+ formatter_class=argparse.RawDescriptionHelpFormatter,
26
+ description="Generates vocabulary")
27
+
28
+ parser.add_argument(
29
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
30
+ metavar='PATH',
31
+ help="Input file (default: standard input).")
32
+
33
+ parser.add_argument(
34
+ '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
35
+ metavar='PATH',
36
+ help="Output file (default: standard output)")
37
+
38
+ return parser
39
+
40
+ def get_vocab(train_file, vocab_file):
41
+
42
+ c = Counter()
43
+
44
+ for line in train_file:
45
+ for word in line.strip('\r\n ').split(' '):
46
+ if word:
47
+ c[word] += 1
48
+
49
+ for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
50
+ vocab_file.write(key+" "+ str(f) + "\n")
51
+
52
+ if __name__ == "__main__":
53
+
54
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
55
+ newdir = os.path.join(currentdir, 'subword_nmt')
56
+ if os.path.isdir(newdir):
57
+ warnings.warn(
58
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
59
+ DeprecationWarning
60
+ )
61
+
62
+ # python 2/3 compatibility
63
+ if sys.version_info < (3, 0):
64
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
65
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
66
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
67
+ else:
68
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
69
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
70
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
71
+
72
+ parser = create_parser()
73
+ args = parser.parse_args()
74
+
75
+ # read/write files as UTF-8
76
+ if args.input.name != '<stdin>':
77
+ args.input = codecs.open(args.input.name, encoding='utf-8')
78
+ if args.output.name != '<stdout>':
79
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
80
+
81
+ get_vocab(args.input, args.output)
82
+
83
+ # close files
84
+ if args.input.name != '<stdin>':
85
+ args.input.close()
86
+ if args.output.name != '<stdout>':
87
+ args.output.close()
subword/learn_bpe.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
6
+ Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
7
+ of a text to a configurable number of symbols, with only a small increase in the number of tokens.
8
+
9
+ Reference:
10
+ Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
11
+ Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
12
+ """
13
+
14
+ from __future__ import unicode_literals
15
+
16
+ import os
17
+ import sys
18
+ import inspect
19
+ import codecs
20
+ import re
21
+ import copy
22
+ import argparse
23
+ import warnings
24
+ import tempfile
25
+ from multiprocessing import Pool, cpu_count
26
+ from collections import defaultdict, Counter
27
+
28
+ try:
29
+ from tqdm import tqdm
30
+ except ImportError:
31
+ def tqdm(iterator, *args, **kwargs):
32
+ return iterator
33
+
34
+ # hack for python2/3 compatibility
35
+ from io import open
36
+ argparse.open = open
37
+
38
+ def create_parser(subparsers=None):
39
+
40
+ if subparsers:
41
+ parser = subparsers.add_parser('learn-bpe',
42
+ formatter_class=argparse.RawDescriptionHelpFormatter,
43
+ description="learn BPE-based word segmentation")
44
+ else:
45
+ parser = argparse.ArgumentParser(
46
+ formatter_class=argparse.RawDescriptionHelpFormatter,
47
+ description="learn BPE-based word segmentation")
48
+
49
+ parser.add_argument(
50
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
51
+ metavar='PATH',
52
+ help="Input text (default: standard input).")
53
+
54
+ parser.add_argument(
55
+ '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
56
+ metavar='PATH',
57
+ help="Output file for BPE codes (default: standard output)")
58
+ parser.add_argument(
59
+ '--symbols', '-s', type=int, default=10000,
60
+ help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
61
+ parser.add_argument(
62
+ '--min-frequency', type=int, default=2, metavar='FREQ',
63
+ help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
64
+ parser.add_argument('--dict-input', action="store_true",
65
+ help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
66
+ parser.add_argument(
67
+ '--total-symbols', '-t', action="store_true",
68
+ help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
69
+ parser.add_argument(
70
+ '--num-workers', type=int, default=1,
71
+ help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
72
+ parser.add_argument(
73
+ '--verbose', '-v', action="store_true",
74
+ help="verbose mode.")
75
+
76
+ return parser
77
+
78
+ def get_vocabulary(fobj, is_dict=False, num_workers=1):
79
+ """Read text and return dictionary that encodes vocabulary
80
+ """
81
+ vocab = Counter()
82
+ if is_dict:
83
+ for i, line in enumerate(fobj):
84
+ try:
85
+ word, count = line.strip('\r\n ').split(' ')
86
+ except:
87
+ print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
88
+ sys.exit(1)
89
+ vocab[word] += int(count)
90
+ elif num_workers == 1 or fobj.name == '<stdin>':
91
+ if num_workers > 1:
92
+ warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
93
+ for i, line in enumerate(fobj):
94
+ for word in line.strip('\r\n ').split(' '):
95
+ if word:
96
+ vocab[word] += 1
97
+ elif num_workers > 1:
98
+
99
+ if sys.version_info < (3, 0):
100
+ print("Parallel mode is only supported in Python3.")
101
+ sys.exit(1)
102
+
103
+ with open(fobj.name, encoding="utf8") as f:
104
+ size = os.fstat(f.fileno()).st_size
105
+ chunk_size = int(size / num_workers)
106
+ offsets = [0 for _ in range(num_workers + 1)]
107
+ for i in range(1, num_workers):
108
+ f.seek(chunk_size * i)
109
+ pos = f.tell()
110
+ while True:
111
+ try:
112
+ line = f.readline()
113
+ break
114
+ except UnicodeDecodeError:
115
+ pos -= 1
116
+ f.seek(pos)
117
+ offsets[i] = f.tell()
118
+ assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
119
+
120
+ vocab_files = []
121
+ pool = Pool(processes=num_workers)
122
+ for i in range(num_workers):
123
+ tmp = tempfile.NamedTemporaryFile(delete=False)
124
+ tmp.close()
125
+ vocab_files.append(tmp)
126
+ pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1]))
127
+ pool.close()
128
+ pool.join()
129
+ import pickle
130
+ for i in range(num_workers):
131
+ with open(vocab_files[i].name, 'rb') as f:
132
+ vocab += pickle.load(f)
133
+ os.remove(vocab_files[i].name)
134
+ else:
135
+ raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
136
+ return vocab
137
+
138
+ def _get_vocabulary(infile, outfile, begin, end):
139
+ import pickle
140
+ vocab = Counter()
141
+ with open(infile, encoding="utf8") as f:
142
+ f.seek(begin)
143
+ line = f.readline()
144
+ while line:
145
+ pos = f.tell()
146
+ assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
147
+ if end > 0 and pos > end:
148
+ break
149
+ for word in line.strip('\r\n ').split(' '):
150
+ if word:
151
+ vocab[word] += 1
152
+ line = f.readline()
153
+ with open(outfile, 'wb') as f:
154
+ pickle.dump(vocab, f)
155
+
156
+ def update_pair_statistics(pair, changed, stats, indices):
157
+ """Minimally update the indices and frequency of symbol pairs
158
+
159
+ if we merge a pair of symbols, only pairs that overlap with occurrences
160
+ of this pair are affected, and need to be updated.
161
+ """
162
+ stats[pair] = 0
163
+ indices[pair] = defaultdict(int)
164
+ first, second = pair
165
+ new_pair = first+second
166
+ for j, word, old_word, freq in changed:
167
+
168
+ # find all instances of pair, and update frequency/indices around it
169
+ i = 0
170
+ while True:
171
+ # find first symbol
172
+ try:
173
+ i = old_word.index(first, i)
174
+ except ValueError:
175
+ break
176
+ # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
177
+ if i < len(old_word)-1 and old_word[i+1] == second:
178
+ # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
179
+ if i:
180
+ prev = old_word[i-1:i+1]
181
+ stats[prev] -= freq
182
+ indices[prev][j] -= 1
183
+ if i < len(old_word)-2:
184
+ # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
185
+ # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
186
+ if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
187
+ nex = old_word[i+1:i+3]
188
+ stats[nex] -= freq
189
+ indices[nex][j] -= 1
190
+ i += 2
191
+ else:
192
+ i += 1
193
+
194
+ i = 0
195
+ while True:
196
+ try:
197
+ # find new pair
198
+ i = word.index(new_pair, i)
199
+ except ValueError:
200
+ break
201
+ # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
202
+ if i:
203
+ prev = word[i-1:i+1]
204
+ stats[prev] += freq
205
+ indices[prev][j] += 1
206
+ # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
207
+ # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
208
+ if i < len(word)-1 and word[i+1] != new_pair:
209
+ nex = word[i:i+2]
210
+ stats[nex] += freq
211
+ indices[nex][j] += 1
212
+ i += 1
213
+
214
+
215
+ def get_pair_statistics(vocab):
216
+ """Count frequency of all symbol pairs, and create index"""
217
+
218
+ # data structure of pair frequencies
219
+ stats = defaultdict(int)
220
+
221
+ #index from pairs to words
222
+ indices = defaultdict(lambda: defaultdict(int))
223
+
224
+ for i, (word, freq) in enumerate(vocab):
225
+ prev_char = word[0]
226
+ for char in word[1:]:
227
+ stats[prev_char, char] += freq
228
+ indices[prev_char, char][i] += 1
229
+ prev_char = char
230
+
231
+ return stats, indices
232
+
233
+
234
+ def replace_pair(pair, vocab, indices):
235
+ """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
236
+ first, second = pair
237
+ pair_str = ''.join(pair)
238
+ pair_str = pair_str.replace('\\','\\\\')
239
+ changes = []
240
+ pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
241
+ if sys.version_info < (3, 0):
242
+ iterator = indices[pair].iteritems()
243
+ else:
244
+ iterator = indices[pair].items()
245
+ for j, freq in iterator:
246
+ if freq < 1:
247
+ continue
248
+ word, freq = vocab[j]
249
+ new_word = ' '.join(word)
250
+ new_word = pattern.sub(pair_str, new_word)
251
+ new_word = tuple(new_word.split(' '))
252
+
253
+ vocab[j] = (new_word, freq)
254
+ changes.append((j, new_word, word, freq))
255
+
256
+ return changes
257
+
258
+ def prune_stats(stats, big_stats, threshold):
259
+ """Prune statistics dict for efficiency of max()
260
+
261
+ The frequency of a symbol pair never increases, so pruning is generally safe
262
+ (until we the most frequent pair is less frequent than a pair we previously pruned)
263
+ big_stats keeps full statistics for when we need to access pruned items
264
+ """
265
+ for item,freq in list(stats.items()):
266
+ if freq < threshold:
267
+ del stats[item]
268
+ if freq < 0:
269
+ big_stats[item] += freq
270
+ else:
271
+ big_stats[item] = freq
272
+
273
+
274
+ def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False, total_symbols=False, num_workers=1):
275
+ """Learn num_symbols BPE operations from vocabulary, and write to outfile.
276
+ """
277
+
278
+ # version 0.2 changes the handling of the end-of-word token ('</w>');
279
+ # version numbering allows bckward compatibility
280
+ outfile.write('#version: 0.2\n')
281
+
282
+ vocab = get_vocabulary(infile, is_dict, num_workers)
283
+ vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
284
+ sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
285
+
286
+ stats, indices = get_pair_statistics(sorted_vocab)
287
+ big_stats = copy.deepcopy(stats)
288
+
289
+ if total_symbols:
290
+ uniq_char_internal = set()
291
+ uniq_char_final = set()
292
+ for word in vocab:
293
+ for char in word[:-1]:
294
+ uniq_char_internal.add(char)
295
+ uniq_char_final.add(word[-1])
296
+ sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal)))
297
+ sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final)))
298
+ sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final)))
299
+ num_symbols -= len(uniq_char_internal) + len(uniq_char_final)
300
+
301
+ # threshold is inspired by Zipfian assumption, but should only affect speed
302
+ threshold = max(stats.values()) / 10
303
+ for i in tqdm(range(num_symbols)):
304
+ if stats:
305
+ most_frequent = max(stats, key=lambda x: (stats[x], x))
306
+
307
+ # we probably missed the best pair because of pruning; go back to full statistics
308
+ if not stats or (i and stats[most_frequent] < threshold):
309
+ prune_stats(stats, big_stats, threshold)
310
+ stats = copy.deepcopy(big_stats)
311
+ most_frequent = max(stats, key=lambda x: (stats[x], x))
312
+ # threshold is inspired by Zipfian assumption, but should only affect speed
313
+ threshold = stats[most_frequent] * i/(i+10000.0)
314
+ prune_stats(stats, big_stats, threshold)
315
+
316
+ if stats[most_frequent] < min_frequency:
317
+ sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
318
+ break
319
+
320
+ if verbose:
321
+ sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
322
+ outfile.write('{0} {1}\n'.format(*most_frequent))
323
+ changes = replace_pair(most_frequent, sorted_vocab, indices)
324
+ update_pair_statistics(most_frequent, changes, stats, indices)
325
+ stats[most_frequent] = 0
326
+ if not i % 100:
327
+ prune_stats(stats, big_stats, threshold)
328
+
329
+
330
+ if __name__ == '__main__':
331
+
332
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
333
+ newdir = os.path.join(currentdir, 'subword_nmt')
334
+ if os.path.isdir(newdir):
335
+ warnings.warn(
336
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
337
+ DeprecationWarning
338
+ )
339
+
340
+ # python 2/3 compatibility
341
+ if sys.version_info < (3, 0):
342
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
343
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
344
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
345
+ else:
346
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
347
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
348
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
349
+
350
+ parser = create_parser()
351
+ args = parser.parse_args()
352
+
353
+ if args.num_workers <= 0:
354
+ args.num_workers = cpu_count()
355
+
356
+ if sys.version_info < (3, 0) and args.num_workers > 1:
357
+ args.num_workers = 1
358
+ warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
359
+
360
+ # read/write files as UTF-8
361
+ if args.input.name != '<stdin>':
362
+ args.input = codecs.open(args.input.name, encoding='utf-8')
363
+ if args.output.name != '<stdout>':
364
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
365
+
366
+ learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols, num_workers=args.num_workers)
367
+
368
+ # close files
369
+ if args.input.name != '<stdin>':
370
+ args.input.close()
371
+ if args.output.name != '<stdout>':
372
+ args.output.close()
subword/learn_joint_bpe_and_vocab.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
6
+ This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus,
7
+ applies the learned operation to each and (optionally) returns the resulting vocabulary of each text.
8
+ The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text.
9
+
10
+ Reference:
11
+ Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
12
+ Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
13
+ """
14
+
15
+ from __future__ import unicode_literals
16
+
17
+ import sys
18
+ import os
19
+ import inspect
20
+ import codecs
21
+ import argparse
22
+ import tempfile
23
+ import warnings
24
+ from collections import Counter
25
+ from multiprocessing import cpu_count
26
+
27
+ #hack to get imports working if running this as a script, or within a package
28
+ if __name__ == '__main__':
29
+ import learn_bpe
30
+ import apply_bpe
31
+ else:
32
+ from . import learn_bpe
33
+ from . import apply_bpe
34
+
35
+ # hack for python2/3 compatibility
36
+ from io import open
37
+ argparse.open = open
38
+
39
+ def create_parser(subparsers=None):
40
+
41
+ if subparsers:
42
+ parser = subparsers.add_parser('learn-joint-bpe-and-vocab',
43
+ formatter_class=argparse.RawDescriptionHelpFormatter,
44
+ description="learn BPE-based word segmentation")
45
+ else:
46
+ parser = argparse.ArgumentParser(
47
+ formatter_class=argparse.RawDescriptionHelpFormatter,
48
+ description="learn BPE-based word segmentation")
49
+
50
+ parser.add_argument(
51
+ '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+',
52
+ metavar='PATH',
53
+ help="Input texts (multiple allowed).")
54
+ parser.add_argument(
55
+ '--output', '-o', type=argparse.FileType('w'), required=True,
56
+ metavar='PATH',
57
+ help="Output file for BPE codes.")
58
+ parser.add_argument(
59
+ '--symbols', '-s', type=int, default=10000,
60
+ help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
61
+ parser.add_argument(
62
+ '--separator', type=str, default='@@', metavar='STR',
63
+ help="Separator between non-final subword units (default: '%(default)s')")
64
+ parser.add_argument(
65
+ '--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None,
66
+ metavar='PATH', dest='vocab',
67
+ help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py')
68
+ parser.add_argument(
69
+ '--min-frequency', type=int, default=2, metavar='FREQ',
70
+ help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
71
+ parser.add_argument(
72
+ '--total-symbols', '-t', action="store_true",
73
+ help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
74
+ parser.add_argument(
75
+ '--num-workers', type=int, default=1,
76
+ help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
77
+ parser.add_argument(
78
+ '--verbose', '-v', action="store_true",
79
+ help="verbose mode.")
80
+
81
+ return parser
82
+
83
+ def learn_joint_bpe_and_vocab(args):
84
+
85
+ if args.vocab and len(args.input) != len(args.vocab):
86
+ sys.stderr.write('Error: number of input files and vocabulary files must match\n')
87
+ sys.exit(1)
88
+
89
+ # read/write files as UTF-8
90
+ args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
91
+ args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]
92
+
93
+ # get combined vocabulary of all input texts
94
+ full_vocab = Counter()
95
+ for f in args.input:
96
+ full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
97
+ f.seek(0)
98
+
99
+ vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
100
+
101
+ # learn BPE on combined vocabulary
102
+ with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
103
+ learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)
104
+
105
+ with codecs.open(args.output.name, encoding='UTF-8') as codes:
106
+ bpe = apply_bpe.BPE(codes, separator=args.separator)
107
+
108
+ # apply BPE to each training corpus and get vocabulary
109
+ for train_file, vocab_file in zip(args.input, args.vocab):
110
+
111
+ tmp = tempfile.NamedTemporaryFile(delete=False)
112
+ tmp.close()
113
+
114
+ tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
115
+
116
+ train_file.seek(0)
117
+ bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)
118
+
119
+ tmpout.close()
120
+ tmpin = codecs.open(tmp.name, encoding='UTF-8')
121
+
122
+ vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
123
+ tmpin.close()
124
+ os.remove(tmp.name)
125
+
126
+ for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
127
+ vocab_file.write("{0} {1}\n".format(key, freq))
128
+ train_file.close()
129
+ vocab_file.close()
130
+
131
+
132
+ if __name__ == '__main__':
133
+
134
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
135
+ newdir = os.path.join(currentdir, 'subword_nmt')
136
+ if os.path.isdir(newdir):
137
+ warnings.warn(
138
+ "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
139
+ DeprecationWarning
140
+ )
141
+
142
+ # python 2/3 compatibility
143
+ if sys.version_info < (3, 0):
144
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
145
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
146
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
147
+ else:
148
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
149
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
150
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
151
+
152
+ parser = create_parser()
153
+ args = parser.parse_args()
154
+
155
+ if args.num_workers <= 0:
156
+ args.num_workers = cpu_count()
157
+
158
+ if sys.version_info < (3, 0):
159
+ args.separator = args.separator.decode('UTF-8')
160
+ if args.num_workers > 1:
161
+ args.num_workers = 1
162
+ warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
163
+
164
+ assert(len(args.input) == len(args.vocab))
165
+
166
+ learn_joint_bpe_and_vocab(args)
subword/segment_char_ngrams.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Author: Rico Sennrich
4
+
5
+ from __future__ import unicode_literals, division
6
+
7
+ import sys
8
+ import codecs
9
+ import argparse
10
+
11
+ # hack for python2/3 compatibility
12
+ from io import open
13
+ argparse.open = open
14
+
15
+ def create_parser(subparsers=None):
16
+
17
+ if subparsers:
18
+ parser = subparsers.add_parser('segment-char-ngrams',
19
+ formatter_class=argparse.RawDescriptionHelpFormatter,
20
+ description="segment rare words into character n-grams")
21
+ else:
22
+ parser = argparse.ArgumentParser(
23
+ formatter_class=argparse.RawDescriptionHelpFormatter,
24
+ description="segment rare words into character n-grams")
25
+
26
+ parser.add_argument(
27
+ '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
28
+ metavar='PATH',
29
+ help="Input file (default: standard input).")
30
+ parser.add_argument(
31
+ '--vocab', type=argparse.FileType('r'), metavar='PATH',
32
+ required=True,
33
+ help="Vocabulary file.")
34
+ parser.add_argument(
35
+ '--shortlist', type=int, metavar='INT', default=0,
36
+ help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).")
37
+ parser.add_argument(
38
+ '-n', type=int, metavar='INT', default=2,
39
+ help="segment rare words into character n-grams of size INT (default: '%(default)s')).")
40
+ parser.add_argument(
41
+ '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
42
+ metavar='PATH',
43
+ help="Output file (default: standard output)")
44
+ parser.add_argument(
45
+ '--separator', '-s', type=str, default='@@', metavar='STR',
46
+ help="Separator between non-final subword units (default: '%(default)s'))")
47
+
48
+ return parser
49
+
50
+ def segment_char_ngrams(args):
51
+
52
+ vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2]
53
+ vocab = dict((y,x) for (x,y) in enumerate(vocab))
54
+
55
+ for line in args.input:
56
+ for word in line.split():
57
+ if word not in vocab or vocab[word] > args.shortlist:
58
+ i = 0
59
+ while i*args.n < len(word):
60
+ args.output.write(word[i*args.n:i*args.n+args.n])
61
+ i += 1
62
+ if i*args.n < len(word):
63
+ args.output.write(args.separator)
64
+ args.output.write(' ')
65
+ else:
66
+ args.output.write(word + ' ')
67
+ args.output.write('\n')
68
+
69
+
70
+ if __name__ == '__main__':
71
+
72
+ # python 2/3 compatibility
73
+ if sys.version_info < (3, 0):
74
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
75
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
76
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
77
+ else:
78
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
79
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
80
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
81
+
82
+ parser = create_parser()
83
+ args = parser.parse_args()
84
+
85
+ if sys.version_info < (3, 0):
86
+ args.separator = args.separator.decode('UTF-8')
87
+
88
+ # read/write files as UTF-8
89
+ args.vocab = codecs.open(args.vocab.name, encoding='utf-8')
90
+ if args.input.name != '<stdin>':
91
+ args.input = codecs.open(args.input.name, encoding='utf-8')
92
+ if args.output.name != '<stdout>':
93
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
94
+
95
+ segment_char_ngrams(args)
subword/subword_nmt.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import io
5
+ import sys
6
+ import codecs
7
+ import argparse
8
+
9
+ from .learn_bpe import learn_bpe
10
+ from .apply_bpe import BPE, read_vocabulary
11
+ from .get_vocab import get_vocab
12
+ from .learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab
13
+
14
+ from .learn_bpe import create_parser as create_learn_bpe_parser
15
+ from .apply_bpe import create_parser as create_apply_bpe_parser
16
+ from .get_vocab import create_parser as create_get_vocab_parser
17
+ from .learn_joint_bpe_and_vocab import create_parser as create_learn_joint_bpe_and_vocab_parser
18
+
19
+ # hack for python2/3 compatibility
20
+ argparse.open = io.open
21
+
22
+ def main():
23
+ parser = argparse.ArgumentParser(
24
+ formatter_class=argparse.RawTextHelpFormatter,
25
+ description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ")
26
+ subparsers = parser.add_subparsers(dest='command',
27
+ help="""command to run. Run one of the commands with '-h' for more info.
28
+
29
+ learn-bpe: learn BPE merge operations on input text.
30
+ apply-bpe: apply given BPE operations to input text.
31
+ get-vocab: extract vocabulary and word frequencies from input text.
32
+ learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""")
33
+
34
+ learn_bpe_parser = create_learn_bpe_parser(subparsers)
35
+ apply_bpe_parser = create_apply_bpe_parser(subparsers)
36
+ get_vocab_parser = create_get_vocab_parser(subparsers)
37
+ learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers)
38
+
39
+ args = parser.parse_args()
40
+
41
+ if args.command == 'learn-bpe':
42
+ # read/write files as UTF-8
43
+ if args.input.name != '<stdin>':
44
+ args.input = codecs.open(args.input.name, encoding='utf-8')
45
+ if args.output.name != '<stdout>':
46
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
47
+
48
+ learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose,
49
+ is_dict=args.dict_input, total_symbols=args.total_symbols)
50
+ elif args.command == 'apply-bpe':
51
+ # read/write files as UTF-8
52
+ args.codes = codecs.open(args.codes.name, encoding='utf-8')
53
+ if args.input.name != '<stdin>':
54
+ args.input = codecs.open(args.input.name, encoding='utf-8')
55
+ if args.output.name != '<stdout>':
56
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
57
+ if args.vocabulary:
58
+ args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
59
+
60
+ if args.vocabulary:
61
+ vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
62
+ else:
63
+ vocabulary = None
64
+
65
+ if sys.version_info < (3, 0):
66
+ args.separator = args.separator.decode('UTF-8')
67
+ if args.glossaries:
68
+ args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
69
+
70
+ bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
71
+
72
+ for line in args.input:
73
+ args.output.write(bpe.process_line(line, args.dropout))
74
+
75
+ elif args.command == 'get-vocab':
76
+ if args.input.name != '<stdin>':
77
+ args.input = codecs.open(args.input.name, encoding='utf-8')
78
+ if args.output.name != '<stdout>':
79
+ args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
80
+ get_vocab(args.input, args.output)
81
+ elif args.command == 'learn-joint-bpe-and-vocab':
82
+ learn_joint_bpe_and_vocab(args)
83
+ if sys.version_info < (3, 0):
84
+ args.separator = args.separator.decode('UTF-8')
85
+ else:
86
+ raise Exception('Invalid command provided')
87
+
88
+
89
+ # python 2/3 compatibility
90
+ if sys.version_info < (3, 0):
91
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
92
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
93
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
94
+ else:
95
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
96
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
97
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
subword/tests/__init__.py ADDED
File without changes
subword/tests/data/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ bpe.out
subword/tests/data/bpe.ref ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ t h
3
+ th e</w>
4
+ i n
5
+ a n
6
+ e r
7
+ r e
8
+ o r
9
+ t i
10
+ a r
11
+ an d</w>
12
+ e n
13
+ o f</w>
14
+ o u
15
+ o n
16
+ t o</w>
17
+ o n</w>
18
+ i s</w>
19
+ e d</w>
20
+ in g</w>
21
+ a l
22
+ i n</w>
23
+ e r</w>
24
+ i t
25
+ s t
26
+ e s</w>
27
+ a t
28
+ o r</w>
29
+ a t</w>
30
+ r o
31
+ i c
32
+ o m
33
+ e s
34
+ i l
35
+ e n</w>
36
+ o u</w>
37
+ a s
38
+ a s</w>
39
+ e l
40
+ u s
41
+ a n</w>
42
+ e c
43
+ i s
44
+ o s
45
+ a c
46
+ ti on</w>
47
+ y ou</w>
48
+ o t
49
+ f or</w>
50
+ w h
51
+ i t</w>
52
+ a l</w>
53
+ v e</w>
54
+ p l
55
+ a p
56
+ s h
57
+ o l
58
+ d i
59
+ th e
60
+ q u
61
+ th at</w>
62
+ e t
63
+ m a
64
+ ar e</w>
65
+ al l</w>
66
+ th is</w>
67
+ c om
68
+ c h
69
+ r i
70
+ u n
71
+ en t</w>
72
+ b e</w>
73
+ b l
74
+ n o
75
+ a m
76
+ e v
77
+ c e</w>
78
+ @ -
79
+ @- @</w>
80
+ f or
81
+ s i
82
+ u r
83
+ l o
84
+ it h</w>
85
+ er s</w>
86
+ t s</w>
87
+ ou r</w>
88
+ w ith</w>
89
+ re s
90
+ h a
91
+ p ro
92
+ qu ot
93
+ quot ;</w>
94
+ & quot;</w>
95
+ e m
96
+ ti on
97
+ a d
98
+ l y</w>
99
+ e t</w>
100
+ b e
101
+ or d
102
+ c on
103
+ er e</w>
104
+ i g
105
+ n e
106
+ a y</w>
107
+ ro m</w>
108
+ f rom</w>
109
+ b u
110
+ n d</w>
111
+ ap os
112
+ & apos
113
+ o w
114
+ i r
115
+ w or
116
+ b y</w>
117
+ a tion</w>
118
+ o p
119
+ &apos ;
120
+ f f
121
+ t r
122
+ l i
123
+ s u
124
+ y our</w>
125
+ no t</w>
126
+ the y</w>
127
+ ic h</w>
128
+ s p
129
+ c an</w>
130
+ ou t</w>
131
+ e x
132
+ e ar
133
+ l d</w>
134
+ d e
135
+ v er
136
+ t a
137
+ g e</w>
138
+ wh ich</w>
139
+ d s</w>
140
+ bl e</w>
141
+ p ar
142
+ on e</w>
143
+ a y
144
+ w il
145
+ in g
146
+ d at
147
+ t er</w>
148
+ t er
149
+ ha ve</w>
150
+ sh all</w>
151
+ tion s</w>
152
+ m an
153
+ it y</w>
154
+ d e</w>
155
+ wil l</w>
156
+ p a
157
+ o d</w>
158
+ & #
159
+ th er</w>
160
+ c l
161
+ . .
162
+ .. .</w>
163
+ u l
164
+ es s</w>
165
+ 0 0
166
+ i f</w>
167
+ a b
168
+ h e</w>
169
+ ou ld</w>
170
+ i r</w>
171
+ c h</w>
172
+ t h</w>
173
+ r a
174
+ m er
175
+ 1 2
176
+ p u
177
+ A nd</w>
178
+ un to</w>
179
+ s it
180
+ res s</w>
181
+ p e
182
+ h t</w>
183
+ en ts</w>
184
+ 4 ;</w>
185
+ 12 4;</w>
186
+ &# 124;</w>
187
+ ing s</w>
188
+ h ol
189
+ v er</w>
190
+ m e</w>
191
+ w e</w>
192
+ s o</w>
193
+ re e</w>
194
+ m y</w>
195
+ u p
196
+ k e</w>
197
+ i d
198
+ at ed</w>
199
+ us e</w>
200
+ m ent</w>
201
+ &apos; s</w>
202
+ es t</w>
203
+ a r</w>
204
+ P ress</w>
205
+ ou n
206
+ h o
207
+ for e</w>
208
+ f il
209
+ d ow
210
+ al l
211
+ at e</w>
212
+ t ed</w>
213
+ p er
214
+ h is</w>
215
+ er e
216
+ as e</w>
217
+ the ir</w>
218
+ p or
219
+ I C
220
+ th ere</w>
221
+ t o
222
+ is h</w>
223
+ 2 00
224
+ r ou
225
+ m e
226
+ ec om
227
+ h i
228
+ as t</w>
229
+ wor k</w>
230
+ w as</w>
231
+ sit es</w>
232
+ f t
233
+ u m
234
+ in e</w>
235
+ a ti
236
+ ri bu
237
+ or e</w>
238
+ g l
239
+ c at</w>
240
+ a ble</w>
241
+ IC E
242
+ ICE cat</w>
243
+ g i
244
+ am e</w>
245
+ ac c
246
+ u d
247
+ st r
248
+ s o
249
+ pl e</w>
250
+ mer ce</w>
251
+ k s</w>
252
+ g o
253
+ ev en</w>
254
+ c re
255
+ y st
256
+ us t</w>
257
+ or s</w>
258
+ ic e</w>
259
+ h as</w>
260
+ ecom merce</w>
261
+ c i
262
+ no w</w>
263
+ a v
264
+ m ents</w>
265
+ a d</w>
266
+ us ing</w>
267
+ s t</w>
268
+ man y</w>
269
+ ma y</w>
270
+ k ing</w>
271
+ ev er
272
+ ere fore</w>
273
+ di st
274
+ y e</w>
275
+ u t
276
+ ti me</w>
277
+ s e
278
+ re n
279
+ os e</w>
280
+ o ther</w>
281
+ m ore</w>
282
+ e st
283
+ s er
284
+ s el
285
+ re c
286
+ p h
287
+ lo c
288
+ l ic
289
+ in ce</w>
290
+ en s
291
+ bu t</w>
292
+ ar y</w>
293
+ an t</w>
294
+ G od</w>
295
+ s yst
296
+ s om
297
+ l e
298
+ f ree</w>
299
+ dist ribu
300
+ an s
301
+ a g
302
+ W ord
303
+ p ur
304
+ en t
305
+ d o
306
+ ar t
307
+ al so</w>
308
+ w e
309
+ v i
310
+ s a
311
+ ri g
312
+ ne w</w>
313
+ l and</w>
314
+ b o
315
+ w ere</w>
316
+ u c
317
+ n ing</w>
318
+ m ig
319
+ i c</w>
320
+ f ir
321
+ es e</w>
322
+ em s</w>
323
+ e l</w>
324
+ d o</w>
325
+ b r
326
+ as ed</w>
327
+ ab out</w>
328
+ E n
329
+ th ings</w>
330
+ lic ens
331
+ it s</w>
332
+ i m
333
+ g r
334
+ dat a</w>
335
+ y e
336
+ up on</w>
337
+ s ti
338
+ or d</w>
339
+ in s</w>
340
+ con t
341
+ w i
342
+ us ed</w>
343
+ si on</w>
344
+ p os
345
+ ou nd</w>
346
+ l a
347
+ f e
348
+ es s
349
+ com m
350
+ L ord</w>
351
+ 1 9
352
+ the m</w>
353
+ th ese</w>
354
+ on ly</w>
355
+ is h
356
+ in cl
357
+ et c</w>
358
+ el s</w>
359
+ el l</w>
360
+ c ol
361
+ c o
362
+ ac h</w>
363
+ a m</w>
364
+ a il
365
+ u l</w>
366
+ th ou</w>
367
+ ou r
368
+ n lo
369
+ in to</w>
370
+ i es</w>
371
+ hi m</w>
372
+ dow nlo
373
+ di z</w>
374
+ d er
375
+ al ly</w>
376
+ ac e</w>
377
+ Word Press</w>
378
+ som e</w>
379
+ s ince</w>
380
+ re m
381
+ pe o
382
+ peo ple</w>
383
+ pa in</w>
384
+ os t</w>
385
+ on s</w>
386
+ n o</w>
387
+ i ma
388
+ ho w</w>
389
+ for ma
390
+ en d
391
+ ad ing</w>
392
+ a re
393
+ S pain</w>
394
+ O p
395
+ u s</w>
396
+ por t</w>
397
+ ou s
398
+ in ter
399
+ ha d</w>
400
+ h ere</w>
401
+ en ti
402
+ be en</w>
403
+ ay s</w>
404
+ ur e</w>
405
+ t e
406
+ sh ould</w>
407
+ ser v
408
+ p re
409
+ l ay
410
+ g re
411
+ ff er
412
+ b ased</w>
413
+ ap art
414
+ a diz</w>
415
+ C h
416
+ C adiz</w>
417
+ w ould</w>
418
+ w are</w>
419
+ ver y</w>
420
+ u p</w>
421
+ syst ems</w>
422
+ o st
423
+ loc ated</w>
424
+ incl ud
425
+ hol d</w>
426
+ gl ish</w>
427
+ forma tion</w>
428
+ f in
429
+ en d</w>
430
+ d ev
431
+ ar k
432
+ Q u
433
+ Op en</w>
434
+ En glish</w>
435
+ wh o</w>
436
+ u ro
437
+ t ing</w>
438
+ su p
439
+ o re
440
+ n ess</w>
441
+ in formation</w>
442
+ g et</w>
443
+ f i
444
+ ec t</w>
445
+ b ec
446
+ ar d</w>
447
+ an ds</w>
448
+ an ce</w>
449
+ E uro
450
+ u e</w>
451
+ ord er</w>
452
+ id ay</w>
453
+ ic tion
454
+ ft ware</w>
455
+ f ul</w>
456
+ d is
457
+ at h</w>
458
+ a tions</w>
459
+ L u
460
+ wh en</w>
461
+ w ay</w>
462
+ t e</w>
463
+ sh e
464
+ pur ch
465
+ on g</w>
466
+ m ust</w>
467
+ fir st</w>
468
+ fil e</w>
469
+ em b
470
+ e p
471
+ e di
472
+ an g
473
+ ye a</w>
474
+ t ors</w>
475
+ st ati
476
+ stati sti
477
+ re s</w>
478
+ purch ase</w>
479
+ m ost</w>
480
+ m en</w>
481
+ m an</w>
482
+ l a</w>
483
+ it e</w>
484
+ i l</w>
485
+ h erefore</w>
486
+ fil es</w>
487
+ f t</w>
488
+ f a
489
+ an c
490
+ I n
491
+ w ell</w>
492
+ ti c
493
+ s ec
494
+ par is
495
+ p res
496
+ o ff
497
+ l in
498
+ ima ge</w>
499
+ iction ary</w>
500
+ i z
501
+ h op
502
+ h el
503
+ h e
504
+ g h</w>
505
+ f l
506
+ e d
507
+ com paris
508
+ a use</w>
509
+ P S
510
+ A S
511
+ v al
512
+ statisti c</w>
513
+ so ftware</w>
514
+ she et</w>
515
+ o k</w>
516
+ o g
517
+ m is
518
+ j o
519
+ hop s</w>
520
+ hol iday</w>
521
+ h ear
522
+ go od</w>
523
+ g o</w>
524
+ f e</w>
525
+ es hops</w>
526
+ en ce</w>
527
+ e i
528
+ downlo ading</w>
529
+ distribu tors</w>
530
+ di ffer
531
+ d ay</w>
532
+ comparis on</w>
533
+ an y</w>
534
+ am il
535
+ a ge</w>
536
+ a f
537
+ P s</w>
538
+ P H
539
+ N A</w>
540
+ AS Ps</w>
541
+ 6 8
542
+ v ing</w>
543
+ th y</w>
544
+ su ch</w>
545
+ pu bl
546
+ ord ing</w>
547
+ l ine</w>
548
+ i d</w>
549
+ gre at</w>
550
+ for m
551
+ f ul
552
+ ever y</w>
553
+ el y</w>
554
+ d et
555
+ d es
556
+ ch o
557
+ c oun
558
+ c ity</w>
559
+ be hold</w>
560
+ all ed</w>
561
+ W herefore</w>
562
+ PH P</w>
563
+ P r
564
+ wor ld</w>
565
+ wi th
566
+ wh at</w>
567
+ w r
568
+ w at
569
+ tion al</w>
570
+ si m
571
+ ren t</w>
572
+ p r
573
+ ord s</w>
574
+ o b
575
+ no w
576
+ mig ht</w>
577
+ m u
578
+ f amil
579
+ e as
580
+ d ing</w>
581
+ bec ause</w>
582
+ ark X
583
+ arkX Press</w>
584
+ acc ording</w>
585
+ a u
586
+ Qu arkXPress</w>
587
+ M edi
588
+ C om
589
+ 0 0</w>
590
+ w s</w>
591
+ us ers</w>
592
+ ti es</w>
593
+ th ing</w>
594
+ se e</w>
595
+ p ri
596
+ o m</w>
597
+ o c
598
+ l l</w>
599
+ k e
600
+ ic es</w>
601
+ em ent</w>
602
+ ec i
603
+ e p</w>
604
+ e m</w>
605
+ d uc
606
+ d er</w>
607
+ ar i
608
+ am p
609
+ af ter</w>
610
+ Medi a</w>
611
+ &apos; t</w>
612
+ ver sion</w>
613
+ v es</w>
614
+ u res</w>
615
+ u m</w>
616
+ ta r</w>
617
+ rig ht</w>
618
+ rig h
619
+ par t
620
+ ow n</w>
621
+ or y</w>
622
+ o ver</w>
623
+ o s</w>
624
+ o k
625
+ mu ch</w>
626
+ k now
627
+ in st
628
+ ig h
629
+ g en
630
+ ex c
631
+ differ ent</w>
632
+ d en</w>
633
+ ap p
634
+ ans a</w>
635
+ al lo
636
+ S tar</w>
637
+ Lu f
638
+ L NA</w>
639
+ D LNA</w>
640
+ 1 9</w>
641
+ y p
642
+ w ords</w>
643
+ v is
644
+ v en</w>
645
+ u r</w>
646
+ th ansa</w>
647
+ si d
648
+ sel f</w>
649
+ re n</w>
650
+ pu ter</w>
651
+ pl o
652
+ p ow
653
+ ot h</w>
654
+ n i
655
+ licens e</w>
656
+ li ke</w>
657
+ l ear
658
+ k now</w>
659
+ in ut
660
+ il e</w>
661
+ f ore
662
+ et s</w>
663
+ emb er</w>
664
+ d ec
665
+ cont ent</w>
666
+ com e</w>
667
+ c alled</w>
668
+ av ail
669
+ ar ound</w>
670
+ an d
671
+ O ff
672
+ Luf thansa</w>
673
+ F or
674
+ A l
675
+ w o</w>
676
+ up dat
677
+ u t</w>
678
+ u g
679
+ ti ve</w>
680
+ ta ke</w>
681
+ str uc
682
+ sid enti
683
+ s et</w>
684
+ s e</w>
685
+ s ame</w>
686
+ rec ei
687
+ re ad
688
+ pro duc
689
+ pl ay
690
+ p dat
691
+ ou s</w>
692
+ o l</w>
693
+ n al</w>
694
+ m at
695
+ ish ed</w>
696
+ ir it</w>
697
+ in ed</w>
698
+ i um</w>
699
+ h ot
700
+ g in
701
+ g ht</w>
702
+ f un
703
+ com pl
704
+ c ur
705
+ avail able</w>
706
+ a ir
707
+ W in
708
+ U pdat
709
+ wor ks</w>
710
+ with out</w>
711
+ un g</w>
712
+ tr ans
713
+ th ose</w>
714
+ th an</w>
715
+ sp on
716
+ sp eci
717
+ pro c
718
+ pa ge</w>
719
+ on al</w>
720
+ o ds</w>
721
+ ma de</w>
722
+ m es</w>
723
+ includ ed</w>
724
+ in i
725
+ ig n</w>
726
+ fe at
727
+ el l
728
+ ec ts</w>
729
+ ear s</w>
730
+ e w</w>
731
+ e Star</w>
732
+ dow s</w>
733
+ be fore</w>
734
+ b et
735
+ at or</w>
736
+ an s</w>
737
+ al s</w>
738
+ Win dows</w>
739
+ Updat eStar</w>
740
+ F ra
741
+ ä sidenti
742
+ äsidenti n</w>
743
+ ä ft
744
+ äft s
745
+ äfts ord
746
+ äftsord n
747
+ äftsordn ung</w>
748
+ z ur</w>
749
+ v id
750
+ um b
751
+ u plo
752
+ th rou
753
+ t yp
754
+ t wo</w>
755
+ spon s
756
+ si ble</w>
757
+ s m
758
+ rem ium</w>
759
+ re p
760
+ re gi
761
+ r e</w>
762
+ pow er</w>
763
+ per s
764
+ p an
765
+ or ing</w>
766
+ op en</w>
767
+ o w</w>
768
+ n ec
769
+ mig al</w>
770
+ is t</w>
771
+ ha ving</w>
772
+ h ath</w>
773
+ gi ven</w>
774
+ ev er</w>
775
+ et h</w>
776
+ es ch
777
+ esch äftsordnung</w>
778
+ en ter</w>
779
+ e a
780
+ con ta
781
+ com man
782
+ ch il
783
+ c or
784
+ c ap
785
+ b oth</w>
786
+ ati ve</w>
787
+ apart ments</w>
788
+ apart ment</w>
789
+ ad a</w>
790
+ S er
791
+ Pr äsidentin</w>
792
+ PS D</w>
793
+ H ot
794
+ G eschäftsordnung</w>
795
+ Fra u</w>
796
+ For migal</w>
797
+ C al
798
+ 2 .
799
+ 1 1</w>
800
+ y ears</w>
801
+ wh erefore</w>
802
+ u st
803
+ throu gh</w>
804
+ th en</w>
805
+ t l
806
+ t en</w>
807
+ sh al
808
+ shal t</w>
809
+ s ou
810
+ res t</w>
811
+ recei ve</w>
812
+ r u
813
+ ot ter
814
+ mer ci
815
+ ma ke</w>
816
+ m s</w>
817
+ m o
818
+ la w</w>
819
+ k et</w>
820
+ j ust</w>
821
+ ic k</w>
822
+ g rou
823
+ fun c
824
+ fore ver</w>
825
+ fin d</w>
826
+ f ace</w>
827
+ ear ch</w>
828
+ e ds</w>
829
+ e al
830
+ distribu tion</w>
831
+ d ays</w>
832
+ comman d
833
+ chil d
834
+ br ands</w>
835
+ bl ess
836
+ be gin
837
+ am ong</w>
838
+ am es</w>
839
+ ac t</w>
840
+ a in</w>
841
+ a bl
842
+ T h
843
+ P remium</w>
844
+ D e
845
+ wat ers</w>
846
+ v o
847
+ u es</w>
848
+ ti v
849
+ t y</w>
850
+ t ur
851
+ sup port</w>
852
+ spons oring</w>
853
+ r on
854
+ r an
855
+ qu i
856
+ pl ug
857
+ par t</w>
858
+ p as
859
+ otter y</w>
860
+ n or</w>
861
+ n er</w>
862
+ n ed</w>
863
+ m ine</w>
864
+ l ast</w>
865
+ it ed</w>
866
+ inut e</w>
867
+ in d
868
+ il li
869
+ ic ation</w>
870
+ gen er
871
+ g es</w>
872
+ g e
873
+ g al</w>
874
+ famil y</w>
875
+ f ol
876
+ f f</w>
877
+ er y</w>
878
+ er nal</w>
879
+ el i
880
+ d ra
881
+ cho ose</w>
882
+ child ren</w>
883
+ c at
884
+ be ach</w>
885
+ as es</w>
886
+ Off ers</w>
887
+ M inute</w>
888
+ L e
889
+ L ast</w>
890
+ G ods</w>
891
+ G er
892
+ D ictionary</w>
893
+ Cal a</w>
894
+ B o
895
+ 6 3
896
+ 1 5</w>
897
+ wr it
898
+ wh ile</w>
899
+ w ar
900
+ val ue</w>
901
+ v ed</w>
902
+ v ari
903
+ u al</w>
904
+ tr an
905
+ to ol</w>
906
+ t ri
907
+ t en
908
+ st ing</w>
909
+ s ed</w>
910
+ s ay</w>
911
+ re d</w>
912
+ pl e
913
+ on g
914
+ ol d</w>
915
+ n ers</w>
916
+ n a
917
+ merci al</w>
918
+ me di
919
+ m on
920
+ lo ok</w>
921
+ l et</w>
922
+ j ada</w>
923
+ ic i
924
+ hel p</w>
925
+ feat ures</w>
926
+ en tr
927
+ en c
928
+ eas y</w>
929
+ ear th</w>
930
+ d on</w>
931
+ con nec
932
+ ch ar
933
+ c ould</w>
934
+ be ing</w>
935
+ b ac
936
+ ar k</w>
937
+ amp ;</w>
938
+ a in
939
+ P y
940
+ H ost
941
+ A n
942
+ 2 0</w>
943
+ & amp;</w>
944
+ ye ar</w>
945
+ w ing</w>
946
+ w ant</w>
947
+ w a
948
+ v ers</w>
949
+ us er</w>
950
+ ur ing</w>
951
+ updat es</w>
952
+ ti mes</w>
953
+ t re
954
+ t ly</w>
955
+ syst em</w>
956
+ sp ea
957
+ sit e</w>
958
+ sim pl
959
+ sa id</w>
960
+ s k
961
+ s et
962
+ re v
963
+ re l
964
+ re f
965
+ pu t</w>
966
+ pro g
967
+ pl ace</w>
968
+ pe an</w>
969
+ p ho
970
+ pho to</w>
971
+ p at
972
+ oun t</w>
973
+ ot e</w>
974
+ or t</w>
975
+ og y</w>
976
+ ne y</w>
977
+ ne es</w>
978
+ ne eds</w>
979
+ ne ed</w>
980
+ n umb
981
+ n ame</w>
982
+ lay ers</w>
983
+ l l
984
+ k en</w>
985
+ ic al</w>
986
+ i a</w>
987
+ ful l</w>
988
+ fi ed</w>
989
+ fe w</w>
990
+ et y</w>
991
+ est s</w>
992
+ es si
993
+ dow n</w>
994
+ do m</w>
995
+ det ail
996
+ dat ab
997
+ d ictionary</w>
998
+ con f
999
+ com mercial</w>
1000
+ c a</w>
1001
+ b re
subword/tests/data/corpus.bpe.ref.en ADDED
The diff for this file is too large to render. See raw diff
 
subword/tests/data/corpus.en ADDED
The diff for this file is too large to render. See raw diff
 
subword/tests/test_bpe.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import unicode_literals
5
+ import unittest
6
+ import codecs
7
+
8
+ import os,sys,inspect
9
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
10
+ parentdir = os.path.dirname(currentdir)
11
+ sys.path.insert(0,parentdir)
12
+
13
+ from learn_bpe import learn_bpe
14
+ from apply_bpe import BPE
15
+
16
+
17
+ class TestBPELearnMethod(unittest.TestCase):
18
+
19
+ def test_learn_bpe(self):
20
+ infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
21
+ outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8')
22
+ learn_bpe(infile, outfile, 1000)
23
+ infile.close()
24
+ outfile.close()
25
+
26
+ outlines = open(os.path.join(currentdir,'data','bpe.out'))
27
+ reflines = open(os.path.join(currentdir,'data','bpe.ref'))
28
+
29
+ for line, line2 in zip(outlines, reflines):
30
+ self.assertEqual(line, line2)
31
+
32
+ outlines.close()
33
+ reflines.close()
34
+
35
+ class TestBPESegmentMethod(unittest.TestCase):
36
+
37
+ def setUp(self):
38
+
39
+ with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile:
40
+ self.bpe = BPE(bpefile)
41
+
42
+ self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
43
+ self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
44
+
45
+ def tearDown(self):
46
+
47
+ self.infile.close()
48
+ self.reffile.close()
49
+
50
+ def test_apply_bpe(self):
51
+
52
+ for line, ref in zip(self.infile, self.reffile):
53
+ out = self.bpe.process_line(line)
54
+ self.assertEqual(out, ref)
55
+
56
+ def test_trailing_whitespace(self):
57
+ """BPE.proces_line() preserves leading and trailing whitespace"""
58
+
59
+ orig = ' iron cement \n'
60
+ exp = ' ir@@ on c@@ ement \n'
61
+
62
+ out = self.bpe.process_line(orig)
63
+ self.assertEqual(out, exp)
64
+
65
+ def test_utf8_whitespace(self):
66
+ """UTF-8 whitespace is treated as normal character, not word boundary"""
67
+
68
+ orig = 'iron\xa0cement\n'
69
+ exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'
70
+
71
+ out = self.bpe.process_line(orig)
72
+ self.assertEqual(out, exp)
73
+
74
+ def test_empty_line(self):
75
+
76
+ orig = '\n'
77
+ exp = '\n'
78
+
79
+ out = self.bpe.process_line(orig)
80
+ self.assertEqual(out, exp)
81
+
82
+ if __name__ == '__main__':
83
+ unittest.main()
subword/tests/test_glossaries.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import unittest
5
+ import mock
6
+
7
+ import os,sys,inspect
8
+ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
9
+ parentdir = os.path.dirname(currentdir)
10
+ sys.path.insert(0,parentdir)
11
+
12
+ from apply_bpe import isolate_glossary, BPE
13
+
14
+ class TestIsolateGlossaryFunction(unittest.TestCase):
15
+
16
+ def setUp(self):
17
+ self.glossary = 'like'
18
+
19
+ def _run_test_case(self, test_case):
20
+ orig, expected = test_case
21
+ out = isolate_glossary(orig, self.glossary)
22
+ self.assertEqual(out, expected)
23
+
24
+ def test_empty_string(self):
25
+ orig = ''
26
+ exp = ['']
27
+ test_case = (orig, exp)
28
+ self._run_test_case(test_case)
29
+
30
+ def test_no_glossary(self):
31
+ orig = 'word'
32
+ exp = ['word']
33
+ test_case = (orig, exp)
34
+ self._run_test_case(test_case)
35
+
36
+ def test_isolated_glossary(self):
37
+ orig = 'like'
38
+ exp = ['like']
39
+ test_case = (orig, exp)
40
+ self._run_test_case(test_case)
41
+
42
+ def test_word_one_side(self):
43
+ orig = 'likeword'
44
+ exp = ['like', 'word']
45
+ test_case = (orig, exp)
46
+ self._run_test_case(test_case)
47
+
48
+ def test_words_both_sides(self):
49
+ orig = 'wordlikeword'
50
+ exp = ['word', 'like', 'word']
51
+ test_case = (orig, exp)
52
+ self._run_test_case(test_case)
53
+
54
+ def test_back_to_back_glossary(self):
55
+ orig = 'likelike'
56
+ exp = ['like', 'like']
57
+ test_case = (orig, exp)
58
+ self._run_test_case(test_case)
59
+
60
+ def test_multiple_glossaries(self):
61
+ orig = 'wordlikewordlike'
62
+ exp = ['word', 'like', 'word', 'like']
63
+ test_case = (orig, exp)
64
+ self._run_test_case(test_case)
65
+
66
+ class TestBPEIsolateGlossariesMethod(unittest.TestCase):
67
+
68
+ def setUp(self):
69
+
70
+ amock = mock.MagicMock()
71
+ amock.readline.return_value = 'something'
72
+ glossaries = ['like', 'Manuel', 'USA']
73
+ self.bpe = BPE(amock, glossaries=glossaries)
74
+
75
+ def _run_test_case(self, test_case):
76
+ orig, expected = test_case
77
+ out = self.bpe._isolate_glossaries(orig)
78
+ self.assertEqual(out, expected)
79
+
80
+ def test_multiple_glossaries(self):
81
+ orig = 'wordlikeUSAwordManuelManuelwordUSA'
82
+ exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA']
83
+ test_case = (orig, exp)
84
+ self._run_test_case(test_case)
85
+
86
+ class TestRegexIsolateGlossaries(unittest.TestCase):
87
+
88
+ def setUp(self):
89
+
90
+ amock = mock.MagicMock()
91
+ amock.readline.return_value = 'something'
92
+ glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
93
+ self.bpe = BPE(amock, glossaries=glossaries)
94
+
95
+ def _run_test_case(self, test_case):
96
+ orig, expected = test_case
97
+ out = self.bpe._isolate_glossaries(orig)
98
+ self.assertEqual(out, expected)
99
+
100
+ def test_regex_glossaries(self):
101
+ orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>'
102
+ exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>']
103
+ test_case = (orig, exp)
104
+ self._run_test_case(test_case)
105
+
106
+ def encode_mock(segment, x2, x3, x4, x5, x6, x7, glosses, dropout):
107
+ if glosses.match(segment):
108
+ return (segment,)
109
+ else:
110
+ l = len(segment)
111
+ return (segment[:l//2], segment[l//2:])
112
+
113
+ class TestBPESegmentMethod(unittest.TestCase):
114
+
115
+ def setUp(self):
116
+
117
+ amock = mock.MagicMock()
118
+ amock.readline.return_value = 'something'
119
+ glossaries = ['like', 'Manuel', 'USA']
120
+ self.bpe = BPE(amock, glossaries=glossaries)
121
+
122
+ @mock.patch('apply_bpe.encode', side_effect=encode_mock)
123
+ def _run_test_case(self, test_case, encode_function):
124
+
125
+ orig, expected = test_case
126
+ out = self.bpe.segment(orig)
127
+
128
+ self.assertEqual(out, expected)
129
+
130
+ def test_multiple_glossaries(self):
131
+ orig = 'wordlikeword likeManuelword'
132
+ exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
133
+ test_case = (orig, exp)
134
+ self._run_test_case(test_case)
135
+
136
+ if __name__ == '__main__':
137
+ unittest.main()