Spaces:

abhaskumarsinha
/

MinimalGPT-Felis_Catus

Running

App Files Files Community

abhaskumarsinha commited on Jun 12, 2023

Commit

70d3cae

•

1 Parent(s): 73f96f1

Upload 21 files

Browse files

Files changed (21) hide show

subword/.ipynb_checkpoints/encoding-checkpoint.ipynb +700 -0
subword/__init__.py +0 -0
subword/__pycache__/__init__.cpython-39.pyc +0 -0
subword/__pycache__/apply_bpe.cpython-39.pyc +0 -0
subword/apply_bpe.py +457 -0
subword/bpe_toy.py +51 -0
subword/chrF.py +139 -0
subword/dataset/codec.txt +0 -0
subword/encoding.ipynb +700 -0
subword/get_vocab.py +87 -0
subword/learn_bpe.py +372 -0
subword/learn_joint_bpe_and_vocab.py +166 -0
subword/segment_char_ngrams.py +95 -0
subword/subword_nmt.py +97 -0
subword/tests/__init__.py +0 -0
subword/tests/data/.gitignore +1 -0
subword/tests/data/bpe.ref +1001 -0
subword/tests/data/corpus.bpe.ref.en +0 -0
subword/tests/data/corpus.en +0 -0
subword/tests/test_bpe.py +83 -0
subword/tests/test_glossaries.py +137 -0

subword/.ipynb_checkpoints/encoding-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,700 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9644db35",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  0%|          | 0/20000 [00:00<?, ?it/s]\n",
+      "  0%|          | 1/20000 [00:00<38:40,  8.62it/s]\n",
+      "  0%|          | 2/20000 [00:00<1:31:59,  3.62it/s]\n",
+      "  0%|          | 3/20000 [00:00<1:21:11,  4.11it/s]\n",
+      "  0%|          | 4/20000 [00:01<1:48:20,  3.08it/s]\n",
+      "  0%|          | 6/20000 [00:01<1:03:27,  5.25it/s]\n",
+      "  0%|          | 7/20000 [00:01<1:12:17,  4.61it/s]\n",
+      "  0%|          | 8/20000 [00:01<1:10:13,  4.74it/s]\n",
+      "  0%|          | 10/20000 [00:02<1:09:39,  4.78it/s]\n",
+      "  0%|          | 13/20000 [00:02<42:59,  7.75it/s]  \n",
+      "  0%|          | 16/20000 [00:02<30:25, 10.95it/s]\n",
+      "  0%|          | 19/20000 [00:02<24:50, 13.41it/s]\n",
+      "  0%|          | 21/20000 [00:03<36:58,  9.01it/s]\n",
+      "  0%|          | 23/20000 [00:03<35:20,  9.42it/s]\n",
+      "  0%|          | 25/20000 [00:03<31:30, 10.56it/s]\n",
+      "  0%|          | 27/20000 [00:03<33:12, 10.03it/s]\n",
+      "  0%|          | 29/20000 [00:03<33:30,  9.93it/s]\n",
+      "  0%|          | 31/20000 [00:03<35:06,  9.48it/s]\n",
+      "  0%|          | 33/20000 [00:04<37:03,  8.98it/s]\n",
+      "  0%|          | 37/20000 [00:04<26:59, 12.32it/s]\n",
+      "  0%|          | 39/20000 [00:04<26:54, 12.37it/s]\n",
+      "  0%|          | 42/20000 [00:04<22:51, 14.55it/s]\n",
+      "  0%|          | 46/20000 [00:04<19:15, 17.27it/s]\n",
+      "  0%|          | 48/20000 [00:05<22:34, 14.73it/s]\n",
+      "  0%|          | 50/20000 [00:05<23:39, 14.06it/s]\n",
+      "  0%|          | 52/20000 [00:05<23:30, 14.14it/s]\n",
+      "  0%|          | 55/20000 [00:05<20:12, 16.45it/s]\n",
+      "  0%|          | 58/20000 [00:05<19:09, 17.35it/s]\n",
+      "  0%|          | 60/20000 [00:05<19:25, 17.11it/s]\n",
+      "  0%|          | 63/20000 [00:06<28:45, 11.56it/s]\n",
+      "  0%|          | 70/20000 [00:06<16:14, 20.45it/s]\n",
+      "  0%|          | 75/20000 [00:06<14:15, 23.28it/s]\n",
+      "  0%|          | 78/20000 [00:06<14:28, 22.94it/s]\n",
+      "  0%|          | 81/20000 [00:06<16:06, 20.62it/s]\n",
+      "  0%|          | 85/20000 [00:06<15:13, 21.81it/s]\n",
+      "  0%|          | 88/20000 [00:07<15:39, 21.20it/s]\n",
+      "  0%|          | 92/20000 [00:07<13:28, 24.61it/s]\n",
+      "  0%|          | 95/20000 [00:07<13:32, 24.50it/s]\n",
+      "  0%|          | 99/20000 [00:07<12:06, 27.40it/s]\n",
+      "  1%|          | 102/20000 [00:07<12:55, 25.65it/s]\n",
+      "  1%|          | 105/20000 [00:07<12:46, 25.95it/s]\n",
+      "  1%|          | 112/20000 [00:07<12:02, 27.54it/s]\n",
+      "  1%|          | 118/20000 [00:08<10:02, 33.00it/s]\n",
+      "  1%|          | 122/20000 [00:08<10:12, 32.46it/s]\n",
+      "  1%|          | 127/20000 [00:08<10:07, 32.73it/s]\n",
+      "  1%|          | 138/20000 [00:08<06:56, 47.66it/s]\n",
+      "  1%|          | 144/20000 [00:08<06:47, 48.73it/s]\n",
+      "  1%|          | 150/20000 [00:08<07:00, 47.21it/s]\n",
+      "  1%|          | 156/20000 [00:08<06:44, 49.01it/s]\n",
+      "  1%|          | 162/20000 [00:08<06:47, 48.71it/s]\n",
+      "  1%|          | 169/20000 [00:09<06:38, 49.81it/s]\n",
+      "  1%|          | 176/20000 [00:09<06:28, 51.03it/s]\n",
+      "  1%|          | 184/20000 [00:09<05:58, 55.22it/s]\n",
+      "  1%|          | 190/20000 [00:09<08:05, 40.78it/s]\n",
+      "  1%|          | 197/20000 [00:09<07:30, 43.91it/s]\n",
+      "  1%|1         | 202/20000 [00:09<07:53, 41.79it/s]\n",
+      "  1%|1         | 210/20000 [00:10<06:51, 48.15it/s]\n",
+      "  1%|1         | 220/20000 [00:10<05:35, 59.01it/s]\n",
+      "  1%|1         | 229/20000 [00:10<05:05, 64.72it/s]\n",
+      "  1%|1         | 236/20000 [00:10<05:37, 58.47it/s]\n",
+      "  1%|1         | 244/20000 [00:10<05:22, 61.18it/s]\n",
+      "  1%|1         | 251/20000 [00:10<05:31, 59.65it/s]\n",
+      "  1%|1         | 259/20000 [00:10<05:11, 63.33it/s]\n",
+      "  1%|1         | 266/20000 [00:10<05:25, 60.70it/s]\n",
+      "  1%|1         | 273/20000 [00:10<05:31, 59.42it/s]\n",
+      "  1%|1         | 282/20000 [00:11<04:57, 66.26it/s]\n",
+      "  1%|1         | 289/20000 [00:11<05:00, 65.52it/s]\n",
+      "  1%|1         | 296/20000 [00:11<05:10, 63.47it/s]\n",
+      "  2%|1         | 303/20000 [00:11<07:06, 46.23it/s]\n",
+      "  2%|1         | 313/20000 [00:11<05:41, 57.60it/s]\n",
+      "  2%|1         | 324/20000 [00:11<04:54, 66.87it/s]\n",
+      "  2%|1         | 335/20000 [00:11<04:16, 76.56it/s]\n",
+      "  2%|1         | 346/20000 [00:11<03:50, 85.09it/s]\n",
+      "  2%|1         | 357/20000 [00:12<03:38, 90.06it/s]\n",
+      "  2%|1         | 367/20000 [00:12<03:46, 86.85it/s]\n",
+      "  2%|1         | 377/20000 [00:12<03:41, 88.70it/s]\n",
+      "  2%|1         | 387/20000 [00:12<03:43, 87.95it/s]\n",
+      "  2%|1         | 396/20000 [00:12<03:49, 85.48it/s]\n",
+      "  2%|2         | 405/20000 [00:12<04:04, 80.15it/s]\n",
+      "  2%|2         | 416/20000 [00:12<03:42, 87.82it/s]\n",
+      "  2%|2         | 429/20000 [00:12<03:19, 98.03it/s]\n",
+      "  2%|2         | 439/20000 [00:13<03:35, 90.74it/s]\n",
+      "  2%|2         | 450/20000 [00:13<03:24, 95.65it/s]\n",
+      "  2%|2         | 462/20000 [00:13<03:13, 100.98it/s]\n",
+      "  2%|2         | 473/20000 [00:13<03:16, 99.60it/s] \n",
+      "  2%|2         | 484/20000 [00:13<03:28, 93.64it/s]\n",
+      "  2%|2         | 494/20000 [00:13<03:30, 92.86it/s]\n",
+      "  3%|2         | 504/20000 [00:13<04:34, 70.99it/s]\n",
+      "  3%|2         | 520/20000 [00:13<03:34, 90.96it/s]\n",
+      "  3%|2         | 534/20000 [00:14<03:12, 101.38it/s]\n",
+      "  3%|2         | 547/20000 [00:14<03:01, 107.03it/s]\n",
+      "  3%|2         | 559/20000 [00:14<03:02, 106.25it/s]\n",
+      "  3%|2         | 571/20000 [00:14<03:14, 99.81it/s] \n",
+      "  3%|2         | 582/20000 [00:14<03:13, 100.11it/s]\n",
+      "  3%|2         | 595/20000 [00:14<03:03, 105.49it/s]\n",
+      "  3%|3         | 606/20000 [00:14<03:07, 103.63it/s]\n",
+      "  3%|3         | 625/20000 [00:14<02:33, 126.08it/s]\n",
+      "  3%|3         | 643/20000 [00:14<02:17, 140.29it/s]\n",
+      "  3%|3         | 658/20000 [00:15<02:23, 135.01it/s]\n",
+      "  3%|3         | 672/20000 [00:15<02:32, 126.59it/s]\n",
+      "  3%|3         | 685/20000 [00:15<02:42, 119.19it/s]\n",
+      "  3%|3         | 698/20000 [00:15<02:46, 116.22it/s]\n",
+      "  4%|3         | 710/20000 [00:15<02:49, 113.91it/s]\n",
+      "  4%|3         | 727/20000 [00:15<02:31, 127.58it/s]\n",
+      "  4%|3         | 744/20000 [00:15<02:18, 139.24it/s]\n",
+      "  4%|3         | 759/20000 [00:15<03:10, 101.19it/s]\n",
+      "  4%|3         | 771/20000 [00:16<03:03, 104.67it/s]\n",
+      "  4%|3         | 783/20000 [00:16<03:02, 105.07it/s]\n",
+      "  4%|3         | 795/20000 [00:16<03:14, 98.92it/s] \n",
+      "  4%|4         | 807/20000 [00:16<03:06, 102.82it/s]\n",
+      "  4%|4         | 822/20000 [00:16<02:50, 112.34it/s]\n",
+      "  4%|4         | 834/20000 [00:16<02:55, 109.45it/s]\n",
+      "  4%|4         | 847/20000 [00:16<02:47, 114.28it/s]\n",
+      "  4%|4         | 860/20000 [00:16<02:42, 117.94it/s]\n",
+      "  4%|4         | 873/20000 [00:16<02:46, 114.58it/s]\n",
+      "  4%|4         | 885/20000 [00:17<02:58, 106.97it/s]\n",
+      "  4%|4         | 896/20000 [00:17<03:07, 102.08it/s]\n",
+      "  5%|4         | 908/20000 [00:17<03:01, 105.42it/s]\n",
+      "  5%|4         | 924/20000 [00:17<02:42, 117.67it/s]\n",
+      "  5%|4         | 940/20000 [00:17<02:28, 128.26it/s]\n",
+      "  5%|4         | 954/20000 [00:17<02:24, 131.54it/s]\n",
+      "  5%|4         | 968/20000 [00:17<02:34, 123.37it/s]\n",
+      "  5%|4         | 982/20000 [00:17<02:31, 125.85it/s]\n",
+      "  5%|4         | 995/20000 [00:18<02:39, 119.06it/s]\n",
+      "  5%|5         | 1008/20000 [00:18<03:38, 86.92it/s]\n",
+      "  5%|5         | 1024/20000 [00:18<03:04, 102.72it/s]\n",
+      "  5%|5         | 1041/20000 [00:18<02:40, 118.03it/s]\n",
+      "  5%|5         | 1055/20000 [00:18<02:34, 122.63it/s]\n",
+      "  5%|5         | 1069/20000 [00:18<02:31, 124.89it/s]\n",
+      "  5%|5         | 1083/20000 [00:18<02:34, 122.68it/s]\n",
+      "  5%|5         | 1096/20000 [00:18<02:39, 118.60it/s]\n",
+      "  6%|5         | 1110/20000 [00:19<02:32, 123.65it/s]\n",
+      "  6%|5         | 1127/20000 [00:19<02:18, 136.02it/s]\n",
+      "  6%|5         | 1145/20000 [00:19<02:08, 146.74it/s]\n",
+      "  6%|5         | 1161/20000 [00:19<02:06, 148.82it/s]\n",
+      "  6%|5         | 1177/20000 [00:19<02:04, 151.15it/s]\n",
+      "  6%|5         | 1193/20000 [00:19<02:09, 145.70it/s]\n",
+      "  6%|6         | 1208/20000 [00:19<02:10, 144.47it/s]\n",
+      "  6%|6         | 1227/20000 [00:19<02:00, 156.04it/s]\n",
+      "  6%|6         | 1244/20000 [00:19<01:57, 159.13it/s]\n",
+      "  6%|6         | 1261/20000 [00:19<02:01, 154.24it/s]\n",
+      "  6%|6         | 1277/20000 [00:20<02:09, 145.11it/s]\n",
+      "  6%|6         | 1292/20000 [00:20<02:10, 143.32it/s]\n",
+      "  7%|6         | 1307/20000 [00:20<03:02, 102.65it/s]\n",
+      "  7%|6         | 1330/20000 [00:20<02:23, 130.14it/s]\n",
+      "  7%|6         | 1348/20000 [00:20<02:13, 139.97it/s]\n",
+      "  7%|6         | 1368/20000 [00:20<02:02, 152.70it/s]\n",
+      "  7%|6         | 1385/20000 [00:20<02:00, 153.99it/s]\n",
+      "  7%|7         | 1402/20000 [00:21<02:07, 146.16it/s]\n",
+      "  7%|7         | 1423/20000 [00:21<01:55, 161.53it/s]\n",
+      "  7%|7         | 1441/20000 [00:21<01:52, 165.17it/s]\n",
+      "  7%|7         | 1459/20000 [00:21<01:55, 160.82it/s]\n",
+      "  7%|7         | 1476/20000 [00:21<02:03, 149.82it/s]\n",
+      "  7%|7         | 1492/20000 [00:21<02:08, 143.79it/s]\n",
+      "  8%|7         | 1507/20000 [00:21<02:10, 142.06it/s]\n",
+      "  8%|7         | 1530/20000 [00:21<01:52, 164.72it/s]\n",
+      "  8%|7         | 1548/20000 [00:21<01:50, 167.09it/s]\n",
+      "  8%|7         | 1565/20000 [00:22<01:49, 167.90it/s]\n",
+      "  8%|7         | 1582/20000 [00:22<01:53, 161.57it/s]\n",
+      "  8%|7         | 1599/20000 [00:22<01:56, 158.15it/s]\n",
+      "  8%|8         | 1617/20000 [00:22<01:51, 164.25it/s]\n",
+      "  8%|8         | 1637/20000 [00:22<01:45, 174.45it/s]\n",
+      "  8%|8         | 1657/20000 [00:22<01:41, 181.32it/s]\n",
+      "  8%|8         | 1676/20000 [00:22<01:40, 182.25it/s]\n",
+      "  8%|8         | 1695/20000 [00:22<01:46, 171.94it/s]\n",
+      "  9%|8         | 1718/20000 [00:22<01:38, 186.12it/s]\n",
+      "  9%|8         | 1739/20000 [00:22<01:34, 192.48it/s]\n",
+      "  9%|8         | 1759/20000 [00:23<02:13, 136.76it/s]\n",
+      "  9%|8         | 1777/20000 [00:23<02:04, 145.80it/s]\n",
+      "  9%|8         | 1794/20000 [00:23<02:04, 146.68it/s]\n",
+      "  9%|9         | 1814/20000 [00:23<01:53, 159.63it/s]\n",
+      "  9%|9         | 1836/20000 [00:23<01:43, 175.04it/s]\n",
+      "  9%|9         | 1856/20000 [00:23<01:41, 179.30it/s]\n",
+      "  9%|9         | 1875/20000 [00:23<01:42, 176.01it/s]\n",
+      "  9%|9         | 1894/20000 [00:23<01:45, 171.34it/s]\n",
+      " 10%|9         | 1915/20000 [00:24<01:39, 180.93it/s]\n",
+      " 10%|9         | 1937/20000 [00:24<01:34, 190.79it/s]\n",
+      " 10%|9         | 1957/20000 [00:24<01:35, 189.63it/s]\n",
+      " 10%|9         | 1977/20000 [00:24<01:36, 186.73it/s]\n",
+      " 10%|9         | 1996/20000 [00:24<01:42, 175.72it/s]\n",
+      " 10%|#         | 2018/20000 [00:24<01:35, 187.87it/s]\n",
+      " 10%|#         | 2046/20000 [00:24<01:24, 212.03it/s]\n",
+      " 10%|#         | 2068/20000 [00:24<01:27, 204.39it/s]\n",
+      " 10%|#         | 2089/20000 [00:24<01:31, 195.56it/s]\n",
+      " 11%|#         | 2109/20000 [00:25<01:33, 192.02it/s]\n",
+      " 11%|#         | 2140/20000 [00:25<01:19, 224.10it/s]\n",
+      " 11%|#         | 2165/20000 [00:25<01:17, 230.78it/s]\n",
+      " 11%|#         | 2189/20000 [00:25<01:18, 225.64it/s]\n",
+      " 11%|#1        | 2212/20000 [00:25<01:24, 210.15it/s]\n",
+      " 11%|#1        | 2236/20000 [00:25<01:21, 217.71it/s]\n",
+      " 11%|#1        | 2259/20000 [00:25<01:22, 215.12it/s]\n",
+      " 11%|#1        | 2281/20000 [00:25<01:24, 208.87it/s]\n",
+      " 12%|#1        | 2303/20000 [00:25<01:35, 185.14it/s]\n",
+      " 12%|#1        | 2333/20000 [00:26<01:22, 213.67it/s]\n",
+      " 12%|#1        | 2357/20000 [00:26<01:19, 220.73it/s]\n",
+      " 12%|#1        | 2380/20000 [00:26<01:21, 214.95it/s]\n",
+      " 12%|#2        | 2402/20000 [00:26<02:03, 142.71it/s]\n",
+      " 12%|#2        | 2432/20000 [00:26<01:40, 174.08it/s]\n",
+      " 12%|#2        | 2459/20000 [00:26<01:29, 195.81it/s]\n",
+      " 12%|#2        | 2482/20000 [00:26<01:28, 198.82it/s]\n",
+      " 13%|#2        | 2505/20000 [00:27<01:29, 195.33it/s]\n",
+      " 13%|#2        | 2538/20000 [00:27<01:16, 228.52it/s]\n",
+      " 13%|#2        | 2566/20000 [00:27<01:11, 242.22it/s]\n",
+      " 13%|#2        | 2592/20000 [00:27<01:15, 230.01it/s]\n",
+      " 13%|#3        | 2620/20000 [00:27<01:11, 243.40it/s]\n",
+      " 13%|#3        | 2651/20000 [00:27<01:06, 261.84it/s]\n",
+      " 13%|#3        | 2678/20000 [00:27<01:06, 260.46it/s]\n",
+      " 14%|#3        | 2705/20000 [00:27<01:08, 252.37it/s]\n",
+      " 14%|#3        | 2740/20000 [00:27<01:02, 278.24it/s]\n",
+      " 14%|#3        | 2769/20000 [00:27<01:05, 264.95it/s]\n",
+      " 14%|#3        | 2796/20000 [00:28<01:09, 247.16it/s]\n",
+      " 14%|#4        | 2828/20000 [00:28<01:04, 264.60it/s]\n",
+      " 14%|#4        | 2855/20000 [00:28<01:05, 260.34it/s]\n",
+      " 14%|#4        | 2882/20000 [00:28<01:09, 247.20it/s]\n",
+      " 15%|#4        | 2908/20000 [00:28<01:12, 236.53it/s]\n",
+      " 15%|#4        | 2952/20000 [00:28<00:58, 291.10it/s]\n",
+      " 15%|#4        | 2982/20000 [00:28<01:03, 266.27it/s]\n",
+      " 15%|#5        | 3010/20000 [00:28<01:03, 267.07it/s]\n",
+      " 15%|#5        | 3039/20000 [00:29<01:02, 270.37it/s]\n",
+      " 15%|#5        | 3068/20000 [00:29<01:01, 273.53it/s]\n",
+      " 15%|#5        | 3096/20000 [00:29<01:04, 263.45it/s]\n",
+      " 16%|#5        | 3129/20000 [00:29<00:59, 281.96it/s]\n",
+      " 16%|#5        | 3160/20000 [00:29<00:58, 287.48it/s]\n",
+      " 16%|#5        | 3190/20000 [00:29<01:00, 279.05it/s]\n",
+      " 16%|#6        | 3226/20000 [00:29<00:55, 301.05it/s]\n",
+      " 16%|#6        | 3257/20000 [00:29<00:55, 303.61it/s]\n",
+      " 16%|#6        | 3288/20000 [00:29<00:56, 293.52it/s]\n",
+      " 17%|#6        | 3318/20000 [00:29<00:56, 293.68it/s]\n",
+      " 17%|#6        | 3357/20000 [00:30<00:52, 318.68it/s]\n",
+      " 17%|#6        | 3390/20000 [00:30<00:58, 284.80it/s]\n",
+      " 17%|#7        | 3420/20000 [00:30<01:21, 204.06it/s]\n",
+      " 17%|#7        | 3459/20000 [00:30<01:08, 242.62it/s]\n",
+      " 17%|#7        | 3491/20000 [00:30<01:03, 260.00it/s]\n",
+      " 18%|#7        | 3535/20000 [00:30<00:54, 304.04it/s]\n",
+      " 18%|#7        | 3573/20000 [00:30<00:50, 323.92it/s]\n",
+      " 18%|#8        | 3608/20000 [00:31<00:55, 296.34it/s]\n",
+      " 18%|#8        | 3653/20000 [00:31<00:48, 336.01it/s]\n",
+      " 18%|#8        | 3689/20000 [00:31<00:49, 329.16it/s]\n",
+      " 19%|#8        | 3733/20000 [00:31<00:45, 358.11it/s]\n",
+      " 19%|#8        | 3771/20000 [00:31<00:44, 361.17it/s]\n",
+      " 19%|#9        | 3809/20000 [00:31<00:47, 342.31it/s]\n",
+      " 19%|#9        | 3861/20000 [00:31<00:41, 390.94it/s]\n",
+      " 20%|#9        | 3902/20000 [00:31<00:42, 378.22it/s]\n",
+      " 20%|#9        | 3968/20000 [00:31<00:35, 455.02it/s]\n",
+      " 20%|##        | 4015/20000 [00:32<00:37, 427.77it/s]\n",
+      " 20%|##        | 4066/20000 [00:32<00:35, 449.03it/s]\n",
+      " 21%|##        | 4112/20000 [00:32<00:39, 404.45it/s]\n",
+      " 21%|##        | 4174/20000 [00:32<00:34, 458.89it/s]\n",
+      " 21%|##1       | 4222/20000 [00:32<00:35, 442.90it/s]\n",
+      " 21%|##1       | 4271/20000 [00:32<00:34, 454.41it/s]\n",
+      " 22%|##1       | 4329/20000 [00:32<00:32, 489.36it/s]\n",
+      " 22%|##1       | 4387/20000 [00:32<00:30, 515.14it/s]\n",
+      " 22%|##2       | 4447/20000 [00:32<00:28, 538.10it/s]\n",
+      " 23%|##2       | 4502/20000 [00:33<00:32, 478.73it/s]\n",
+      " 23%|##2       | 4563/20000 [00:33<00:30, 512.67it/s]\n",
+      " 23%|##3       | 4616/20000 [00:33<00:30, 496.81it/s]\n",
+      " 23%|##3       | 4677/20000 [00:33<00:29, 527.98it/s]\n",
+      " 24%|##3       | 4733/20000 [00:33<00:28, 537.01it/s]\n",
+      " 24%|##3       | 4788/20000 [00:33<00:28, 534.59it/s]\n",
+      " 24%|##4       | 4864/20000 [00:33<00:25, 599.65it/s]\n",
+      " 25%|##4       | 4925/20000 [00:33<00:25, 595.70it/s]\n",
+      " 25%|##4       | 4994/20000 [00:33<00:24, 617.81it/s]\n",
+      " 25%|##5       | 5079/20000 [00:33<00:21, 683.71it/s]\n",
+      " 26%|##5       | 5148/20000 [00:34<00:35, 419.97it/s]\n",
+      " 26%|##6       | 5203/20000 [00:34<00:33, 446.58it/s]\n",
+      " 26%|##6       | 5289/20000 [00:34<00:27, 538.90it/s]\n",
+      " 27%|##6       | 5377/20000 [00:34<00:23, 622.07it/s]\n",
+      " 27%|##7       | 5471/20000 [00:34<00:20, 703.42it/s]\n",
+      " 28%|##7       | 5549/20000 [00:36<01:35, 150.73it/s]\n",
+      " 28%|##8       | 5606/20000 [00:36<01:37, 147.12it/s]\n",
+      " 28%|##8       | 5650/20000 [00:36<01:34, 151.57it/s]\n",
+      " 28%|##8       | 5686/20000 [00:37<01:33, 153.50it/s]\n",
+      " 29%|##8       | 5716/20000 [00:37<01:32, 154.45it/s]\n",
+      " 29%|##8       | 5742/20000 [00:37<01:29, 158.75it/s]\n",
+      " 29%|##8       | 5766/20000 [00:37<01:28, 160.05it/s]\n",
+      " 29%|##8       | 5788/20000 [00:37<01:29, 159.44it/s]\n",
+      " 29%|##9       | 5808/20000 [00:37<01:29, 158.22it/s]\n",
+      " 29%|##9       | 5827/20000 [00:37<01:27, 162.78it/s]\n",
+      " 29%|##9       | 5846/20000 [00:38<01:25, 165.07it/s]\n",
+      " 29%|##9       | 5864/20000 [00:38<01:25, 164.71it/s]\n",
+      " 29%|##9       | 5882/20000 [00:38<01:26, 162.88it/s]\n",
+      " 29%|##9       | 5899/20000 [00:38<01:30, 155.66it/s]\n",
+      " 30%|##9       | 5916/20000 [00:38<01:29, 158.09it/s]\n",
+      " 30%|##9       | 5935/20000 [00:38<01:24, 166.09it/s]\n",
+      " 30%|##9       | 5954/20000 [00:38<01:22, 169.84it/s]\n",
+      " 30%|##9       | 5972/20000 [00:38<01:21, 171.23it/s]\n",
+      " 30%|##9       | 5990/20000 [00:38<01:22, 170.81it/s]\n",
+      " 30%|###       | 6008/20000 [00:39<01:23, 167.70it/s]\n",
+      " 30%|###       | 6027/20000 [00:39<01:20, 173.01it/s]\n",
+      " 30%|###       | 6046/20000 [00:39<01:19, 175.35it/s]\n",
+      " 30%|###       | 6064/20000 [00:39<01:20, 172.23it/s]\n",
+      " 30%|###       | 6082/20000 [00:39<01:21, 170.55it/s]\n",
+      " 30%|###       | 6100/20000 [00:39<01:23, 167.05it/s]\n",
+      " 31%|###       | 6118/20000 [00:39<01:21, 170.70it/s]\n",
+      " 31%|###       | 6138/20000 [00:39<01:17, 178.09it/s]\n",
+      " 31%|###       | 6157/20000 [00:39<01:16, 179.96it/s]\n",
+      " 31%|###       | 6176/20000 [00:39<01:18, 177.21it/s]\n",
+      " 31%|###       | 6194/20000 [00:40<01:18, 174.99it/s]\n",
+      " 31%|###1      | 6212/20000 [00:40<01:19, 173.44it/s]\n",
+      " 31%|###1      | 6232/20000 [00:40<01:16, 180.04it/s]\n",
+      " 31%|###1      | 6251/20000 [00:40<01:16, 179.80it/s]\n",
+      " 31%|###1      | 6270/20000 [00:40<01:19, 172.28it/s]\n",
+      " 31%|###1      | 6288/20000 [00:40<01:20, 170.14it/s]\n",
+      " 32%|###1      | 6306/20000 [00:40<01:22, 165.92it/s]\n",
+      " 32%|###1      | 6327/20000 [00:40<01:16, 178.17it/s]\n",
+      " 32%|###1      | 6347/20000 [00:40<01:14, 183.85it/s]\n",
+      " 32%|###1      | 6366/20000 [00:41<01:14, 182.46it/s]\n",
+      " 32%|###1      | 6385/20000 [00:41<01:17, 175.53it/s]\n",
+      " 32%|###2      | 6403/20000 [00:41<01:21, 166.92it/s]\n",
+      " 32%|###2      | 6423/20000 [00:41<01:17, 174.57it/s]\n",
+      " 32%|###2      | 6443/20000 [00:41<01:15, 179.69it/s]\n",
+      " 32%|###2      | 6462/20000 [00:41<01:16, 178.06it/s]\n",
+      " 32%|###2      | 6480/20000 [00:41<01:17, 174.17it/s]\n",
+      " 32%|###2      | 6498/20000 [00:41<01:19, 170.04it/s]\n",
+      " 33%|###2      | 6517/20000 [00:41<01:16, 175.15it/s]\n",
+      " 33%|###2      | 6538/20000 [00:42<01:13, 184.06it/s]\n",
+      " 33%|###2      | 6558/20000 [00:42<01:11, 187.58it/s]\n",
+      " 33%|###2      | 6577/20000 [00:42<01:12, 183.99it/s]\n",
+      " 33%|###2      | 6596/20000 [00:42<01:14, 180.51it/s]\n",
+      " 33%|###3      | 6615/20000 [00:42<01:14, 180.64it/s]\n",
+      " 33%|###3      | 6636/20000 [00:42<01:11, 187.45it/s]\n",
+      " 33%|###3      | 6656/20000 [00:42<01:10, 189.43it/s]\n",
+      " 33%|###3      | 6675/20000 [00:42<01:11, 185.29it/s]\n",
+      " 33%|###3      | 6694/20000 [00:42<01:14, 177.91it/s]\n",
+      " 34%|###3      | 6712/20000 [00:42<01:15, 176.02it/s]\n",
+      " 34%|###3      | 6733/20000 [00:43<01:11, 185.68it/s]\n",
+      " 34%|###3      | 6752/20000 [00:43<01:10, 186.91it/s]\n",
+      " 34%|###3      | 6771/20000 [00:43<01:12, 183.53it/s]\n",
+      " 34%|###3      | 6790/20000 [00:43<01:15, 175.73it/s]\n",
+      " 34%|###4      | 6808/20000 [00:43<01:17, 170.68it/s]\n",
+      " 34%|###4      | 6828/20000 [00:43<01:13, 178.87it/s]\n",
+      " 34%|###4      | 6849/20000 [00:43<01:10, 186.16it/s]\n",
+      " 34%|###4      | 6868/20000 [00:43<01:10, 187.26it/s]\n",
+      " 34%|###4      | 6887/20000 [00:43<01:10, 185.36it/s]\n",
+      " 35%|###4      | 6906/20000 [00:44<01:12, 180.40it/s]\n",
+      " 35%|###4      | 6929/20000 [00:44<01:07, 193.50it/s]\n",
+      " 35%|###4      | 6950/20000 [00:44<01:05, 198.27it/s]\n",
+      " 35%|###4      | 6970/20000 [00:44<01:06, 197.04it/s]\n",
+      " 35%|###4      | 6990/20000 [00:44<01:08, 190.65it/s]\n",
+      " 35%|###5      | 7010/20000 [00:44<01:10, 184.33it/s]\n",
+      " 35%|###5      | 7029/20000 [00:44<01:10, 183.34it/s]\n",
+      " 35%|###5      | 7049/20000 [00:44<01:08, 188.06it/s]\n",
+      " 35%|###5      | 7068/20000 [00:44<01:08, 188.62it/s]\n",
+      " 35%|###5      | 7087/20000 [00:44<01:09, 184.71it/s]\n",
+      " 36%|###5      | 7106/20000 [00:45<01:11, 179.97it/s]\n",
+      " 36%|###5      | 7129/20000 [00:45<01:06, 192.60it/s]\n",
+      " 36%|###5      | 7151/20000 [00:45<01:04, 198.21it/s]\n",
+      " 36%|###5      | 7172/20000 [00:45<01:04, 198.74it/s]\n",
+      " 36%|###5      | 7192/20000 [00:45<01:04, 197.39it/s]\n",
+      " 36%|###6      | 7212/20000 [00:45<01:05, 195.87it/s]\n",
+      " 36%|###6      | 7235/20000 [00:45<01:02, 205.80it/s]\n",
+      " 36%|###6      | 7256/20000 [00:45<01:01, 205.81it/s]\n",
+      " 36%|###6      | 7277/20000 [00:45<01:03, 201.69it/s]\n",
+      " 36%|###6      | 7298/20000 [00:46<01:05, 193.96it/s]\n",
+      " 37%|###6      | 7320/20000 [00:46<01:03, 200.19it/s]\n",
+      " 37%|###6      | 7343/20000 [00:46<01:00, 208.13it/s]\n",
+      " 37%|###6      | 7364/20000 [00:46<01:00, 208.67it/s]\n",
+      " 37%|###6      | 7385/20000 [00:46<01:01, 204.27it/s]\n",
+      " 37%|###7      | 7406/20000 [00:46<01:03, 199.56it/s]\n",
+      " 37%|###7      | 7429/20000 [00:46<01:00, 207.10it/s]\n",
+      " 37%|###7      | 7450/20000 [00:46<01:00, 207.93it/s]\n",
+      " 37%|###7      | 7471/20000 [00:46<01:00, 207.32it/s]\n",
+      " 37%|###7      | 7492/20000 [00:46<01:01, 204.50it/s]\n",
+      " 38%|###7      | 7513/20000 [00:47<01:00, 205.49it/s]\n",
+      " 38%|###7      | 7537/20000 [00:47<00:57, 215.00it/s]\n",
+      " 38%|###7      | 7559/20000 [00:47<00:57, 215.82it/s]\n",
+      " 38%|###7      | 7581/20000 [00:47<00:58, 210.82it/s]\n",
+      " 38%|###8      | 7603/20000 [00:47<01:01, 200.64it/s]\n",
+      " 38%|###8      | 7627/20000 [00:47<00:58, 211.11it/s]\n",
+      " 38%|###8      | 7650/20000 [00:47<00:57, 215.27it/s]\n",
+      " 38%|###8      | 7672/20000 [00:47<00:58, 211.14it/s]\n",
+      " 38%|###8      | 7694/20000 [00:47<01:00, 203.74it/s]\n",
+      " 39%|###8      | 7716/20000 [00:48<00:59, 206.56it/s]\n",
+      " 39%|###8      | 7742/20000 [00:48<00:55, 219.27it/s]\n",
+      " 39%|###8      | 7765/20000 [00:48<00:55, 222.34it/s]\n",
+      " 39%|###8      | 7788/20000 [00:48<00:55, 220.09it/s]\n",
+      " 39%|###9      | 7811/20000 [00:48<00:56, 217.29it/s]\n",
+      " 39%|###9      | 7837/20000 [00:48<00:53, 227.59it/s]\n",
+      " 39%|###9      | 7860/20000 [00:48<00:53, 225.68it/s]\n",
+      " 39%|###9      | 7883/20000 [00:48<00:57, 210.45it/s]\n",
+      " 40%|###9      | 7905/20000 [00:48<00:59, 203.50it/s]\n",
+      " 40%|###9      | 7931/20000 [00:49<00:55, 218.40it/s]\n",
+      " 40%|###9      | 7955/20000 [00:49<00:53, 223.84it/s]\n",
+      " 40%|###9      | 7978/20000 [00:49<00:54, 222.42it/s]\n",
+      " 40%|####      | 8001/20000 [00:49<00:56, 211.24it/s]\n",
+      " 40%|####      | 8028/20000 [00:49<00:52, 226.96it/s]\n",
+      " 40%|####      | 8052/20000 [00:49<00:52, 229.36it/s]\n",
+      " 40%|####      | 8076/20000 [00:49<00:52, 226.55it/s]\n",
+      " 40%|####      | 8099/20000 [00:49<00:54, 217.59it/s]\n",
+      " 41%|####      | 8121/20000 [00:50<01:26, 136.56it/s]\n",
+      " 41%|####      | 8144/20000 [00:50<01:16, 154.69it/s]\n",
+      " 41%|####      | 8165/20000 [00:50<01:11, 165.14it/s]\n",
+      " 41%|####      | 8186/20000 [00:50<01:07, 174.19it/s]\n",
+      " 41%|####1     | 8206/20000 [00:50<01:05, 179.79it/s]\n",
+      " 41%|####1     | 8234/20000 [00:50<00:57, 205.18it/s]\n",
+      " 41%|####1     | 8259/20000 [00:50<00:54, 215.64it/s]\n",
+      " 41%|####1     | 8282/20000 [00:50<00:53, 219.03it/s]\n",
+      " 42%|####1     | 8305/20000 [00:50<00:55, 209.63it/s]\n",
+      " 42%|####1     | 8334/20000 [00:51<00:50, 229.98it/s]\n",
+      " 42%|####1     | 8359/20000 [00:51<00:49, 234.96it/s]\n",
+      " 42%|####1     | 8383/20000 [00:51<00:50, 230.45it/s]\n",
+      " 42%|####2     | 8407/20000 [00:51<00:52, 222.38it/s]\n",
+      " 42%|####2     | 8436/20000 [00:51<00:48, 240.62it/s]\n",
+      " 42%|####2     | 8461/20000 [00:51<00:47, 242.60it/s]\n",
+      " 42%|####2     | 8486/20000 [00:51<00:47, 239.88it/s]\n",
+      " 43%|####2     | 8511/20000 [00:51<00:48, 236.67it/s]\n",
+      " 43%|####2     | 8539/20000 [00:51<00:46, 247.65it/s]\n",
+      " 43%|####2     | 8564/20000 [00:51<00:46, 244.78it/s]\n",
+      " 43%|####2     | 8589/20000 [00:52<00:48, 236.04it/s]\n",
+      " 43%|####3     | 8613/20000 [00:52<00:48, 235.15it/s]\n",
+      " 43%|####3     | 8643/20000 [00:52<00:45, 252.29it/s]\n",
+      " 43%|####3     | 8669/20000 [00:52<00:45, 248.06it/s]\n",
+      " 43%|####3     | 8694/20000 [00:52<00:47, 240.33it/s]\n",
+      " 44%|####3     | 8720/20000 [00:52<00:46, 243.84it/s]\n",
+      " 44%|####3     | 8748/20000 [00:52<00:44, 254.21it/s]\n",
+      " 44%|####3     | 8777/20000 [00:52<00:42, 263.08it/s]\n",
+      " 44%|####4     | 8808/20000 [00:52<00:40, 276.78it/s]\n",
+      " 44%|####4     | 8856/20000 [00:53<00:33, 336.62it/s]\n",
+      " 44%|####4     | 8896/20000 [00:53<00:31, 353.29it/s]\n",
+      " 45%|####4     | 8955/20000 [00:53<00:26, 422.10it/s]\n",
+      " 45%|####5     | 9001/20000 [00:53<00:25, 425.76it/s]\n",
+      " 45%|####5     | 9070/20000 [00:53<00:21, 501.92it/s]\n",
+      " 46%|####5     | 9128/20000 [00:53<00:20, 523.43it/s]\n",
+      " 46%|####5     | 9183/20000 [00:53<00:20, 529.69it/s]\n",
+      " 46%|####6     | 9237/20000 [00:53<00:20, 531.16it/s]\n",
+      " 46%|####6     | 9291/20000 [00:53<00:21, 508.14it/s]\n",
+      " 47%|####6     | 9346/20000 [00:53<00:20, 520.17it/s]\n",
+      " 47%|####6     | 9399/20000 [00:54<00:20, 509.66it/s]\n",
+      " 47%|####7     | 9451/20000 [00:54<00:20, 509.69it/s]\n",
+      " 48%|####7     | 9503/20000 [00:54<00:21, 494.01it/s]\n",
+      " 48%|####7     | 9580/20000 [00:54<00:18, 569.61it/s]\n",
+      " 48%|####8     | 9649/20000 [00:54<00:17, 602.61it/s]\n",
+      " 49%|####8     | 9710/20000 [00:54<00:17, 597.78it/s]\n",
+      " 49%|####8     | 9792/20000 [00:54<00:15, 660.63it/s]\n",
+      " 49%|####9     | 9874/20000 [00:54<00:14, 707.37it/s]\n",
+      " 50%|####9     | 9946/20000 [00:54<00:14, 698.70it/s]\n",
+      " 50%|#####     | 10029/20000 [00:54<00:13, 735.01it/s]\n",
+      " 51%|#####     | 10103/20000 [00:55<00:14, 703.33it/s]\n",
+      " 51%|#####     | 10187/20000 [00:55<00:13, 742.49it/s]\n",
+      " 51%|#####1    | 10275/20000 [00:55<00:12, 782.42it/s]\n",
+      " 52%|#####1    | 10372/20000 [00:55<00:11, 837.41it/s]\n",
+      " 52%|#####2    | 10463/20000 [00:55<00:11, 858.83it/s]\n",
+      " 53%|#####2    | 10550/20000 [00:55<00:10, 862.08it/s]\n",
+      " 53%|#####3    | 10640/20000 [00:55<00:10, 873.35it/s]\n",
+      " 54%|#####3    | 10728/20000 [00:55<00:10, 857.41it/s]\n",
+      " 54%|#####4    | 10815/20000 [00:55<00:10, 858.61it/s]\n",
+      " 55%|#####4    | 10902/20000 [00:56<00:10, 861.95it/s]\n",
+      " 55%|#####5    | 11034/20000 [00:56<00:08, 997.87it/s]\n",
+      " 56%|#####5    | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
+      " 56%|#####6    | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
+      " 57%|#####7    | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
+      " 58%|#####7    | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
+      " 59%|#####8    | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
+      " 60%|#####9    | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
+      " 61%|######    | 12137/20000 [00:58<00:32, 241.50it/s] \n",
+      " 61%|######1   | 12256/20000 [00:59<00:34, 221.55it/s]\n",
+      " 62%|######1   | 12344/20000 [00:59<00:36, 211.18it/s]\n",
+      " 62%|######2   | 12411/20000 [01:00<00:37, 204.44it/s]\n",
+      " 62%|######2   | 12464/20000 [01:00<00:37, 201.86it/s]\n",
+      " 63%|######2   | 12507/20000 [01:00<00:37, 197.48it/s]\n",
+      " 63%|######2   | 12542/20000 [01:00<00:35, 209.76it/s]\n",
+      " 63%|######2   | 12576/20000 [01:01<00:33, 220.35it/s]\n",
+      " 63%|######3   | 12609/20000 [01:01<00:32, 226.03it/s]\n",
+      " 63%|######3   | 12640/20000 [01:01<00:31, 234.04it/s]\n",
+      " 63%|######3   | 12670/20000 [01:01<00:29, 246.31it/s]\n",
+      " 64%|######3   | 12700/20000 [01:01<00:28, 251.76it/s]\n",
+      " 64%|######3   | 12731/20000 [01:01<00:27, 263.76it/s]\n",
+      " 64%|######3   | 12761/20000 [01:01<00:26, 272.59it/s]\n",
+      " 64%|######3   | 12791/20000 [01:01<00:26, 271.94it/s]\n",
+      " 64%|######4   | 12820/20000 [01:01<00:26, 274.53it/s]\n",
+      " 64%|######4   | 12850/20000 [01:02<00:25, 281.46it/s]\n",
+      " 64%|######4   | 12879/20000 [01:02<00:25, 277.63it/s]\n",
+      " 65%|######4   | 12908/20000 [01:02<00:26, 264.03it/s]\n",
+      " 65%|######4   | 12939/20000 [01:02<00:25, 276.61it/s]\n",
+      " 65%|######4   | 12969/20000 [01:02<00:24, 281.60it/s]\n",
+      " 65%|######4   | 12998/20000 [01:02<00:24, 280.78it/s]\n",
+      " 65%|######5   | 13027/20000 [01:02<00:25, 275.51it/s]\n",
+      " 65%|######5   | 13058/20000 [01:02<00:24, 285.34it/s]\n",
+      " 65%|######5   | 13087/20000 [01:02<00:24, 285.03it/s]\n",
+      " 66%|######5   | 13117/20000 [01:03<00:23, 287.71it/s]\n",
+      " 66%|######5   | 13151/20000 [01:03<00:22, 301.25it/s]\n",
+      " 66%|######5   | 13182/20000 [01:03<00:22, 299.42it/s]\n",
+      " 66%|######6   | 13213/20000 [01:03<00:23, 288.18it/s]\n",
+      " 66%|######6   | 13247/20000 [01:03<00:22, 302.07it/s]\n",
+      " 66%|######6   | 13280/20000 [01:03<00:21, 309.23it/s]\n",
+      " 67%|######6   | 13312/20000 [01:03<00:21, 306.12it/s]\n",
+      " 67%|######6   | 13348/20000 [01:03<00:20, 321.72it/s]\n",
+      " 67%|######6   | 13381/20000 [01:03<00:20, 320.39it/s]\n",
+      " 67%|######7   | 13414/20000 [01:04<00:35, 183.90it/s]\n",
+      " 67%|######7   | 13448/20000 [01:04<00:30, 213.47it/s]\n",
+      " 67%|######7   | 13478/20000 [01:04<00:28, 232.06it/s]\n",
+      " 68%|######7   | 13508/20000 [01:04<00:26, 246.85it/s]\n",
+      " 68%|######7   | 13546/20000 [01:04<00:23, 278.79it/s]\n",
+      " 68%|######7   | 13578/20000 [01:04<00:22, 289.60it/s]\n",
+      " 68%|######8   | 13610/20000 [01:04<00:21, 290.75it/s]\n",
+      " 68%|######8   | 13650/20000 [01:04<00:19, 319.96it/s]\n",
+      " 68%|######8   | 13684/20000 [01:05<00:19, 322.87it/s]\n",
+      " 69%|######8   | 13718/20000 [01:05<00:19, 324.97it/s]\n",
+      " 69%|######8   | 13753/20000 [01:05<00:18, 332.16it/s]\n",
+      " 69%|######8   | 13787/20000 [01:05<00:19, 323.16it/s]\n",
+      " 69%|######9   | 13820/20000 [01:05<00:19, 317.82it/s]\n",
+      " 69%|######9   | 13857/20000 [01:05<00:18, 332.74it/s]\n",
+      " 69%|######9   | 13891/20000 [01:05<00:18, 333.86it/s]\n",
+      " 70%|######9   | 13927/20000 [01:05<00:17, 340.50it/s]\n",
+      " 70%|######9   | 13963/20000 [01:05<00:17, 345.20it/s]\n",
+      " 70%|######9   | 13998/20000 [01:05<00:17, 340.60it/s]\n",
+      " 70%|#######   | 14036/20000 [01:06<00:16, 351.09it/s]\n",
+      " 70%|#######   | 14073/20000 [01:06<00:16, 356.65it/s]\n",
+      " 71%|#######   | 14109/20000 [01:06<00:16, 353.45it/s]\n",
+      " 71%|#######   | 14150/20000 [01:06<00:15, 369.02it/s]\n",
+      " 71%|#######   | 14187/20000 [01:06<00:15, 368.21it/s]\n",
+      " 71%|#######1  | 14227/20000 [01:06<00:15, 375.42it/s]\n",
+      " 71%|#######1  | 14265/20000 [01:06<00:16, 345.08it/s]\n",
+      " 72%|#######1  | 14301/20000 [01:06<00:16, 347.30it/s]\n",
+      " 72%|#######1  | 14349/20000 [01:06<00:14, 383.90it/s]\n",
+      " 72%|#######1  | 14388/20000 [01:06<00:14, 376.96it/s]\n",
+      " 72%|#######2  | 14430/20000 [01:07<00:14, 389.28it/s]\n",
+      " 72%|#######2  | 14471/20000 [01:07<00:13, 395.30it/s]\n",
+      " 73%|#######2  | 14511/20000 [01:07<00:14, 389.82it/s]\n",
+      " 73%|#######2  | 14554/20000 [01:07<00:13, 401.53it/s]\n",
+      " 73%|#######2  | 14595/20000 [01:07<00:14, 378.41it/s]\n",
+      " 73%|#######3  | 14643/20000 [01:07<00:13, 405.95it/s]\n",
+      " 73%|#######3  | 14687/20000 [01:07<00:12, 415.69it/s]\n",
+      " 74%|#######3  | 14730/20000 [01:07<00:12, 418.62it/s]\n",
+      " 74%|#######3  | 14774/20000 [01:07<00:12, 422.40it/s]\n",
+      " 74%|#######4  | 14817/20000 [01:08<00:12, 418.48it/s]\n",
+      " 74%|#######4  | 14868/20000 [01:08<00:11, 443.95it/s]\n",
+      " 75%|#######4  | 14913/20000 [01:08<00:11, 444.41it/s]\n",
+      " 75%|#######4  | 14962/20000 [01:08<00:11, 457.86it/s]\n",
+      " 75%|#######5  | 15008/20000 [01:08<00:11, 438.97it/s]\n",
+      " 75%|#######5  | 15067/20000 [01:08<00:10, 481.14it/s]\n",
+      " 76%|#######5  | 15116/20000 [01:08<00:10, 483.71it/s]\n",
+      " 76%|#######5  | 15173/20000 [01:08<00:09, 509.06it/s]\n",
+      " 76%|#######6  | 15227/20000 [01:08<00:09, 518.19it/s]\n",
+      " 76%|#######6  | 15285/20000 [01:08<00:08, 534.95it/s]\n",
+      " 77%|#######6  | 15351/20000 [01:09<00:08, 570.41it/s]\n",
+      " 77%|#######7  | 15409/20000 [01:09<00:08, 569.86it/s]\n",
+      " 77%|#######7  | 15477/20000 [01:09<00:07, 602.56it/s]\n",
+      " 78%|#######7  | 15538/20000 [01:09<00:07, 602.96it/s]\n",
+      " 78%|#######7  | 15599/20000 [01:09<00:07, 585.87it/s]\n",
+      " 78%|#######8  | 15658/20000 [01:09<00:07, 581.97it/s]\n",
+      " 79%|#######8  | 15722/20000 [01:09<00:07, 598.93it/s]\n",
+      " 79%|#######8  | 15799/20000 [01:09<00:06, 647.41it/s]\n",
+      " 79%|#######9  | 15877/20000 [01:09<00:06, 684.57it/s]\n",
+      " 80%|#######9  | 15957/20000 [01:09<00:05, 718.72it/s]\n",
+      " 80%|########  | 16037/20000 [01:10<00:05, 740.70it/s]\n",
+      " 81%|########  | 16112/20000 [01:10<00:05, 730.42it/s]\n",
+      " 81%|########  | 16195/20000 [01:10<00:05, 757.50it/s]\n",
+      " 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
+      " 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
+      " 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
+      " 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
+      " 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
+      " 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
+      " 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
+      " 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
+      " 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
+      " 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
+      " 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
+      " 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
+      " 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
+      " 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
+      " 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
+      " 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
+      " 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
+      " 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
+      " 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
+      " 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
+      " 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
+      " 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
+      " 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
+      " 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
+      " 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
+      " 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
+      " 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
+      " 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
+      " 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
+      " 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
+      " 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
+      " 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
+      " 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
+      " 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
+      " 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
+      " 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
+      " 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
+      " 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
+      " 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
+      " 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
+      " 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
+      " 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
+      " 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
+      " 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
+      " 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
+      " 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
+      " 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
+      " 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
+      " 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
+      " 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
+      " 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
+      " 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
+      " 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
+      " 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
+      " 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
+      " 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
+      "100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
+      "100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
+      "100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
+      "100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
+      "100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
+      "100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "68a4113a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "06254f0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vocabulary size: 20217\n"
+     ]
+    }
+   ],
+   "source": [
+    "def count_tokens(file_path):\n",
+    "    try:\n",
+    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "            text = file.read()\n",
+    "            # Split the text into tokens based on spaces\n",
+    "            tokens = text.split()\n",
+    "            # Count the vocabulary size (number of unique tokens)\n",
+    "            vocabulary_size = len(set(tokens))\n",
+    "            return vocabulary_size\n",
+    "    except IOError:\n",
+    "        print(f\"Error: Could not open or read the file '{file_path}'\")\n",
+    "        return -1\n",
+    "\n",
+    "# Example usage\n",
+    "file_path = './dataset/output_dataset.txt'  # Replace with the actual file path\n",
+    "vocabulary_size = count_tokens(file_path)\n",
+    "if vocabulary_size != -1:\n",
+    "    print(f\"Vocabulary size: {vocabulary_size}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

subword/__init__.py ADDED Viewed

File without changes

subword/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (166 Bytes). View file

subword/__pycache__/apply_bpe.cpython-39.pyc ADDED Viewed

Binary file (13.4 kB). View file

subword/apply_bpe.py ADDED Viewed

	@@ -0,0 +1,457 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+"""Use operations learned with learn_bpe.py to encode a new text.
+The text will not be smaller, but use only a fixed vocabulary, with rare words
+encoded as variable-length sequences of subword units.
+Reference:
+Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
+Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
+"""
+from __future__ import unicode_literals, division
+import sys
+import os
+import inspect
+import codecs
+import io
+import argparse
+import re
+import warnings
+import random
+import tempfile
+from multiprocessing import Pool, cpu_count
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+class BPE(object):
+    def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):
+        codes.seek(0)
+        offset=1
+        # check version information
+        firstline = codes.readline()
+        if firstline.startswith('#version:'):
+            self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
+            offset += 1
+        else:
+            self.version = (0, 1)
+            codes.seek(0)
+        self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes.read().rstrip('\n').split('\n')) if (n < merges or merges == -1)]
+        for i, item in enumerate(self.bpe_codes):
+            if len(item) != 2:
+                sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item)))
+                sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n')
+                sys.exit(1)
+        # some hacking to deal with duplicates (only consider first instance)
+        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
+        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])
+        self.separator = separator
+        self.vocab = vocab
+        self.glossaries = glossaries if glossaries else []
+        self.glossaries_regex = re.compile('^({})$'.format('|'.join(glossaries))) if glossaries else None
+        self.cache = {}
+    def process_lines(self, filename, outfile, dropout=0, num_workers=1):
+        if sys.version_info < (3, 0):
+            print("Parallel mode is only supported in Python3.")
+            sys.exit(1)
+        if num_workers == 1:
+            _process_lines(self, filename, outfile, dropout, 0, 0)
+        elif num_workers > 1:
+            with open(filename, encoding="utf-8") as f:
+                size = os.fstat(f.fileno()).st_size
+                chunk_size = int(size / num_workers)
+                offsets = [0 for _ in range(num_workers + 1)]
+                for i in range(1, num_workers):
+                    f.seek(chunk_size * i)
+                    pos = f.tell()
+                    while True:
+                        try:
+                            line = f.readline()
+                            break
+                        except UnicodeDecodeError:
+                            pos -= 1
+                            f.seek(pos)
+                    offsets[i] = f.tell()
+                    assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
+            res_files = []
+            pool = Pool(processes=num_workers)
+            for i in range(num_workers):
+                tmp = tempfile.NamedTemporaryFile(delete=False)
+                tmp.close()
+                res_files.append(tmp)
+                pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
+            pool.close()
+            pool.join()
+            for i in range(num_workers):
+                with open(res_files[i].name, encoding="utf-8") as fi:
+                    for line in fi:
+                        outfile.write(line)
+                os.remove(res_files[i].name)
+        else:
+            raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
+    def process_line(self, line, dropout=0):
+        """segment line, dealing with leading and trailing whitespace"""
+        out = ""
+        leading_whitespace = len(line)-len(line.lstrip('\r\n '))
+        if leading_whitespace:
+            out += line[:leading_whitespace]
+        out += self.segment(line, dropout)
+        trailing_whitespace = len(line)-len(line.rstrip('\r\n '))
+        if trailing_whitespace and trailing_whitespace != len(line):
+            out += line[-trailing_whitespace:]
+        return out
+    def segment(self, sentence, dropout=0):
+        """segment single sentence (whitespace-tokenized string) with BPE encoding"""
+        segments = self.segment_tokens(sentence.strip('\r\n ').split(' '), dropout)
+        return ' '.join(segments)
+    def segment_tokens(self, tokens, dropout=0):
+        """segment a sequence of tokens with BPE encoding"""
+        output = []
+        for word in tokens:
+            # eliminate double spaces
+            if not word:
+                continue
+            new_word = [out for segment in self._isolate_glossaries(word)
+                        for out in encode(segment,
+                                          self.bpe_codes,
+                                          self.bpe_codes_reverse,
+                                          self.vocab,
+                                          self.separator,
+                                          self.version,
+                                          self.cache,
+                                          self.glossaries_regex,
+                                          dropout)]
+            for item in new_word[:-1]:
+                output.append(item + self.separator)
+            output.append(new_word[-1])
+        return output
+    def _isolate_glossaries(self, word):
+        word_segments = [word]
+        for gloss in self.glossaries:
+            word_segments = [out_segments for segment in word_segments
+                                 for out_segments in isolate_glossary(segment, gloss)]
+        return word_segments
+def _process_lines(bpe, filename, outfile, dropout, begin, end):
+    if isinstance(outfile, str):
+        fo = open(outfile, "w", encoding="utf-8")
+    else:
+        fo = outfile
+    with open(filename, encoding="utf-8") as f:
+        f.seek(begin)
+        line = f.readline()
+        while line:
+            pos = f.tell()
+            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
+            if end > 0 and pos > end:
+                break
+            fo.write(bpe.process_line(line, dropout))
+            line = f.readline()
+    if isinstance(outfile, str):
+        fo.close()
+def create_parser(subparsers=None):
+    if subparsers:
+        parser = subparsers.add_parser('apply-bpe',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    else:
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input file (default: standard input).")
+    parser.add_argument(
+        '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
+        required=True,
+        help="File with BPE codes (created by learn_bpe.py).")
+    parser.add_argument(
+        '--merges', '-m', type=int, default=-1,
+        metavar='INT',
+        help="Use this many BPE operations (<= number of learned symbols)"+
+             "default: Apply all the learned merge operations")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output file (default: standard output)")
+    parser.add_argument(
+        '--separator', '-s', type=str, default='@@', metavar='STR',
+        help="Separator between non-final subword units (default: '%(default)s'))")
+    parser.add_argument(
+        '--vocabulary', type=argparse.FileType('r'), default=None,
+        metavar="PATH",
+        help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.")
+    parser.add_argument(
+        '--vocabulary-threshold', type=int, default=None,
+        metavar="INT",
+        help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV")
+    parser.add_argument(
+        '--dropout', type=float, default=0,
+        metavar="P",
+        help="Dropout BPE merge operations with probability P (Provilkov et al., 2019). Use this on training data only.")
+    parser.add_argument(
+        '--glossaries', type=str, nargs='+', default=None,
+        metavar="STR",
+        help="Glossaries. Words matching any of the words/regex provided in glossaries will not be affected "+
+             "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords. "+
+             "Can be provided as a list of words/regex after the --glossaries argument. Enclose each regex in quotes.")
+    parser.add_argument(
+        '--seed', type=int, default=None,
+        metavar="S",
+        help="Random seed for the random number generators (e.g. for BPE dropout with --dropout).")
+    parser.add_argument(
+        '--num-workers', type=int, default=1,
+        help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
+    return parser
+def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries_regex=None, dropout=0):
+    """Encode word based on list of BPE merge operations, which are applied consecutively
+    """
+    if not dropout and orig in cache:
+        return cache[orig]
+    if glossaries_regex and glossaries_regex.match(orig):
+        cache[orig] = (orig,)
+        return (orig,)
+    if len(orig) == 1:
+        return orig
+    if version == (0, 1):
+        word = list(orig) + ['</w>']
+    elif version == (0, 2): # more consistent handling of word-final segments
+        word = list(orig[:-1]) + [orig[-1] + '</w>']
+    else:
+        raise NotImplementedError
+    while len(word) > 1:
+        # get list of symbol pairs; optionally apply dropout
+        pairs = [(bpe_codes[pair],i,pair) for (i,pair) in enumerate(zip(word, word[1:])) if (not dropout or random.random() > dropout) and pair in bpe_codes]
+        if not pairs:
+            break
+        #get first merge operation in list of BPE codes
+        bigram = min(pairs)[2]
+        # find start position of all pairs that we want to merge
+        positions = [i for (rank,i,pair) in pairs if pair == bigram]
+        i = 0
+        new_word = []
+        bigram = ''.join(bigram)
+        for j in positions:
+            # merges are invalid if they start before current position. This can happen if there are overlapping pairs: (x x x -> xx x)
+            if j < i:
+                continue
+            new_word.extend(word[i:j]) # all symbols before merged pair
+            new_word.append(bigram) # merged pair
+            i = j+2 # continue after merged pair
+        new_word.extend(word[i:]) # add all symbols until end of word
+        word = new_word
+    # don't print end-of-word symbols
+    if word[-1] == '</w>':
+        word = word[:-1]
+    elif word[-1].endswith('</w>'):
+        word[-1] = word[-1][:-4]
+    word = tuple(word)
+    if vocab:
+        word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator)
+    cache[orig] = word
+    return word
+def recursive_split(segment, bpe_codes, vocab, separator, final=False):
+    """Recursively split segment into smaller units (by reversing BPE merges)
+    until all units are either in-vocabulary, or cannot be split futher."""
+    try:
+        if final:
+            left, right = bpe_codes[segment + '</w>']
+            right = right[:-4]
+        else:
+            left, right = bpe_codes[segment]
+    except:
+        #sys.stderr.write('cannot split {0} further.\n'.format(segment))
+        yield segment
+        return
+    if left + separator in vocab:
+        yield left
+    else:
+        for item in recursive_split(left, bpe_codes, vocab, separator, False):
+            yield item
+    if (final and right in vocab) or (not final and right + separator in vocab):
+        yield right
+    else:
+        for item in recursive_split(right, bpe_codes, vocab, separator, final):
+            yield item
+def check_vocab_and_split(orig, bpe_codes, vocab, separator):
+    """Check for each segment in word if it is in-vocabulary,
+    and segment OOV segments into smaller units by reversing the BPE merge operations"""
+    out = []
+    for segment in orig[:-1]:
+        if segment + separator in vocab:
+            out.append(segment)
+        else:
+            #sys.stderr.write('OOV: {0}\n'.format(segment))
+            for item in recursive_split(segment, bpe_codes, vocab, separator, False):
+                out.append(item)
+    segment = orig[-1]
+    if segment in vocab:
+        out.append(segment)
+    else:
+        #sys.stderr.write('OOV: {0}\n'.format(segment))
+        for item in recursive_split(segment, bpe_codes, vocab, separator, True):
+            out.append(item)
+    return out
+def read_vocabulary(vocab_file, threshold):
+    """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
+    """
+    vocabulary = set()
+    for line in vocab_file:
+        word, freq = line.strip('\r\n ').split(' ')
+        freq = int(freq)
+        if threshold == None or freq >= threshold:
+            vocabulary.add(word)
+    return vocabulary
+def isolate_glossary(word, glossary):
+    """
+    Isolate a glossary present inside a word.
+    Returns a list of subwords. In which all 'glossary' glossaries are isolated
+    For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is:
+        ['1934', 'USA', 'B', 'USA']
+    """
+    # regex equivalent of (if word == glossary or glossary not in word)
+    if re.match('^'+glossary+'$', word) or not re.search(glossary, word):
+        return [word]
+    else:
+        segments = re.split(r'({})'.format(glossary), word)
+        segments, ending = segments[:-1], segments[-1]
+        segments = list(filter(None, segments)) # Remove empty strings in regex group.
+        return segments + [ending.strip('\r\n ')] if ending != '' else segments
+if __name__ == '__main__':
+    currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    newdir = os.path.join(currentdir, 'subword_nmt')
+    if os.path.isdir(newdir):
+        warnings.warn(
+            "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+            DeprecationWarning
+        )
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
+    parser = create_parser()
+    args = parser.parse_args()
+    if args.num_workers <= 0:
+        args.num_workers = cpu_count()
+    # read/write files as UTF-8
+    args.codes = codecs.open(args.codes.name, encoding='utf-8')
+    if args.input.name != '<stdin>':
+        args.input = codecs.open(args.input.name, encoding='utf-8')
+    if args.output.name != '<stdout>':
+        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+    if args.vocabulary:
+        args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
+    if args.vocabulary:
+        vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
+    else:
+        vocabulary = None
+    if sys.version_info < (3, 0):
+        args.separator = args.separator.decode('UTF-8')
+        if args.glossaries:
+            args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
+        if args.num_workers > 1:
+            args.num_workers = 1
+            warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
+    if args.seed is not None:
+        random.seed(args.seed)
+    bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
+    if args.input.name == '<stdin>' or args.num_workers == 1:
+        if args.num_workers > 1:
+            warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
+        for line in args.input:
+            args.output.write(bpe.process_line(line, args.dropout))
+    else:
+        bpe.process_lines(args.input.name, args.output, args.dropout, args.num_workers)
+    # close files
+    args.codes.close()
+    if args.input.name != '<stdin>':
+        args.input.close()
+    if args.output.name != '<stdout>':
+        args.output.close()
+    if args.vocabulary:
+        args.vocabulary.close()

subword/bpe_toy.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
+Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
+of a text to a configurable number of symbols, with only a small increase in the number of tokens.
+This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
+indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).
+Reference:
+Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
+Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
+"""
+import re
+import sys
+import collections
+def get_stats(vocab):
+  pairs = collections.defaultdict(int)
+  for word, freq in vocab.items():
+    symbols = word.split()
+    for i in range(len(symbols)-1):
+      pairs[symbols[i],symbols[i+1]] += freq
+  return pairs
+def merge_vocab(pair, v_in):
+  v_out = {}
+  bigram_pattern = re.escape(' '.join(pair))
+  p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
+  for word in v_in:
+    w_out = p.sub(''.join(pair), word)
+    v_out[w_out] = v_in[word]
+  return v_out
+vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2,
+         'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3}
+num_merges = 15
+for i in range(num_merges):
+  pairs = get_stats(vocab)
+  try:
+    best = max(pairs, key=pairs.get)
+  except ValueError:
+    break
+  if pairs[best] < 2:
+     sys.stderr.write('no pair has frequency > 1. Stopping\n')
+     break
+  vocab = merge_vocab(best, vocab)
+  print(best)

subword/chrF.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+"""Compute chrF3 for machine translation evaluation
+Reference:
+Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal.
+"""
+from __future__ import print_function, unicode_literals, division
+import sys
+import codecs
+import io
+import argparse
+from collections import defaultdict
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+def create_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="learn BPE-based word segmentation")
+    parser.add_argument(
+        '--ref', '-r', type=argparse.FileType('r'), required=True,
+        metavar='PATH',
+        help="Reference file")
+    parser.add_argument(
+        '--hyp', type=argparse.FileType('r'), metavar='PATH',
+        default=sys.stdin,
+        help="Hypothesis file (default: stdin).")
+    parser.add_argument(
+        '--beta', '-b', type=float, default=3,
+        metavar='FLOAT',
+        help="beta parameter (default: '%(default)s')")
+    parser.add_argument(
+        '--ngram', '-n', type=int, default=6,
+        metavar='INT',
+        help="ngram order (default: '%(default)s')")
+    parser.add_argument(
+        '--space', '-s', action='store_true',
+        help="take spaces into account (default: '%(default)s')")
+    parser.add_argument(
+        '--precision', action='store_true',
+        help="report precision (default: '%(default)s')")
+    parser.add_argument(
+        '--recall', action='store_true',
+        help="report recall (default: '%(default)s')")
+    return parser
+def extract_ngrams(words, max_length=4, spaces=False):
+    if not spaces:
+        words = ''.join(words.split())
+    else:
+        words = words.strip()
+    results = defaultdict(lambda: defaultdict(int))
+    for length in range(max_length):
+        for start_pos in range(len(words)):
+            end_pos = start_pos + length + 1
+            if end_pos <= len(words):
+                results[length][tuple(words[start_pos: end_pos])] += 1
+    return results
+def get_correct(ngrams_ref, ngrams_test, correct, total):
+    for rank in ngrams_test:
+        for chain in ngrams_test[rank]:
+            total[rank] += ngrams_test[rank][chain]
+            if chain in ngrams_ref[rank]:
+                correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain])
+    return correct, total
+def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0):
+    precision = 0
+    recall = 0
+    for i in range(max_length):
+      if total_hyp[i] + smooth and total_ref[i] + smooth:
+        precision += (correct[i] + smooth) / (total_hyp[i] + smooth)
+        recall += (correct[i] + smooth) / (total_ref[i] + smooth)
+    precision /= max_length
+    recall /= max_length
+    return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall
+def main(args):
+    correct = [0]*args.ngram
+    total = [0]*args.ngram
+    total_ref = [0]*args.ngram
+    for line in args.ref:
+      line2 = args.hyp.readline()
+      ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space)
+      ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space)
+      get_correct(ngrams_ref, ngrams_test, correct, total)
+      for rank in ngrams_ref:
+          for chain in ngrams_ref[rank]:
+              total_ref[rank] += ngrams_ref[rank][chain]
+    chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta)
+    print('chrF3: {0:.4f}'.format(chrf))
+    if args.precision:
+        print('chrPrec: {0:.4f}'.format(precision))
+    if args.recall:
+        print('chrRec: {0:.4f}'.format(recall))
+if __name__ == '__main__':
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
+    parser = create_parser()
+    args = parser.parse_args()
+    main(args)

subword/dataset/codec.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

subword/encoding.ipynb ADDED Viewed

	@@ -0,0 +1,700 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9644db35",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  0%|          | 0/20000 [00:00<?, ?it/s]\n",
+      "  0%|          | 1/20000 [00:00<38:40,  8.62it/s]\n",
+      "  0%|          | 2/20000 [00:00<1:31:59,  3.62it/s]\n",
+      "  0%|          | 3/20000 [00:00<1:21:11,  4.11it/s]\n",
+      "  0%|          | 4/20000 [00:01<1:48:20,  3.08it/s]\n",
+      "  0%|          | 6/20000 [00:01<1:03:27,  5.25it/s]\n",
+      "  0%|          | 7/20000 [00:01<1:12:17,  4.61it/s]\n",
+      "  0%|          | 8/20000 [00:01<1:10:13,  4.74it/s]\n",
+      "  0%|          | 10/20000 [00:02<1:09:39,  4.78it/s]\n",
+      "  0%|          | 13/20000 [00:02<42:59,  7.75it/s]  \n",
+      "  0%|          | 16/20000 [00:02<30:25, 10.95it/s]\n",
+      "  0%|          | 19/20000 [00:02<24:50, 13.41it/s]\n",
+      "  0%|          | 21/20000 [00:03<36:58,  9.01it/s]\n",
+      "  0%|          | 23/20000 [00:03<35:20,  9.42it/s]\n",
+      "  0%|          | 25/20000 [00:03<31:30, 10.56it/s]\n",
+      "  0%|          | 27/20000 [00:03<33:12, 10.03it/s]\n",
+      "  0%|          | 29/20000 [00:03<33:30,  9.93it/s]\n",
+      "  0%|          | 31/20000 [00:03<35:06,  9.48it/s]\n",
+      "  0%|          | 33/20000 [00:04<37:03,  8.98it/s]\n",
+      "  0%|          | 37/20000 [00:04<26:59, 12.32it/s]\n",
+      "  0%|          | 39/20000 [00:04<26:54, 12.37it/s]\n",
+      "  0%|          | 42/20000 [00:04<22:51, 14.55it/s]\n",
+      "  0%|          | 46/20000 [00:04<19:15, 17.27it/s]\n",
+      "  0%|          | 48/20000 [00:05<22:34, 14.73it/s]\n",
+      "  0%|          | 50/20000 [00:05<23:39, 14.06it/s]\n",
+      "  0%|          | 52/20000 [00:05<23:30, 14.14it/s]\n",
+      "  0%|          | 55/20000 [00:05<20:12, 16.45it/s]\n",
+      "  0%|          | 58/20000 [00:05<19:09, 17.35it/s]\n",
+      "  0%|          | 60/20000 [00:05<19:25, 17.11it/s]\n",
+      "  0%|          | 63/20000 [00:06<28:45, 11.56it/s]\n",
+      "  0%|          | 70/20000 [00:06<16:14, 20.45it/s]\n",
+      "  0%|          | 75/20000 [00:06<14:15, 23.28it/s]\n",
+      "  0%|          | 78/20000 [00:06<14:28, 22.94it/s]\n",
+      "  0%|          | 81/20000 [00:06<16:06, 20.62it/s]\n",
+      "  0%|          | 85/20000 [00:06<15:13, 21.81it/s]\n",
+      "  0%|          | 88/20000 [00:07<15:39, 21.20it/s]\n",
+      "  0%|          | 92/20000 [00:07<13:28, 24.61it/s]\n",
+      "  0%|          | 95/20000 [00:07<13:32, 24.50it/s]\n",
+      "  0%|          | 99/20000 [00:07<12:06, 27.40it/s]\n",
+      "  1%|          | 102/20000 [00:07<12:55, 25.65it/s]\n",
+      "  1%|          | 105/20000 [00:07<12:46, 25.95it/s]\n",
+      "  1%|          | 112/20000 [00:07<12:02, 27.54it/s]\n",
+      "  1%|          | 118/20000 [00:08<10:02, 33.00it/s]\n",
+      "  1%|          | 122/20000 [00:08<10:12, 32.46it/s]\n",
+      "  1%|          | 127/20000 [00:08<10:07, 32.73it/s]\n",
+      "  1%|          | 138/20000 [00:08<06:56, 47.66it/s]\n",
+      "  1%|          | 144/20000 [00:08<06:47, 48.73it/s]\n",
+      "  1%|          | 150/20000 [00:08<07:00, 47.21it/s]\n",
+      "  1%|          | 156/20000 [00:08<06:44, 49.01it/s]\n",
+      "  1%|          | 162/20000 [00:08<06:47, 48.71it/s]\n",
+      "  1%|          | 169/20000 [00:09<06:38, 49.81it/s]\n",
+      "  1%|          | 176/20000 [00:09<06:28, 51.03it/s]\n",
+      "  1%|          | 184/20000 [00:09<05:58, 55.22it/s]\n",
+      "  1%|          | 190/20000 [00:09<08:05, 40.78it/s]\n",
+      "  1%|          | 197/20000 [00:09<07:30, 43.91it/s]\n",
+      "  1%|1         | 202/20000 [00:09<07:53, 41.79it/s]\n",
+      "  1%|1         | 210/20000 [00:10<06:51, 48.15it/s]\n",
+      "  1%|1         | 220/20000 [00:10<05:35, 59.01it/s]\n",
+      "  1%|1         | 229/20000 [00:10<05:05, 64.72it/s]\n",
+      "  1%|1         | 236/20000 [00:10<05:37, 58.47it/s]\n",
+      "  1%|1         | 244/20000 [00:10<05:22, 61.18it/s]\n",
+      "  1%|1         | 251/20000 [00:10<05:31, 59.65it/s]\n",
+      "  1%|1         | 259/20000 [00:10<05:11, 63.33it/s]\n",
+      "  1%|1         | 266/20000 [00:10<05:25, 60.70it/s]\n",
+      "  1%|1         | 273/20000 [00:10<05:31, 59.42it/s]\n",
+      "  1%|1         | 282/20000 [00:11<04:57, 66.26it/s]\n",
+      "  1%|1         | 289/20000 [00:11<05:00, 65.52it/s]\n",
+      "  1%|1         | 296/20000 [00:11<05:10, 63.47it/s]\n",
+      "  2%|1         | 303/20000 [00:11<07:06, 46.23it/s]\n",
+      "  2%|1         | 313/20000 [00:11<05:41, 57.60it/s]\n",
+      "  2%|1         | 324/20000 [00:11<04:54, 66.87it/s]\n",
+      "  2%|1         | 335/20000 [00:11<04:16, 76.56it/s]\n",
+      "  2%|1         | 346/20000 [00:11<03:50, 85.09it/s]\n",
+      "  2%|1         | 357/20000 [00:12<03:38, 90.06it/s]\n",
+      "  2%|1         | 367/20000 [00:12<03:46, 86.85it/s]\n",
+      "  2%|1         | 377/20000 [00:12<03:41, 88.70it/s]\n",
+      "  2%|1         | 387/20000 [00:12<03:43, 87.95it/s]\n",
+      "  2%|1         | 396/20000 [00:12<03:49, 85.48it/s]\n",
+      "  2%|2         | 405/20000 [00:12<04:04, 80.15it/s]\n",
+      "  2%|2         | 416/20000 [00:12<03:42, 87.82it/s]\n",
+      "  2%|2         | 429/20000 [00:12<03:19, 98.03it/s]\n",
+      "  2%|2         | 439/20000 [00:13<03:35, 90.74it/s]\n",
+      "  2%|2         | 450/20000 [00:13<03:24, 95.65it/s]\n",
+      "  2%|2         | 462/20000 [00:13<03:13, 100.98it/s]\n",
+      "  2%|2         | 473/20000 [00:13<03:16, 99.60it/s] \n",
+      "  2%|2         | 484/20000 [00:13<03:28, 93.64it/s]\n",
+      "  2%|2         | 494/20000 [00:13<03:30, 92.86it/s]\n",
+      "  3%|2         | 504/20000 [00:13<04:34, 70.99it/s]\n",
+      "  3%|2         | 520/20000 [00:13<03:34, 90.96it/s]\n",
+      "  3%|2         | 534/20000 [00:14<03:12, 101.38it/s]\n",
+      "  3%|2         | 547/20000 [00:14<03:01, 107.03it/s]\n",
+      "  3%|2         | 559/20000 [00:14<03:02, 106.25it/s]\n",
+      "  3%|2         | 571/20000 [00:14<03:14, 99.81it/s] \n",
+      "  3%|2         | 582/20000 [00:14<03:13, 100.11it/s]\n",
+      "  3%|2         | 595/20000 [00:14<03:03, 105.49it/s]\n",
+      "  3%|3         | 606/20000 [00:14<03:07, 103.63it/s]\n",
+      "  3%|3         | 625/20000 [00:14<02:33, 126.08it/s]\n",
+      "  3%|3         | 643/20000 [00:14<02:17, 140.29it/s]\n",
+      "  3%|3         | 658/20000 [00:15<02:23, 135.01it/s]\n",
+      "  3%|3         | 672/20000 [00:15<02:32, 126.59it/s]\n",
+      "  3%|3         | 685/20000 [00:15<02:42, 119.19it/s]\n",
+      "  3%|3         | 698/20000 [00:15<02:46, 116.22it/s]\n",
+      "  4%|3         | 710/20000 [00:15<02:49, 113.91it/s]\n",
+      "  4%|3         | 727/20000 [00:15<02:31, 127.58it/s]\n",
+      "  4%|3         | 744/20000 [00:15<02:18, 139.24it/s]\n",
+      "  4%|3         | 759/20000 [00:15<03:10, 101.19it/s]\n",
+      "  4%|3         | 771/20000 [00:16<03:03, 104.67it/s]\n",
+      "  4%|3         | 783/20000 [00:16<03:02, 105.07it/s]\n",
+      "  4%|3         | 795/20000 [00:16<03:14, 98.92it/s] \n",
+      "  4%|4         | 807/20000 [00:16<03:06, 102.82it/s]\n",
+      "  4%|4         | 822/20000 [00:16<02:50, 112.34it/s]\n",
+      "  4%|4         | 834/20000 [00:16<02:55, 109.45it/s]\n",
+      "  4%|4         | 847/20000 [00:16<02:47, 114.28it/s]\n",
+      "  4%|4         | 860/20000 [00:16<02:42, 117.94it/s]\n",
+      "  4%|4         | 873/20000 [00:16<02:46, 114.58it/s]\n",
+      "  4%|4         | 885/20000 [00:17<02:58, 106.97it/s]\n",
+      "  4%|4         | 896/20000 [00:17<03:07, 102.08it/s]\n",
+      "  5%|4         | 908/20000 [00:17<03:01, 105.42it/s]\n",
+      "  5%|4         | 924/20000 [00:17<02:42, 117.67it/s]\n",
+      "  5%|4         | 940/20000 [00:17<02:28, 128.26it/s]\n",
+      "  5%|4         | 954/20000 [00:17<02:24, 131.54it/s]\n",
+      "  5%|4         | 968/20000 [00:17<02:34, 123.37it/s]\n",
+      "  5%|4         | 982/20000 [00:17<02:31, 125.85it/s]\n",
+      "  5%|4         | 995/20000 [00:18<02:39, 119.06it/s]\n",
+      "  5%|5         | 1008/20000 [00:18<03:38, 86.92it/s]\n",
+      "  5%|5         | 1024/20000 [00:18<03:04, 102.72it/s]\n",
+      "  5%|5         | 1041/20000 [00:18<02:40, 118.03it/s]\n",
+      "  5%|5         | 1055/20000 [00:18<02:34, 122.63it/s]\n",
+      "  5%|5         | 1069/20000 [00:18<02:31, 124.89it/s]\n",
+      "  5%|5         | 1083/20000 [00:18<02:34, 122.68it/s]\n",
+      "  5%|5         | 1096/20000 [00:18<02:39, 118.60it/s]\n",
+      "  6%|5         | 1110/20000 [00:19<02:32, 123.65it/s]\n",
+      "  6%|5         | 1127/20000 [00:19<02:18, 136.02it/s]\n",
+      "  6%|5         | 1145/20000 [00:19<02:08, 146.74it/s]\n",
+      "  6%|5         | 1161/20000 [00:19<02:06, 148.82it/s]\n",
+      "  6%|5         | 1177/20000 [00:19<02:04, 151.15it/s]\n",
+      "  6%|5         | 1193/20000 [00:19<02:09, 145.70it/s]\n",
+      "  6%|6         | 1208/20000 [00:19<02:10, 144.47it/s]\n",
+      "  6%|6         | 1227/20000 [00:19<02:00, 156.04it/s]\n",
+      "  6%|6         | 1244/20000 [00:19<01:57, 159.13it/s]\n",
+      "  6%|6         | 1261/20000 [00:19<02:01, 154.24it/s]\n",
+      "  6%|6         | 1277/20000 [00:20<02:09, 145.11it/s]\n",
+      "  6%|6         | 1292/20000 [00:20<02:10, 143.32it/s]\n",
+      "  7%|6         | 1307/20000 [00:20<03:02, 102.65it/s]\n",
+      "  7%|6         | 1330/20000 [00:20<02:23, 130.14it/s]\n",
+      "  7%|6         | 1348/20000 [00:20<02:13, 139.97it/s]\n",
+      "  7%|6         | 1368/20000 [00:20<02:02, 152.70it/s]\n",
+      "  7%|6         | 1385/20000 [00:20<02:00, 153.99it/s]\n",
+      "  7%|7         | 1402/20000 [00:21<02:07, 146.16it/s]\n",
+      "  7%|7         | 1423/20000 [00:21<01:55, 161.53it/s]\n",
+      "  7%|7         | 1441/20000 [00:21<01:52, 165.17it/s]\n",
+      "  7%|7         | 1459/20000 [00:21<01:55, 160.82it/s]\n",
+      "  7%|7         | 1476/20000 [00:21<02:03, 149.82it/s]\n",
+      "  7%|7         | 1492/20000 [00:21<02:08, 143.79it/s]\n",
+      "  8%|7         | 1507/20000 [00:21<02:10, 142.06it/s]\n",
+      "  8%|7         | 1530/20000 [00:21<01:52, 164.72it/s]\n",
+      "  8%|7         | 1548/20000 [00:21<01:50, 167.09it/s]\n",
+      "  8%|7         | 1565/20000 [00:22<01:49, 167.90it/s]\n",
+      "  8%|7         | 1582/20000 [00:22<01:53, 161.57it/s]\n",
+      "  8%|7         | 1599/20000 [00:22<01:56, 158.15it/s]\n",
+      "  8%|8         | 1617/20000 [00:22<01:51, 164.25it/s]\n",
+      "  8%|8         | 1637/20000 [00:22<01:45, 174.45it/s]\n",
+      "  8%|8         | 1657/20000 [00:22<01:41, 181.32it/s]\n",
+      "  8%|8         | 1676/20000 [00:22<01:40, 182.25it/s]\n",
+      "  8%|8         | 1695/20000 [00:22<01:46, 171.94it/s]\n",
+      "  9%|8         | 1718/20000 [00:22<01:38, 186.12it/s]\n",
+      "  9%|8         | 1739/20000 [00:22<01:34, 192.48it/s]\n",
+      "  9%|8         | 1759/20000 [00:23<02:13, 136.76it/s]\n",
+      "  9%|8         | 1777/20000 [00:23<02:04, 145.80it/s]\n",
+      "  9%|8         | 1794/20000 [00:23<02:04, 146.68it/s]\n",
+      "  9%|9         | 1814/20000 [00:23<01:53, 159.63it/s]\n",
+      "  9%|9         | 1836/20000 [00:23<01:43, 175.04it/s]\n",
+      "  9%|9         | 1856/20000 [00:23<01:41, 179.30it/s]\n",
+      "  9%|9         | 1875/20000 [00:23<01:42, 176.01it/s]\n",
+      "  9%|9         | 1894/20000 [00:23<01:45, 171.34it/s]\n",
+      " 10%|9         | 1915/20000 [00:24<01:39, 180.93it/s]\n",
+      " 10%|9         | 1937/20000 [00:24<01:34, 190.79it/s]\n",
+      " 10%|9         | 1957/20000 [00:24<01:35, 189.63it/s]\n",
+      " 10%|9         | 1977/20000 [00:24<01:36, 186.73it/s]\n",
+      " 10%|9         | 1996/20000 [00:24<01:42, 175.72it/s]\n",
+      " 10%|#         | 2018/20000 [00:24<01:35, 187.87it/s]\n",
+      " 10%|#         | 2046/20000 [00:24<01:24, 212.03it/s]\n",
+      " 10%|#         | 2068/20000 [00:24<01:27, 204.39it/s]\n",
+      " 10%|#         | 2089/20000 [00:24<01:31, 195.56it/s]\n",
+      " 11%|#         | 2109/20000 [00:25<01:33, 192.02it/s]\n",
+      " 11%|#         | 2140/20000 [00:25<01:19, 224.10it/s]\n",
+      " 11%|#         | 2165/20000 [00:25<01:17, 230.78it/s]\n",
+      " 11%|#         | 2189/20000 [00:25<01:18, 225.64it/s]\n",
+      " 11%|#1        | 2212/20000 [00:25<01:24, 210.15it/s]\n",
+      " 11%|#1        | 2236/20000 [00:25<01:21, 217.71it/s]\n",
+      " 11%|#1        | 2259/20000 [00:25<01:22, 215.12it/s]\n",
+      " 11%|#1        | 2281/20000 [00:25<01:24, 208.87it/s]\n",
+      " 12%|#1        | 2303/20000 [00:25<01:35, 185.14it/s]\n",
+      " 12%|#1        | 2333/20000 [00:26<01:22, 213.67it/s]\n",
+      " 12%|#1        | 2357/20000 [00:26<01:19, 220.73it/s]\n",
+      " 12%|#1        | 2380/20000 [00:26<01:21, 214.95it/s]\n",
+      " 12%|#2        | 2402/20000 [00:26<02:03, 142.71it/s]\n",
+      " 12%|#2        | 2432/20000 [00:26<01:40, 174.08it/s]\n",
+      " 12%|#2        | 2459/20000 [00:26<01:29, 195.81it/s]\n",
+      " 12%|#2        | 2482/20000 [00:26<01:28, 198.82it/s]\n",
+      " 13%|#2        | 2505/20000 [00:27<01:29, 195.33it/s]\n",
+      " 13%|#2        | 2538/20000 [00:27<01:16, 228.52it/s]\n",
+      " 13%|#2        | 2566/20000 [00:27<01:11, 242.22it/s]\n",
+      " 13%|#2        | 2592/20000 [00:27<01:15, 230.01it/s]\n",
+      " 13%|#3        | 2620/20000 [00:27<01:11, 243.40it/s]\n",
+      " 13%|#3        | 2651/20000 [00:27<01:06, 261.84it/s]\n",
+      " 13%|#3        | 2678/20000 [00:27<01:06, 260.46it/s]\n",
+      " 14%|#3        | 2705/20000 [00:27<01:08, 252.37it/s]\n",
+      " 14%|#3        | 2740/20000 [00:27<01:02, 278.24it/s]\n",
+      " 14%|#3        | 2769/20000 [00:27<01:05, 264.95it/s]\n",
+      " 14%|#3        | 2796/20000 [00:28<01:09, 247.16it/s]\n",
+      " 14%|#4        | 2828/20000 [00:28<01:04, 264.60it/s]\n",
+      " 14%|#4        | 2855/20000 [00:28<01:05, 260.34it/s]\n",
+      " 14%|#4        | 2882/20000 [00:28<01:09, 247.20it/s]\n",
+      " 15%|#4        | 2908/20000 [00:28<01:12, 236.53it/s]\n",
+      " 15%|#4        | 2952/20000 [00:28<00:58, 291.10it/s]\n",
+      " 15%|#4        | 2982/20000 [00:28<01:03, 266.27it/s]\n",
+      " 15%|#5        | 3010/20000 [00:28<01:03, 267.07it/s]\n",
+      " 15%|#5        | 3039/20000 [00:29<01:02, 270.37it/s]\n",
+      " 15%|#5        | 3068/20000 [00:29<01:01, 273.53it/s]\n",
+      " 15%|#5        | 3096/20000 [00:29<01:04, 263.45it/s]\n",
+      " 16%|#5        | 3129/20000 [00:29<00:59, 281.96it/s]\n",
+      " 16%|#5        | 3160/20000 [00:29<00:58, 287.48it/s]\n",
+      " 16%|#5        | 3190/20000 [00:29<01:00, 279.05it/s]\n",
+      " 16%|#6        | 3226/20000 [00:29<00:55, 301.05it/s]\n",
+      " 16%|#6        | 3257/20000 [00:29<00:55, 303.61it/s]\n",
+      " 16%|#6        | 3288/20000 [00:29<00:56, 293.52it/s]\n",
+      " 17%|#6        | 3318/20000 [00:29<00:56, 293.68it/s]\n",
+      " 17%|#6        | 3357/20000 [00:30<00:52, 318.68it/s]\n",
+      " 17%|#6        | 3390/20000 [00:30<00:58, 284.80it/s]\n",
+      " 17%|#7        | 3420/20000 [00:30<01:21, 204.06it/s]\n",
+      " 17%|#7        | 3459/20000 [00:30<01:08, 242.62it/s]\n",
+      " 17%|#7        | 3491/20000 [00:30<01:03, 260.00it/s]\n",
+      " 18%|#7        | 3535/20000 [00:30<00:54, 304.04it/s]\n",
+      " 18%|#7        | 3573/20000 [00:30<00:50, 323.92it/s]\n",
+      " 18%|#8        | 3608/20000 [00:31<00:55, 296.34it/s]\n",
+      " 18%|#8        | 3653/20000 [00:31<00:48, 336.01it/s]\n",
+      " 18%|#8        | 3689/20000 [00:31<00:49, 329.16it/s]\n",
+      " 19%|#8        | 3733/20000 [00:31<00:45, 358.11it/s]\n",
+      " 19%|#8        | 3771/20000 [00:31<00:44, 361.17it/s]\n",
+      " 19%|#9        | 3809/20000 [00:31<00:47, 342.31it/s]\n",
+      " 19%|#9        | 3861/20000 [00:31<00:41, 390.94it/s]\n",
+      " 20%|#9        | 3902/20000 [00:31<00:42, 378.22it/s]\n",
+      " 20%|#9        | 3968/20000 [00:31<00:35, 455.02it/s]\n",
+      " 20%|##        | 4015/20000 [00:32<00:37, 427.77it/s]\n",
+      " 20%|##        | 4066/20000 [00:32<00:35, 449.03it/s]\n",
+      " 21%|##        | 4112/20000 [00:32<00:39, 404.45it/s]\n",
+      " 21%|##        | 4174/20000 [00:32<00:34, 458.89it/s]\n",
+      " 21%|##1       | 4222/20000 [00:32<00:35, 442.90it/s]\n",
+      " 21%|##1       | 4271/20000 [00:32<00:34, 454.41it/s]\n",
+      " 22%|##1       | 4329/20000 [00:32<00:32, 489.36it/s]\n",
+      " 22%|##1       | 4387/20000 [00:32<00:30, 515.14it/s]\n",
+      " 22%|##2       | 4447/20000 [00:32<00:28, 538.10it/s]\n",
+      " 23%|##2       | 4502/20000 [00:33<00:32, 478.73it/s]\n",
+      " 23%|##2       | 4563/20000 [00:33<00:30, 512.67it/s]\n",
+      " 23%|##3       | 4616/20000 [00:33<00:30, 496.81it/s]\n",
+      " 23%|##3       | 4677/20000 [00:33<00:29, 527.98it/s]\n",
+      " 24%|##3       | 4733/20000 [00:33<00:28, 537.01it/s]\n",
+      " 24%|##3       | 4788/20000 [00:33<00:28, 534.59it/s]\n",
+      " 24%|##4       | 4864/20000 [00:33<00:25, 599.65it/s]\n",
+      " 25%|##4       | 4925/20000 [00:33<00:25, 595.70it/s]\n",
+      " 25%|##4       | 4994/20000 [00:33<00:24, 617.81it/s]\n",
+      " 25%|##5       | 5079/20000 [00:33<00:21, 683.71it/s]\n",
+      " 26%|##5       | 5148/20000 [00:34<00:35, 419.97it/s]\n",
+      " 26%|##6       | 5203/20000 [00:34<00:33, 446.58it/s]\n",
+      " 26%|##6       | 5289/20000 [00:34<00:27, 538.90it/s]\n",
+      " 27%|##6       | 5377/20000 [00:34<00:23, 622.07it/s]\n",
+      " 27%|##7       | 5471/20000 [00:34<00:20, 703.42it/s]\n",
+      " 28%|##7       | 5549/20000 [00:36<01:35, 150.73it/s]\n",
+      " 28%|##8       | 5606/20000 [00:36<01:37, 147.12it/s]\n",
+      " 28%|##8       | 5650/20000 [00:36<01:34, 151.57it/s]\n",
+      " 28%|##8       | 5686/20000 [00:37<01:33, 153.50it/s]\n",
+      " 29%|##8       | 5716/20000 [00:37<01:32, 154.45it/s]\n",
+      " 29%|##8       | 5742/20000 [00:37<01:29, 158.75it/s]\n",
+      " 29%|##8       | 5766/20000 [00:37<01:28, 160.05it/s]\n",
+      " 29%|##8       | 5788/20000 [00:37<01:29, 159.44it/s]\n",
+      " 29%|##9       | 5808/20000 [00:37<01:29, 158.22it/s]\n",
+      " 29%|##9       | 5827/20000 [00:37<01:27, 162.78it/s]\n",
+      " 29%|##9       | 5846/20000 [00:38<01:25, 165.07it/s]\n",
+      " 29%|##9       | 5864/20000 [00:38<01:25, 164.71it/s]\n",
+      " 29%|##9       | 5882/20000 [00:38<01:26, 162.88it/s]\n",
+      " 29%|##9       | 5899/20000 [00:38<01:30, 155.66it/s]\n",
+      " 30%|##9       | 5916/20000 [00:38<01:29, 158.09it/s]\n",
+      " 30%|##9       | 5935/20000 [00:38<01:24, 166.09it/s]\n",
+      " 30%|##9       | 5954/20000 [00:38<01:22, 169.84it/s]\n",
+      " 30%|##9       | 5972/20000 [00:38<01:21, 171.23it/s]\n",
+      " 30%|##9       | 5990/20000 [00:38<01:22, 170.81it/s]\n",
+      " 30%|###       | 6008/20000 [00:39<01:23, 167.70it/s]\n",
+      " 30%|###       | 6027/20000 [00:39<01:20, 173.01it/s]\n",
+      " 30%|###       | 6046/20000 [00:39<01:19, 175.35it/s]\n",
+      " 30%|###       | 6064/20000 [00:39<01:20, 172.23it/s]\n",
+      " 30%|###       | 6082/20000 [00:39<01:21, 170.55it/s]\n",
+      " 30%|###       | 6100/20000 [00:39<01:23, 167.05it/s]\n",
+      " 31%|###       | 6118/20000 [00:39<01:21, 170.70it/s]\n",
+      " 31%|###       | 6138/20000 [00:39<01:17, 178.09it/s]\n",
+      " 31%|###       | 6157/20000 [00:39<01:16, 179.96it/s]\n",
+      " 31%|###       | 6176/20000 [00:39<01:18, 177.21it/s]\n",
+      " 31%|###       | 6194/20000 [00:40<01:18, 174.99it/s]\n",
+      " 31%|###1      | 6212/20000 [00:40<01:19, 173.44it/s]\n",
+      " 31%|###1      | 6232/20000 [00:40<01:16, 180.04it/s]\n",
+      " 31%|###1      | 6251/20000 [00:40<01:16, 179.80it/s]\n",
+      " 31%|###1      | 6270/20000 [00:40<01:19, 172.28it/s]\n",
+      " 31%|###1      | 6288/20000 [00:40<01:20, 170.14it/s]\n",
+      " 32%|###1      | 6306/20000 [00:40<01:22, 165.92it/s]\n",
+      " 32%|###1      | 6327/20000 [00:40<01:16, 178.17it/s]\n",
+      " 32%|###1      | 6347/20000 [00:40<01:14, 183.85it/s]\n",
+      " 32%|###1      | 6366/20000 [00:41<01:14, 182.46it/s]\n",
+      " 32%|###1      | 6385/20000 [00:41<01:17, 175.53it/s]\n",
+      " 32%|###2      | 6403/20000 [00:41<01:21, 166.92it/s]\n",
+      " 32%|###2      | 6423/20000 [00:41<01:17, 174.57it/s]\n",
+      " 32%|###2      | 6443/20000 [00:41<01:15, 179.69it/s]\n",
+      " 32%|###2      | 6462/20000 [00:41<01:16, 178.06it/s]\n",
+      " 32%|###2      | 6480/20000 [00:41<01:17, 174.17it/s]\n",
+      " 32%|###2      | 6498/20000 [00:41<01:19, 170.04it/s]\n",
+      " 33%|###2      | 6517/20000 [00:41<01:16, 175.15it/s]\n",
+      " 33%|###2      | 6538/20000 [00:42<01:13, 184.06it/s]\n",
+      " 33%|###2      | 6558/20000 [00:42<01:11, 187.58it/s]\n",
+      " 33%|###2      | 6577/20000 [00:42<01:12, 183.99it/s]\n",
+      " 33%|###2      | 6596/20000 [00:42<01:14, 180.51it/s]\n",
+      " 33%|###3      | 6615/20000 [00:42<01:14, 180.64it/s]\n",
+      " 33%|###3      | 6636/20000 [00:42<01:11, 187.45it/s]\n",
+      " 33%|###3      | 6656/20000 [00:42<01:10, 189.43it/s]\n",
+      " 33%|###3      | 6675/20000 [00:42<01:11, 185.29it/s]\n",
+      " 33%|###3      | 6694/20000 [00:42<01:14, 177.91it/s]\n",
+      " 34%|###3      | 6712/20000 [00:42<01:15, 176.02it/s]\n",
+      " 34%|###3      | 6733/20000 [00:43<01:11, 185.68it/s]\n",
+      " 34%|###3      | 6752/20000 [00:43<01:10, 186.91it/s]\n",
+      " 34%|###3      | 6771/20000 [00:43<01:12, 183.53it/s]\n",
+      " 34%|###3      | 6790/20000 [00:43<01:15, 175.73it/s]\n",
+      " 34%|###4      | 6808/20000 [00:43<01:17, 170.68it/s]\n",
+      " 34%|###4      | 6828/20000 [00:43<01:13, 178.87it/s]\n",
+      " 34%|###4      | 6849/20000 [00:43<01:10, 186.16it/s]\n",
+      " 34%|###4      | 6868/20000 [00:43<01:10, 187.26it/s]\n",
+      " 34%|###4      | 6887/20000 [00:43<01:10, 185.36it/s]\n",
+      " 35%|###4      | 6906/20000 [00:44<01:12, 180.40it/s]\n",
+      " 35%|###4      | 6929/20000 [00:44<01:07, 193.50it/s]\n",
+      " 35%|###4      | 6950/20000 [00:44<01:05, 198.27it/s]\n",
+      " 35%|###4      | 6970/20000 [00:44<01:06, 197.04it/s]\n",
+      " 35%|###4      | 6990/20000 [00:44<01:08, 190.65it/s]\n",
+      " 35%|###5      | 7010/20000 [00:44<01:10, 184.33it/s]\n",
+      " 35%|###5      | 7029/20000 [00:44<01:10, 183.34it/s]\n",
+      " 35%|###5      | 7049/20000 [00:44<01:08, 188.06it/s]\n",
+      " 35%|###5      | 7068/20000 [00:44<01:08, 188.62it/s]\n",
+      " 35%|###5      | 7087/20000 [00:44<01:09, 184.71it/s]\n",
+      " 36%|###5      | 7106/20000 [00:45<01:11, 179.97it/s]\n",
+      " 36%|###5      | 7129/20000 [00:45<01:06, 192.60it/s]\n",
+      " 36%|###5      | 7151/20000 [00:45<01:04, 198.21it/s]\n",
+      " 36%|###5      | 7172/20000 [00:45<01:04, 198.74it/s]\n",
+      " 36%|###5      | 7192/20000 [00:45<01:04, 197.39it/s]\n",
+      " 36%|###6      | 7212/20000 [00:45<01:05, 195.87it/s]\n",
+      " 36%|###6      | 7235/20000 [00:45<01:02, 205.80it/s]\n",
+      " 36%|###6      | 7256/20000 [00:45<01:01, 205.81it/s]\n",
+      " 36%|###6      | 7277/20000 [00:45<01:03, 201.69it/s]\n",
+      " 36%|###6      | 7298/20000 [00:46<01:05, 193.96it/s]\n",
+      " 37%|###6      | 7320/20000 [00:46<01:03, 200.19it/s]\n",
+      " 37%|###6      | 7343/20000 [00:46<01:00, 208.13it/s]\n",
+      " 37%|###6      | 7364/20000 [00:46<01:00, 208.67it/s]\n",
+      " 37%|###6      | 7385/20000 [00:46<01:01, 204.27it/s]\n",
+      " 37%|###7      | 7406/20000 [00:46<01:03, 199.56it/s]\n",
+      " 37%|###7      | 7429/20000 [00:46<01:00, 207.10it/s]\n",
+      " 37%|###7      | 7450/20000 [00:46<01:00, 207.93it/s]\n",
+      " 37%|###7      | 7471/20000 [00:46<01:00, 207.32it/s]\n",
+      " 37%|###7      | 7492/20000 [00:46<01:01, 204.50it/s]\n",
+      " 38%|###7      | 7513/20000 [00:47<01:00, 205.49it/s]\n",
+      " 38%|###7      | 7537/20000 [00:47<00:57, 215.00it/s]\n",
+      " 38%|###7      | 7559/20000 [00:47<00:57, 215.82it/s]\n",
+      " 38%|###7      | 7581/20000 [00:47<00:58, 210.82it/s]\n",
+      " 38%|###8      | 7603/20000 [00:47<01:01, 200.64it/s]\n",
+      " 38%|###8      | 7627/20000 [00:47<00:58, 211.11it/s]\n",
+      " 38%|###8      | 7650/20000 [00:47<00:57, 215.27it/s]\n",
+      " 38%|###8      | 7672/20000 [00:47<00:58, 211.14it/s]\n",
+      " 38%|###8      | 7694/20000 [00:47<01:00, 203.74it/s]\n",
+      " 39%|###8      | 7716/20000 [00:48<00:59, 206.56it/s]\n",
+      " 39%|###8      | 7742/20000 [00:48<00:55, 219.27it/s]\n",
+      " 39%|###8      | 7765/20000 [00:48<00:55, 222.34it/s]\n",
+      " 39%|###8      | 7788/20000 [00:48<00:55, 220.09it/s]\n",
+      " 39%|###9      | 7811/20000 [00:48<00:56, 217.29it/s]\n",
+      " 39%|###9      | 7837/20000 [00:48<00:53, 227.59it/s]\n",
+      " 39%|###9      | 7860/20000 [00:48<00:53, 225.68it/s]\n",
+      " 39%|###9      | 7883/20000 [00:48<00:57, 210.45it/s]\n",
+      " 40%|###9      | 7905/20000 [00:48<00:59, 203.50it/s]\n",
+      " 40%|###9      | 7931/20000 [00:49<00:55, 218.40it/s]\n",
+      " 40%|###9      | 7955/20000 [00:49<00:53, 223.84it/s]\n",
+      " 40%|###9      | 7978/20000 [00:49<00:54, 222.42it/s]\n",
+      " 40%|####      | 8001/20000 [00:49<00:56, 211.24it/s]\n",
+      " 40%|####      | 8028/20000 [00:49<00:52, 226.96it/s]\n",
+      " 40%|####      | 8052/20000 [00:49<00:52, 229.36it/s]\n",
+      " 40%|####      | 8076/20000 [00:49<00:52, 226.55it/s]\n",
+      " 40%|####      | 8099/20000 [00:49<00:54, 217.59it/s]\n",
+      " 41%|####      | 8121/20000 [00:50<01:26, 136.56it/s]\n",
+      " 41%|####      | 8144/20000 [00:50<01:16, 154.69it/s]\n",
+      " 41%|####      | 8165/20000 [00:50<01:11, 165.14it/s]\n",
+      " 41%|####      | 8186/20000 [00:50<01:07, 174.19it/s]\n",
+      " 41%|####1     | 8206/20000 [00:50<01:05, 179.79it/s]\n",
+      " 41%|####1     | 8234/20000 [00:50<00:57, 205.18it/s]\n",
+      " 41%|####1     | 8259/20000 [00:50<00:54, 215.64it/s]\n",
+      " 41%|####1     | 8282/20000 [00:50<00:53, 219.03it/s]\n",
+      " 42%|####1     | 8305/20000 [00:50<00:55, 209.63it/s]\n",
+      " 42%|####1     | 8334/20000 [00:51<00:50, 229.98it/s]\n",
+      " 42%|####1     | 8359/20000 [00:51<00:49, 234.96it/s]\n",
+      " 42%|####1     | 8383/20000 [00:51<00:50, 230.45it/s]\n",
+      " 42%|####2     | 8407/20000 [00:51<00:52, 222.38it/s]\n",
+      " 42%|####2     | 8436/20000 [00:51<00:48, 240.62it/s]\n",
+      " 42%|####2     | 8461/20000 [00:51<00:47, 242.60it/s]\n",
+      " 42%|####2     | 8486/20000 [00:51<00:47, 239.88it/s]\n",
+      " 43%|####2     | 8511/20000 [00:51<00:48, 236.67it/s]\n",
+      " 43%|####2     | 8539/20000 [00:51<00:46, 247.65it/s]\n",
+      " 43%|####2     | 8564/20000 [00:51<00:46, 244.78it/s]\n",
+      " 43%|####2     | 8589/20000 [00:52<00:48, 236.04it/s]\n",
+      " 43%|####3     | 8613/20000 [00:52<00:48, 235.15it/s]\n",
+      " 43%|####3     | 8643/20000 [00:52<00:45, 252.29it/s]\n",
+      " 43%|####3     | 8669/20000 [00:52<00:45, 248.06it/s]\n",
+      " 43%|####3     | 8694/20000 [00:52<00:47, 240.33it/s]\n",
+      " 44%|####3     | 8720/20000 [00:52<00:46, 243.84it/s]\n",
+      " 44%|####3     | 8748/20000 [00:52<00:44, 254.21it/s]\n",
+      " 44%|####3     | 8777/20000 [00:52<00:42, 263.08it/s]\n",
+      " 44%|####4     | 8808/20000 [00:52<00:40, 276.78it/s]\n",
+      " 44%|####4     | 8856/20000 [00:53<00:33, 336.62it/s]\n",
+      " 44%|####4     | 8896/20000 [00:53<00:31, 353.29it/s]\n",
+      " 45%|####4     | 8955/20000 [00:53<00:26, 422.10it/s]\n",
+      " 45%|####5     | 9001/20000 [00:53<00:25, 425.76it/s]\n",
+      " 45%|####5     | 9070/20000 [00:53<00:21, 501.92it/s]\n",
+      " 46%|####5     | 9128/20000 [00:53<00:20, 523.43it/s]\n",
+      " 46%|####5     | 9183/20000 [00:53<00:20, 529.69it/s]\n",
+      " 46%|####6     | 9237/20000 [00:53<00:20, 531.16it/s]\n",
+      " 46%|####6     | 9291/20000 [00:53<00:21, 508.14it/s]\n",
+      " 47%|####6     | 9346/20000 [00:53<00:20, 520.17it/s]\n",
+      " 47%|####6     | 9399/20000 [00:54<00:20, 509.66it/s]\n",
+      " 47%|####7     | 9451/20000 [00:54<00:20, 509.69it/s]\n",
+      " 48%|####7     | 9503/20000 [00:54<00:21, 494.01it/s]\n",
+      " 48%|####7     | 9580/20000 [00:54<00:18, 569.61it/s]\n",
+      " 48%|####8     | 9649/20000 [00:54<00:17, 602.61it/s]\n",
+      " 49%|####8     | 9710/20000 [00:54<00:17, 597.78it/s]\n",
+      " 49%|####8     | 9792/20000 [00:54<00:15, 660.63it/s]\n",
+      " 49%|####9     | 9874/20000 [00:54<00:14, 707.37it/s]\n",
+      " 50%|####9     | 9946/20000 [00:54<00:14, 698.70it/s]\n",
+      " 50%|#####     | 10029/20000 [00:54<00:13, 735.01it/s]\n",
+      " 51%|#####     | 10103/20000 [00:55<00:14, 703.33it/s]\n",
+      " 51%|#####     | 10187/20000 [00:55<00:13, 742.49it/s]\n",
+      " 51%|#####1    | 10275/20000 [00:55<00:12, 782.42it/s]\n",
+      " 52%|#####1    | 10372/20000 [00:55<00:11, 837.41it/s]\n",
+      " 52%|#####2    | 10463/20000 [00:55<00:11, 858.83it/s]\n",
+      " 53%|#####2    | 10550/20000 [00:55<00:10, 862.08it/s]\n",
+      " 53%|#####3    | 10640/20000 [00:55<00:10, 873.35it/s]\n",
+      " 54%|#####3    | 10728/20000 [00:55<00:10, 857.41it/s]\n",
+      " 54%|#####4    | 10815/20000 [00:55<00:10, 858.61it/s]\n",
+      " 55%|#####4    | 10902/20000 [00:56<00:10, 861.95it/s]\n",
+      " 55%|#####5    | 11034/20000 [00:56<00:08, 997.87it/s]\n",
+      " 56%|#####5    | 11179/20000 [00:56<00:07, 1132.48it/s]\n",
+      " 56%|#####6    | 11296/20000 [00:56<00:07, 1143.65it/s]\n",
+      " 57%|#####7    | 11420/20000 [00:56<00:07, 1172.43it/s]\n",
+      " 58%|#####7    | 11579/20000 [00:56<00:06, 1297.33it/s]\n",
+      " 59%|#####8    | 11758/20000 [00:56<00:05, 1444.84it/s]\n",
+      " 60%|#####9    | 11971/20000 [00:56<00:04, 1650.09it/s]\n",
+      " 61%|######    | 12137/20000 [00:58<00:32, 241.50it/s] \n",
+      " 61%|######1   | 12256/20000 [00:59<00:34, 221.55it/s]\n",
+      " 62%|######1   | 12344/20000 [00:59<00:36, 211.18it/s]\n",
+      " 62%|######2   | 12411/20000 [01:00<00:37, 204.44it/s]\n",
+      " 62%|######2   | 12464/20000 [01:00<00:37, 201.86it/s]\n",
+      " 63%|######2   | 12507/20000 [01:00<00:37, 197.48it/s]\n",
+      " 63%|######2   | 12542/20000 [01:00<00:35, 209.76it/s]\n",
+      " 63%|######2   | 12576/20000 [01:01<00:33, 220.35it/s]\n",
+      " 63%|######3   | 12609/20000 [01:01<00:32, 226.03it/s]\n",
+      " 63%|######3   | 12640/20000 [01:01<00:31, 234.04it/s]\n",
+      " 63%|######3   | 12670/20000 [01:01<00:29, 246.31it/s]\n",
+      " 64%|######3   | 12700/20000 [01:01<00:28, 251.76it/s]\n",
+      " 64%|######3   | 12731/20000 [01:01<00:27, 263.76it/s]\n",
+      " 64%|######3   | 12761/20000 [01:01<00:26, 272.59it/s]\n",
+      " 64%|######3   | 12791/20000 [01:01<00:26, 271.94it/s]\n",
+      " 64%|######4   | 12820/20000 [01:01<00:26, 274.53it/s]\n",
+      " 64%|######4   | 12850/20000 [01:02<00:25, 281.46it/s]\n",
+      " 64%|######4   | 12879/20000 [01:02<00:25, 277.63it/s]\n",
+      " 65%|######4   | 12908/20000 [01:02<00:26, 264.03it/s]\n",
+      " 65%|######4   | 12939/20000 [01:02<00:25, 276.61it/s]\n",
+      " 65%|######4   | 12969/20000 [01:02<00:24, 281.60it/s]\n",
+      " 65%|######4   | 12998/20000 [01:02<00:24, 280.78it/s]\n",
+      " 65%|######5   | 13027/20000 [01:02<00:25, 275.51it/s]\n",
+      " 65%|######5   | 13058/20000 [01:02<00:24, 285.34it/s]\n",
+      " 65%|######5   | 13087/20000 [01:02<00:24, 285.03it/s]\n",
+      " 66%|######5   | 13117/20000 [01:03<00:23, 287.71it/s]\n",
+      " 66%|######5   | 13151/20000 [01:03<00:22, 301.25it/s]\n",
+      " 66%|######5   | 13182/20000 [01:03<00:22, 299.42it/s]\n",
+      " 66%|######6   | 13213/20000 [01:03<00:23, 288.18it/s]\n",
+      " 66%|######6   | 13247/20000 [01:03<00:22, 302.07it/s]\n",
+      " 66%|######6   | 13280/20000 [01:03<00:21, 309.23it/s]\n",
+      " 67%|######6   | 13312/20000 [01:03<00:21, 306.12it/s]\n",
+      " 67%|######6   | 13348/20000 [01:03<00:20, 321.72it/s]\n",
+      " 67%|######6   | 13381/20000 [01:03<00:20, 320.39it/s]\n",
+      " 67%|######7   | 13414/20000 [01:04<00:35, 183.90it/s]\n",
+      " 67%|######7   | 13448/20000 [01:04<00:30, 213.47it/s]\n",
+      " 67%|######7   | 13478/20000 [01:04<00:28, 232.06it/s]\n",
+      " 68%|######7   | 13508/20000 [01:04<00:26, 246.85it/s]\n",
+      " 68%|######7   | 13546/20000 [01:04<00:23, 278.79it/s]\n",
+      " 68%|######7   | 13578/20000 [01:04<00:22, 289.60it/s]\n",
+      " 68%|######8   | 13610/20000 [01:04<00:21, 290.75it/s]\n",
+      " 68%|######8   | 13650/20000 [01:04<00:19, 319.96it/s]\n",
+      " 68%|######8   | 13684/20000 [01:05<00:19, 322.87it/s]\n",
+      " 69%|######8   | 13718/20000 [01:05<00:19, 324.97it/s]\n",
+      " 69%|######8   | 13753/20000 [01:05<00:18, 332.16it/s]\n",
+      " 69%|######8   | 13787/20000 [01:05<00:19, 323.16it/s]\n",
+      " 69%|######9   | 13820/20000 [01:05<00:19, 317.82it/s]\n",
+      " 69%|######9   | 13857/20000 [01:05<00:18, 332.74it/s]\n",
+      " 69%|######9   | 13891/20000 [01:05<00:18, 333.86it/s]\n",
+      " 70%|######9   | 13927/20000 [01:05<00:17, 340.50it/s]\n",
+      " 70%|######9   | 13963/20000 [01:05<00:17, 345.20it/s]\n",
+      " 70%|######9   | 13998/20000 [01:05<00:17, 340.60it/s]\n",
+      " 70%|#######   | 14036/20000 [01:06<00:16, 351.09it/s]\n",
+      " 70%|#######   | 14073/20000 [01:06<00:16, 356.65it/s]\n",
+      " 71%|#######   | 14109/20000 [01:06<00:16, 353.45it/s]\n",
+      " 71%|#######   | 14150/20000 [01:06<00:15, 369.02it/s]\n",
+      " 71%|#######   | 14187/20000 [01:06<00:15, 368.21it/s]\n",
+      " 71%|#######1  | 14227/20000 [01:06<00:15, 375.42it/s]\n",
+      " 71%|#######1  | 14265/20000 [01:06<00:16, 345.08it/s]\n",
+      " 72%|#######1  | 14301/20000 [01:06<00:16, 347.30it/s]\n",
+      " 72%|#######1  | 14349/20000 [01:06<00:14, 383.90it/s]\n",
+      " 72%|#######1  | 14388/20000 [01:06<00:14, 376.96it/s]\n",
+      " 72%|#######2  | 14430/20000 [01:07<00:14, 389.28it/s]\n",
+      " 72%|#######2  | 14471/20000 [01:07<00:13, 395.30it/s]\n",
+      " 73%|#######2  | 14511/20000 [01:07<00:14, 389.82it/s]\n",
+      " 73%|#######2  | 14554/20000 [01:07<00:13, 401.53it/s]\n",
+      " 73%|#######2  | 14595/20000 [01:07<00:14, 378.41it/s]\n",
+      " 73%|#######3  | 14643/20000 [01:07<00:13, 405.95it/s]\n",
+      " 73%|#######3  | 14687/20000 [01:07<00:12, 415.69it/s]\n",
+      " 74%|#######3  | 14730/20000 [01:07<00:12, 418.62it/s]\n",
+      " 74%|#######3  | 14774/20000 [01:07<00:12, 422.40it/s]\n",
+      " 74%|#######4  | 14817/20000 [01:08<00:12, 418.48it/s]\n",
+      " 74%|#######4  | 14868/20000 [01:08<00:11, 443.95it/s]\n",
+      " 75%|#######4  | 14913/20000 [01:08<00:11, 444.41it/s]\n",
+      " 75%|#######4  | 14962/20000 [01:08<00:11, 457.86it/s]\n",
+      " 75%|#######5  | 15008/20000 [01:08<00:11, 438.97it/s]\n",
+      " 75%|#######5  | 15067/20000 [01:08<00:10, 481.14it/s]\n",
+      " 76%|#######5  | 15116/20000 [01:08<00:10, 483.71it/s]\n",
+      " 76%|#######5  | 15173/20000 [01:08<00:09, 509.06it/s]\n",
+      " 76%|#######6  | 15227/20000 [01:08<00:09, 518.19it/s]\n",
+      " 76%|#######6  | 15285/20000 [01:08<00:08, 534.95it/s]\n",
+      " 77%|#######6  | 15351/20000 [01:09<00:08, 570.41it/s]\n",
+      " 77%|#######7  | 15409/20000 [01:09<00:08, 569.86it/s]\n",
+      " 77%|#######7  | 15477/20000 [01:09<00:07, 602.56it/s]\n",
+      " 78%|#######7  | 15538/20000 [01:09<00:07, 602.96it/s]\n",
+      " 78%|#######7  | 15599/20000 [01:09<00:07, 585.87it/s]\n",
+      " 78%|#######8  | 15658/20000 [01:09<00:07, 581.97it/s]\n",
+      " 79%|#######8  | 15722/20000 [01:09<00:07, 598.93it/s]\n",
+      " 79%|#######8  | 15799/20000 [01:09<00:06, 647.41it/s]\n",
+      " 79%|#######9  | 15877/20000 [01:09<00:06, 684.57it/s]\n",
+      " 80%|#######9  | 15957/20000 [01:09<00:05, 718.72it/s]\n",
+      " 80%|########  | 16037/20000 [01:10<00:05, 740.70it/s]\n",
+      " 81%|########  | 16112/20000 [01:10<00:05, 730.42it/s]\n",
+      " 81%|########  | 16195/20000 [01:10<00:05, 757.50it/s]\n",
+      " 81%|########1 | 16288/20000 [01:10<00:04, 808.47it/s]\n",
+      " 82%|########1 | 16369/20000 [01:10<00:04, 797.07it/s]\n",
+      " 82%|########2 | 16467/20000 [01:10<00:04, 850.97it/s]\n",
+      " 83%|########2 | 16563/20000 [01:10<00:03, 883.26it/s]\n",
+      " 83%|########3 | 16659/20000 [01:10<00:03, 906.02it/s]\n",
+      " 84%|########3 | 16767/20000 [01:10<00:03, 957.87it/s]\n",
+      " 84%|########4 | 16881/20000 [01:10<00:03, 1012.25it/s]\n",
+      " 85%|########4 | 16990/20000 [01:11<00:02, 1035.48it/s]\n",
+      " 86%|########5 | 17120/20000 [01:11<00:02, 1114.63it/s]\n",
+      " 86%|########6 | 17240/20000 [01:11<00:02, 1136.79it/s]\n",
+      " 87%|########6 | 17379/20000 [01:11<00:02, 1212.48it/s]\n",
+      " 88%|########7 | 17514/20000 [01:11<00:01, 1249.92it/s]\n",
+      " 88%|########8 | 17656/20000 [01:11<00:01, 1300.74it/s]\n",
+      " 89%|########9 | 17812/20000 [01:11<00:01, 1378.28it/s]\n",
+      " 90%|######### | 18001/20000 [01:11<00:01, 1522.37it/s]\n",
+      " 91%|#########1| 18201/20000 [01:11<00:01, 1664.77it/s]\n",
+      " 92%|#########2| 18455/20000 [01:11<00:00, 1926.29it/s]\n",
+      " 94%|#########3| 18729/20000 [01:13<00:03, 331.40it/s] \n",
+      " 94%|#########4| 18869/20000 [01:14<00:04, 279.90it/s]\n",
+      " 95%|#########4| 18972/20000 [01:15<00:04, 253.37it/s]\n",
+      " 95%|#########5| 19050/20000 [01:15<00:03, 238.36it/s]\n",
+      " 96%|#########5| 19110/20000 [01:16<00:03, 223.98it/s]\n",
+      " 96%|#########5| 19157/20000 [01:16<00:03, 218.87it/s]\n",
+      " 96%|#########5| 19196/20000 [01:16<00:03, 212.50it/s]\n",
+      " 96%|#########6| 19229/20000 [01:16<00:03, 208.06it/s]\n",
+      " 96%|#########6| 19258/20000 [01:16<00:03, 205.77it/s]\n",
+      " 96%|#########6| 19284/20000 [01:17<00:03, 202.04it/s]\n",
+      " 97%|#########6| 19308/20000 [01:17<00:03, 197.39it/s]\n",
+      " 97%|#########6| 19330/20000 [01:17<00:03, 197.54it/s]\n",
+      " 97%|#########6| 19352/20000 [01:17<00:03, 196.16it/s]\n",
+      " 97%|#########6| 19373/20000 [01:17<00:03, 194.10it/s]\n",
+      " 97%|#########6| 19394/20000 [01:17<00:03, 191.14it/s]\n",
+      " 97%|#########7| 19414/20000 [01:17<00:03, 190.06it/s]\n",
+      " 97%|#########7| 19434/20000 [01:17<00:02, 192.10it/s]\n",
+      " 97%|#########7| 19454/20000 [01:17<00:02, 188.68it/s]\n",
+      " 97%|#########7| 19474/20000 [01:18<00:02, 188.67it/s]\n",
+      " 97%|#########7| 19493/20000 [01:18<00:02, 188.00it/s]\n",
+      " 98%|#########7| 19512/20000 [01:18<00:02, 187.50it/s]\n",
+      " 98%|#########7| 19533/20000 [01:18<00:02, 193.36it/s]\n",
+      " 98%|#########7| 19553/20000 [01:18<00:02, 194.71it/s]\n",
+      " 98%|#########7| 19573/20000 [01:18<00:02, 194.55it/s]\n",
+      " 98%|#########7| 19593/20000 [01:18<00:02, 192.76it/s]\n",
+      " 98%|#########8| 19613/20000 [01:18<00:02, 190.98it/s]\n",
+      " 98%|#########8| 19634/20000 [01:18<00:01, 194.23it/s]\n",
+      " 98%|#########8| 19654/20000 [01:18<00:01, 193.65it/s]\n",
+      " 98%|#########8| 19674/20000 [01:19<00:01, 192.69it/s]\n",
+      " 98%|#########8| 19694/20000 [01:19<00:01, 192.02it/s]\n",
+      " 99%|#########8| 19714/20000 [01:19<00:01, 192.65it/s]\n",
+      " 99%|#########8| 19736/20000 [01:19<00:01, 198.30it/s]\n",
+      " 99%|#########8| 19757/20000 [01:19<00:01, 200.54it/s]\n",
+      " 99%|#########8| 19778/20000 [01:19<00:01, 198.65it/s]\n",
+      " 99%|#########8| 19798/20000 [01:19<00:01, 197.32it/s]\n",
+      " 99%|#########9| 19818/20000 [01:19<00:00, 197.53it/s]\n",
+      " 99%|#########9| 19839/20000 [01:19<00:00, 200.59it/s]\n",
+      " 99%|#########9| 19860/20000 [01:19<00:00, 196.98it/s]\n",
+      " 99%|#########9| 19881/20000 [01:20<00:00, 198.45it/s]\n",
+      "100%|#########9| 19901/20000 [01:20<00:00, 193.05it/s]\n",
+      "100%|#########9| 19924/20000 [01:20<00:00, 201.34it/s]\n",
+      "100%|#########9| 19946/20000 [01:20<00:00, 205.53it/s]\n",
+      "100%|#########9| 19967/20000 [01:20<00:00, 205.63it/s]\n",
+      "100%|#########9| 19988/20000 [01:20<00:00, 203.92it/s]\n",
+      "100%|##########| 20000/20000 [01:20<00:00, 247.89it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python learn_bpe.py -s 20000 -i dataset/output.txt -o dataset/codec.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "68a4113a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apply_bpe.py -i ./dataset/output.txt -o ./dataset/output_dataset.txt -c ./dataset/codec.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "06254f0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vocabulary size: 20217\n"
+     ]
+    }
+   ],
+   "source": [
+    "def count_tokens(file_path):\n",
+    "    try:\n",
+    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "            text = file.read()\n",
+    "            # Split the text into tokens based on spaces\n",
+    "            tokens = text.split()\n",
+    "            # Count the vocabulary size (number of unique tokens)\n",
+    "            vocabulary_size = len(set(tokens))\n",
+    "            return vocabulary_size\n",
+    "    except IOError:\n",
+    "        print(f\"Error: Could not open or read the file '{file_path}'\")\n",
+    "        return -1\n",
+    "\n",
+    "# Example usage\n",
+    "file_path = './dataset/output_dataset.txt'  # Replace with the actual file path\n",
+    "vocabulary_size = count_tokens(file_path)\n",
+    "if vocabulary_size != -1:\n",
+    "    print(f\"Vocabulary size: {vocabulary_size}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

subword/get_vocab.py ADDED Viewed

	@@ -0,0 +1,87 @@

+#! /usr/bin/env python
+from __future__ import print_function
+import os
+import sys
+import inspect
+import warnings
+import argparse
+import codecs
+from collections import Counter
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+def create_parser(subparsers=None):
+    if subparsers:
+        parser = subparsers.add_parser('get-vocab',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="Generates vocabulary")
+    else:
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="Generates vocabulary")
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input file (default: standard input).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output file (default: standard output)")
+    return parser
+def get_vocab(train_file, vocab_file):
+    c = Counter()
+    for line in train_file:
+        for word in line.strip('\r\n ').split(' '):
+            if word:
+                c[word] += 1
+    for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
+        vocab_file.write(key+" "+ str(f) + "\n")
+if __name__ == "__main__":
+    currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    newdir = os.path.join(currentdir, 'subword_nmt')
+    if os.path.isdir(newdir):
+        warnings.warn(
+            "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+            DeprecationWarning
+        )
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
+    parser = create_parser()
+    args = parser.parse_args()
+    # read/write files as UTF-8
+    if args.input.name != '<stdin>':
+        args.input = codecs.open(args.input.name, encoding='utf-8')
+    if args.output.name != '<stdout>':
+        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+    get_vocab(args.input, args.output)
+    # close files
+    if args.input.name != '<stdin>':
+        args.input.close()
+    if args.output.name != '<stdout>':
+        args.output.close()

subword/learn_bpe.py ADDED Viewed

	@@ -0,0 +1,372 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
+Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
+of a text to a configurable number of symbols, with only a small increase in the number of tokens.
+Reference:
+Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
+Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
+"""
+from __future__ import unicode_literals
+import os
+import sys
+import inspect
+import codecs
+import re
+import copy
+import argparse
+import warnings
+import tempfile
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict, Counter
+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(iterator, *args, **kwargs):
+        return iterator
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+def create_parser(subparsers=None):
+    if subparsers:
+        parser = subparsers.add_parser('learn-bpe',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    else:
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input text (default: standard input).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output file for BPE codes (default: standard output)")
+    parser.add_argument(
+        '--symbols', '-s', type=int, default=10000,
+        help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
+    parser.add_argument(
+        '--min-frequency', type=int, default=2, metavar='FREQ',
+        help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
+    parser.add_argument('--dict-input', action="store_true",
+        help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
+    parser.add_argument(
+        '--total-symbols', '-t', action="store_true",
+        help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
+    parser.add_argument(
+        '--num-workers', type=int, default=1,
+        help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
+    parser.add_argument(
+        '--verbose', '-v', action="store_true",
+        help="verbose mode.")
+    return parser
+def get_vocabulary(fobj, is_dict=False, num_workers=1):
+    """Read text and return dictionary that encodes vocabulary
+    """
+    vocab = Counter()
+    if is_dict:
+        for i, line in enumerate(fobj):
+            try:
+                word, count = line.strip('\r\n ').split(' ')
+            except:
+                print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
+                sys.exit(1)
+            vocab[word] += int(count)
+    elif num_workers == 1 or fobj.name == '<stdin>':
+        if num_workers > 1:
+            warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
+        for i, line in enumerate(fobj):
+            for word in line.strip('\r\n ').split(' '):
+                if word:
+                    vocab[word] += 1
+    elif num_workers > 1:
+        if sys.version_info < (3, 0):
+            print("Parallel mode is only supported in Python3.")
+            sys.exit(1)
+        with open(fobj.name, encoding="utf8") as f:
+            size = os.fstat(f.fileno()).st_size
+            chunk_size = int(size / num_workers)
+            offsets = [0 for _ in range(num_workers + 1)]
+            for i in range(1, num_workers):
+                f.seek(chunk_size * i)
+                pos = f.tell()
+                while True:
+                    try:
+                        line = f.readline()
+                        break
+                    except UnicodeDecodeError:
+                        pos -= 1
+                        f.seek(pos)
+                offsets[i] = f.tell()
+                assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
+        vocab_files = []
+        pool = Pool(processes=num_workers)
+        for i in range(num_workers):
+            tmp = tempfile.NamedTemporaryFile(delete=False)
+            tmp.close()
+            vocab_files.append(tmp)
+            pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1]))
+        pool.close()
+        pool.join()
+        import pickle
+        for i in range(num_workers):
+            with open(vocab_files[i].name, 'rb') as f:
+                vocab += pickle.load(f)
+            os.remove(vocab_files[i].name)
+    else:
+        raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
+    return vocab
+def _get_vocabulary(infile, outfile, begin, end):
+    import pickle
+    vocab = Counter()
+    with open(infile, encoding="utf8") as f:
+        f.seek(begin)
+        line = f.readline()
+        while line:
+            pos = f.tell()
+            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
+            if end > 0 and pos > end:
+                break
+            for word in line.strip('\r\n ').split(' '):
+                if word:
+                    vocab[word] += 1
+            line = f.readline()
+    with open(outfile, 'wb') as f:
+        pickle.dump(vocab, f)
+def update_pair_statistics(pair, changed, stats, indices):
+    """Minimally update the indices and frequency of symbol pairs
+    if we merge a pair of symbols, only pairs that overlap with occurrences
+    of this pair are affected, and need to be updated.
+    """
+    stats[pair] = 0
+    indices[pair] = defaultdict(int)
+    first, second = pair
+    new_pair = first+second
+    for j, word, old_word, freq in changed:
+        # find all instances of pair, and update frequency/indices around it
+        i = 0
+        while True:
+            # find first symbol
+            try:
+                i = old_word.index(first, i)
+            except ValueError:
+                break
+            # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
+            if i < len(old_word)-1 and old_word[i+1] == second:
+                # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
+                if i:
+                    prev = old_word[i-1:i+1]
+                    stats[prev] -= freq
+                    indices[prev][j] -= 1
+                if i < len(old_word)-2:
+                    # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
+                    # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
+                    if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
+                        nex = old_word[i+1:i+3]
+                        stats[nex] -= freq
+                        indices[nex][j] -= 1
+                i += 2
+            else:
+                i += 1
+        i = 0
+        while True:
+            try:
+                # find new pair
+                i = word.index(new_pair, i)
+            except ValueError:
+                break
+            # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
+            if i:
+                prev = word[i-1:i+1]
+                stats[prev] += freq
+                indices[prev][j] += 1
+            # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
+            # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
+            if i < len(word)-1 and word[i+1] != new_pair:
+                nex = word[i:i+2]
+                stats[nex] += freq
+                indices[nex][j] += 1
+            i += 1
+def get_pair_statistics(vocab):
+    """Count frequency of all symbol pairs, and create index"""
+    # data structure of pair frequencies
+    stats = defaultdict(int)
+    #index from pairs to words
+    indices = defaultdict(lambda: defaultdict(int))
+    for i, (word, freq) in enumerate(vocab):
+        prev_char = word[0]
+        for char in word[1:]:
+            stats[prev_char, char] += freq
+            indices[prev_char, char][i] += 1
+            prev_char = char
+    return stats, indices
+def replace_pair(pair, vocab, indices):
+    """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
+    first, second = pair
+    pair_str = ''.join(pair)
+    pair_str = pair_str.replace('\\','\\\\')
+    changes = []
+    pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
+    if sys.version_info < (3, 0):
+        iterator = indices[pair].iteritems()
+    else:
+        iterator = indices[pair].items()
+    for j, freq in iterator:
+        if freq < 1:
+            continue
+        word, freq = vocab[j]
+        new_word = ' '.join(word)
+        new_word = pattern.sub(pair_str, new_word)
+        new_word = tuple(new_word.split(' '))
+        vocab[j] = (new_word, freq)
+        changes.append((j, new_word, word, freq))
+    return changes
+def prune_stats(stats, big_stats, threshold):
+    """Prune statistics dict for efficiency of max()
+    The frequency of a symbol pair never increases, so pruning is generally safe
+    (until we the most frequent pair is less frequent than a pair we previously pruned)
+    big_stats keeps full statistics for when we need to access pruned items
+    """
+    for item,freq in list(stats.items()):
+        if freq < threshold:
+            del stats[item]
+            if freq < 0:
+                big_stats[item] += freq
+            else:
+                big_stats[item] = freq
+def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False, total_symbols=False, num_workers=1):
+    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
+    """
+    # version 0.2 changes the handling of the end-of-word token ('</w>');
+    # version numbering allows bckward compatibility
+    outfile.write('#version: 0.2\n')
+    vocab = get_vocabulary(infile, is_dict, num_workers)
+    vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
+    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
+    stats, indices = get_pair_statistics(sorted_vocab)
+    big_stats = copy.deepcopy(stats)
+    if total_symbols:
+        uniq_char_internal = set()
+        uniq_char_final = set()
+        for word in vocab:
+            for char in word[:-1]:
+                uniq_char_internal.add(char)
+            uniq_char_final.add(word[-1])
+        sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal)))
+        sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final)))
+        sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final)))
+        num_symbols -= len(uniq_char_internal) + len(uniq_char_final)
+    # threshold is inspired by Zipfian assumption, but should only affect speed
+    threshold = max(stats.values()) / 10
+    for i in tqdm(range(num_symbols)):
+        if stats:
+            most_frequent = max(stats, key=lambda x: (stats[x], x))
+        # we probably missed the best pair because of pruning; go back to full statistics
+        if not stats or (i and stats[most_frequent] < threshold):
+            prune_stats(stats, big_stats, threshold)
+            stats = copy.deepcopy(big_stats)
+            most_frequent = max(stats, key=lambda x: (stats[x], x))
+            # threshold is inspired by Zipfian assumption, but should only affect speed
+            threshold = stats[most_frequent] * i/(i+10000.0)
+            prune_stats(stats, big_stats, threshold)
+        if stats[most_frequent] < min_frequency:
+            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
+            break
+        if verbose:
+            sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
+        outfile.write('{0} {1}\n'.format(*most_frequent))
+        changes = replace_pair(most_frequent, sorted_vocab, indices)
+        update_pair_statistics(most_frequent, changes, stats, indices)
+        stats[most_frequent] = 0
+        if not i % 100:
+            prune_stats(stats, big_stats, threshold)
+if __name__ == '__main__':
+    currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    newdir = os.path.join(currentdir, 'subword_nmt')
+    if os.path.isdir(newdir):
+        warnings.warn(
+            "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+            DeprecationWarning
+        )
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
+    parser = create_parser()
+    args = parser.parse_args()
+    if args.num_workers <= 0:
+        args.num_workers = cpu_count()
+    if sys.version_info < (3, 0) and args.num_workers > 1:
+        args.num_workers = 1
+        warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
+    # read/write files as UTF-8
+    if args.input.name != '<stdin>':
+        args.input = codecs.open(args.input.name, encoding='utf-8')
+    if args.output.name != '<stdout>':
+        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+    learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols, num_workers=args.num_workers)
+    # close files
+    if args.input.name != '<stdin>':
+        args.input.close()
+    if args.output.name != '<stdout>':
+        args.output.close()

subword/learn_joint_bpe_and_vocab.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
+This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus,
+applies the learned operation to each and (optionally) returns the resulting vocabulary of each text.
+The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text.
+Reference:
+Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
+Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
+"""
+from __future__ import unicode_literals
+import sys
+import os
+import inspect
+import codecs
+import argparse
+import tempfile
+import warnings
+from collections import Counter
+from multiprocessing import cpu_count
+#hack to get imports working if running this as a script, or within a package
+if __name__ == '__main__':
+    import learn_bpe
+    import apply_bpe
+else:
+    from . import learn_bpe
+    from . import apply_bpe
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+def create_parser(subparsers=None):
+    if subparsers:
+        parser = subparsers.add_parser('learn-joint-bpe-and-vocab',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    else:
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="learn BPE-based word segmentation")
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+',
+        metavar='PATH',
+        help="Input texts (multiple allowed).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), required=True,
+        metavar='PATH',
+        help="Output file for BPE codes.")
+    parser.add_argument(
+        '--symbols', '-s', type=int, default=10000,
+        help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)")
+    parser.add_argument(
+        '--separator', type=str, default='@@', metavar='STR',
+        help="Separator between non-final subword units (default: '%(default)s')")
+    parser.add_argument(
+        '--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None,
+        metavar='PATH', dest='vocab',
+        help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py')
+    parser.add_argument(
+        '--min-frequency', type=int, default=2, metavar='FREQ',
+        help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)')
+    parser.add_argument(
+        '--total-symbols', '-t', action="store_true",
+        help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
+    parser.add_argument(
+        '--num-workers', type=int, default=1,
+        help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)")
+    parser.add_argument(
+        '--verbose', '-v', action="store_true",
+        help="verbose mode.")
+    return parser
+def learn_joint_bpe_and_vocab(args):
+    if args.vocab and len(args.input) != len(args.vocab):
+        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
+        sys.exit(1)
+    # read/write files as UTF-8
+    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
+    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]
+    # get combined vocabulary of all input texts
+    full_vocab = Counter()
+    for f in args.input:
+        full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
+        f.seek(0)
+    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
+    # learn BPE on combined vocabulary
+    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
+        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)
+    with codecs.open(args.output.name, encoding='UTF-8') as codes:
+        bpe = apply_bpe.BPE(codes, separator=args.separator)
+    # apply BPE to each training corpus and get vocabulary
+    for train_file, vocab_file in zip(args.input, args.vocab):
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        tmp.close()
+        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
+        train_file.seek(0)
+        bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)
+        tmpout.close()
+        tmpin = codecs.open(tmp.name, encoding='UTF-8')
+        vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
+        tmpin.close()
+        os.remove(tmp.name)
+        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
+            vocab_file.write("{0} {1}\n".format(key, freq))
+        train_file.close()
+        vocab_file.close()
+if __name__ == '__main__':
+    currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+    newdir = os.path.join(currentdir, 'subword_nmt')
+    if os.path.isdir(newdir):
+        warnings.warn(
+            "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
+            DeprecationWarning
+        )
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
+    parser = create_parser()
+    args = parser.parse_args()
+    if args.num_workers <= 0:
+        args.num_workers = cpu_count()
+    if sys.version_info < (3, 0):
+        args.separator = args.separator.decode('UTF-8')
+        if args.num_workers > 1:
+            args.num_workers = 1
+            warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.")
+    assert(len(args.input) == len(args.vocab))
+    learn_joint_bpe_and_vocab(args)

subword/segment_char_ngrams.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+from __future__ import unicode_literals, division
+import sys
+import codecs
+import argparse
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+def create_parser(subparsers=None):
+    if subparsers:
+        parser = subparsers.add_parser('segment-char-ngrams',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="segment rare words into character n-grams")
+    else:
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="segment rare words into character n-grams")
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input file (default: standard input).")
+    parser.add_argument(
+        '--vocab', type=argparse.FileType('r'), metavar='PATH',
+        required=True,
+        help="Vocabulary file.")
+    parser.add_argument(
+        '--shortlist', type=int, metavar='INT', default=0,
+        help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).")
+    parser.add_argument(
+        '-n', type=int, metavar='INT', default=2,
+        help="segment rare words into character n-grams of size INT (default: '%(default)s')).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output file (default: standard output)")
+    parser.add_argument(
+        '--separator', '-s', type=str, default='@@', metavar='STR',
+        help="Separator between non-final subword units (default: '%(default)s'))")
+    return parser
+def segment_char_ngrams(args):
+    vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2]
+    vocab = dict((y,x) for (x,y) in enumerate(vocab))
+    for line in args.input:
+      for word in line.split():
+        if word not in vocab or vocab[word] > args.shortlist:
+          i = 0
+          while i*args.n < len(word):
+            args.output.write(word[i*args.n:i*args.n+args.n])
+            i += 1
+            if i*args.n < len(word):
+              args.output.write(args.separator)
+            args.output.write(' ')
+        else:
+          args.output.write(word + ' ')
+      args.output.write('\n')
+if __name__ == '__main__':
+    # python 2/3 compatibility
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    else:
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
+    parser = create_parser()
+    args = parser.parse_args()
+    if sys.version_info < (3, 0):
+        args.separator = args.separator.decode('UTF-8')
+    # read/write files as UTF-8
+    args.vocab = codecs.open(args.vocab.name, encoding='utf-8')
+    if args.input.name != '<stdin>':
+        args.input = codecs.open(args.input.name, encoding='utf-8')
+    if args.output.name != '<stdout>':
+        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+    segment_char_ngrams(args)

subword/subword_nmt.py ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import io
+import sys
+import codecs
+import argparse
+from .learn_bpe import learn_bpe
+from .apply_bpe import BPE, read_vocabulary
+from .get_vocab import get_vocab
+from .learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab
+from .learn_bpe import create_parser as create_learn_bpe_parser
+from .apply_bpe import create_parser as create_apply_bpe_parser
+from .get_vocab import create_parser as create_get_vocab_parser
+from .learn_joint_bpe_and_vocab import create_parser as create_learn_joint_bpe_and_vocab_parser
+# hack for python2/3 compatibility
+argparse.open = io.open
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter,
+        description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ")
+    subparsers = parser.add_subparsers(dest='command',
+                                       help="""command to run. Run one of the commands with '-h' for more info.
+learn-bpe: learn BPE merge operations on input text.
+apply-bpe: apply given BPE operations to input text.
+get-vocab: extract vocabulary and word frequencies from input text.
+learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""")
+    learn_bpe_parser = create_learn_bpe_parser(subparsers)
+    apply_bpe_parser = create_apply_bpe_parser(subparsers)
+    get_vocab_parser = create_get_vocab_parser(subparsers)
+    learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers)
+    args = parser.parse_args()
+    if args.command == 'learn-bpe':
+        # read/write files as UTF-8
+        if args.input.name != '<stdin>':
+            args.input = codecs.open(args.input.name, encoding='utf-8')
+        if args.output.name != '<stdout>':
+            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+        learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose,
+                  is_dict=args.dict_input, total_symbols=args.total_symbols)
+    elif args.command == 'apply-bpe':
+        # read/write files as UTF-8
+        args.codes = codecs.open(args.codes.name, encoding='utf-8')
+        if args.input.name != '<stdin>':
+            args.input = codecs.open(args.input.name, encoding='utf-8')
+        if args.output.name != '<stdout>':
+            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+        if args.vocabulary:
+            args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
+        if args.vocabulary:
+            vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
+        else:
+            vocabulary = None
+        if sys.version_info < (3, 0):
+            args.separator = args.separator.decode('UTF-8')
+            if args.glossaries:
+                args.glossaries = [g.decode('UTF-8') for g in args.glossaries]
+        bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
+        for line in args.input:
+            args.output.write(bpe.process_line(line, args.dropout))
+    elif args.command == 'get-vocab':
+        if args.input.name != '<stdin>':
+            args.input = codecs.open(args.input.name, encoding='utf-8')
+        if args.output.name != '<stdout>':
+            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
+        get_vocab(args.input, args.output)
+    elif args.command == 'learn-joint-bpe-and-vocab':
+        learn_joint_bpe_and_vocab(args)
+        if sys.version_info < (3, 0):
+            args.separator = args.separator.decode('UTF-8')
+    else:
+        raise Exception('Invalid command provided')
+# python 2/3 compatibility
+if sys.version_info < (3, 0):
+    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+    sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+else:
+    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
+    sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)

subword/tests/__init__.py ADDED Viewed

File without changes

subword/tests/data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ bpe.out

subword/tests/data/bpe.ref ADDED Viewed

	@@ -0,0 +1,1001 @@

+#version: 0.2
+t h
+th e</w>
+i n
+a n
+e r
+r e
+o r
+t i
+a r
+an d</w>
+e n
+o f</w>
+o u
+o n
+t o</w>
+o n</w>
+i s</w>
+e d</w>
+in g</w>
+a l
+i n</w>
+e r</w>
+i t
+s t
+e s</w>
+a t
+o r</w>
+a t</w>
+r o
+i c
+o m
+e s
+i l
+e n</w>
+o u</w>
+a s
+a s</w>
+e l
+u s
+a n</w>
+e c
+i s
+o s
+a c
+ti on</w>
+y ou</w>
+o t
+f or</w>
+w h
+i t</w>
+a l</w>
+v e</w>
+p l
+a p
+s h
+o l
+d i
+th e
+q u
+th at</w>
+e t
+m a
+ar e</w>
+al l</w>
+th is</w>
+c om
+c h
+r i
+u n
+en t</w>
+b e</w>
+b l
+n o
+a m
+e v
+c e</w>
+@ -
+@- @</w>
+f or
+s i
+u r
+l o
+it h</w>
+er s</w>
+t s</w>
+ou r</w>
+w ith</w>
+re s
+h a
+p ro
+qu ot
+quot ;</w>
+& quot;</w>
+e m
+ti on
+a d
+l y</w>
+e t</w>
+b e
+or d
+c on
+er e</w>
+i g
+n e
+a y</w>
+ro m</w>
+f rom</w>
+b u
+n d</w>
+ap os
+& apos
+o w
+i r
+w or
+b y</w>
+a tion</w>
+o p
+&apos ;
+f f
+t r
+l i
+s u
+y our</w>
+no t</w>
+the y</w>
+ic h</w>
+s p
+c an</w>
+ou t</w>
+e x
+e ar
+l d</w>
+d e
+v er
+t a
+g e</w>
+wh ich</w>
+d s</w>
+bl e</w>
+p ar
+on e</w>
+a y
+w il
+in g
+d at
+t er</w>
+t er
+ha ve</w>
+sh all</w>
+tion s</w>
+m an
+it y</w>
+d e</w>
+wil l</w>
+p a
+o d</w>
+& #
+th er</w>
+c l
+. .
+.. .</w>
+u l
+es s</w>
+0 0
+i f</w>
+a b
+h e</w>
+ou ld</w>
+i r</w>
+c h</w>
+t h</w>
+r a
+m er
+1 2
+p u
+A nd</w>
+un to</w>
+s it
+res s</w>
+p e
+h t</w>
+en ts</w>
+4 ;</w>
+12 4;</w>
+&# 124;</w>
+ing s</w>
+h ol
+v er</w>
+m e</w>
+w e</w>
+s o</w>
+re e</w>
+m y</w>
+u p
+k e</w>
+i d
+at ed</w>
+us e</w>
+m ent</w>
+&apos; s</w>
+es t</w>
+a r</w>
+P ress</w>
+ou n
+h o
+for e</w>
+f il
+d ow
+al l
+at e</w>
+t ed</w>
+p er
+h is</w>
+er e
+as e</w>
+the ir</w>
+p or
+I C
+th ere</w>
+t o
+is h</w>
+2 00
+r ou
+m e
+ec om
+h i
+as t</w>
+wor k</w>
+w as</w>
+sit es</w>
+f t
+u m
+in e</w>
+a ti
+ri bu
+or e</w>
+g l
+c at</w>
+a ble</w>
+IC E
+ICE cat</w>
+g i
+am e</w>
+ac c
+u d
+st r
+s o
+pl e</w>
+mer ce</w>
+k s</w>
+g o
+ev en</w>
+c re
+y st
+us t</w>
+or s</w>
+ic e</w>
+h as</w>
+ecom merce</w>
+c i
+no w</w>
+a v
+m ents</w>
+a d</w>
+us ing</w>
+s t</w>
+man y</w>
+ma y</w>
+k ing</w>
+ev er
+ere fore</w>
+di st
+y e</w>
+u t
+ti me</w>
+s e
+re n
+os e</w>
+o ther</w>
+m ore</w>
+e st
+s er
+s el
+re c
+p h
+lo c
+l ic
+in ce</w>
+en s
+bu t</w>
+ar y</w>
+an t</w>
+G od</w>
+s yst
+s om
+l e
+f ree</w>
+dist ribu
+an s
+a g
+W ord
+p ur
+en t
+d o
+ar t
+al so</w>
+w e
+v i
+s a
+ri g
+ne w</w>
+l and</w>
+b o
+w ere</w>
+u c
+n ing</w>
+m ig
+i c</w>
+f ir
+es e</w>
+em s</w>
+e l</w>
+d o</w>
+b r
+as ed</w>
+ab out</w>
+E n
+th ings</w>
+lic ens
+it s</w>
+i m
+g r
+dat a</w>
+y e
+up on</w>
+s ti
+or d</w>
+in s</w>
+con t
+w i
+us ed</w>
+si on</w>
+p os
+ou nd</w>
+l a
+f e
+es s
+com m
+L ord</w>
+1 9
+the m</w>
+th ese</w>
+on ly</w>
+is h
+in cl
+et c</w>
+el s</w>
+el l</w>
+c ol
+c o
+ac h</w>
+a m</w>
+a il
+u l</w>
+th ou</w>
+ou r
+n lo
+in to</w>
+i es</w>
+hi m</w>
+dow nlo
+di z</w>
+d er
+al ly</w>
+ac e</w>
+Word Press</w>
+som e</w>
+s ince</w>
+re m
+pe o
+peo ple</w>
+pa in</w>
+os t</w>
+on s</w>
+n o</w>
+i ma
+ho w</w>
+for ma
+en d
+ad ing</w>
+a re
+S pain</w>
+O p
+u s</w>
+por t</w>
+ou s
+in ter
+ha d</w>
+h ere</w>
+en ti
+be en</w>
+ay s</w>
+ur e</w>
+t e
+sh ould</w>
+ser v
+p re
+l ay
+g re
+ff er
+b ased</w>
+ap art
+a diz</w>
+C h
+C adiz</w>
+w ould</w>
+w are</w>
+ver y</w>
+u p</w>
+syst ems</w>
+o st
+loc ated</w>
+incl ud
+hol d</w>
+gl ish</w>
+forma tion</w>
+f in
+en d</w>
+d ev
+ar k
+Q u
+Op en</w>
+En glish</w>
+wh o</w>
+u ro
+t ing</w>
+su p
+o re
+n ess</w>
+in formation</w>
+g et</w>
+f i
+ec t</w>
+b ec
+ar d</w>
+an ds</w>
+an ce</w>
+E uro
+u e</w>
+ord er</w>
+id ay</w>
+ic tion
+ft ware</w>
+f ul</w>
+d is
+at h</w>
+a tions</w>
+L u
+wh en</w>
+w ay</w>
+t e</w>
+sh e
+pur ch
+on g</w>
+m ust</w>
+fir st</w>
+fil e</w>
+em b
+e p
+e di
+an g
+ye a</w>
+t ors</w>
+st ati
+stati sti
+re s</w>
+purch ase</w>
+m ost</w>
+m en</w>
+m an</w>
+l a</w>
+it e</w>
+i l</w>
+h erefore</w>
+fil es</w>
+f t</w>
+f a
+an c
+I n
+w ell</w>
+ti c
+s ec
+par is
+p res
+o ff
+l in
+ima ge</w>
+iction ary</w>
+i z
+h op
+h el
+h e
+g h</w>
+f l
+e d
+com paris
+a use</w>
+P S
+A S
+v al
+statisti c</w>
+so ftware</w>
+she et</w>
+o k</w>
+o g
+m is
+j o
+hop s</w>
+hol iday</w>
+h ear
+go od</w>
+g o</w>
+f e</w>
+es hops</w>
+en ce</w>
+e i
+downlo ading</w>
+distribu tors</w>
+di ffer
+d ay</w>
+comparis on</w>
+an y</w>
+am il
+a ge</w>
+a f
+P s</w>
+P H
+N A</w>
+AS Ps</w>
+6 8
+v ing</w>
+th y</w>
+su ch</w>
+pu bl
+ord ing</w>
+l ine</w>
+i d</w>
+gre at</w>
+for m
+f ul
+ever y</w>
+el y</w>
+d et
+d es
+ch o
+c oun
+c ity</w>
+be hold</w>
+all ed</w>
+W herefore</w>
+PH P</w>
+P r
+wor ld</w>
+wi th
+wh at</w>
+w r
+w at
+tion al</w>
+si m
+ren t</w>
+p r
+ord s</w>
+o b
+no w
+mig ht</w>
+m u
+f amil
+e as
+d ing</w>
+bec ause</w>
+ark X
+arkX Press</w>
+acc ording</w>
+a u
+Qu arkXPress</w>
+M edi
+C om
+0 0</w>
+w s</w>
+us ers</w>
+ti es</w>
+th ing</w>
+se e</w>
+p ri
+o m</w>
+o c
+l l</w>
+k e
+ic es</w>
+em ent</w>
+ec i
+e p</w>
+e m</w>
+d uc
+d er</w>
+ar i
+am p
+af ter</w>
+Medi a</w>
+&apos; t</w>
+ver sion</w>
+v es</w>
+u res</w>
+u m</w>
+ta r</w>
+rig ht</w>
+rig h
+par t
+ow n</w>
+or y</w>
+o ver</w>
+o s</w>
+o k
+mu ch</w>
+k now
+in st
+ig h
+g en
+ex c
+differ ent</w>
+d en</w>
+ap p
+ans a</w>
+al lo
+S tar</w>
+Lu f
+L NA</w>
+D LNA</w>
+1 9</w>
+y p
+w ords</w>
+v is
+v en</w>
+u r</w>
+th ansa</w>
+si d
+sel f</w>
+re n</w>
+pu ter</w>
+pl o
+p ow
+ot h</w>
+n i
+licens e</w>
+li ke</w>
+l ear
+k now</w>
+in ut
+il e</w>
+f ore
+et s</w>
+emb er</w>
+d ec
+cont ent</w>
+com e</w>
+c alled</w>
+av ail
+ar ound</w>
+an d
+O ff
+Luf thansa</w>
+F or
+A l
+w o</w>
+up dat
+u t</w>
+u g
+ti ve</w>
+ta ke</w>
+str uc
+sid enti
+s et</w>
+s e</w>
+s ame</w>
+rec ei
+re ad
+pro duc
+pl ay
+p dat
+ou s</w>
+o l</w>
+n al</w>
+m at
+ish ed</w>
+ir it</w>
+in ed</w>
+i um</w>
+h ot
+g in
+g ht</w>
+f un
+com pl
+c ur
+avail able</w>
+a ir
+W in
+U pdat
+wor ks</w>
+with out</w>
+un g</w>
+tr ans
+th ose</w>
+th an</w>
+sp on
+sp eci
+pro c
+pa ge</w>
+on al</w>
+o ds</w>
+ma de</w>
+m es</w>
+includ ed</w>
+in i
+ig n</w>
+fe at
+el l
+ec ts</w>
+ear s</w>
+e w</w>
+e Star</w>
+dow s</w>
+be fore</w>
+b et
+at or</w>
+an s</w>
+al s</w>
+Win dows</w>
+Updat eStar</w>
+F ra
+ä sidenti
+äsidenti n</w>
+ä ft
+äft s
+äfts ord
+äftsord n
+äftsordn ung</w>
+z ur</w>
+v id
+um b
+u plo
+th rou
+t yp
+t wo</w>
+spon s
+si ble</w>
+s m
+rem ium</w>
+re p
+re gi
+r e</w>
+pow er</w>
+per s
+p an
+or ing</w>
+op en</w>
+o w</w>
+n ec
+mig al</w>
+is t</w>
+ha ving</w>
+h ath</w>
+gi ven</w>
+ev er</w>
+et h</w>
+es ch
+esch äftsordnung</w>
+en ter</w>
+e a
+con ta
+com man
+ch il
+c or
+c ap
+b oth</w>
+ati ve</w>
+apart ments</w>
+apart ment</w>
+ad a</w>
+S er
+Pr äsidentin</w>
+PS D</w>
+H ot
+G eschäftsordnung</w>
+Fra u</w>
+For migal</w>
+C al
+2 .
+1 1</w>
+y ears</w>
+wh erefore</w>
+u st
+throu gh</w>
+th en</w>
+t l
+t en</w>
+sh al
+shal t</w>
+s ou
+res t</w>
+recei ve</w>
+r u
+ot ter
+mer ci
+ma ke</w>
+m s</w>
+m o
+la w</w>
+k et</w>
+j ust</w>
+ic k</w>
+g rou
+fun c
+fore ver</w>
+fin d</w>
+f ace</w>
+ear ch</w>
+e ds</w>
+e al
+distribu tion</w>
+d ays</w>
+comman d
+chil d
+br ands</w>
+bl ess
+be gin
+am ong</w>
+am es</w>
+ac t</w>
+a in</w>
+a bl
+T h
+P remium</w>
+D e
+wat ers</w>
+v o
+u es</w>
+ti v
+t y</w>
+t ur
+sup port</w>
+spons oring</w>
+r on
+r an
+qu i
+pl ug
+par t</w>
+p as
+otter y</w>
+n or</w>
+n er</w>
+n ed</w>
+m ine</w>
+l ast</w>
+it ed</w>
+inut e</w>
+in d
+il li
+ic ation</w>
+gen er
+g es</w>
+g e
+g al</w>
+famil y</w>
+f ol
+f f</w>
+er y</w>
+er nal</w>
+el i
+d ra
+cho ose</w>
+child ren</w>
+c at
+be ach</w>
+as es</w>
+Off ers</w>
+M inute</w>
+L e
+L ast</w>
+G ods</w>
+G er
+D ictionary</w>
+Cal a</w>
+B o
+6 3
+1 5</w>
+wr it
+wh ile</w>
+w ar
+val ue</w>
+v ed</w>
+v ari
+u al</w>
+tr an
+to ol</w>
+t ri
+t en
+st ing</w>
+s ed</w>
+s ay</w>
+re d</w>
+pl e
+on g
+ol d</w>
+n ers</w>
+n a
+merci al</w>
+me di
+m on
+lo ok</w>
+l et</w>
+j ada</w>
+ic i
+hel p</w>
+feat ures</w>
+en tr
+en c
+eas y</w>
+ear th</w>
+d on</w>
+con nec
+ch ar
+c ould</w>
+be ing</w>
+b ac
+ar k</w>
+amp ;</w>
+a in
+P y
+H ost
+A n
+2 0</w>
+& amp;</w>
+ye ar</w>
+w ing</w>
+w ant</w>
+w a
+v ers</w>
+us er</w>
+ur ing</w>
+updat es</w>
+ti mes</w>
+t re
+t ly</w>
+syst em</w>
+sp ea
+sit e</w>
+sim pl
+sa id</w>
+s k
+s et
+re v
+re l
+re f
+pu t</w>
+pro g
+pl ace</w>
+pe an</w>
+p ho
+pho to</w>
+p at
+oun t</w>
+ot e</w>
+or t</w>
+og y</w>
+ne y</w>
+ne es</w>
+ne eds</w>
+ne ed</w>
+n umb
+n ame</w>
+lay ers</w>
+l l
+k en</w>
+ic al</w>
+i a</w>
+ful l</w>
+fi ed</w>
+fe w</w>
+et y</w>
+est s</w>
+es si
+dow n</w>
+do m</w>
+det ail
+dat ab
+d ictionary</w>
+con f
+com mercial</w>
+c a</w>
+b re

subword/tests/data/corpus.bpe.ref.en ADDED Viewed

The diff for this file is too large to render. See raw diff

subword/tests/data/corpus.en ADDED Viewed

The diff for this file is too large to render. See raw diff

subword/tests/test_bpe.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+import unittest
+import codecs
+import os,sys,inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+from learn_bpe import learn_bpe
+from apply_bpe import BPE
+class TestBPELearnMethod(unittest.TestCase):
+    def test_learn_bpe(self):
+        infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
+        outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8')
+        learn_bpe(infile, outfile, 1000)
+        infile.close()
+        outfile.close()
+        outlines = open(os.path.join(currentdir,'data','bpe.out'))
+        reflines = open(os.path.join(currentdir,'data','bpe.ref'))
+        for line, line2 in zip(outlines, reflines):
+            self.assertEqual(line, line2)
+        outlines.close()
+        reflines.close()
+class TestBPESegmentMethod(unittest.TestCase):
+    def setUp(self):
+        with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile:
+            self.bpe = BPE(bpefile)
+        self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8')
+        self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
+    def tearDown(self):
+        self.infile.close()
+        self.reffile.close()
+    def test_apply_bpe(self):
+        for line, ref in zip(self.infile, self.reffile):
+            out = self.bpe.process_line(line)
+            self.assertEqual(out, ref)
+    def test_trailing_whitespace(self):
+        """BPE.proces_line() preserves leading and trailing whitespace"""
+        orig = '  iron cement  \n'
+        exp = '  ir@@ on c@@ ement  \n'
+        out = self.bpe.process_line(orig)
+        self.assertEqual(out, exp)
+    def test_utf8_whitespace(self):
+        """UTF-8 whitespace is treated as normal character, not word boundary"""
+        orig = 'iron\xa0cement\n'
+        exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'
+        out = self.bpe.process_line(orig)
+        self.assertEqual(out, exp)
+    def test_empty_line(self):
+        orig = '\n'
+        exp = '\n'
+        out = self.bpe.process_line(orig)
+        self.assertEqual(out, exp)
+if __name__ == '__main__':
+    unittest.main()

subword/tests/test_glossaries.py ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import unittest
+import mock
+import os,sys,inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+from apply_bpe import isolate_glossary, BPE
+class TestIsolateGlossaryFunction(unittest.TestCase):
+    def setUp(self):
+        self.glossary = 'like'
+    def _run_test_case(self, test_case):
+        orig, expected = test_case
+        out = isolate_glossary(orig, self.glossary)
+        self.assertEqual(out, expected)
+    def test_empty_string(self):
+        orig = ''
+        exp = ['']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_no_glossary(self):
+        orig = 'word'
+        exp = ['word']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_isolated_glossary(self):
+        orig = 'like'
+        exp = ['like']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_word_one_side(self):
+        orig = 'likeword'
+        exp = ['like', 'word']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_words_both_sides(self):
+        orig = 'wordlikeword'
+        exp = ['word', 'like', 'word']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_back_to_back_glossary(self):
+        orig = 'likelike'
+        exp = ['like', 'like']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+    def test_multiple_glossaries(self):
+        orig = 'wordlikewordlike'
+        exp = ['word', 'like', 'word', 'like']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+class TestBPEIsolateGlossariesMethod(unittest.TestCase):
+    def setUp(self):
+        amock = mock.MagicMock()
+        amock.readline.return_value = 'something'
+        glossaries = ['like', 'Manuel', 'USA']
+        self.bpe = BPE(amock, glossaries=glossaries)
+    def _run_test_case(self, test_case):
+        orig, expected = test_case
+        out = self.bpe._isolate_glossaries(orig)
+        self.assertEqual(out, expected)
+    def test_multiple_glossaries(self):
+        orig = 'wordlikeUSAwordManuelManuelwordUSA'
+        exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+class TestRegexIsolateGlossaries(unittest.TestCase):
+    def setUp(self):
+        amock = mock.MagicMock()
+        amock.readline.return_value = 'something'
+        glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
+        self.bpe = BPE(amock, glossaries=glossaries)
+    def _run_test_case(self, test_case):
+        orig, expected = test_case
+        out = self.bpe._isolate_glossaries(orig)
+        self.assertEqual(out, expected)
+    def test_regex_glossaries(self):
+        orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>'
+        exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>']
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+def encode_mock(segment, x2, x3, x4, x5, x6, x7, glosses, dropout):
+    if glosses.match(segment):
+        return (segment,)
+    else:
+        l = len(segment)
+        return (segment[:l//2], segment[l//2:])
+class TestBPESegmentMethod(unittest.TestCase):
+    def setUp(self):
+        amock = mock.MagicMock()
+        amock.readline.return_value = 'something'
+        glossaries = ['like', 'Manuel', 'USA']
+        self.bpe = BPE(amock, glossaries=glossaries)
+    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
+    def _run_test_case(self, test_case, encode_function):
+        orig, expected = test_case
+        out = self.bpe.segment(orig)
+        self.assertEqual(out, expected)
+    def test_multiple_glossaries(self):
+        orig = 'wordlikeword likeManuelword'
+        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
+        test_case = (orig, exp)
+        self._run_test_case(test_case)
+if __name__ == '__main__':
+    unittest.main()