yizhangliu commited on
Commit
fd18910
1 Parent(s): 3d24b44

Upload 3 files

Browse files
Files changed (3) hide show
  1. encoder.json +0 -0
  2. encoder.py +120 -0
  3. vocab.bpe +0 -0
encoder.json ADDED
The diff for this file is too large to render. See raw diff
 
encoder.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file includes code which was modified from https://github.com/openai/gpt-2
2
+
3
+ import tensorflow as tf
4
+ import os
5
+ import json
6
+ import regex as re
7
+ from functools import lru_cache
8
+ import requests
9
+ import boto3
10
+ import pdb
11
+
12
+
13
+ @lru_cache()
14
+ def bytes_to_unicode():
15
+
16
+ bs = (
17
+ list(range(ord("!"), ord("~") + 1))
18
+ + list(range(ord("¡"), ord("¬") + 1))
19
+ + list(range(ord("®"), ord("ÿ") + 1))
20
+ )
21
+ cs = bs[:]
22
+ n = 0
23
+ for b in range(2 ** 8):
24
+ if b not in bs:
25
+ bs.append(b)
26
+ cs.append(2 ** 8 + n)
27
+ n += 1
28
+ cs = [chr(n) for n in cs]
29
+ return dict(zip(bs, cs))
30
+
31
+
32
+ def get_pairs(word):
33
+ pairs = set()
34
+ prev_char = word[0]
35
+ for char in word[1:]:
36
+ pairs.add((prev_char, char))
37
+ prev_char = char
38
+ return pairs
39
+
40
+
41
+ class Encoder:
42
+ def __init__(self, encoder, bpe_merges, errors="replace"):
43
+ self.encoder = encoder
44
+ self.decoder = {v: k for k, v in self.encoder.items()}
45
+ self.errors = errors
46
+ self.byte_encoder = bytes_to_unicode()
47
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
48
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
49
+ self.cache = {}
50
+ self.pat = re.compile(
51
+ r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
52
+ )
53
+
54
+ def bpe(self, token):
55
+ if token in self.cache:
56
+ return self.cache[token]
57
+ word = tuple(token)
58
+
59
+ pairs = get_pairs(word)
60
+
61
+ if not pairs:
62
+ return token
63
+
64
+ while True:
65
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
66
+ if bigram not in self.bpe_ranks:
67
+ break
68
+ first, second = bigram
69
+ new_word = []
70
+ i = 0
71
+ while i < len(word):
72
+ try:
73
+ j = word.index(first, i)
74
+ new_word.extend(word[i:j])
75
+ i = j
76
+ except:
77
+ new_word.extend(word[i:])
78
+ break
79
+
80
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
81
+ new_word.append(first + second)
82
+ i += 2
83
+ else:
84
+ new_word.append(word[i])
85
+ i += 1
86
+ new_word = tuple(new_word)
87
+ word = new_word
88
+ if len(word) == 1:
89
+ break
90
+ else:
91
+ pairs = get_pairs(word)
92
+
93
+ word = " ".join(word)
94
+ self.cache[token] = word
95
+ return word
96
+
97
+ def encode(self, text):
98
+ bpe_tokens = []
99
+ for token in re.findall(self.pat, text):
100
+ token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
101
+
102
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
103
+ return bpe_tokens
104
+
105
+ def decode(self, tokens):
106
+ text = "".join([self.decoder[token] for token in tokens])
107
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
108
+ return text
109
+
110
+
111
+ def get_encoder():
112
+ with open("encoder.json", "r") as f:
113
+ encoder = json.load(f)
114
+ with open("vocab.bpe", "r", encoding="utf-8") as f:
115
+ bpe_data = f.read()
116
+ bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
117
+ return Encoder(encoder=encoder, bpe_merges=bpe_merges)
118
+
119
+ # encoder = get_encoder()
120
+ # print('encoded is ', encoder.encode('hello 👋 world 🌍 This is a long string to test whether or not the emoji issue was fixed!'))
vocab.bpe ADDED
The diff for this file is too large to render. See raw diff