monsoon-nlp
commited on
Commit
•
65a4e18
1
Parent(s):
8ff78b2
explain pre-tokenization
Browse files
README.md
CHANGED
@@ -6,6 +6,126 @@ language: th
|
|
6 |
|
7 |
Adapted from https://github.com/ThAIKeras/bert for HuggingFace/Transformers library
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
Google's [**BERT**](https://github.com/google-research/bert) is currently the state-of-the-art method of pre-training text representations which additionally provides multilingual models. ~~Unfortunately, Thai is the only one in 103 languages that is excluded due to difficulties in word segmentation.~~
|
|
|
6 |
|
7 |
Adapted from https://github.com/ThAIKeras/bert for HuggingFace/Transformers library
|
8 |
|
9 |
+
## Pre-tokenization
|
10 |
+
|
11 |
+
You must run the original ThaiTokenizer to have your tokenization match that of the original model. If you skip this step, you will not do much better than
|
12 |
+
mBERT or random chance!
|
13 |
+
|
14 |
+
```bash
|
15 |
+
pip install pythainlp six sentencepiece==0.0.9
|
16 |
+
git clone https://github.com/ThAIKeras/bert
|
17 |
+
# download .vocab and .model files from ThAIKeras readme
|
18 |
+
```
|
19 |
+
|
20 |
+
Then set up ThaiTokenizer class - this is modified slightly to
|
21 |
+
remove a TensorFlow dependency.
|
22 |
+
|
23 |
+
```python
|
24 |
+
import collections
|
25 |
+
import unicodedata
|
26 |
+
import six
|
27 |
+
|
28 |
+
def convert_to_unicode(text):
|
29 |
+
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
|
30 |
+
if six.PY3:
|
31 |
+
if isinstance(text, str):
|
32 |
+
return text
|
33 |
+
elif isinstance(text, bytes):
|
34 |
+
return text.decode("utf-8", "ignore")
|
35 |
+
else:
|
36 |
+
raise ValueError("Unsupported string type: %s" % (type(text)))
|
37 |
+
elif six.PY2:
|
38 |
+
if isinstance(text, str):
|
39 |
+
return text.decode("utf-8", "ignore")
|
40 |
+
elif isinstance(text, unicode):
|
41 |
+
return text
|
42 |
+
else:
|
43 |
+
raise ValueError("Unsupported string type: %s" % (type(text)))
|
44 |
+
else:
|
45 |
+
raise ValueError("Not running on Python2 or Python 3?")
|
46 |
+
|
47 |
+
def load_vocab(vocab_file):
|
48 |
+
vocab = collections.OrderedDict()
|
49 |
+
index = 0
|
50 |
+
with open(vocab_file, "r") as reader:
|
51 |
+
while True:
|
52 |
+
token = reader.readline()
|
53 |
+
if token.split(): token = token.split()[0] # to support SentencePiece vocab file
|
54 |
+
token = convert_to_unicode(token)
|
55 |
+
if not token:
|
56 |
+
break
|
57 |
+
token = token.strip()
|
58 |
+
vocab[token] = index
|
59 |
+
index += 1
|
60 |
+
return vocab
|
61 |
+
|
62 |
+
#####
|
63 |
+
|
64 |
+
from bert.bpe_helper import BPE
|
65 |
+
import sentencepiece as spm
|
66 |
+
|
67 |
+
def convert_by_vocab(vocab, items):
|
68 |
+
output = []
|
69 |
+
for item in items:
|
70 |
+
output.append(vocab[item])
|
71 |
+
return output
|
72 |
+
|
73 |
+
class ThaiTokenizer(object):
|
74 |
+
"""Tokenizes Thai texts."""
|
75 |
+
|
76 |
+
def __init__(self, vocab_file, spm_file):
|
77 |
+
self.vocab = load_vocab(vocab_file)
|
78 |
+
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
79 |
+
|
80 |
+
self.bpe = BPE(vocab_file)
|
81 |
+
self.s = spm.SentencePieceProcessor()
|
82 |
+
self.s.Load(spm_file)
|
83 |
+
|
84 |
+
def tokenize(self, text):
|
85 |
+
bpe_tokens = self.bpe.encode(text).split(' ')
|
86 |
+
spm_tokens = self.s.EncodeAsPieces(text)
|
87 |
+
|
88 |
+
tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens
|
89 |
+
|
90 |
+
split_tokens = []
|
91 |
+
|
92 |
+
for token in tokens:
|
93 |
+
new_token = token
|
94 |
+
|
95 |
+
if token.startswith('_') and not token in self.vocab:
|
96 |
+
split_tokens.append('_')
|
97 |
+
new_token = token[1:]
|
98 |
+
|
99 |
+
if not new_token in self.vocab:
|
100 |
+
split_tokens.append('<unk>')
|
101 |
+
else:
|
102 |
+
split_tokens.append(new_token)
|
103 |
+
|
104 |
+
return split_tokens
|
105 |
+
|
106 |
+
def convert_tokens_to_ids(self, tokens):
|
107 |
+
return convert_by_vocab(self.vocab, tokens)
|
108 |
+
|
109 |
+
def convert_ids_to_tokens(self, ids):
|
110 |
+
return convert_by_vocab(self.inv_vocab, ids)
|
111 |
+
```
|
112 |
+
|
113 |
+
Then pre-tokenizing your own text:
|
114 |
+
|
115 |
+
```python
|
116 |
+
from pythainlp import sent_tokenize
|
117 |
+
tokenizer = ThaiTokenizer(vocab_file='th.wiki.bpe.op25000.vocab', spm_file='th.wiki.bpe.op25000.model')
|
118 |
+
|
119 |
+
og_text = "กรุงเทพมหานคร..."
|
120 |
+
split_sentences = ' '.join(sent_tokenize(txt))
|
121 |
+
split_words = ' '.join(tokenizer.tokenize(split_sentences))
|
122 |
+
|
123 |
+
split_words
|
124 |
+
> "▁ร้าน อาหาร ใหญ่มาก กก กก กก ▁ <unk> เลี้ยว..."
|
125 |
+
```
|
126 |
+
|
127 |
+
Original README follows:
|
128 |
+
|
129 |
---
|
130 |
|
131 |
Google's [**BERT**](https://github.com/google-research/bert) is currently the state-of-the-art method of pre-training text representations which additionally provides multilingual models. ~~Unfortunately, Thai is the only one in 103 languages that is excluded due to difficulties in word segmentation.~~
|