tspersian commited on
Commit
d786ff1
1 Parent(s): 4128ba5
Files changed (6) hide show
  1. .gitignore +1 -0
  2. README.md +8 -2
  3. __init__.py +1 -1
  4. base.py +6 -6
  5. mana_tokenizer.py +2 -1
  6. test.py +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
README.md CHANGED
@@ -11,7 +11,7 @@ language:
11
  The Mana Tokenizer is a custom-trained BPE tokenizer designed for Persian text. It is trained on a combination of huge Persian corpus. The tokenizer is built using the BPE with high character coverage to handle diverse Persian text.
12
 
13
  ## Quick Start
14
-
15
  ```python
16
  from mana_tokenizer import ManaTokenizer
17
  tokenizer = ManaTokenizer()
@@ -19,11 +19,17 @@ text = "سلام من یک متن تست برای تست این تست هستم.
19
  print(tokenizer.encode(text))
20
  print(tokenizer.decode(tokenizer.encode(text)))
21
  ```
 
 
 
 
 
22
 
23
- You can also add special tokens
24
  ```python
25
  tokenizer.register_special_tokens({"</s>": 100269})
26
  ```
 
27
  Batch encode:
28
  ```python
29
  tokenizer.batch_encode(["یک متن طولانی"])
 
11
  The Mana Tokenizer is a custom-trained BPE tokenizer designed for Persian text. It is trained on a combination of huge Persian corpus. The tokenizer is built using the BPE with high character coverage to handle diverse Persian text.
12
 
13
  ## Quick Start
14
+ You can encode/decode your data using Mana Tokenizer like this:
15
  ```python
16
  from mana_tokenizer import ManaTokenizer
17
  tokenizer = ManaTokenizer()
 
19
  print(tokenizer.encode(text))
20
  print(tokenizer.decode(tokenizer.encode(text)))
21
  ```
22
+ output should be:
23
+ ```
24
+ [216, 179, 217, 132, 216, 167, 217, 133, 32, 217, 133, 217, 134, 32, 219, 140, 218, 169, 32, 217, 133, 216, 170, 217, 134, 32, 216, 170, 216, 179, 216, 170, 32, 216, 168, 216, 177, 216, 167, 219, 140, 32, 216, 170, 216, 179, 216, 170, 32, 216, 167, 219, 140, 217, 134, 32, 216, 170, 216, 179, 216, 170, 32, 217, 135, 216, 179, 216, 170, 217, 133, 46]
25
+ سلام من یک متن تست برای تست این تست هستم.
26
+ ```
27
 
28
+ You can also add special tokens:
29
  ```python
30
  tokenizer.register_special_tokens({"</s>": 100269})
31
  ```
32
+
33
  Batch encode:
34
  ```python
35
  tokenizer.batch_encode(["یک متن طولانی"])
__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  from .base import Tokenizer
2
  from .mana_tokenizer import ManaTokenizer
3
- import helper
 
1
  from .base import Tokenizer
2
  from .mana_tokenizer import ManaTokenizer
3
+ from .helper import _process_string_scalar, render_token, merge
base.py CHANGED
@@ -9,7 +9,7 @@ import os
9
  import regex as re
10
  import csv
11
  import time
12
- import helper
13
 
14
  class Tokenizer:
15
  """Base class for Tokenizers"""
@@ -97,7 +97,7 @@ class Tokenizer:
97
  batch_size = len(item) // (self._cpus*2) or 1
98
  batches = [item[i:i + batch_size] for i in range(0, len(item), batch_size)]
99
  print(f'Processing {len(batches)} batches of size {batch_size}')
100
- results = Parallel(n_jobs=self._cpus)(delayed(helper._process_string_scalar)(batch, self.compiled_pattern) for batch in batches)
101
  for result in results: # Aggregate results into one Counter
102
  ids.update(result)
103
  elif isinstance(item, IterableDataset):
@@ -171,12 +171,12 @@ class Tokenizer:
171
  inverted_merges = {idx: pair for pair, idx in self.merges.items()}
172
  with open(vocab_file, "w", encoding="utf-8") as f: # Ensure this is also utf-8
173
  for idx, token in self.vocab.items():
174
- s = helper.render_token(token)
175
  # find the children of this token, if any
176
  if idx in inverted_merges:
177
  idx0, idx1 = inverted_merges[idx]
178
- s0 = helper.render_token(self.vocab[idx0])
179
- s1 = helper.render_token(self.vocab[idx1])
180
  f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
181
  else:
182
  f.write(f"[{s}] {idx}\n")
@@ -237,7 +237,7 @@ class Tokenizer:
237
  break # nothing else can be merged
238
  # otherwise let's merge the best pair (lowest merge index)
239
  idx = self.merges[pair]
240
- len_chunk = helper.merge(chunk, pair, idx, len_chunk)
241
  return chunk # list of ints
242
 
243
  def encode_ordinary(self, text):
 
9
  import regex as re
10
  import csv
11
  import time
12
+ from mana_tokenizer.helper import _process_string_scalar, render_token, merge
13
 
14
  class Tokenizer:
15
  """Base class for Tokenizers"""
 
97
  batch_size = len(item) // (self._cpus*2) or 1
98
  batches = [item[i:i + batch_size] for i in range(0, len(item), batch_size)]
99
  print(f'Processing {len(batches)} batches of size {batch_size}')
100
+ results = Parallel(n_jobs=self._cpus)(delayed(_process_string_scalar)(batch, self.compiled_pattern) for batch in batches)
101
  for result in results: # Aggregate results into one Counter
102
  ids.update(result)
103
  elif isinstance(item, IterableDataset):
 
171
  inverted_merges = {idx: pair for pair, idx in self.merges.items()}
172
  with open(vocab_file, "w", encoding="utf-8") as f: # Ensure this is also utf-8
173
  for idx, token in self.vocab.items():
174
+ s = render_token(token)
175
  # find the children of this token, if any
176
  if idx in inverted_merges:
177
  idx0, idx1 = inverted_merges[idx]
178
+ s0 = render_token(self.vocab[idx0])
179
+ s1 = render_token(self.vocab[idx1])
180
  f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
181
  else:
182
  f.write(f"[{s}] {idx}\n")
 
237
  break # nothing else can be merged
238
  # otherwise let's merge the best pair (lowest merge index)
239
  idx = self.merges[pair]
240
+ len_chunk = merge(chunk, pair, idx, len_chunk)
241
  return chunk # list of ints
242
 
243
  def encode_ordinary(self, text):
mana_tokenizer.py CHANGED
@@ -1,4 +1,5 @@
1
- from .base import Tokenizer, get_stats, merge_batch_get_stats
 
2
  from heapq import nlargest
3
  import time
4
 
 
1
+ from .base import Tokenizer
2
+ from .helper import get_stats, merge_batch_get_stats
3
  from heapq import nlargest
4
  import time
5
 
test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .mana_tokenizer import ManaTokenizer
2
+ tokenizer = ManaTokenizer()
3
+ text = "سلام من یک متن تست برای تست این تست هستم."
4
+ print(tokenizer.encode(text))
5
+ print(tokenizer.decode(tokenizer.encode(text)))