xu-song commited on
Commit
7c73423
1 Parent(s): 9d1b27e
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -35
  2. README.2.md +0 -136
  3. README.md +17 -15
  4. app.py +39 -24
  5. character_app.py +83 -79
  6. character_util.py +216 -216
  7. compression_app.py +187 -130
  8. compression_util.py +320 -302
  9. css/style.css +62 -59
  10. playground_app.py +233 -264
  11. playground_util.py +181 -181
  12. requirements.txt +11 -10
  13. stats/character_stats.json +0 -0
  14. stats/compression_rate.json +0 -0
  15. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ar.diff.json +3 -0
  16. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.de.diff.json +3 -0
  17. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.en.diff.json +3 -0
  18. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.es.diff.json +3 -0
  19. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fa.diff.json +3 -0
  20. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fr.diff.json +3 -0
  21. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ja.diff.json +3 -0
  22. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ko.diff.json +3 -0
  23. stats/compression_rate/ClassCat.gpt2-base-french @ cc100.zh-Hans.diff.json +3 -0
  24. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ar.diff.json +3 -0
  25. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.de.diff.json +3 -0
  26. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.en.diff.json +3 -0
  27. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.es.diff.json +3 -0
  28. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fa.diff.json +3 -0
  29. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fr.diff.json +3 -0
  30. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ja.diff.json +3 -0
  31. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ko.diff.json +3 -0
  32. stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.zh-Hans.diff.json +3 -0
  33. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ar.diff.json +3 -0
  34. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.de.diff.json +3 -0
  35. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.en.diff.json +3 -0
  36. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.es.diff.json +3 -0
  37. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fa.diff.json +3 -0
  38. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fr.diff.json +3 -0
  39. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ja.diff.json +3 -0
  40. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ko.diff.json +3 -0
  41. stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.zh-Hans.diff.json +3 -0
  42. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ar.diff.json +3 -0
  43. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.de.diff.json +3 -0
  44. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.en.diff.json +3 -0
  45. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.es.diff.json +3 -0
  46. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fa.diff.json +3 -0
  47. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fr.diff.json +3 -0
  48. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ja.diff.json +3 -0
  49. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ko.diff.json +3 -0
  50. stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.zh-Hans.diff.json +3 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ stats/iter_vocab/*.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ stats/compression_rate/*.json filter=lfs diff=lfs merge=lfs -text
README.2.md DELETED
@@ -1,136 +0,0 @@
1
-
2
- https://arxiv.org/abs/2308.16692 SpeechTokenizer
3
-
4
- 对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
5
- 之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
6
-
7
- ## 词典构建
8
-
9
- bert词典
10
- gpt词典
11
- gpt-neox词典
12
-
13
- ## encode
14
-
15
-
16
- ## decode
17
-
18
- bert词典有个特殊字符 #
19
-
20
- gpt-neox词典呢?
21
- - _开头表示空格或句首
22
-
23
-
24
- ## 关于分词粒度
25
-
26
-
27
- ## ss
28
-
29
-
30
-
31
- bert-chinese vocab_size: 21128
32
- bert-en
33
- clue
34
- glm
35
- chatglm
36
- bloom
37
-
38
-
39
- ## 最小词典
40
-
41
- mobilenet
42
-
43
-
44
- ## ss
45
-
46
-
47
- ## bert
48
-
49
- ```
50
- [PAD]
51
- ...
52
- [unused99]
53
- [UNK]
54
- [CLS]
55
- [SEP]
56
- [MASK]
57
- <S>
58
- <T>
59
- !
60
- ...
61
-
62
- big
63
- ##ut
64
- ftp
65
- carol
66
- ##vi
67
- ```
68
-
69
-
70
- ## @@
71
-
72
- https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
73
-
74
- ```
75
- "he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"
76
- ```
77
-
78
- 跟BERT类似,只不过BERT是词后缀,这里是词前缀。
79
-
80
- 这种应该是 https://github.com/rsennrich/subword-nmt
81
-
82
-
83
- ## GPT2
84
-
85
- 词典见:https://huggingface.co/gpt2/raw/main/vocab.json
86
-
87
-
88
- ```
89
- ['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
90
- ```
91
- 跟BERT不同,BERT用特殊符号表示 “连接”,GPT2用特殊符号表示 “空格”。
92
-
93
- 详见 gpt2/README.md
94
-
95
- - 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
96
- - 很多数字独立编码,几乎上千个。
97
-
98
- - 类似的还有:moss
99
-
100
-
101
- ### Ġ是什么
102
-
103
- It's a feature of byte-level BPE(an encoded space character).
104
- Ġ 表示空格,有的版本用Ä代替Ġ。
105
-
106
-
107
- ```sh
108
- What's up with the tokenizer?
109
- # BPE后
110
- ['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
111
- # 经过vocab.json编码后
112
- [ 2061, 338, 510, 351, 262, 11241, 7509, 30]
113
- # 经过dict.txt编码后(fairseq特有)
114
- [ 其他数字 ]
115
- ```
116
- <>
117
- 疑问:up会加Ġ,为什么what不加Ġ,因为有个pre
118
-
119
- - https://github.com/pytorch/fairseq/issues/1716
120
- - https://github.com/huggingface/transformers/issues/1083
121
-
122
-
123
- ## 空格、tab、换行
124
-
125
-
126
-
127
-
128
-
129
- ## reversible and lossless
130
-
131
- It's reversible and lossless, so you can convert tokens back into the original text
132
-
133
-
134
- ## diff
135
-
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,15 +1,17 @@
1
- ---
2
- title: Tokenizer Arena
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.31.4
8
- app_file: app.py
9
- pinned: false
10
- datasets:
11
- - cc100
12
- ---
13
-
14
-
15
- Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
 
 
 
1
+ ---
2
+ title: Tokenizer Arena
3
+ emoji:
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.32.2
8
+ app_file: app.py
9
+ pinned: false
10
+ datasets:
11
+ - cc100
12
+ ---
13
+
14
+
15
+
16
+
17
+ Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
app.py CHANGED
@@ -1,24 +1,39 @@
1
- import os
2
- from playground_app import demo as playground_tab
3
- from compression_app import demo as compression_tab
4
- from character_app import demo as character_tab
5
- from patcher.gr_interface import TabbedInterface
6
- from huggingface_hub import login
7
-
8
- auth_token = os.environ.get('HF_TOKEN', None)
9
- if auth_token:
10
- login(token=auth_token)
11
-
12
-
13
- # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,。
14
- demo = TabbedInterface(
15
- [playground_tab, compression_tab, character_tab],
16
- [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"],
17
- title='<div align="center">Tokenizer Arena ⚔️</div>',
18
- css="css/style.css"
19
- )
20
-
21
- demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
22
-
23
- if __name__ == "__main__":
24
- demo.queue(max_size=1024, default_concurrency_limit=80).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio app to showcase the LLM tokenization."""
2
+
3
+ import os
4
+ import gradio as gr
5
+ from huggingface_hub import login
6
+ from playground_app import demo as playground_tab
7
+ from compression_app import demo as compression_tab
8
+ from character_app import demo as character_tab
9
+
10
+ auth_token = os.environ.get('HF_TOKEN', None)
11
+ if auth_token:
12
+ login(token=auth_token)
13
+
14
+
15
+ title = '<div align="center">Tokenizer Arena ⚔️</div>'
16
+ interface_list = [playground_tab, compression_tab, character_tab]
17
+ tab_names = [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"]
18
+
19
+ with gr.Blocks(css="css/style.css", js="js/onload.js") as demo:
20
+ gr.HTML(
21
+ f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
22
+ )
23
+ with gr.Tabs():
24
+ for interface, tab_name in zip(interface_list, tab_names):
25
+ with gr.Tab(label=tab_name):
26
+ interface.render()
27
+
28
+ model_name = gr.Textbox(
29
+ placeholder="🔍 Add tokenizer from Hugging Face (e.g. Xenova/gpt-4o) and press ENTER...",
30
+ show_label=False,
31
+ )
32
+
33
+ model_name.submit()
34
+
35
+ # demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
36
+
37
+ if __name__ == "__main__":
38
+ demo.launch()
39
+ # demo.queue(max_size=1024, default_concurrency_limit=80).launch()
character_app.py CHANGED
@@ -1,79 +1,83 @@
1
- import gradio as gr
2
- from character_util import get_character_table, default_columns
3
-
4
- all_columns = [
5
- ("digit", "digit"),
6
- ("space", "space"),
7
- ("lang-chinese", 'zh'),
8
- ("lang-korea", 'ko'),
9
- ("lang-japanese", 'ja'),
10
- # ("byte", "byte"),
11
- # ("oov", "oov")
12
- ]
13
-
14
-
15
- # columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
16
-
17
- abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
18
-
19
-
20
- def get_column_info(columns):
21
- markdown = ""
22
- for column in columns:
23
- markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
24
- f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
25
- return markdown
26
-
27
-
28
- with gr.Blocks() as demo:
29
- gr.Markdown("## 🛠️ Setting") # ⚙
30
- with gr.Accordion("Please select the type of character you want to count.", open=True):
31
- # file size 💽 🖴, tokens 🧮
32
- with gr.Row():
33
- with gr.Column():
34
- columns = gr.Checkboxgroup(
35
- all_columns,
36
- value=default_columns,
37
- label="character type",
38
- # info=""
39
- )
40
- gr.Markdown(
41
- "To count other types of characters, you can modify [character_util.py]"
42
- "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). "
43
- )
44
- column_info = gr.Markdown(
45
- get_column_info(default_columns)
46
- )
47
-
48
- gr.Markdown("## 📊 Character Statistics")
49
- search_bar = gr.Textbox(
50
- placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
51
- show_label=False,
52
- elem_id="search-bar",
53
- )
54
- compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
55
-
56
- search_bar.submit(
57
- get_character_table,
58
- inputs=[search_bar, columns],
59
- outputs=compress_rate_table
60
- )
61
- columns.change(
62
- get_character_table,
63
- inputs=[search_bar, columns],
64
- outputs=compress_rate_table
65
- )
66
- columns.change(
67
- get_column_info,
68
- inputs=[columns],
69
- outputs=column_info
70
- )
71
-
72
- demo.load(
73
- get_character_table,
74
- inputs=[search_bar, columns],
75
- outputs=compress_rate_table
76
- )
77
-
78
- if __name__ == "__main__":
79
- demo.launch()
 
 
 
 
 
1
+ """
2
+ ##
3
+ """
4
+
5
+ import gradio as gr
6
+ from character_util import get_character_table, default_columns
7
+
8
+ all_columns = [
9
+ ("digit", "digit"),
10
+ ("space", "space"),
11
+ ("lang-chinese", 'zh'),
12
+ ("lang-korea", 'ko'),
13
+ ("lang-japanese", 'ja'),
14
+ # ("byte", "byte"),
15
+ # ("oov", "oov")
16
+ ]
17
+
18
+
19
+ # columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
20
+
21
+ abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
22
+
23
+
24
+ def get_column_info(columns):
25
+ markdown = ""
26
+ for column in columns:
27
+ markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
28
+ f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
29
+ return markdown
30
+
31
+
32
+ with gr.Blocks() as demo:
33
+ gr.Markdown("## 🛠️ Setting") # ⚙
34
+ with gr.Accordion("Please select the type of character you want to count.", open=True):
35
+ # file size 💽 🖴, tokens 🧮
36
+ with gr.Row():
37
+ with gr.Column():
38
+ columns = gr.Checkboxgroup(
39
+ all_columns,
40
+ value=default_columns,
41
+ label="character type",
42
+ # info=""
43
+ )
44
+ gr.Markdown(
45
+ "To count other types of characters, you can modify [lang_util.py]"
46
+ "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). "
47
+ )
48
+ column_info = gr.Markdown(
49
+ get_column_info(default_columns)
50
+ )
51
+
52
+ gr.Markdown("## 📊 Character Statistics")
53
+ search_bar = gr.Textbox(
54
+ placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
55
+ show_label=False,
56
+ elem_id="search-bar",
57
+ )
58
+ compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
59
+
60
+ search_bar.submit(
61
+ get_character_table,
62
+ inputs=[search_bar, columns],
63
+ outputs=compress_rate_table
64
+ )
65
+ columns.change(
66
+ get_character_table,
67
+ inputs=[search_bar, columns],
68
+ outputs=compress_rate_table
69
+ )
70
+ columns.change(
71
+ get_column_info,
72
+ inputs=[columns],
73
+ outputs=column_info
74
+ )
75
+
76
+ demo.load(
77
+ get_character_table,
78
+ inputs=[search_bar, columns],
79
+ outputs=compress_rate_table
80
+ )
81
+
82
+ if __name__ == "__main__":
83
+ demo.launch()
character_util.py CHANGED
@@ -1,216 +1,216 @@
1
- """
2
- TODO:
3
- 1. add more language
4
- 2. check space count of bert
5
- 3. add token_impl
6
- 4.
7
- """
8
- import os
9
- import json
10
- import numpy as np
11
- import pandas as pd
12
- from collections import Counter, defaultdict
13
- from vocab import tokenizer_factory
14
- from typing import Optional, Union, Literal
15
- from utils.log_util import logger
16
- from utils.text_util import contains_digit, get_space_count
17
- from utils.lang_util import detect_language, language_ranges
18
-
19
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
-
21
- default_columns = ["digit", "zh"]
22
-
23
- def _to_unicode(text):
24
- return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
25
-
26
-
27
- def _get_coding_length(tokenizer, vocab, filter=None):
28
- """
29
- oov character may be tokenized into more than one token.
30
- """
31
- all_length = []
32
- for word in vocab:
33
- if len(word) > 1:
34
- continue
35
- if filter is not None and filter(word):
36
- continue
37
- try:
38
- tokens = tokenizer.encode(word)
39
- except Exception as e:
40
- print(e)
41
-
42
- all_length.append(len(tokens))
43
- # if len(tokens.ids) > 1:
44
- # if len(tokens) > 3:
45
- # print(word, tokens)
46
-
47
- dist_length = Counter(all_length)
48
- mean_length = round(sum(all_length) / len(all_length), 2)
49
- return dist_length, mean_length
50
-
51
-
52
- cache = {}
53
-
54
-
55
- def _dist(token_lens):
56
- """
57
- :param token_lens:
58
- :return: min,median,max of token_lens
59
- """
60
- if not token_lens:
61
- return "-"
62
- return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
63
-
64
-
65
- def iter_vocab(
66
- tokenizer_name: str,
67
- from_cache: bool = True,
68
- cache_dir: str = "stats",
69
- ) -> Union[pd.DataFrame, dict]:
70
- """
71
- :param tokenizer_name:
72
- :param from_cache:
73
- :param cache_dir:
74
- :return:
75
- """
76
- tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
77
-
78
- cache_dir = os.path.join(CURRENT_DIR, cache_dir)
79
- os.makedirs(cache_dir, exist_ok=True)
80
-
81
- # load from cache
82
- cache_path = os.path.join(cache_dir, "character_stats.json")
83
- if not cache and os.path.exists(cache_path):
84
- with open(cache_path, "r", encoding="utf-8") as f_tmp:
85
- cache.update(json.load(f_tmp))
86
- if from_cache and tokenizer_name in cache:
87
- # logger.info(f"load {tokenizer_config.name_or_path} from cache")
88
- return cache[tokenizer_name]
89
-
90
- tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
91
-
92
- tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
93
- digit_tokens = []
94
- space_tokens = []
95
- byte_tokens = []
96
-
97
- buffer = []
98
- for token_id in range(tokenizer.vocab_size):
99
- # for token_id in tokenizer.get_vocab():
100
- # for token_id in range(len(tokenizer)):
101
- decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
102
- token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
103
- tags = []
104
- if token is None: # 有些词典有空的id(不连续)
105
- continue
106
- if isinstance(token, bytes):
107
- token = token.decode("utf-8", errors="ignore")
108
-
109
- if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
110
- if tokenizer.sp_model.is_byte(token_id):
111
- tags.append("is_byte")
112
- byte_tokens.append(token)
113
-
114
- language_tags = detect_language(decode_str)
115
- for language in language_tags:
116
- tokens_by_lang[language[1]].append(decode_str)
117
-
118
- if contains_digit(decode_str):
119
- tags.append("digit")
120
- digit_tokens.append(decode_str)
121
-
122
- space_count = get_space_count(decode_str)
123
- if space_count > 0:
124
- space_tokens.append(decode_str)
125
-
126
- buffer.append(json.dumps(
127
- {
128
- "id": token_id,
129
- "token": token,
130
- "token_decode": decode_str,
131
- "token_dumps": json.dumps(token),
132
- "token_unicode": _to_unicode(token),
133
- "token_len": len(decode_str),
134
- },
135
- ensure_ascii=False) + "\n")
136
-
137
- result = {
138
- "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
139
- "organization": tokenizer_config.org,
140
- # "impl": str(tokenizer.__class__),
141
- # "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
142
- "vocab_size": len(tokenizer),
143
-
144
- # "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
145
- # "中文汉字编码长度分布": json.dumps(dist_length),
146
-
147
- "num(digit)": len(digit_tokens),
148
- "len(digit)": _dist([len(token) for token in digit_tokens]),
149
- "num(space)": len(space_tokens),
150
- "len(space)": _dist([len(token) for token in space_tokens]),
151
-
152
- # "num(byte)": len(byte_tokens)
153
- }
154
-
155
- for lang, tokens in tokens_by_lang.items():
156
- result[f"num({lang})"] = len(tokens)
157
- result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
158
-
159
- out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
160
- with open(out_path, "w", encoding="utf-8") as f_out:
161
- for line in buffer:
162
- f_out.write(line)
163
- len_before = len(cache)
164
- cache[tokenizer_name] = result
165
- len_after = len(cache)
166
- logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
167
- with open(cache_path, "w", encoding="utf-8") as f_out:
168
- f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
169
- return result
170
-
171
-
172
- def to_dataframe(stats, columns):
173
- table = []
174
- for stat in stats.values():
175
- filtered_stat = {}
176
- for k, v in stat.items():
177
- if not k.startswith("num") and not k.startswith("len"):
178
- filtered_stat[k] = v
179
- if any(column in k for column in columns):
180
- k = k.replace("ja-kana", "kana")
181
- filtered_stat[k] = v
182
- table.append(filtered_stat)
183
- df = pd.DataFrame(table)
184
- return df
185
-
186
-
187
- def get_character_table(
188
- tokenizer_filter: Optional[str] = None,
189
- columns: Optional[list] = None,
190
- return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
191
- ) -> Union[pd.DataFrame, dict]:
192
- """
193
- """
194
- logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
195
- stats = {}
196
- if columns is None:
197
- columns = default_columns
198
- if tokenizer_filter is not None:
199
- tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
200
- if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
201
- else:
202
- tokenizer_names = tokenizer_factory.all_tokenizer_names
203
-
204
- for tokenizer_name in tokenizer_names:
205
- stat = iter_vocab(tokenizer_name)
206
- stats[tokenizer_name] = stat
207
-
208
- if return_type == "dataframe":
209
- stats = to_dataframe(stats, columns)
210
- return stats
211
-
212
-
213
- if __name__ == "__main__":
214
- # aa = get_character_table(tokenizer_filter="baichuan")
215
- df = get_character_table()
216
- logger.info(f"\n{df.to_markdown(index=False)}")
 
1
+ """
2
+ TODO:
3
+ 1. add more language
4
+ 2. check space count of bert
5
+ 3. add token_impl
6
+ 4.
7
+ """
8
+ import os
9
+ import json
10
+ import numpy as np
11
+ import pandas as pd
12
+ from collections import Counter, defaultdict
13
+ from vocab import tokenizer_factory
14
+ from typing import Optional, Union, Literal
15
+ from utils.log_util import logger
16
+ from utils.text_util import contains_digit, get_space_count
17
+ from utils.lang_util import detect_language_by_unicode, language_ranges
18
+
19
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
+
21
+ default_columns = ["digit", "zh"]
22
+
23
+ def _to_unicode(text):
24
+ return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
25
+
26
+
27
+ def _get_coding_length(tokenizer, vocab, filter=None):
28
+ """
29
+ oov character may be tokenized into more than one token.
30
+ """
31
+ all_length = []
32
+ for word in vocab:
33
+ if len(word) > 1:
34
+ continue
35
+ if filter is not None and filter(word):
36
+ continue
37
+ try:
38
+ tokens = tokenizer.encode(word)
39
+ except Exception as e:
40
+ print(e)
41
+
42
+ all_length.append(len(tokens))
43
+ # if len(tokens.ids) > 1:
44
+ # if len(tokens) > 3:
45
+ # print(word, tokens)
46
+
47
+ dist_length = Counter(all_length)
48
+ mean_length = round(sum(all_length) / len(all_length), 2)
49
+ return dist_length, mean_length
50
+
51
+
52
+ cache = {}
53
+
54
+
55
+ def _dist(token_lens):
56
+ """
57
+ :param token_lens:
58
+ :return: min,median,max of token_lens
59
+ """
60
+ if not token_lens:
61
+ return "-"
62
+ return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
63
+
64
+
65
+ def iter_vocab(
66
+ tokenizer_name: str,
67
+ from_cache: bool = True,
68
+ cache_dir: str = "stats",
69
+ ) -> Union[pd.DataFrame, dict]:
70
+ """
71
+ :param tokenizer_name:
72
+ :param from_cache:
73
+ :param cache_dir:
74
+ :return:
75
+ """
76
+ tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
77
+
78
+ cache_dir = os.path.join(CURRENT_DIR, cache_dir)
79
+ os.makedirs(cache_dir, exist_ok=True)
80
+
81
+ # load from cache
82
+ cache_path = os.path.join(cache_dir, "character_stats.json")
83
+ if not cache and os.path.exists(cache_path):
84
+ with open(cache_path, "r", encoding="utf-8") as f_tmp:
85
+ cache.update(json.load(f_tmp))
86
+ if from_cache and tokenizer_name in cache:
87
+ # logger.info(f"load {tokenizer_config.name_or_path} from cache")
88
+ return cache[tokenizer_name]
89
+
90
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
91
+
92
+ tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
93
+ digit_tokens = []
94
+ space_tokens = []
95
+ byte_tokens = []
96
+
97
+ buffer = []
98
+ for token_id in range(tokenizer.vocab_size):
99
+ # for token_id in tokenizer.get_vocab():
100
+ # for token_id in range(len(tokenizer)):
101
+ decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
102
+ token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
103
+ tags = []
104
+ if token is None: # 有些词典有空的id(不连续)
105
+ continue
106
+ if isinstance(token, bytes):
107
+ token = token.decode("utf-8", errors="ignore")
108
+
109
+ if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
110
+ if tokenizer.sp_model.is_byte(token_id):
111
+ tags.append("is_byte")
112
+ byte_tokens.append(token)
113
+
114
+ language_tags = detect_language_by_unicode(decode_str)
115
+ for language in language_tags:
116
+ tokens_by_lang[language[1]].append(decode_str)
117
+
118
+ if contains_digit(decode_str):
119
+ tags.append("digit")
120
+ digit_tokens.append(decode_str)
121
+
122
+ space_count = get_space_count(decode_str)
123
+ if space_count > 0:
124
+ space_tokens.append(decode_str)
125
+
126
+ buffer.append(json.dumps(
127
+ {
128
+ "id": token_id,
129
+ "token": token,
130
+ "token_decode": decode_str,
131
+ "token_dumps": json.dumps(token),
132
+ "token_unicode": _to_unicode(token),
133
+ "token_len": len(decode_str),
134
+ },
135
+ ensure_ascii=False) + "\n")
136
+
137
+ result = {
138
+ "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
139
+ "organization": tokenizer_config.org,
140
+ # "impl": str(tokenizer.__class__),
141
+ # "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
142
+ "vocab_size": len(tokenizer),
143
+
144
+ # "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
145
+ # "中文汉字编码长度分布": json.dumps(dist_length),
146
+
147
+ "num(digit)": len(digit_tokens),
148
+ "len(digit)": _dist([len(token) for token in digit_tokens]),
149
+ "num(space)": len(space_tokens),
150
+ "len(space)": _dist([len(token) for token in space_tokens]),
151
+
152
+ # "num(byte)": len(byte_tokens)
153
+ }
154
+
155
+ for lang, tokens in tokens_by_lang.items():
156
+ result[f"num({lang})"] = len(tokens)
157
+ result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
158
+
159
+ out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
160
+ with open(out_path, "w", encoding="utf-8") as f_out:
161
+ for line in buffer:
162
+ f_out.write(line)
163
+ len_before = len(cache)
164
+ cache[tokenizer_name] = result
165
+ len_after = len(cache)
166
+ logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
167
+ with open(cache_path, "w", encoding="utf-8") as f_out:
168
+ f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
169
+ return result
170
+
171
+
172
+ def to_dataframe(stats, columns):
173
+ table = []
174
+ for stat in stats.values():
175
+ filtered_stat = {}
176
+ for k, v in stat.items():
177
+ if not k.startswith("num") and not k.startswith("len"):
178
+ filtered_stat[k] = v
179
+ if any(column in k for column in columns):
180
+ k = k.replace("ja-kana", "kana")
181
+ filtered_stat[k] = v
182
+ table.append(filtered_stat)
183
+ df = pd.DataFrame(table)
184
+ return df
185
+
186
+
187
+ def get_character_table(
188
+ tokenizer_filter: Optional[str] = None,
189
+ columns: Optional[list] = None,
190
+ return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
191
+ ) -> Union[pd.DataFrame, dict]:
192
+ """
193
+ """
194
+ logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
195
+ stats = {}
196
+ if columns is None:
197
+ columns = default_columns
198
+ if tokenizer_filter is not None:
199
+ tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
200
+ if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
201
+ else:
202
+ tokenizer_names = tokenizer_factory.all_tokenizer_names
203
+
204
+ for tokenizer_name in tokenizer_names:
205
+ stat = iter_vocab(tokenizer_name)
206
+ stats[tokenizer_name] = stat
207
+
208
+ if return_type == "dataframe":
209
+ stats = to_dataframe(stats, columns)
210
+ return stats
211
+
212
+
213
+ if __name__ == "__main__":
214
+ # aa = get_character_table(tokenizer_filter="baichuan")
215
+ df = get_character_table()
216
+ logger.info(f"\n{df.to_markdown(index=False)}")
compression_app.py CHANGED
@@ -1,130 +1,187 @@
1
- """
2
- TODO:
3
- - 统计 tokenizer_impl
4
- - 统计 OOV
5
- - 统计 reversal
6
- - 增加 math,code
7
- """
8
-
9
- import gradio as gr
10
- from compression_util import get_compression_leaderboard, common_corpuses
11
-
12
-
13
- with gr.Blocks() as demo:
14
- # gr.Markdown("## Convertor")
15
- # with gr.Accordion("Convertor", open=False):
16
- # gr.Markdown("Tokenize {} corpus")
17
- # with gr.Row(elem_classes="no-border"):
18
- # gr.Button("File Size", min_width=50)
19
- # file_size = gr.Textbox(
20
- # show_label=False,
21
- # min_width=50,
22
- # # elem_classes="textbox-as-text"
23
- # )
24
- # gr.Dropdown(
25
- # choices=['MB', 'GB', 'TB'],
26
- # show_label=False,
27
- # min_width=15,
28
- # # elem_classes="textbox-as-text"
29
- # )
30
- # # gr.Markdown('<h2 align="center">≈</h2>')
31
- # # gr.HTML('<h2 style="margin: auto;">≈</h2>')
32
- # gr.Button(
33
- # "≈",
34
- # min_width=10,
35
- # elem_classes="button-white h2-font"
36
- #
37
- # )
38
- #
39
- # gr.Button(
40
- # "Tokens",
41
- # min_width=50
42
- # )
43
- # gr.Textbox(
44
- # show_label=False,
45
- # min_width=50
46
- # )
47
- # gr.Dropdown(
48
- # ['million', 'billion', 'trillion'],
49
- # show_label=False,
50
- # min_width=15,
51
- # elem_classes="button-white"
52
- # )
53
-
54
- gr.Markdown("## 🛠️ Setting") # ⚙
55
- with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
56
- # file size 💽 🖴, tokens 🧮
57
- # Total amount of disk used
58
- with gr.Row():
59
- with gr.Column():
60
- compress_rate_corpus = gr.Dropdown(
61
- common_corpuses, # , "code"
62
- value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
63
- label="corpus",
64
- multiselect=True
65
- # info=""
66
- )
67
-
68
- # unit of file_size: gigabyte terabyte
69
- # unit of token_num: million billion trillion
70
- # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
71
- compress_rate_unit = gr.Radio(
72
- ["b_tokens/g_bytes", "t_tokens/t_bytes"],
73
- value="b_tokens/g_bytes",
74
- label="measure", # evaluation metric
75
- )
76
-
77
- gr.Markdown(
78
- # "Note:\n\n"
79
- "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
80
- "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
81
- "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
82
- # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
83
- # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
84
- "- `char/token` measures how many chars per token on the tokenized corpus.\n"
85
- "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus, 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
86
- "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
87
- )
88
-
89
- gr.Markdown("## 🏆 Compression Rate Leaderboard")
90
- search_bar = gr.Textbox(
91
- placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
92
- show_label=False,
93
- elem_id="search-bar",
94
- )
95
- compress_rate_table = gr.Dataframe(datatype="html")
96
-
97
- # func call
98
- compress_rate_corpus.change(
99
- get_compression_leaderboard,
100
- inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
101
- outputs=compress_rate_table
102
- )
103
- compress_rate_unit.change(
104
- get_compression_leaderboard,
105
- inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
106
- outputs=compress_rate_table
107
- )
108
- # file_size.change(
109
- # get_all_compress_rate,
110
- # outputs=compress_rate_table
111
- # )
112
-
113
- search_bar.submit(
114
- get_compression_leaderboard,
115
- inputs=[
116
- compress_rate_corpus,
117
- compress_rate_unit,
118
- search_bar,
119
- ],
120
- outputs=compress_rate_table
121
- )
122
-
123
- demo.load(
124
- get_compression_leaderboard,
125
- inputs=[compress_rate_corpus, compress_rate_unit],
126
- outputs=compress_rate_table
127
- )
128
-
129
- if __name__ == "__main__":
130
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ - 统计 tokenizer_impl
4
+ - 统计 OOV
5
+ - 统计 reversal
6
+ - 增加 math,code
7
+
8
+
9
+
10
+ ## balance
11
+
12
+ - 高压缩率 VS vocab_size:
13
+ - 高压缩率,就意味着,编码后的token数少,那么 token长度 就会长,--> vocab_size 就会太大
14
+ - 高压缩率 VS 无损
15
+ - s
16
+ - OOV
17
+ - OOV 多,那么生成的 UNK 可能多(一个char 一个UNK) --> token 数目多 -> 压缩率低
18
+ - OOV 多,那么生成的 UNK 可能少() --> token 数目多 -> 压缩率低
19
+
20
+ """
21
+
22
+ import gradio as gr
23
+ from compression_util import get_compression_leaderboard, common_corpuses
24
+
25
+
26
+ # From the perspective of compression
27
+ # exactly reconstructed from compressed tokens
28
+ docs = """## 📖 What is a good tokenizer?
29
+
30
+ From a compression perspective, a good tokenizer should be lossless, and keep high compression rate (less tokens).
31
+ The encoding and decoding process can be formulated as
32
+ ```python
33
+ token_ids = tokenizer.encode(input_text) # compressed tokens
34
+ decoded_text = tokenizer.decode(token_ids) # reconstructed text
35
+ ```
36
+
37
+ - **Lossless** <br>
38
+ Lossless tokenization preserves the exact original text, i.e. `decoded_text = input_text`.
39
+
40
+ - Most lossy tokenizers get many out-of-vocabulary tokens. 👉 Check the [oov of bert-base-uncased](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/google-bert.bert-base-casedcc100.zh-Hans.diff.json).
41
+ - Some other tokenizers have no oov, but still be lossy due to text normalization. For example qwen performs [unicode normalization](https://github.com/huggingface/transformers/blob/v4.42.3/src/transformers/models/qwen2/tokenization_qwen2.py#L338),
42
+ which may bring some [slight difference](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate/Qwen.Qwen1.5-1.8B%20@%20cc100.ja.diff.jsonn) to the reconstructed text.
43
+
44
+ - **Compression Rate** <br>
45
+ There are mainly two types of metric to represent the `input_text`:
46
+ - `byte-level`: the number of bytes in the given text
47
+ - `char-level`: the number of characters in the given text.
48
+
49
+ To evaluate compression rate, simple metrics can be "how many bytes per token" or "how many chars per token". <br>
50
+ In this leaderboard, we adopt more frequently used metric: "how many billion tokens per gigabytes corpus" and "how many chars
51
+ per token", i.e. `b_tokens/g_bytes` and `char/token`.
52
+
53
+
54
+ 💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
55
+ """
56
+
57
+
58
+
59
+ # theme = gr.themes.Monochrome()
60
+ theme = gr.themes.Default()
61
+ # theme.set(accordion_text_weight=600) # 暂不支持
62
+ with gr.Blocks(theme=theme) as demo:
63
+ # gr.Markdown("## Convertor")
64
+ # with gr.Accordion("Convertor", open=False):
65
+ # gr.Markdown("Tokenize {} corpus")
66
+ # with gr.Row(elem_classes="no-border"):
67
+ # gr.Button("File Size", min_width=50)
68
+ # file_size = gr.Textbox(
69
+ # show_label=False,
70
+ # min_width=50,
71
+ # # elem_classes="textbox-as-text"
72
+ # )
73
+ # gr.Dropdown(
74
+ # choices=['MB', 'GB', 'TB'],
75
+ # show_label=False,
76
+ # min_width=15,
77
+ # # elem_classes="textbox-as-text"
78
+ # )
79
+ # # gr.Markdown('<h2 align="center">≈</h2>')
80
+ # # gr.HTML('<h2 style="margin: auto;">≈</h2>')
81
+ # gr.Button(
82
+ # "",
83
+ # min_width=10,
84
+ # elem_classes="button-white h2-font"
85
+ #
86
+ # )
87
+ #
88
+ # gr.Button(
89
+ # "Tokens",
90
+ # min_width=50
91
+ # )
92
+ # gr.Textbox(
93
+ # show_label=False,
94
+ # min_width=50
95
+ # )
96
+ # gr.Dropdown(
97
+ # ['million', 'billion', 'trillion'],
98
+ # show_label=False,
99
+ # min_width=15,
100
+ # elem_classes="button-white"
101
+ # )
102
+
103
+
104
+
105
+ gr.Markdown(docs)
106
+ gr.Markdown("## 🛠️ Setting") # ⚙
107
+ gr.Markdown("We perform tokenization on different corpus, and calculate the compression rate."
108
+ "")
109
+ with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
110
+ # file size 💽 🖴, tokens 🧮
111
+ # Total amount of disk used
112
+ with gr.Row():
113
+ with gr.Column():
114
+ compress_rate_corpus = gr.Dropdown(
115
+ common_corpuses, # , "code"
116
+ value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
117
+ label="corpus",
118
+ multiselect=True
119
+ # info=""
120
+ )
121
+
122
+ # unit of file_size: gigabyte terabyte
123
+ # unit of token_num: million billion trillion
124
+ # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
125
+ compress_rate_unit = gr.Radio(
126
+ ["b_tokens/g_bytes", "t_tokens/t_bytes"],
127
+ value="b_tokens/g_bytes",
128
+ label="measure", # evaluation metric
129
+ )
130
+
131
+ gr.Markdown(
132
+ # "Note:\n\n explanation"
133
+ # "Supported languages are (20): arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), urdu (ur), vietnamese (vi), and chinese (zh)."
134
+ # " arabic (ar), english (en), spanish (es), french (fr), italian (it), japanese (ja), portuguese (pt), russian (ru), and chinese (zh)."
135
+ "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/statmt/cc100) corpus.\n"
136
+ "- measure\n"
137
+ " - `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
138
+ " - `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
139
+ # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
140
+ # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
141
+ " - `char/token` measures how many chars per token on the tokenized corpus.\n"
142
+ " - `oov_ratio`: out-of-vocabulary ratio on the selected corpus, 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
143
+ "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
144
+ )
145
+
146
+ gr.Markdown("## 🏆 Compression Rate Leaderboard")
147
+ search_bar = gr.Textbox(
148
+ placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
149
+ show_label=False,
150
+ elem_id="search-bar",
151
+ )
152
+ compress_rate_table = gr.Dataframe(datatype="html")
153
+
154
+ # func call
155
+ compress_rate_corpus.change(
156
+ get_compression_leaderboard,
157
+ inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
158
+ outputs=compress_rate_table
159
+ )
160
+ compress_rate_unit.change(
161
+ get_compression_leaderboard,
162
+ inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
163
+ outputs=compress_rate_table
164
+ )
165
+ # file_size.change(
166
+ # get_all_compress_rate,
167
+ # outputs=compress_rate_table
168
+ # )
169
+
170
+ search_bar.submit(
171
+ get_compression_leaderboard,
172
+ inputs=[
173
+ compress_rate_corpus,
174
+ compress_rate_unit,
175
+ search_bar,
176
+ ],
177
+ outputs=compress_rate_table
178
+ )
179
+
180
+ demo.load(
181
+ get_compression_leaderboard,
182
+ inputs=[compress_rate_corpus, compress_rate_unit],
183
+ outputs=compress_rate_table
184
+ )
185
+
186
+ if __name__ == "__main__":
187
+ demo.launch()
compression_util.py CHANGED
@@ -1,302 +1,320 @@
1
- """
2
-
3
- ## TODO
4
- code:
5
- math:
6
- whitespace:
7
-
8
- """
9
-
10
- import json
11
- import os
12
- import sys
13
- import pandas as pd
14
- from datasets import load_dataset
15
- from utils.log_util import logger
16
- from vocab import tokenizer_factory, TokenizerConfig
17
- from typing import List, Optional, Union, Literal
18
-
19
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
-
21
- common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
22
-
23
- common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
24
- "cc100/fa", "cc100/ar", "cc100/ja"])
25
-
26
- VALID_CODES_CC100 = [
27
- "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
28
- "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gn", "gu",
29
- "ha", "he", "hi", "hi_rom", "hr", "ht", "hu", "hy", "id", "ig", "is", "it", "ja", "jv", "ka",
30
- "kk", "km", "kn", "ko", "ku", "ky", "la", "lg", "li", "ln", "lo", "lt", "lv", "mg", "mk", "ml",
31
- "mn", "mr", "ms", "my", "my_zaw", "ne", "nl", "no", "ns", "om", "or", "pa", "pl", "ps", "pt",
32
- "qu", "rm", "ro", "ru", "sa", "si", "sc", "sd", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv",
33
- "sw", "ta", "ta_rom", "te", "te_rom", "th", "tl", "tn", "tr", "ug", "uk", "ur", "ur_rom", "uz",
34
- "vi", "wo", "xh", "yi", "yo", "zh-Hans", "zh-Hant", "zu",
35
- ]
36
-
37
-
38
- # code: https://huggingface.co/datasets/codeparrot/github-code-clean python java c sql html
39
- # math:
40
-
41
- def get_n_bytes_of_string(string_text):
42
- n_bytes = len(string_text.encode("utf-8"))
43
- return n_bytes
44
-
45
-
46
- def unit_convertor(stat, unit):
47
- n_tokens = stat["_n_tokens"]
48
- n_chars = stat["_n_chars"]
49
- n_bytes = stat["_n_bytes"]
50
-
51
- if n_tokens is None:
52
- return None
53
-
54
- n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
55
- n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
56
- n_bytes_in_mb = n_bytes / (1024 * 1024)
57
- n_bytes_in_gb = n_bytes_in_mb / 1024
58
- n_bytes_in_tb = n_bytes_in_gb / 1024
59
- # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)
60
-
61
- if unit == "n_tokens/n_bytes":
62
- value = n_tokens / n_bytes
63
- elif unit in ["char/token", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
64
- value = n_chars / n_tokens
65
- elif unit in ["token/char", "tokens_per_char"]: # 一个中文汉字需要几个token
66
- value = n_tokens / n_chars
67
- elif unit == "g_bytes/b_tokens":
68
- value = n_bytes_in_gb / n_tokens_in_billion
69
- elif unit == "b_tokens/g_bytes":
70
- value = n_tokens_in_billion / n_bytes_in_gb
71
- elif unit == "t_bytes/t_tokens": # 重要:
72
- value = n_bytes_in_tb / n_tokens_in_trillion
73
- elif unit == "t_tokens/t_bytes":
74
- value = n_tokens_in_trillion / n_bytes_in_tb
75
- else:
76
- raise "measure not support"
77
- return round(value, 3)
78
-
79
-
80
- def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
81
- """
82
- """
83
- all_stats = list(stats_by_corpus.values())
84
- assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
85
- reversible = all(stat['reversible'] for stat in all_stats)
86
- is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
87
-
88
- merged_stats = {
89
- "tokenizer": all_stats[0]["tokenizer"],
90
- "organization": all_stats[0]["organization"],
91
- "vocab_size": all_stats[0]["vocab_size"],
92
- "_n_bytes": 0,
93
- "_n_tokens": 0 if is_support else None,
94
- "_n_chars": 0,
95
- "_n_oov_chars": 0,
96
- "reversible": True,
97
- }
98
- for stats in all_stats:
99
- merged_stats["_n_bytes"] += stats["_n_bytes"]
100
- merged_stats["_n_chars"] += stats["_n_chars"]
101
- if is_support: # The number of tokens cannot be accurately counted, when there are too many UNKs.
102
- merged_stats["_n_tokens"] += stats["_n_tokens"]
103
- merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
104
- merged_stats["reversible"] &= stats['reversible']
105
-
106
- merged_stats.update({
107
- "oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
108
- "reversible": reversible
109
- })
110
- return merged_stats
111
-
112
-
113
- def to_dataframe(stats, units=None):
114
- if units is None:
115
- units = common_units
116
- elif not isinstance(units, list):
117
- units = [units]
118
- table = []
119
-
120
- for stat in stats.values():
121
- columns = {k: v for k, v in stat.items() if not k.startswith("_")}
122
- for unit in units:
123
- if unit not in stat:
124
- columns[unit] = unit_convertor(stat, unit)
125
- else:
126
- logger.error(f"unit {unit} not support")
127
- table.append(columns)
128
- df = pd.DataFrame(table)
129
- return df
130
-
131
-
132
- cache = {}
133
-
134
-
135
- def tokenize_corpus(
136
- tokenizer_name: str,
137
- corpuses: List[str],
138
- cache_dir: str = "stats"
139
- ) -> dict:
140
- """
141
- :param tokenizer_name:
142
- :param corpuses:
143
- :param cache_dir:
144
- :return:
145
- """
146
-
147
- def _char_based_oov(src_text, decode_text):
148
- oov_chars = []
149
- for char in src_text:
150
- if char not in decode_text:
151
- oov_chars.append(char)
152
-
153
- n_oov_chars = len(oov_chars)
154
- oov_charset = list(dict.fromkeys(oov_chars))
155
- return n_oov_chars, oov_charset
156
-
157
- def _tokenize(tokenizer, datasets, detail_path=None):
158
- """
159
- :param tokenizer:
160
- :param datasets:
161
- :param detail_path:
162
- :return:
163
- """
164
- n_bytes = 0
165
- n_tokens = 0
166
- n_chars = 0
167
- n_oov_chars = 0
168
- diff_details = []
169
- oov_charset = set()
170
- unk_token_id = None
171
- if hasattr(tokenizer, "unk_token"):
172
- unk_token_id = tokenizer.unk_token_id
173
- for dataset in datasets:
174
- for item in dataset:
175
- text = item["text"]
176
- n_bytes += get_n_bytes_of_string(text)
177
- n_chars += len(text)
178
- ids = tokenizer.encode(text, add_special_tokens=False)
179
-
180
- # detect oov
181
- decode_text = tokenizer.decode(ids)
182
- decode_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
183
- if decode_text != text:
184
- _n_oov_chars, _oov_charset = _char_based_oov(text, decode_text_without_unk)
185
- diff_details.append(
186
- {
187
- "text": text,
188
- "decode_text": decode_text,
189
- "decode_text_without_unk": decode_text_without_unk,
190
- "n_oov_chars": _n_oov_chars,
191
- 'oov_ratio': _n_oov_chars / len(text),
192
- 'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
193
- }
194
- )
195
- n_oov_chars += _n_oov_chars
196
- oov_charset.update(_oov_charset)
197
- n_tokens += len(ids)
198
- stat = {
199
- "_n_bytes": n_bytes,
200
- "_n_tokens": n_tokens,
201
- "_n_chars": n_chars,
202
- "_n_oov_chars": n_oov_chars,
203
- "oov_ratio": n_oov_chars / n_chars,
204
- '_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
205
- "reversible": len(diff_details) == 0
206
- }
207
-
208
- if detail_path and diff_details:
209
- logger.info(f"saving tokenization detail to '{detail_path}'")
210
- with open(detail_path, "w", encoding="utf-8") as f:
211
- f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
212
- # print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
213
- # f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
214
- # f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
215
- # for diff_detail in diff_details:
216
- # # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
217
- # # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
218
- # f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
219
- # f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
220
- return stat
221
-
222
- # load from cache
223
- cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
224
- cache_path = os.path.join(cache_dir, "compression_rate.json")
225
- if not cache and os.path.exists(cache_path):
226
- with open(cache_path, "r", encoding="utf-8") as f_tmp:
227
- cache.update(json.load(f_tmp))
228
- if cache_id in cache:
229
- # logger.info(f"loading {cache_id} from in-memory cache")
230
- return cache[cache_id]
231
-
232
- # tokenize corpus
233
- tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
234
- datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
235
-
236
- stat = {
237
- "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
238
- "organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
239
- "vocab_size": len(tokenizer),
240
- }
241
- tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
242
- os.makedirs(tokenize_detail_dir, exist_ok=True)
243
- tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
244
- stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
245
- # add basic info
246
-
247
- # save to cache
248
- len_before = len(cache)
249
- cache[cache_id] = stat
250
- len_after = len(cache)
251
- logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
252
- with open(cache_path, "w", encoding="utf-8") as f_tmp:
253
- json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
254
- return stat
255
-
256
-
257
- def get_compression_leaderboard(
258
- corpuses: List[str] = ['cc100/en'],
259
- unit: str = "b_tokens/g_bytes",
260
- tokenizer_filter: Optional[str] = None,
261
- return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
262
- ) -> Union[pd.DataFrame, dict]:
263
- """
264
- """
265
- logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
266
- stats = {}
267
- if tokenizer_filter is not None:
268
- tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
269
- if tokenizer_filter.lower() in tokenizer_name.lower()]
270
- else:
271
- tokenizer_names = tokenizer_factory.all_tokenizer_names
272
- for tokenizer_name in tokenizer_names:
273
- stats_by_corpus = {}
274
- for corpus in corpuses:
275
- stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
276
- stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
277
-
278
- if return_type == "dataframe":
279
- token_number_unit, file_size_unit = unit.split("/")
280
- reverse_unit = f"{file_size_unit}/{token_number_unit}"
281
- stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
282
- stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
283
- stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
284
- return stats
285
-
286
-
287
- def main():
288
- if len(sys.argv) == 3:
289
- tokenizer_filter = [sys.argv[1]]
290
- corpuses = [sys.argv[2]]
291
- else:
292
- tokenizer_filter = None
293
- corpuses = common_corpuses
294
- # tokenizer_filter = "openai"
295
- # corpuses = ["cc100/en", "cc100/zh-Hans"]
296
- df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
297
- # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
298
- logger.info(f"\n{df.to_markdown(index=False)}")
299
-
300
-
301
- if __name__ == "__main__":
302
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ## more statistics
3
+ code:
4
+ math:
5
+ digit:
6
+ whitespace:
7
+ top_oov: most frequent oov chars
8
+ ranking: thumb_up thumb_down
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import sys
14
+ from difflib import SequenceMatcher
15
+ import pandas as pd
16
+ from datasets import load_dataset
17
+ from utils.log_util import logger
18
+ from vocab import tokenizer_factory, TokenizerConfig
19
+ from typing import List, Optional, Union, Literal
20
+
21
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
22
+
23
+ common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
24
+
25
+ common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
26
+ "cc100/fa", "cc100/ar", "cc100/ja"])
27
+
28
+ VALID_CODES_CC100 = [
29
+ "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
30
+ "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gn", "gu",
31
+ "ha", "he", "hi", "hi_rom", "hr", "ht", "hu", "hy", "id", "ig", "is", "it", "ja", "jv", "ka",
32
+ "kk", "km", "kn", "ko", "ku", "ky", "la", "lg", "li", "ln", "lo", "lt", "lv", "mg", "mk", "ml",
33
+ "mn", "mr", "ms", "my", "my_zaw", "ne", "nl", "no", "ns", "om", "or", "pa", "pl", "ps", "pt",
34
+ "qu", "rm", "ro", "ru", "sa", "si", "sc", "sd", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv",
35
+ "sw", "ta", "ta_rom", "te", "te_rom", "th", "tl", "tn", "tr", "ug", "uk", "ur", "ur_rom", "uz",
36
+ "vi", "wo", "xh", "yi", "yo", "zh-Hans", "zh-Hant", "zu",
37
+ ]
38
+
39
+
40
+ # code: https://huggingface.co/datasets/codeparrot/github-code-clean python java c sql html
41
+ # math:
42
+
43
+ def get_n_bytes_of_string(string_text):
44
+ n_bytes = len(string_text.encode("utf-8"))
45
+ return n_bytes
46
+
47
+
48
+ def unit_convertor(stat, unit):
49
+ n_tokens = stat["_n_tokens"]
50
+ n_chars = stat["_n_chars"]
51
+ n_bytes = stat["_n_bytes"]
52
+
53
+ if n_tokens is None:
54
+ return None
55
+
56
+ n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
57
+ n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
58
+ n_bytes_in_mb = n_bytes / (1024 * 1024)
59
+ n_bytes_in_gb = n_bytes_in_mb / 1024
60
+ n_bytes_in_tb = n_bytes_in_gb / 1024
61
+ # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)
62
+
63
+ if unit == "n_tokens/n_bytes":
64
+ value = n_tokens / n_bytes
65
+ elif unit in ["char/token", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
66
+ value = n_chars / n_tokens
67
+ elif unit in ["token/char", "tokens_per_char"]: # 一个中文汉字需要几个token?
68
+ value = n_tokens / n_chars
69
+ elif unit == "g_bytes/b_tokens":
70
+ value = n_bytes_in_gb / n_tokens_in_billion
71
+ elif unit == "b_tokens/g_bytes":
72
+ value = n_tokens_in_billion / n_bytes_in_gb
73
+ elif unit == "t_bytes/t_tokens": # 重要:
74
+ value = n_bytes_in_tb / n_tokens_in_trillion
75
+ elif unit == "t_tokens/t_bytes":
76
+ value = n_tokens_in_trillion / n_bytes_in_tb
77
+ else:
78
+ raise "measure not support"
79
+ return round(value, 3)
80
+
81
+
82
+ def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
83
+ """
84
+ """
85
+ all_stats = list(stats_by_corpus.values())
86
+ assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
87
+ lossless = all(stat['lossless'] for stat in all_stats)
88
+ is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
89
+
90
+ merged_stats = {
91
+ "tokenizer": all_stats[0]["tokenizer"],
92
+ "organization": all_stats[0]["organization"],
93
+ "vocab_size": all_stats[0]["vocab_size"],
94
+ "_n_bytes": 0,
95
+ "_n_tokens": 0 if is_support else None,
96
+ "_n_chars": 0,
97
+ "_n_oov_chars": 0,
98
+ "lossless": True,
99
+ }
100
+ for stats in all_stats:
101
+ merged_stats["_n_bytes"] += stats["_n_bytes"]
102
+ merged_stats["_n_chars"] += stats["_n_chars"]
103
+ if is_support: # The number of tokens cannot be accurately counted, when there are too many UNKs.
104
+ merged_stats["_n_tokens"] += stats["_n_tokens"]
105
+ merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
106
+ merged_stats["lossless"] &= stats['lossless']
107
+
108
+ merged_stats.update({
109
+ "oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
110
+ "lossless": lossless
111
+ })
112
+ return merged_stats
113
+
114
+
115
+ def to_dataframe(stats, units=None):
116
+ if units is None:
117
+ units = common_units
118
+ elif not isinstance(units, list):
119
+ units = [units]
120
+ table = []
121
+
122
+ for stat in stats.values():
123
+ columns = {k: v for k, v in stat.items() if not k.startswith("_")}
124
+ for unit in units:
125
+ if unit not in stat:
126
+ columns[unit] = unit_convertor(stat, unit)
127
+ else:
128
+ logger.error(f"unit {unit} not support")
129
+ table.append(columns)
130
+ df = pd.DataFrame(table)
131
+ return df
132
+
133
+
134
+ cache = {}
135
+
136
+
137
+ def tokenize_corpus(
138
+ tokenizer_name: str,
139
+ corpuses: List[str],
140
+ cache_dir: str = "stats"
141
+ ) -> dict:
142
+ """
143
+ :param tokenizer_name:
144
+ :param corpuses:
145
+ :param cache_dir:
146
+ :return:
147
+ """
148
+
149
+ def _assert_oov(tokenizer, oov_candidate):
150
+
151
+ tokenizer.encode()
152
+
153
+ def _char_based_oov(src_text, decoded_text, tokenizer):
154
+ oov_charset = [] # keep the order in src_text
155
+ decoded_charset = set(decoded_text)
156
+ for char in dict.fromkeys(src_text):
157
+ if char not in decoded_charset \
158
+ and char != tokenizer.decode(tokenizer.encode(char, add_special_tokens=False)):
159
+ oov_charset.append(char)
160
+
161
+ n_oov_chars = sum([1 for char in src_text if char in oov_charset])
162
+ return n_oov_chars, oov_charset
163
+
164
+ def _diff_path(src_text, decoded_text):
165
+ s = SequenceMatcher(a=src_text, b=decoded_text)
166
+ changes = []
167
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
168
+ if tag != "equal":
169
+ changes.append('{:7} text[{}:{}] --> decoded_text[{}:{}] {!r:>8} --> {!r}'.format(
170
+ tag, i1, i2, j1, j2, src_text[i1:i2], decoded_text[j1:j2]))
171
+ return changes
172
+
173
+ def _tokenize(tokenizer, datasets, detail_path=None):
174
+ """
175
+ :param tokenizer:
176
+ :param datasets:
177
+ :param detail_path:
178
+ :return:
179
+ """
180
+ n_bytes = 0
181
+ n_tokens = 0
182
+ n_chars = 0
183
+ n_oov_chars = 0
184
+ diff_details = []
185
+ oov_charset = set()
186
+ unk_token_id = None
187
+ if hasattr(tokenizer, "unk_token"):
188
+ unk_token_id = tokenizer.unk_token_id
189
+ for dataset in datasets:
190
+ for item in dataset:
191
+ text = item["text"]
192
+ n_bytes += get_n_bytes_of_string(text)
193
+ n_chars += len(text)
194
+ ids = tokenizer.encode(text, add_special_tokens=False)
195
+
196
+ # detect oov
197
+ decoded_text = tokenizer.decode(ids)
198
+ decoded_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
199
+ if decoded_text != text:
200
+ _n_oov_chars, _oov_charset = _char_based_oov(text, decoded_text_without_unk, tokenizer)
201
+ diffs = _diff_path(text, decoded_text)
202
+ diff_details.append(
203
+ {
204
+ "text": text,
205
+ "decoded_text": decoded_text,
206
+ "diff": diffs,
207
+ "n_oov_chars": _n_oov_chars,
208
+ 'oov_ratio': _n_oov_chars / len(text),
209
+ 'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
210
+ }
211
+ )
212
+ n_oov_chars += _n_oov_chars
213
+ oov_charset.update(_oov_charset)
214
+ n_tokens += len(ids)
215
+ stat = {
216
+ "_n_bytes": n_bytes,
217
+ "_n_tokens": n_tokens,
218
+ "_n_chars": n_chars,
219
+ "_n_oov_chars": n_oov_chars,
220
+ "oov_ratio": n_oov_chars / n_chars,
221
+ '_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
222
+ "lossless": len(diff_details) == 0
223
+ }
224
+
225
+ if detail_path and diff_details:
226
+ logger.info(f"saving tokenization detail to '{detail_path}'")
227
+ with open(detail_path, "w", encoding="utf-8") as f:
228
+ f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
229
+ # print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
230
+ # f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
231
+ # f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
232
+ # for diff_detail in diff_details:
233
+ # # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
234
+ # # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
235
+ # f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
236
+ # f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
237
+ return stat
238
+
239
+ # load from cache
240
+ cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
241
+ cache_path = os.path.join(cache_dir, "compression_rate.json")
242
+ if not cache and os.path.exists(cache_path):
243
+ with open(cache_path, "r", encoding="utf-8") as f_tmp:
244
+ cache.update(json.load(f_tmp))
245
+ if cache_id in cache:
246
+ # logger.info(f"loading {cache_id} from in-memory cache")
247
+ return cache[cache_id]
248
+
249
+ # tokenize corpus
250
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
251
+ datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
252
+
253
+ stat = {
254
+ "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
255
+ "organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
256
+ "vocab_size": len(tokenizer),
257
+ }
258
+ tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
259
+ os.makedirs(tokenize_detail_dir, exist_ok=True)
260
+ tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
261
+ stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
262
+ # add basic info
263
+
264
+ # save to cache
265
+ len_before = len(cache)
266
+ cache[cache_id] = stat
267
+ len_after = len(cache)
268
+ logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
269
+ with open(cache_path, "w", encoding="utf-8") as f_tmp:
270
+ json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
271
+ return stat
272
+
273
+
274
+ def get_compression_leaderboard(
275
+ corpuses: List[str] = ['cc100/en'],
276
+ unit: str = "b_tokens/g_bytes",
277
+ tokenizer_filter: Optional[str] = None,
278
+ return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
279
+ ) -> Union[pd.DataFrame, dict]:
280
+ """
281
+ """
282
+ logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
283
+ stats = {}
284
+ if tokenizer_filter is not None:
285
+ tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
286
+ if tokenizer_filter.lower() in tokenizer_name.lower()]
287
+ else:
288
+ tokenizer_names = tokenizer_factory.all_tokenizer_names
289
+ for tokenizer_name in tokenizer_names:
290
+ stats_by_corpus = {}
291
+ for corpus in corpuses:
292
+ stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
293
+ stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
294
+
295
+ if return_type == "dataframe":
296
+ token_number_unit, file_size_unit = unit.split("/")
297
+ reverse_unit = f"{file_size_unit}/{token_number_unit}"
298
+ stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
299
+ stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
300
+ stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
301
+ return stats
302
+
303
+
304
+ def main():
305
+ if len(sys.argv) == 3:
306
+ tokenizer_filter = [sys.argv[1]]
307
+ corpuses = [sys.argv[2]]
308
+ else:
309
+ tokenizer_filter, corpuses = None, common_corpuses
310
+ # tokenizer_filter, corpuses = "openai", ["cc100/en", "cc100/zh-Hans"]
311
+ # tokenizer_filter, corpuses = "Qwen/Qwen1.5-14B", ["cc100/de"]
312
+ # tokenizer_filter, corpuses = "Qwen/Qwen1.5-14B", ["cc100/ja"] # oov 特别多
313
+ # tokenizer_filter, corpuses = "google-bert/bert-base-uncased", ["cc100/ja", "cc100/zh-Hans"] # oov 特别多
314
+ df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
315
+ # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
316
+ logger.info(f"\n{df.to_markdown(index=False)}")
317
+
318
+
319
+ if __name__ == "__main__":
320
+ main()
css/style.css CHANGED
@@ -1,59 +1,62 @@
1
-
2
- /* 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673 */
3
- .space-show {
4
- white-space: pre-wrap;
5
- }
6
-
7
- .cell-wrap {
8
- white-space: pre-wrap;
9
- }
10
-
11
-
12
- /* white button */
13
- .button-as-text {
14
- background: #fff;
15
- border-color: #fff;
16
- }
17
-
18
- .textbox-as-text {
19
- border-style: hidden;
20
- background: #fff;
21
- border-color: #fff;
22
- }
23
-
24
-
25
- .h2-font {
26
- font-size: 30px;
27
- }
28
-
29
- .no-border {
30
- border: 0px none;
31
- }
32
-
33
- /* 隐藏legend */
34
- .category-legend {
35
- display: none !important;
36
- }
37
-
38
- .statistics {
39
- min-width: min(50px, 100%) !important;
40
- }
41
-
42
- .statistics textarea {
43
- min-width: min(50px, 100%) !important;
44
- font-size: 20px !important;
45
- font-weight: 600 !important;
46
- text-align: center !important;
47
- border: none !important;
48
- }
49
-
50
- .statistics label {
51
- text-align: center !important;
52
- }
53
-
54
- /* align-self: flex-end; */
55
- .example-style {
56
- max-width: 150px;
57
- align-self: self-end;
58
- }
59
-
 
 
 
 
1
+ /* hidde legend of HighlightText, has been integrated in gradio.4.32.0 with `show_inline_category`
2
+ .category-legend {
3
+ display: none !important;
4
+ }
5
+ */
6
+
7
+ /* show space in HighlightText:https://blog.csdn.net/liuxiao723846/article/details/118994673
8
+ TODO: integrate in gradio with `show_single_whitespace=True` or `strip_token=False`
9
+ */
10
+ .space-show {
11
+ white-space: pre-wrap;
12
+ }
13
+
14
+ .cell-wrap {
15
+ white-space: pre-wrap;
16
+ }
17
+
18
+
19
+ /* white button */
20
+ .button-as-text {
21
+ background: #fff;
22
+ border-color: #fff;
23
+ }
24
+
25
+ .textbox-as-text {
26
+ border-style: hidden;
27
+ background: #fff;
28
+ border-color: #fff;
29
+ }
30
+
31
+
32
+ .h2-font {
33
+ font-size: 30px;
34
+ }
35
+
36
+ .no-border {
37
+ border: 0px none;
38
+ }
39
+
40
+
41
+ .statistics {
42
+ min-width: min(50px, 100%) !important;
43
+ }
44
+
45
+ .statistics textarea {
46
+ min-width: min(50px, 100%) !important;
47
+ font-size: 20px !important;
48
+ font-weight: 600 !important;
49
+ text-align: center !important;
50
+ border: none !important;
51
+ }
52
+
53
+ .statistics label {
54
+ text-align: center !important;
55
+ }
56
+
57
+ /* align-self: flex-end; */
58
+ .example-style {
59
+ max-width: 150px;
60
+ align-self: self-end;
61
+ }
62
+
playground_app.py CHANGED
@@ -1,264 +1,233 @@
1
- # coding=utf-8
2
- # author: xusong
3
- # time: 2022/8/23 16:06
4
-
5
- """
6
- ## TODO:
7
- - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
8
- - iter_vocab warmup
9
- - 开关
10
- - add_special_token 开关
11
- - theme 开关 light/dark
12
- - token_id/tokens/bytes 开关
13
- - 中文字词统计,是否要包括 _ G 等字符
14
- - 评测
15
- - OOV评测
16
- - 通过 javascript 添加 hover_text
17
- - 英文 utf-8编码
18
- - 词典支持下载,借用image下载的标签,
19
- - baichuan的单字数量怎么两万多个?
20
- - qwen: ValueError: Unclosed image token
21
- - 路径修改为全path meta-llama/Llama-2-13b-hf
22
-
23
- plots
24
-
25
- table
26
-
27
- ## related demo
28
- - [](http://text-processing.com/demo/tokenize/)
29
- - [gpt-tokenizer](https://gpt-tokenizer.dev/)
30
- - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
31
- - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
32
-
33
- ## 可视化
34
-
35
- [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
36
- """
37
-
38
- import gradio as gr
39
- from vocab import tokenizer_factory
40
- from playground_examples import example_types, example_fn
41
- from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
42
-
43
-
44
-
45
-
46
- get_window_url_params = """
47
- function(url_params) {
48
- const params = new URLSearchParams(window.location.search);
49
- url_params = JSON.stringify(Object.fromEntries(params));
50
- return url_params;
51
- }
52
- """
53
-
54
- all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
55
-
56
- with gr.Blocks() as demo:
57
- # links: https://www.coderstool.com/utf8-encoding-decoding
58
- # 功能:输入文本,进行分词
59
- # 分词器:常见的分词器有集中,
60
- # 背景:方便分词、看词粒度、对比
61
-
62
- with gr.Row():
63
- gr.Markdown("## Input Text")
64
- dropdown_examples = gr.Dropdown(
65
- example_types,
66
- value="Examples",
67
- type="index",
68
- allow_custom_value=True,
69
- show_label=False,
70
- container=False,
71
- scale=0,
72
- elem_classes="example-style"
73
- )
74
- user_input = gr.Textbox(
75
- # value=default_user_input,
76
- label="Input Text",
77
- lines=5,
78
- show_label=False,
79
- )
80
- gr.Markdown("## Tokenization")
81
-
82
- # compress rate setting TODO: 将 这个模块调整到下面
83
- # with gr.Accordion("Compress Rate Setting", open=True):
84
- # gr.Markdown(
85
- # "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
86
- # with gr.Row():
87
- # compress_rate_corpus = gr.CheckboxGroup(
88
- # common_corpuses, # , "code"
89
- # value=["cc100-en", "cc100-zh-Hans"],
90
- # label="corpus",
91
- # # info=""
92
- # )
93
- # compress_rate_unit = gr.Radio(
94
- # common_units,
95
- # value="b_tokens/g_bytes",
96
- # label="unit",
97
- # )
98
- # TODO: Token Setting
99
- # with gr.Accordion("Token Filter Setting", open=False):
100
- # gr.Markdown(
101
- # "Get total number of tokens which contain the following character)")
102
- # gr.Radio(
103
- # ["zh-Hans", "", "number", "space"],
104
- # value="zh",
105
- # )
106
-
107
- with gr.Row():
108
- with gr.Column(scale=6):
109
- with gr.Group():
110
- tokenizer_name_1 = gr.Dropdown(
111
- all_tokenizer_name,
112
- label="Tokenizer 1",
113
- )
114
- with gr.Group():
115
- with gr.Row():
116
- organization_1 = gr.TextArea(
117
- label="Organization",
118
- lines=1,
119
- elem_classes="statistics",
120
- )
121
- stats_vocab_size_1 = gr.TextArea(
122
- label="Vocab Size",
123
- lines=1,
124
- elem_classes="statistics"
125
- )
126
- # stats_zh_token_size_1 = gr.TextArea(
127
- # label="ZH char/word",
128
- # lines=1,
129
- # elem_classes="statistics",
130
- # )
131
- # stats_compress_rate_1 = gr.TextArea(
132
- # label="Compress Rate",
133
- # lines=1,
134
- # elem_classes="statistics",
135
- # )
136
- stats_overlap_token_size_1 = gr.TextArea(
137
- # value=default_stats_overlap_token_size,
138
- label="Overlap Tokens",
139
- lines=1,
140
- elem_classes="statistics"
141
- )
142
- # stats_3 = gr.TextArea(
143
- # label="Compress Rate",
144
- # lines=1,
145
- # elem_classes="statistics"
146
- # )
147
- # https://www.onlinewebfonts.com/icon/418591
148
- gr.Image("images/VS.svg", scale=1, show_label=False,
149
- show_download_button=False, container=False,
150
- show_share_button=False)
151
- with gr.Column(scale=6):
152
- with gr.Group():
153
- tokenizer_name_2 = gr.Dropdown(
154
- all_tokenizer_name,
155
- label="Tokenizer 2",
156
- )
157
- with gr.Group():
158
- with gr.Row():
159
- organization_2 = gr.TextArea(
160
- label="Organization",
161
- lines=1,
162
- elem_classes="statistics",
163
- )
164
- stats_vocab_size_2 = gr.TextArea(
165
- label="Vocab Size",
166
- lines=1,
167
- elem_classes="statistics"
168
- )
169
- # stats_zh_token_size_2 = gr.TextArea(
170
- # label="ZH char/word", # 中文字/词
171
- # lines=1,
172
- # elem_classes="statistics",
173
- # )
174
- # stats_compress_rate_2 = gr.TextArea(
175
- # label="Compress Rate",
176
- # lines=1,
177
- # elem_classes="statistics"
178
- # )
179
- stats_filtered_token_2 = gr.TextArea(
180
- label="filtered tokens",
181
- lines=1,
182
- elem_classes="statistics",
183
- visible=False
184
- )
185
- stats_overlap_token_size_2 = gr.TextArea(
186
- label="Overlap Tokens",
187
- lines=1,
188
- elem_classes="statistics"
189
- )
190
-
191
- # TODO: 图 表 压缩率
192
- with gr.Row():
193
- # dynamic change label
194
- with gr.Column():
195
- output_text_1 = gr.Highlightedtext(
196
- show_legend=True,
197
- elem_classes="space-show"
198
- )
199
- with gr.Column():
200
- output_text_2 = gr.Highlightedtext(
201
- show_legend=True,
202
- elem_classes="space-show"
203
- )
204
-
205
- with gr.Row():
206
- output_table_1 = gr.Dataframe()
207
- output_table_2 = gr.Dataframe()
208
-
209
- # setting
210
- # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
211
- # [stats_compress_rate_1, stats_compress_rate_2])
212
-
213
- tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
214
- [output_text_1, output_table_1])
215
- tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
216
- tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
217
- [stats_overlap_token_size_1, stats_overlap_token_size_2])
218
- # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
219
- # [stats_compress_rate_1])
220
-
221
- # TODO: every=3
222
- user_input.change(tokenize_pair,
223
- [user_input, tokenizer_name_1, tokenizer_name_2],
224
- [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
225
-
226
- tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
227
- [output_text_2, output_table_2])
228
- tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
229
- tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
230
- [stats_overlap_token_size_1, stats_overlap_token_size_2])
231
- # tokenizer_type_2.change(get_compress_rate,
232
- # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
233
- # [stats_compress_rate_2])
234
- #
235
- # compress_rate_unit.change(get_compress_rate,
236
- # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
237
- # [stats_compress_rate_1])
238
- # compress_rate_unit.change(get_compress_rate,
239
- # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
240
- # [stats_compress_rate_2])
241
- # compress_rate_corpus.change(get_compress_rate,
242
- # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
243
- # [stats_compress_rate_1])
244
- # compress_rate_corpus.change(get_compress_rate,
245
- # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
246
- # [stats_compress_rate_2])
247
-
248
- dropdown_examples.change(
249
- example_fn,
250
- dropdown_examples,
251
- [user_input, tokenizer_name_1, tokenizer_name_2]
252
- )
253
-
254
- demo.load(
255
- fn=on_load,
256
- inputs=[user_input], # 这里只需要传个空object即可。
257
- outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
258
- js=get_window_url_params
259
- )
260
-
261
- if __name__ == "__main__":
262
- # demo.queue(max_size=20).launch()
263
- demo.launch()
264
- # demo.launch(share=True)
 
1
+ # coding=utf-8
2
+ # author: xusong
3
+ # time: 2022/8/23 16:06
4
+
5
+ import gradio as gr
6
+ from vocab import tokenizer_factory
7
+ from playground_examples import example_types, example_fn
8
+ from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
9
+
10
+
11
+
12
+
13
+ get_window_url_params = """
14
+ function(url_params) {
15
+ const params = new URLSearchParams(window.location.search);
16
+ url_params = JSON.stringify(Object.fromEntries(params));
17
+ return url_params;
18
+ }
19
+ """
20
+
21
+ all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
22
+
23
+ with gr.Blocks() as demo:
24
+ # links: https://www.coderstool.com/utf8-encoding-decoding
25
+ # 功能:输入文本,进行分词
26
+ # 分词器:常见的分词器有集中,
27
+ # 背景:方便分词、看词粒度、对比
28
+
29
+ with gr.Row():
30
+ gr.Markdown("## Input Text")
31
+ dropdown_examples = gr.Dropdown(
32
+ example_types,
33
+ value="Examples",
34
+ type="index",
35
+ allow_custom_value=True,
36
+ show_label=False,
37
+ container=False,
38
+ scale=0,
39
+ elem_classes="example-style"
40
+ )
41
+ user_input = gr.Textbox(
42
+ # value=default_user_input,
43
+ label="Input Text",
44
+ lines=5,
45
+ show_label=False,
46
+ )
47
+ gr.Markdown("## Tokenization")
48
+
49
+ # compress rate setting TODO: 将 这个模块调整到下面
50
+ # with gr.Accordion("Compress Rate Setting", open=True):
51
+ # gr.Markdown(
52
+ # "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
53
+ # with gr.Row():
54
+ # compress_rate_corpus = gr.CheckboxGroup(
55
+ # common_corpuses, # , "code"
56
+ # value=["cc100-en", "cc100-zh-Hans"],
57
+ # label="corpus",
58
+ # # info=""
59
+ # )
60
+ # compress_rate_unit = gr.Radio(
61
+ # common_units,
62
+ # value="b_tokens/g_bytes",
63
+ # label="unit",
64
+ # )
65
+ # TODO: Token Setting
66
+ # with gr.Accordion("Token Filter Setting", open=False):
67
+ # gr.Markdown(
68
+ # "Get total number of tokens which contain the following character)")
69
+ # gr.Radio(
70
+ # ["zh-Hans", "", "number", "space"],
71
+ # value="zh",
72
+ # )
73
+
74
+ with gr.Row():
75
+ with gr.Column(scale=6):
76
+ with gr.Group():
77
+ tokenizer_name_1 = gr.Dropdown(
78
+ all_tokenizer_name,
79
+ label="Tokenizer 1",
80
+ )
81
+ with gr.Group():
82
+ with gr.Row():
83
+ organization_1 = gr.TextArea(
84
+ label="Organization",
85
+ lines=1,
86
+ elem_classes="statistics",
87
+ )
88
+ stats_vocab_size_1 = gr.TextArea(
89
+ label="Vocab Size",
90
+ lines=1,
91
+ elem_classes="statistics"
92
+ )
93
+ # stats_zh_token_size_1 = gr.TextArea(
94
+ # label="ZH char/word",
95
+ # lines=1,
96
+ # elem_classes="statistics",
97
+ # )
98
+ # stats_compress_rate_1 = gr.TextArea(
99
+ # label="Compress Rate",
100
+ # lines=1,
101
+ # elem_classes="statistics",
102
+ # )
103
+ stats_overlap_token_size_1 = gr.TextArea(
104
+ # value=default_stats_overlap_token_size,
105
+ label="Overlap Tokens",
106
+ lines=1,
107
+ elem_classes="statistics"
108
+ )
109
+ # stats_3 = gr.TextArea(
110
+ # label="Compress Rate",
111
+ # lines=1,
112
+ # elem_classes="statistics"
113
+ # )
114
+ # https://www.onlinewebfonts.com/icon/418591
115
+ gr.Image("images/VS.svg", scale=1, show_label=False,
116
+ show_download_button=False, container=False,
117
+ show_share_button=False)
118
+ with gr.Column(scale=6):
119
+ with gr.Group():
120
+ tokenizer_name_2 = gr.Dropdown(
121
+ all_tokenizer_name,
122
+ label="Tokenizer 2",
123
+ )
124
+ with gr.Group():
125
+ with gr.Row():
126
+ organization_2 = gr.TextArea(
127
+ label="Organization",
128
+ lines=1,
129
+ elem_classes="statistics",
130
+ )
131
+ stats_vocab_size_2 = gr.TextArea(
132
+ label="Vocab Size",
133
+ lines=1,
134
+ elem_classes="statistics"
135
+ )
136
+ # stats_zh_token_size_2 = gr.TextArea(
137
+ # label="ZH char/word", # 中文字/词
138
+ # lines=1,
139
+ # elem_classes="statistics",
140
+ # )
141
+ # stats_compress_rate_2 = gr.TextArea(
142
+ # label="Compress Rate",
143
+ # lines=1,
144
+ # elem_classes="statistics"
145
+ # )
146
+ stats_filtered_token_2 = gr.TextArea(
147
+ label="filtered tokens",
148
+ lines=1,
149
+ elem_classes="statistics",
150
+ visible=False
151
+ )
152
+ stats_overlap_token_size_2 = gr.TextArea(
153
+ label="Overlap Tokens",
154
+ lines=1,
155
+ elem_classes="statistics"
156
+ )
157
+
158
+ # TODO: 图 表 压缩率
159
+ with gr.Row():
160
+ # dynamic change label
161
+ with gr.Column():
162
+ output_text_1 = gr.Highlightedtext(
163
+ show_legend=False,
164
+ show_inline_category=False,
165
+ elem_classes="space-show"
166
+ )
167
+ with gr.Column():
168
+ output_text_2 = gr.Highlightedtext(
169
+ show_legend=False,
170
+ show_inline_category=False,
171
+ elem_classes="space-show"
172
+ )
173
+
174
+ with gr.Row():
175
+ output_table_1 = gr.Dataframe()
176
+ output_table_2 = gr.Dataframe()
177
+
178
+ # setting
179
+ # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
180
+ # [stats_compress_rate_1, stats_compress_rate_2])
181
+
182
+ tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
183
+ [output_text_1, output_table_1])
184
+ tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
185
+ tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
186
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
187
+ # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
188
+ # [stats_compress_rate_1])
189
+
190
+ # TODO: every=3
191
+ user_input.change(tokenize_pair,
192
+ [user_input, tokenizer_name_1, tokenizer_name_2],
193
+ [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
194
+
195
+ tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
196
+ [output_text_2, output_table_2])
197
+ tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
198
+ tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
199
+ [stats_overlap_token_size_1, stats_overlap_token_size_2])
200
+ # tokenizer_type_2.change(get_compress_rate,
201
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
202
+ # [stats_compress_rate_2])
203
+ #
204
+ # compress_rate_unit.change(get_compress_rate,
205
+ # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
206
+ # [stats_compress_rate_1])
207
+ # compress_rate_unit.change(get_compress_rate,
208
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
209
+ # [stats_compress_rate_2])
210
+ # compress_rate_corpus.change(get_compress_rate,
211
+ # [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
212
+ # [stats_compress_rate_1])
213
+ # compress_rate_corpus.change(get_compress_rate,
214
+ # [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
215
+ # [stats_compress_rate_2])
216
+
217
+ dropdown_examples.change(
218
+ example_fn,
219
+ dropdown_examples,
220
+ [user_input, tokenizer_name_1, tokenizer_name_2]
221
+ )
222
+
223
+ demo.load(
224
+ fn=on_load,
225
+ inputs=[user_input], # 这里只需要传个空object即可。
226
+ outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
227
+ js=get_window_url_params
228
+ )
229
+
230
+ if __name__ == "__main__":
231
+ # demo.queue(max_size=20).launch()
232
+ demo.launch()
233
+ # demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
playground_util.py CHANGED
@@ -1,181 +1,181 @@
1
- import gradio as gr
2
- import json
3
- import copy
4
- import pandas as pd
5
- from vocab import tokenizer_factory
6
- from character_util import iter_vocab
7
- from utils.log_util import logger
8
- from functools import lru_cache
9
-
10
- default_user_input = """\
11
- Replace this text in the input field to see how tokenization works.
12
- Buenos días!
13
- 华为发布Mate60手机。
14
- ラグビーワールドカップ2023フランス"""
15
- # default_tokenizer_name_1 = "Meta/llama3"
16
- default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
17
- default_tokenizer_name_2 = "openai/gpt-4"
18
-
19
-
20
- @lru_cache
21
- def _tokenize(
22
- text: str,
23
- tokenizer_name: str,
24
- color_num: int = 5,
25
- add_special_token: bool = False
26
- ):
27
- logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
28
- pos_tokens = []
29
- tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
30
- if add_special_token:
31
- encoding = tokenizer.encode(text, add_special_tokens=True)
32
- else:
33
- encoding = tokenizer.encode(text, add_special_tokens=False)
34
-
35
- table = []
36
-
37
- for idx, token_id in enumerate(encoding):
38
- decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
39
- pos_tokens.extend([(decode_text, str(idx % color_num))])
40
-
41
- # token "Byte": # 这是 utf-8编码吧?
42
- token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
43
- if isinstance(token, bytes):
44
- try:
45
- token_str = token.decode("utf-8")
46
- except:
47
- token_str = token.decode("utf-8", errors="ignore")
48
- logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
49
- {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
50
- ensure_ascii=False))
51
-
52
- token_bytes = token
53
- # json_dumps = json.dumps(token_str)
54
- elif isinstance(token, str):
55
- token_str = token
56
- token_bytes = bytes(token_str, "utf-8")
57
- # json_dumps = json.dumps(token_str)
58
- else:
59
- logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
60
- {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
61
- token_str = token
62
- token_bytes = token
63
- # continue
64
-
65
- # ⭐
66
- # TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
67
- table.append(
68
- {"TokenID": token_id,
69
- "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
70
- "Text": decode_text, #
71
- # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
72
- "UTF8 Bytes": str(token_bytes),
73
- # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
74
- }
75
- )
76
-
77
- table_df = pd.DataFrame(table)
78
- logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
79
- return pos_tokens, len(encoding), table_df
80
-
81
-
82
- def tokenize(
83
- text: str,
84
- tokenizer_name: str,
85
- color_num: int = 5,
86
- add_special_token: bool = False
87
- ):
88
- """ tokenize wrapper
89
- As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
90
- """
91
- pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
92
- return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
93
-
94
-
95
- def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
96
- """
97
- input_text.change
98
- """
99
- pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
100
- pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
101
- return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
102
-
103
-
104
- @lru_cache
105
- def basic_count(tokenizer_name):
106
- stats = iter_vocab(tokenizer_name)
107
- return stats['vocab_size'], f'{stats["organization"]}'
108
- # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
109
-
110
-
111
- # def get_compress_rate(tokenizer_name, all_corpus, unit):
112
- # tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
113
- # compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
114
- # compress_rate = unit_convertor(compress_rate_stats, unit)
115
- # return compress_rate
116
-
117
-
118
- @lru_cache
119
- def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
120
- tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
121
- tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
122
-
123
- vocab_set_1 = tokenizer1.get_vocab().keys()
124
- vocab_set_2 = tokenizer2.get_vocab().keys()
125
-
126
- token1 = next(iter(vocab_set_1))
127
- token2 = next(iter(vocab_set_2))
128
- if type(token1) != type(token2): # bytes str
129
- if isinstance(token1, str):
130
- vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
131
- if isinstance(token2, str):
132
- vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
133
-
134
- overlap_tokens = vocab_set_1 & vocab_set_2
135
- overlap_token_size = len(overlap_tokens)
136
- logger.info(
137
- f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
138
- return overlap_token_size, overlap_token_size
139
-
140
-
141
- def on_load(url_params, request: gr.Request):
142
- """
143
- onLoad
144
- """
145
- text = None
146
- tokenizer_type_1 = None
147
- tokenizer_type_2 = None
148
- try:
149
- url_params = json.loads(url_params)
150
- except:
151
- url_params = {}
152
- if request:
153
- logger.info(str(request.headers))
154
- client_ip = request.client.host
155
- # local_ip = socket.gethostbyname(socket.gethostbyname(""))
156
- # headers = request.kwargs['headers']
157
- # if headers and 'x-forwarded-for' in headers:
158
- # x_forwarded_for = headers['x-forwarded-for']
159
- # client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
160
- # if "referer" in request.headers: # not work for huggingface-space
161
- # url_params = parse_qs(urlparse(request.headers["referer"]).query)
162
- # url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
163
- tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
164
- tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
165
- text = url_params.get("text", default_user_input)
166
- logger.info(f"client_ip: {client_ip}; params: {url_params}")
167
- return text, tokenizer_type_1, tokenizer_type_2
168
-
169
-
170
- # def compress_rate_unit_change(unit):
171
- # return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
172
-
173
-
174
- def test_coding():
175
- bytes1 = b'\xe4\xb8\xad'
176
- print(bytes1) # b'\xe4\xb8\xad'
177
-
178
-
179
- if __name__ == "__main__":
180
- print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
181
- # print(basic_count("internlm_chat_7b"))
 
1
+ import gradio as gr
2
+ import json
3
+ import copy
4
+ import pandas as pd
5
+ from vocab import tokenizer_factory
6
+ from character_util import iter_vocab
7
+ from utils.log_util import logger
8
+ from functools import lru_cache
9
+
10
+ default_user_input = """\
11
+ Replace this text in the input field to see how tokenization works.
12
+ Buenos días!
13
+ 华为发布Mate60手机。
14
+ ラグビーワールドカップ2023フランス"""
15
+ # default_tokenizer_name_1 = "Meta/llama3"
16
+ default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
17
+ default_tokenizer_name_2 = "openai/gpt-4o"
18
+
19
+
20
+ @lru_cache
21
+ def _tokenize(
22
+ text: str,
23
+ tokenizer_name: str,
24
+ color_num: int = 5,
25
+ add_special_token: bool = False
26
+ ):
27
+ logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
28
+ pos_tokens = []
29
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
30
+ if add_special_token:
31
+ encoding = tokenizer.encode(text, add_special_tokens=True)
32
+ else:
33
+ encoding = tokenizer.encode(text, add_special_tokens=False)
34
+
35
+ table = []
36
+
37
+ for idx, token_id in enumerate(encoding):
38
+ decoded_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
39
+ pos_tokens.extend([(decoded_text, str(idx % color_num))])
40
+
41
+ # token "Byte": # 这是 utf-8编码吧?
42
+ token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
43
+ if isinstance(token, bytes):
44
+ try:
45
+ token_str = token.decode("utf-8")
46
+ except:
47
+ token_str = token.decode("utf-8", errors="ignore")
48
+ logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
49
+ {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
50
+ ensure_ascii=False))
51
+
52
+ token_bytes = token
53
+ # json_dumps = json.dumps(token_str)
54
+ elif isinstance(token, str):
55
+ token_str = token
56
+ token_bytes = bytes(token_str, "utf-8")
57
+ # json_dumps = json.dumps(token_str)
58
+ else:
59
+ logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
60
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
61
+ token_str = token
62
+ token_bytes = token
63
+ # continue
64
+
65
+ # ⭐
66
+ # TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
67
+ table.append(
68
+ {"TokenID": token_id,
69
+ "Token": token_str, # utf-8解码后的字符串,为什么���些是 <0xE7>,表示什么?比如llama
70
+ "Text": decoded_text, #
71
+ # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
72
+ "UTF8 Bytes": str(token_bytes),
73
+ # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
74
+ }
75
+ )
76
+
77
+ table_df = pd.DataFrame(table)
78
+ logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
79
+ return pos_tokens, len(encoding), table_df
80
+
81
+
82
+ def tokenize(
83
+ text: str,
84
+ tokenizer_name: str,
85
+ color_num: int = 5,
86
+ add_special_token: bool = False
87
+ ):
88
+ """ tokenize wrapper
89
+ As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
90
+ """
91
+ pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
92
+ return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
93
+
94
+
95
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
96
+ """
97
+ input_text.change
98
+ """
99
+ pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
100
+ pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
101
+ return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
102
+
103
+
104
+ @lru_cache
105
+ def basic_count(tokenizer_name):
106
+ stats = iter_vocab(tokenizer_name)
107
+ return stats['vocab_size'], f'{stats["organization"]}'
108
+ # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
109
+
110
+
111
+ # def get_compress_rate(tokenizer_name, all_corpus, unit):
112
+ # tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
113
+ # compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
114
+ # compress_rate = unit_convertor(compress_rate_stats, unit)
115
+ # return compress_rate
116
+
117
+
118
+ @lru_cache
119
+ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
120
+ tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
121
+ tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
122
+
123
+ vocab_set_1 = tokenizer1.get_vocab().keys()
124
+ vocab_set_2 = tokenizer2.get_vocab().keys()
125
+
126
+ token1 = next(iter(vocab_set_1))
127
+ token2 = next(iter(vocab_set_2))
128
+ if type(token1) != type(token2): # bytes str
129
+ if isinstance(token1, str):
130
+ vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
131
+ if isinstance(token2, str):
132
+ vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
133
+
134
+ overlap_tokens = vocab_set_1 & vocab_set_2
135
+ overlap_token_size = len(overlap_tokens)
136
+ logger.info(
137
+ f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
138
+ return overlap_token_size, overlap_token_size
139
+
140
+
141
+ def on_load(url_params, request: gr.Request):
142
+ """
143
+ onLoad
144
+ """
145
+ text = None
146
+ tokenizer_type_1 = None
147
+ tokenizer_type_2 = None
148
+ try:
149
+ url_params = json.loads(url_params)
150
+ except:
151
+ url_params = {}
152
+ if request:
153
+ logger.info(str(request.headers))
154
+ client_ip = request.client.host
155
+ # local_ip = socket.gethostbyname(socket.gethostbyname(""))
156
+ # headers = request.kwargs['headers']
157
+ # if headers and 'x-forwarded-for' in headers:
158
+ # x_forwarded_for = headers['x-forwarded-for']
159
+ # client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
160
+ # if "referer" in request.headers: # not work for huggingface-space
161
+ # url_params = parse_qs(urlparse(request.headers["referer"]).query)
162
+ # url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
163
+ tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
164
+ tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
165
+ text = url_params.get("text", default_user_input)
166
+ logger.info(f"client_ip: {client_ip}; params: {url_params}")
167
+ return text, tokenizer_type_1, tokenizer_type_2
168
+
169
+
170
+ # def compress_rate_unit_change(unit):
171
+ # return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
172
+
173
+
174
+ def test_coding():
175
+ bytes1 = b'\xe4\xb8\xad'
176
+ print(bytes1) # b'\xe4\xb8\xad'
177
+
178
+
179
+ if __name__ == "__main__":
180
+ print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
181
+ # print(basic_count("internlm_chat_7b"))
requirements.txt CHANGED
@@ -1,11 +1,12 @@
1
- transformers
2
- sentencepiece
3
- tiktoken
4
- icetk
5
- torch
6
- nltk
7
- boto3
8
- protobuf==4.25.3
9
- ai2-olmo==0.2.4
10
- ipadic
 
11
  fugashi
 
1
+ gradio>=4.32.0
2
+ transformers
3
+ sentencepiece
4
+ tiktoken
5
+ icetk
6
+ torch
7
+ nltk
8
+ boto3
9
+ protobuf==4.25.3
10
+ ai2-olmo==0.2.4
11
+ ipadic
12
  fugashi
stats/character_stats.json CHANGED
The diff for this file is too large to render. See raw diff
 
stats/compression_rate.json CHANGED
The diff for this file is too large to render. See raw diff
 
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ar.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfb1c2be8bf13e5989a95b5f401f92aaad6cadde8ecc704ebaf9b9578bb359a2
3
+ size 2145294
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.de.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:554a365ce0da76ae5d93642b496bb1bc3d8d78c1112523545a2219f7fe213a91
3
+ size 10978507
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.en.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c349b2602379affd0aa388d75addece67a14d0afaaf5b4980c90e9cc875e8e
3
+ size 5261108
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.es.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e829c4c544a9e8d98701b3d3bf1e3593b63e59ab5ba244c1ab376f6002fbd0f9
3
+ size 6853004
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fa.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908327a56262f721590d9479faa579156ba8bd155242262943797be697bc2655
3
+ size 1058478
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fr.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f02e17dfe25c4c1526c8adee812a7141d92ccbd3b1160e7c73fc325d9fbfe4e
3
+ size 6385085
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ja.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0abf3a91ddeeaa12d4732eaf1b4ff2a207b3d85fc54a079b4ac853696d831148
3
+ size 2529096
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ko.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd64f035328b88bb4389ee820bb6d2bed510e0e4259cc4f38a0f573d2c003c2
3
+ size 2491144
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.zh-Hans.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7781b5bc9b2c3f45764842bf551a6e039ddef4f1bafd85ce12446834a26dd241
3
+ size 10841058
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ar.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfb1c2be8bf13e5989a95b5f401f92aaad6cadde8ecc704ebaf9b9578bb359a2
3
+ size 2145294
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.de.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:554a365ce0da76ae5d93642b496bb1bc3d8d78c1112523545a2219f7fe213a91
3
+ size 10978507
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.en.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c349b2602379affd0aa388d75addece67a14d0afaaf5b4980c90e9cc875e8e
3
+ size 5261108
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.es.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e829c4c544a9e8d98701b3d3bf1e3593b63e59ab5ba244c1ab376f6002fbd0f9
3
+ size 6853004
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fa.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908327a56262f721590d9479faa579156ba8bd155242262943797be697bc2655
3
+ size 1058478
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fr.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f02e17dfe25c4c1526c8adee812a7141d92ccbd3b1160e7c73fc325d9fbfe4e
3
+ size 6385085
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ja.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0abf3a91ddeeaa12d4732eaf1b4ff2a207b3d85fc54a079b4ac853696d831148
3
+ size 2529096
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ko.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd64f035328b88bb4389ee820bb6d2bed510e0e4259cc4f38a0f573d2c003c2
3
+ size 2491144
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.zh-Hans.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7781b5bc9b2c3f45764842bf551a6e039ddef4f1bafd85ce12446834a26dd241
3
+ size 10841058
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ar.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b56af2e07e0c6ae80ed6c212d92a11eaad7dc654c187c7471738ba3c830a588
3
+ size 20780798
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.de.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00492605965dd0637b79fe80e3d2428065cba551a9a7198bd7a0b505ce85d81b
3
+ size 2751629
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.en.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e823bebc4f8f42e03b8e621baa23b07072a4199eb0fd293e92d11c96003f3433
3
+ size 163424
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.es.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3003793b062ae28b5b4f202b8f0d9f725e46f024acc38f7f9ef08e8b3381fc0
3
+ size 2030664
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fa.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c07f75c1eb80e59bab44b7b6ced9aec1404dbf56a5abd85779846c83974a7de
3
+ size 18041636
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fr.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:341e747d07dd8276b90de8c7d725a45e10d39084bc819ffd54cab6460ddcba63
3
+ size 3129632
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ja.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b46c604a75d0288f253c3439a2a7333c38e900ebb42ba39dd1c2ecbe4229f304
3
+ size 6425383
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ko.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeab167e9f566512c3065d362e720f1930bd51ca5b9c14c207a252fa9380e7fa
3
+ size 15893128
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.zh-Hans.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e851ffd1f4f9bf8949cb0e77cc15ea65223fe4a54ac5a13ec9e43c27a550388f
3
+ size 10563259
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ar.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b56af2e07e0c6ae80ed6c212d92a11eaad7dc654c187c7471738ba3c830a588
3
+ size 20780798
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.de.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00492605965dd0637b79fe80e3d2428065cba551a9a7198bd7a0b505ce85d81b
3
+ size 2751629
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.en.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e823bebc4f8f42e03b8e621baa23b07072a4199eb0fd293e92d11c96003f3433
3
+ size 163424
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.es.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3003793b062ae28b5b4f202b8f0d9f725e46f024acc38f7f9ef08e8b3381fc0
3
+ size 2030664
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fa.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c07f75c1eb80e59bab44b7b6ced9aec1404dbf56a5abd85779846c83974a7de
3
+ size 18041636
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fr.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:341e747d07dd8276b90de8c7d725a45e10d39084bc819ffd54cab6460ddcba63
3
+ size 3129632
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ja.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b46c604a75d0288f253c3439a2a7333c38e900ebb42ba39dd1c2ecbe4229f304
3
+ size 6425383
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ko.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeab167e9f566512c3065d362e720f1930bd51ca5b9c14c207a252fa9380e7fa
3
+ size 15893128
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.zh-Hans.diff.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e851ffd1f4f9bf8949cb0e77cc15ea65223fe4a54ac5a13ec9e43c27a550388f
3
+ size 10563259