Spaces:
Running
Running
add doc
Browse files- compression_app.py +10 -6
- compression_util.py +6 -3
compression_app.py
CHANGED
@@ -43,12 +43,12 @@ Lossless tokenization preserves the exact original text, i.e. `decoded_text = in
|
|
43 |
|
44 |
- **Compression Rate** <br>
|
45 |
There are mainly two types of metric to represent the `input_text`:
|
46 |
-
- `
|
47 |
-
- `
|
48 |
|
49 |
-
To evaluate compression rate, simple metrics can be "how many
|
50 |
-
In this leaderboard, we adopt more frequently used metric: "how many
|
51 |
-
per
|
52 |
💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
|
53 |
"""
|
54 |
|
@@ -141,7 +141,11 @@ with gr.Blocks(theme=theme) as demo:
|
|
141 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
142 |
)
|
143 |
|
144 |
-
gr.Markdown("## 🏆 Compression Rate Leaderboard"
|
|
|
|
|
|
|
|
|
145 |
search_bar = gr.Textbox(
|
146 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
147 |
show_label=False,
|
|
|
43 |
|
44 |
- **Compression Rate** <br>
|
45 |
There are mainly two types of metric to represent the `input_text`:
|
46 |
+
- `char-level`: the number of characters in the given text
|
47 |
+
- `byte-level`: the number of bytes in the given text.
|
48 |
|
49 |
+
To evaluate compression rate, simple metrics can be "how many chars per token" or "how many bytes per token". <br>
|
50 |
+
In this leaderboard, we adopt more frequently used metric: "how many chars per token" and
|
51 |
+
"how many billion tokens per gigabytes corpus", i.e. `char/token` and `b_tokens/g_bytes`.
|
52 |
💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
|
53 |
"""
|
54 |
|
|
|
141 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
142 |
)
|
143 |
|
144 |
+
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
145 |
+
"The leaderboard aim to evaluate tokenizer performance on different languages.\n"
|
146 |
+
"Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
|
147 |
+
"Higher `char/token` means less words be segmented into subwords."
|
148 |
+
)
|
149 |
search_bar = gr.Textbox(
|
150 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
151 |
show_label=False,
|
compression_util.py
CHANGED
@@ -295,9 +295,12 @@ def get_compression_leaderboard(
|
|
295 |
if return_type == "dataframe":
|
296 |
token_number_unit, file_size_unit = unit.split("/")
|
297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
298 |
-
stats = to_dataframe(stats, [
|
299 |
-
stats = stats.sort_values(["oov_ratio",
|
300 |
-
|
|
|
|
|
|
|
301 |
return stats
|
302 |
|
303 |
|
|
|
295 |
if return_type == "dataframe":
|
296 |
token_number_unit, file_size_unit = unit.split("/")
|
297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
298 |
+
stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
|
299 |
+
stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
|
300 |
+
|
301 |
+
# stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
302 |
+
|
303 |
+
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
|
304 |
return stats
|
305 |
|
306 |
|