File size: 4,433 Bytes
5357bd8
61a3b2f
bee3802
2b73d7a
 
 
 
0a9a631
8d000e9
176f915
 
 
 
 
 
71c7fc4
0a9a631
ece3f89
7760bbc
ece3f89
 
 
b9f9278
176f915
 
 
 
 
 
 
 
7760bbc
 
 
 
2b73d7a
 
b9f9278
2b73d7a
7760bbc
 
176f915
 
 
66b3df6
7760bbc
 
 
 
 
 
 
ece3f89
 
b9f9278
2b73d7a
66b3df6
ab010ed
176f915
 
 
27c0e8b
 
71c7fc4
 
 
 
 
91eabe4
00fb4c8
91eabe4
bd02afc
3f8c44e
 
 
 
 
 
 
 
 
778be61
bd02afc
778be61
3f8c44e
778be61
 
 
bd02afc
 
 
ece3f89
7760bbc
80ccea0
b613c61
7760bbc
80ccea0
3567a04
bd02afc
6f9d03f
7760bbc
ece3f89
 
 
 
9f4f9aa
7760bbc
ece3f89
fcbfd45
7760bbc
2b73d7a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import (
    aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k,
    aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
)
from transformers import AutoTokenizer, logging
from huggingface_hub import login
import os

# Retrieve your Hugging Face token from the environment variable
HF_TOKEN = os.getenv('HF_TOKEN')

if HF_TOKEN:
    HF_TOKEN = HF_TOKEN.strip()  # Remove any leading or trailing whitespace/newlines
    login(token=HF_TOKEN)

# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Try to load the gated tokenizer
try:
    meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
except Exception as e:
    meta_llama_tokenizer = None
    logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")

# List of available tokenizers and a dictionary to load them
tokenizer_options = [
    "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
    "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
    "FreedomIntelligence/AceGPT-13B",
    "FreedomIntelligence/AceGPT-7B",
    "inception-mbzuai/jais-13b",
    "aubmindlab/bert-base-arabertv2"
]

if meta_llama_tokenizer:
    tokenizer_options.append("meta-llama/Meta-Llama-3-8B")

tokenizers = {
    "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
    "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
    "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
    "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
    "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
    "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
    "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
    "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
    "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
    "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
    "aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
}

if meta_llama_tokenizer:
    tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer

def compare_tokenizers(tokenizer_index, text):
    tokenizer_name = tokenizer_options[tokenizer_index]
    tokenizer = tokenizers[tokenizer_name]()
    tokens = tokenizer.tokenize(text)
    encoded_output = tokenizer.encode(text, add_special_tokens=True)
    decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)

    # Ensure the tokens are properly decoded
    tokens_display = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]

    # Prepare the results to be displayed in HTML format
    tokens_html = "".join([
        f"<span style='background-color:#eeeeee; color: #333333; padding:4px; margin:2px; border-radius:3px; border:1px solid #cccccc;'>{token}</span>" 
        for token in tokens_display
    ])
    encoded_html = "".join([
        f"<span style='background-color:#e0e0e0; color: #000000; padding:4px; margin:2px; border-radius:3px; border:1px solid #aaaaaa;'>{token}</span>" 
        for token in encoded_output
    ])
    decoded_html = f"<div style='background-color:#f5f5f5; color: #444444; padding:10px; border-radius:3px; border:1px solid #999999;'>{decoded_text}</div>"

    results_html = f"""
    <div style='font-family: Arial, sans-serif;'>
        <h3 style='color: #2e7d32;'>Tokenizer: {tokenizer_name}</h3>
        <p><strong>Tokens:</strong> {tokens_html}</p>
        <p><strong>Encoded:</strong> {encoded_html}</p>
        <p><strong>Decoded:</strong> {decoded_html}</p>
    </div>
    """
    return results_html

# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
    gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer", type="index"),
    gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]

outputs_component = gr.HTML(label="Results")

# Setting up the interface
iface = Interface(
    fn=compare_tokenizers, 
    inputs=inputs_component, 
    outputs=outputs_component, 
    title="Arabic Tokenizer Arena",
    live=True
)

# Launching the Gradio app
iface.launch()