Spaces:
Runtime error
Runtime error
File size: 4,433 Bytes
5357bd8 61a3b2f bee3802 2b73d7a 0a9a631 8d000e9 176f915 71c7fc4 0a9a631 ece3f89 7760bbc ece3f89 b9f9278 176f915 7760bbc 2b73d7a b9f9278 2b73d7a 7760bbc 176f915 66b3df6 7760bbc ece3f89 b9f9278 2b73d7a 66b3df6 ab010ed 176f915 27c0e8b 71c7fc4 91eabe4 00fb4c8 91eabe4 bd02afc 3f8c44e 778be61 bd02afc 778be61 3f8c44e 778be61 bd02afc ece3f89 7760bbc 80ccea0 b613c61 7760bbc 80ccea0 3567a04 bd02afc 6f9d03f 7760bbc ece3f89 9f4f9aa 7760bbc ece3f89 fcbfd45 7760bbc 2b73d7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import (
aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k,
aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
)
from transformers import AutoTokenizer, logging
from huggingface_hub import login
import os
# Retrieve your Hugging Face token from the environment variable
HF_TOKEN = os.getenv('HF_TOKEN')
if HF_TOKEN:
HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
login(token=HF_TOKEN)
# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
# Try to load the gated tokenizer
try:
meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
except Exception as e:
meta_llama_tokenizer = None
logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")
# List of available tokenizers and a dictionary to load them
tokenizer_options = [
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b",
"aubmindlab/bert-base-arabertv2"
]
if meta_llama_tokenizer:
tokenizer_options.append("meta-llama/Meta-Llama-3-8B")
tokenizers = {
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
"aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
"aranizer_sp32k": aranizer_sp32k.get_tokenizer,
"aranizer_sp50k": aranizer_sp50k.get_tokenizer,
"aranizer_sp64k": aranizer_sp64k.get_tokenizer,
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
"aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
}
if meta_llama_tokenizer:
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
def compare_tokenizers(tokenizer_index, text):
tokenizer_name = tokenizer_options[tokenizer_index]
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
# Ensure the tokens are properly decoded
tokens_display = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
# Prepare the results to be displayed in HTML format
tokens_html = "".join([
f"<span style='background-color:#eeeeee; color: #333333; padding:4px; margin:2px; border-radius:3px; border:1px solid #cccccc;'>{token}</span>"
for token in tokens_display
])
encoded_html = "".join([
f"<span style='background-color:#e0e0e0; color: #000000; padding:4px; margin:2px; border-radius:3px; border:1px solid #aaaaaa;'>{token}</span>"
for token in encoded_output
])
decoded_html = f"<div style='background-color:#f5f5f5; color: #444444; padding:10px; border-radius:3px; border:1px solid #999999;'>{decoded_text}</div>"
results_html = f"""
<div style='font-family: Arial, sans-serif;'>
<h3 style='color: #2e7d32;'>Tokenizer: {tokenizer_name}</h3>
<p><strong>Tokens:</strong> {tokens_html}</p>
<p><strong>Encoded:</strong> {encoded_html}</p>
<p><strong>Decoded:</strong> {decoded_html}</p>
</div>
"""
return results_html
# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer", type="index"),
gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]
outputs_component = gr.HTML(label="Results")
# Setting up the interface
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Arabic Tokenizer Arena",
live=True
)
# Launching the Gradio app
iface.launch()
|