File size: 1,790 Bytes
82fa7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4b7270
 
82fa7ce
b4b7270
 
82fa7ce
 
 
 
 
 
 
 
 
 
 
952d239
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

import gradio as gr
from transformers import AutoTokenizer

# Define tokenizers for English and Arabic
tokenizers = {
    "English - BERT (bert-base-uncased)": "bert-base-uncased",
    "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)": "CAMeL-Lab/bert-base-arabic-camelbert-ca",
    "Arabic - AraBERT (asafaya/bert-base-arabic)": "asafaya/bert-base-arabic"
}

# Tokenization function
def tokenize_text(text,model_name):
    # Load the selected tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizers[model_name])
    
    # Tokenize the input text
    tokens = tokenizer.tokenize(text)
    
    # Return tokens and number of tokens
    return f"Tokens: {tokens}", f"Number of tokens: {len(tokens)}"

# Define Gradio interface components
model_choice = gr.Dropdown(choices=list(tokenizers.keys()), label="Select Tokenizer", value="English - BERT (bert-base-uncased)")
text_input = gr.Textbox(label="Enter a sentence to tokenize")



# Predefined example sentences with the corresponding tokenizers
examples = [
    ["The quick brown fox jumps over the lazy dog.", "English - BERT (bert-base-uncased)"],  # English sentence with English tokenizer
    ["القمر جميل في السماء.", "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)"]  # Arabic sentence with Arabic tokenizer
]

# Set up the Gradio interface
demo = gr.Interface(
    fn=tokenize_text,
    inputs=[text_input,model_choice],
    outputs=[gr.Textbox(label="Tokens"), gr.Textbox(label="Number of Tokens")],  # Properly named outputs
    title="Hugging Face Tokenizer Explorer",
    description="Enter a sentence or use one of the example sentences below to see how different tokenizers work.",
    examples=examples,  # Provide examples that apply to the text input field
    allow_flagging=False
)