import gradio as gr from transformers import AutoTokenizer # Define tokenizers for English and Arabic tokenizers = { "English - BERT (bert-base-uncased)": "bert-base-uncased", "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)": "CAMeL-Lab/bert-base-arabic-camelbert-ca", "Arabic - AraBERT (asafaya/bert-base-arabic)": "asafaya/bert-base-arabic" } # Tokenization function def tokenize_text(text,model_name): # Load the selected tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizers[model_name]) # Tokenize the input text tokens = tokenizer.tokenize(text) # Return tokens and number of tokens return f"Tokens: {tokens}", f"Number of tokens: {len(tokens)}" # Define Gradio interface components model_choice = gr.Dropdown(choices=list(tokenizers.keys()), label="Select Tokenizer", value="English - BERT (bert-base-uncased)") text_input = gr.Textbox(label="Enter a sentence to tokenize") # Predefined example sentences with the corresponding tokenizers examples = [ ["The quick brown fox jumps over the lazy dog.", "English - BERT (bert-base-uncased)"], # English sentence with English tokenizer ["القمر جميل في السماء.", "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)"] # Arabic sentence with Arabic tokenizer ] # Set up the Gradio interface demo = gr.Interface( fn=tokenize_text, inputs=[text_input,model_choice], outputs=[gr.Textbox(label="Tokens"), gr.Textbox(label="Number of Tokens")], # Properly named outputs title="Hugging Face Tokenizer Explorer", description="Enter a sentence or use one of the example sentences below to see how different tokenizers work.", examples=examples, # Provide examples that apply to the text input field allow_flagging=False, cache_examples=False # Disable example caching )