Spaces:
Runtime error
Runtime error
File size: 1,844 Bytes
82fa7ce b4b7270 82fa7ce b4b7270 82fa7ce 1ba41b9 4828cb7 952d239 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from transformers import AutoTokenizer
# Define tokenizers for English and Arabic
tokenizers = {
"English - BERT (bert-base-uncased)": "bert-base-uncased",
"Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)": "CAMeL-Lab/bert-base-arabic-camelbert-ca",
"Arabic - AraBERT (asafaya/bert-base-arabic)": "asafaya/bert-base-arabic"
}
# Tokenization function
def tokenize_text(text,model_name):
# Load the selected tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizers[model_name])
# Tokenize the input text
tokens = tokenizer.tokenize(text)
# Return tokens and number of tokens
return f"Tokens: {tokens}", f"Number of tokens: {len(tokens)}"
# Define Gradio interface components
model_choice = gr.Dropdown(choices=list(tokenizers.keys()), label="Select Tokenizer", value="English - BERT (bert-base-uncased)")
text_input = gr.Textbox(label="Enter a sentence to tokenize")
# Predefined example sentences with the corresponding tokenizers
examples = [
["The quick brown fox jumps over the lazy dog.", "English - BERT (bert-base-uncased)"], # English sentence with English tokenizer
["القمر جميل في السماء.", "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)"] # Arabic sentence with Arabic tokenizer
]
# Set up the Gradio interface
demo = gr.Interface(
fn=tokenize_text,
inputs=[text_input,model_choice],
outputs=[gr.Textbox(label="Tokens"), gr.Textbox(label="Number of Tokens")], # Properly named outputs
title="Hugging Face Tokenizer Explorer",
description="Enter a sentence or use one of the example sentences below to see how different tokenizers work.",
examples=examples, # Provide examples that apply to the text input field
allow_flagging=False,
cache_examples=False # Disable example caching
) |