Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from flores import code_mapping | |
import platform | |
import torch | |
import nltk | |
nltk.download("punkt_tab") | |
REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"} | |
device = "cpu" if platform.system() == "Darwin" else "cuda" | |
MODEL_NAME = "facebook/nllb-200-3.3B" | |
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) | |
flores_codes = list(code_mapping.keys()) | |
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES] | |
def load_model(): | |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) | |
print(f"Model loaded in {device}") | |
return model | |
model = load_model() | |
def load_tokenizer(src_lang, tgt_lang): | |
tokenizer = AutoTokenizer.from_pretrained( | |
MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang] | |
) | |
return tokenizer | |
def translate(text: str, src_lang: str, tgt_lang: str): | |
tokenizer = load_tokenizer(src_lang, tgt_lang) | |
paragraphs = text.split("\n") | |
translated_paragraphs = [] | |
for paragraph in paragraphs: | |
sentences = nltk.sent_tokenize(paragraph) | |
translated_sentences = [] | |
for sentence in sentences: | |
input_tokens = ( | |
tokenizer(sentence, return_tensors="pt") | |
.input_ids[0] | |
.cpu() | |
.numpy() | |
.tolist() | |
) | |
translated_chunk = model.generate( | |
input_ids=torch.tensor([input_tokens]).to(device), | |
forced_bos_token_id=tokenizer.convert_tokens_to_ids(code_mapping[tgt_lang]), | |
max_length=len(input_tokens) + 50, | |
num_return_sequences=1, | |
) | |
translated_chunk = tokenizer.decode( | |
translated_chunk[0], skip_special_tokens=True | |
) | |
translated_sentences.append(translated_chunk) | |
translated_paragraph = " ".join(translated_sentences) | |
translated_paragraphs.append(translated_paragraph) | |
return "\n".join(translated_paragraphs) | |
description = """ | |
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages. | |
This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face") | |
gr.Markdown(description) | |
with gr.Row(): | |
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) | |
target_lang = gr.Dropdown(label="Target Language", choices=target_languages) | |
with gr.Row(): | |
input_text = gr.Textbox(label="Input Text", lines=6) | |
with gr.Row(): | |
btn = gr.Button("Translate text") | |
with gr.Row(): | |
output = gr.Textbox(label="Output Text", lines=6) | |
btn.click( | |
translate, | |
inputs=[input_text, src_lang, target_lang], | |
outputs=output, | |
) | |
demo.launch() |