Didier's picture
Using smaller m2m100 model
4661832
"""
File: app.py
Description: Translate text...
Author: Didier Guillevic
Date: 2024-09-07
"""
import spaces
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
import gradio as gr
import langdetect
from deep_translator import GoogleTranslator
from model_spacy import nlp_xx
import model_translation
m2m100 = model_translation.ModelM2M100()
def translate_with_Helsinki(
chunks, src_lang, tgt_lang, input_max_length, output_max_length) -> str:
"""Translate the chunks with the Helsinki model
"""
if src_lang not in translation.src_langs:
return (
f"ISSUE: currently no model for language '{src_lang}'. "
"If wrong language, please specify language."
)
logger.info(f"LANG: {src_lang}, TEXT: {chunks[0][:50]}...")
tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)
translated_chunks = []
for chunk in chunks:
# NOTE: The 'fa' (Persian) model has multiple target languages to choose from.
# We need to specifiy the desired languages among: fra ita por ron spa
# https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc
# Prepend text with >>fra<< in order to translate in French.
if src_lang == 'fa':
chunk = ">>fra<< " + chunk
inputs = tokenizer(
chunk, return_tensors="pt", max_length=input_max_length,
truncation=True, padding="longest").to(model.device)
outputs = model.generate(**inputs, max_length=output_max_length)
translated_chunk = tokenizer.batch_decode(
outputs, skip_special_tokens=True)[0]
#logger.info(f"Text: {chunk}")
#logger.info(f"Translation: {translated_chunk}")
translated_chunks.append(translated_chunk)
return '\n'.join(translated_chunks)
def translate_text(
text: str,
src_lang: str,
tgt_lang: str
) -> str:
"""Translate the given text into English or French
"""
# src_lang among the supported languages?
# - make sure src_lang is not None
src_lang = src_lang if (src_lang and src_lang != "auto") else langdetect.detect(text)
if src_lang not in model_translation.language_codes.values():
logging.error(f"Language detected {src_lang} not among supported language")
# tgt_lang: make sure it is not None. Default to 'en' if not set.
if tgt_lang not in model_translation.tgt_language_codes.values():
tgt_lang = 'en'
# translate
translated_text_m2m100 = m2m100.translate(text, src_lang, tgt_lang)
translated_text_google_translate = GoogleTranslator(
source='auto', target='en').translate(text=text)
return (
translated_text_m2m100,
translated_text_google_translate
)
#
# User interface
#
with gr.Blocks() as demo:
gr.Markdown("""
## Text translation v0.0.3
""")
# Input
input_text = gr.Textbox(
lines=5,
placeholder="Enter text to translate",
label="Text to translate",
render=True
)
# Output
output_text_m2m100 = gr.Textbox(
lines=4,
label="Facebook m2m100 (418M)",
render=True
)
output_text_google_translate = gr.Textbox(
lines=4,
label="Google Translate",
render=True
)
# Source and target languages
with gr.Row():
src_lang = gr.Dropdown(
choices=model_translation.language_codes.items(),
value="auto",
label="Source language",
render=True
)
tgt_lang = gr.Dropdown(
choices=model_translation.tgt_language_codes.items(),
value="en",
label="Target language",
render=True
)
# Submit button
translate_btn = gr.Button("Translate")
translate_btn.click(
fn=translate_text,
inputs=[input_text, src_lang, tgt_lang],
outputs=[output_text_m2m100, output_text_google_translate]
)
with gr.Accordion("Documentation", open=False):
gr.Markdown("""
- Models: serving Facebook M2M100 (418M) and Google Translate.
""")
if __name__ == "__main__":
demo.launch()