""" File: app.py Description: Translate text... Author: Didier Guillevic Date: 2024-09-07 """ import spaces import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) import gradio as gr import langdetect from deep_translator import GoogleTranslator from model_spacy import nlp_xx import model_translation m2m100 = model_translation.ModelM2M100() def translate_with_Helsinki( chunks, src_lang, tgt_lang, input_max_length, output_max_length) -> str: """Translate the chunks with the Helsinki model """ if src_lang not in translation.src_langs: return ( f"ISSUE: currently no model for language '{src_lang}'. " "If wrong language, please specify language." ) logger.info(f"LANG: {src_lang}, TEXT: {chunks[0][:50]}...") tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang) translated_chunks = [] for chunk in chunks: # NOTE: The 'fa' (Persian) model has multiple target languages to choose from. # We need to specifiy the desired languages among: fra ita por ron spa # https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc # Prepend text with >>fra<< in order to translate in French. if src_lang == 'fa': chunk = ">>fra<< " + chunk inputs = tokenizer( chunk, return_tensors="pt", max_length=input_max_length, truncation=True, padding="longest").to(model.device) outputs = model.generate(**inputs, max_length=output_max_length) translated_chunk = tokenizer.batch_decode( outputs, skip_special_tokens=True)[0] #logger.info(f"Text: {chunk}") #logger.info(f"Translation: {translated_chunk}") translated_chunks.append(translated_chunk) return '\n'.join(translated_chunks) def translate_text( text: str, src_lang: str, tgt_lang: str ) -> str: """Translate the given text into English or French """ # src_lang among the supported languages? # - make sure src_lang is not None src_lang = src_lang if (src_lang and src_lang != "auto") else langdetect.detect(text) if src_lang not in model_translation.language_codes.values(): logging.error(f"Language detected {src_lang} not among supported language") # tgt_lang: make sure it is not None. Default to 'en' if not set. if tgt_lang not in model_translation.tgt_language_codes.values(): tgt_lang = 'en' # translate translated_text_m2m100 = m2m100.translate(text, src_lang, tgt_lang) translated_text_google_translate = GoogleTranslator( source='auto', target='en').translate(text=text) return ( translated_text_m2m100, translated_text_google_translate ) # # User interface # with gr.Blocks() as demo: gr.Markdown(""" ## Text translation v0.0.3 """) # Input input_text = gr.Textbox( lines=5, placeholder="Enter text to translate", label="Text to translate", render=True ) # Output output_text_m2m100 = gr.Textbox( lines=4, label="Facebook m2m100 (418M)", render=True ) output_text_google_translate = gr.Textbox( lines=4, label="Google Translate", render=True ) # Source and target languages with gr.Row(): src_lang = gr.Dropdown( choices=model_translation.language_codes.items(), value="auto", label="Source language", render=True ) tgt_lang = gr.Dropdown( choices=model_translation.tgt_language_codes.items(), value="en", label="Target language", render=True ) # Submit button translate_btn = gr.Button("Translate") translate_btn.click( fn=translate_text, inputs=[input_text, src_lang, tgt_lang], outputs=[output_text_m2m100, output_text_google_translate] ) with gr.Accordion("Documentation", open=False): gr.Markdown(""" - Models: serving Facebook M2M100 (418M) and Google Translate. """) if __name__ == "__main__": demo.launch()