Spaces:

anyantudre
/

moore-language-translation-tts-stt

Runtime error

App Files Files Community

anyantudre commited on Apr 8

Commit

e41ca58

•

1 Parent(s): 7f1969e

Upload 5 files

Browse files

Files changed (5) hide show

app.py +56 -0
requirements.txt +8 -0
speech_to_text.py +46 -0
text_to_speech.py +40 -0
translation.py +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import scipy
+import gradio as gr
+from transformers import set_seed, pipeline
+from transformers import VitsTokenizer, VitsModel
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+from datasets import load_dataset, Audio
+import speech_to_text, text_to_speech, translation
+language_list = ['mos', 'fra', 'eng']
+demo = gr.Blocks()
+mms_stt = gr.Interface(
+    fn=speech_to_text.transcribe,
+    inputs=[
+        gr.Audio(sources=["microphone", "upload"], type="filepath"),
+        gr.Dropdown(language_list, label="Language")
+    ],
+    outputs="text",
+    title="Speech-to-text"
+)
+mms_tts = gr.Interface(
+    fn=text_to_speech.synthesize_facebook,
+    inputs=[
+        gr.Text(label="Input text"),
+        gr.Dropdown(language_list, label="Language")
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio", type="numpy")
+    ],
+    title="Text-to-speech"
+)
+mms_translate = gr.Interface(
+    fn=translation.translation,
+    inputs=[
+        gr.Textbox(label="Text", placeholder="Yaa sõama"),
+        gr.Dropdown(label="Source Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]),
+        gr.Dropdown(label="Target Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"])
+    ],
+    outputs=["text"],
+    examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "mos_Latn"]],
+    title="Translation Demo",
+)
+with demo:
+    gr.TabbedInterface(
+        [mms_translate, mms_tts, mms_stt],
+        ["Translation", "Text-to-speech", "Speech-to-text"],
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+datasets
+librosa
+pycountry
+scipy
+sentencepiece
+transformers
+torch
+gradio

speech_to_text.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import librosa
+import torch
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+from transformers import set_seed
+import time
+def transcribe(fp:str, target_lang:str) -> str:
+    '''
+    For given audio file, transcribe it.
+    Parameters
+    ----------
+    fp: str
+        The file path to the audio file.
+    target_lang:str
+        The ISO-3 code of the target language.
+    Returns
+    ----------
+    transcript:str
+        The transcribed text.
+    '''
+    # Ensure replicability
+    set_seed(555)
+    start_time = time.time()
+    # Load transcription model
+    model_id = "facebook/mms-1b-all"
+    processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
+    model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
+    # Process the audio
+    signal, sampling_rate =  librosa.load(fp, sr=16000)
+    inputs = processor(signal, sampling_rate=16_000, return_tensors="pt")
+    # Inference
+    with torch.no_grad():
+        outputs = model(**inputs).logits
+    ids = torch.argmax(outputs, dim=-1)[0]
+    transcript = processor.decode(ids)
+    print("Time elapsed: ", int(time.time() - start_time), " seconds")
+    return transcript

text_to_speech.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import time
+import torch
+from transformers import set_seed
+from transformers import VitsTokenizer, VitsModel
+def synthesize_facebook(s:str, iso3:str) -> str:
+    '''
+    For given text, speak it.
+    Parameters
+    ----------
+    s: str
+        The written text.
+    is03:str
+        The ISO-3 code of the text's language.
+    Returns
+    ----------
+    synth:str
+        The synthesized audio.
+    '''
+    # Ensure replicability
+    set_seed(555)
+    start_time = time.time()
+    # Load synthesizer
+    tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
+    model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
+    inputs = tokenizer(text=s, return_tensors="pt")
+    # Inference
+    with torch.no_grad():
+       outputs = model(**inputs)
+    synth = outputs.waveform[0]
+    print("Time elapsed: ", int(time.time() - start_time), " seconds")
+    return synth.numpy()

translation.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import pipeline
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+model     = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+def translation(text, src_lang, tgt_lang):
+  trans_pipe = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400)
+  return trans_pipe(text)[0]["translation_text"]