Spaces:

datnth1709
/

FantasticFour-S2T-MT-demo

Runtime error

App Files Files Community

datnth1709 commited on Sep 19, 2022

Commit

5c35238

•

1 Parent(s): f4a01a0

add envi traslation

Browse files

Files changed (5) hide show

app.py +112 -22
en_speech_01.wav +0 -0
en_speech_02.wav +0 -0
en_speech_03.wav +0 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
 from transformers import pipeline
 from transformers.file_utils import cached_path, hf_bucket_url
 import os, zipfile
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from datasets import load_dataset
 import torch
 import kenlm
@@ -12,12 +14,12 @@ from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
 """Vietnamese speech2text"""
 cache_dir = './cache/'
 processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
-model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
 lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
 lm_file = cached_path(lm_file,cache_dir=cache_dir)
 with zipfile.ZipFile(lm_file, 'r') as zip_ref:
     zip_ref.extractall(cache_dir)
-lm_file = cache_dir + 'vi_lm_4grams.bin'\
 def get_decoder_ngram_model(tokenizer, ngram_lm_path):
     vocab_dict = tokenizer.get_vocab()
@@ -56,7 +58,7 @@ def speech_file_to_array_fn(path, max_seconds=10):
     return batch
 # tokenize
-def speech2text(audio):
    # read in sound file
     # load dummy dataset and read soundfiles
     ds = speech_file_to_array_fn(audio.name)
@@ -67,57 +69,145 @@ def speech2text(audio):
           return_tensors="pt"
     ).input_values
     # decode ctc output
-    logits = model(input_values).logits[0]
     pred_ids = torch.argmax(logits, dim=-1)
     greedy_search_output = processor.decode(pred_ids)
     beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
     return beam_search_output
 """Machine translation"""
-model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
-translator = pipeline("translation", model=model_checkpoint)
 def translate_vi2en(Vietnamese):
-    return translator(Vietnamese)[0]['translation_text']
-def inference(audio):
-    vi_text = speech2text(audio)
     en_text = translate_vi2en(vi_text)
     return en_text
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                    "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
                    "Nếu như một câu nói có thể khiến em vui."]
 vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
 with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.TabItem("Translation: Vietnamese to English"):
             with gr.Row():
                 with gr.Column():
-                    vietnamese = gr.Textbox(label="Vietnamese Text")
-                    translate_to_english = gr.Button(value="Translate To English")
                 with gr.Column():
-                    english1 = gr.Textbox(label="English Text")
-            translate_to_english.click(lambda text: translate_vi2en(text), inputs=vietnamese, outputs=english1)
             gr.Examples(examples=vi_example_text,
-                        inputs=[vietnamese])
-        with gr.TabItem("Speech2text and translation"):
             with gr.Row():
                 with gr.Column():
-                    audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
-                    translate_button = gr.Button(value="Translate To English")
                 with gr.Column():
-                    english2 = gr.Textbox(label="English Text")
-            translate_button.click(lambda voice: inference(voice), inputs=audio, outputs=english2)
             gr.Examples(examples=vi_example_voice,
-                        inputs=[audio])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import nltk
+import librosa
 from transformers import pipeline
 from transformers.file_utils import cached_path, hf_bucket_url
 import os, zipfile
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer
 from datasets import load_dataset
 import torch
 import kenlm
 """Vietnamese speech2text"""
 cache_dir = './cache/'
 processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
+vi_model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
 lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
 lm_file = cached_path(lm_file,cache_dir=cache_dir)
 with zipfile.ZipFile(lm_file, 'r') as zip_ref:
     zip_ref.extractall(cache_dir)
+lm_file = cache_dir + 'vi_lm_4grams.bin'
 def get_decoder_ngram_model(tokenizer, ngram_lm_path):
     vocab_dict = tokenizer.get_vocab()
     return batch
 # tokenize
+def speech2text_vi(audio):
    # read in sound file
     # load dummy dataset and read soundfiles
     ds = speech_file_to_array_fn(audio.name)
           return_tensors="pt"
     ).input_values
     # decode ctc output
+    logits = vi_model(input_values).logits[0]
     pred_ids = torch.argmax(logits, dim=-1)
     greedy_search_output = processor.decode(pred_ids)
     beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
     return beam_search_output
+"""English speech2text"""
+nltk.download("punkt")
+# Loading the model and the tokenizer
+model_name = "facebook/wav2vec2-base-960h"
+eng_tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
+eng_model = Wav2Vec2ForCTC.from_pretrained(model_name)
+def load_data(input_file):
+    """ Function for resampling to ensure that the speech input is sampled at 16KHz.
+    """
+    # read the file
+    speech, sample_rate = librosa.load(input_file)
+    # make it 1-D
+    if len(speech.shape) > 1:
+        speech = speech[:, 0] + speech[:, 1]
+    # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
+    if sample_rate != 16000:
+        speech = librosa.resample(speech, sample_rate, 16000)
+    return speech
+def correct_casing(input_sentence):
+    """ This function is for correcting the casing of the generated transcribed text
+    """
+    sentences = nltk.sent_tokenize(input_sentence)
+    return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]))
+def speech2text_en(input_file):
+    """This function generates transcripts for the provided audio input
+    """
+    speech = load_data(input_file)
+    # Tokenize
+    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
+    # Take logits
+    logits = eng_model(input_values).logits
+    # Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Get the words from predicted word ids
+    transcription = eng_tokenizer.decode(predicted_ids[0])
+    # Output is all upper case
+    transcription = correct_casing(transcription.lower())
+    return transcription
 """Machine translation"""
+vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
+envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
+vien_translator = pipeline("translation", model=vien_model_checkpoint)
+envi_translator = pipeline("translation", model=envi_model_checkpoint)
 def translate_vi2en(Vietnamese):
+    return vien_translator(Vietnamese)[0]['translation_text']
+def translate_en2vi(English):
+    return envi_translator(English)[0]['translation_text']
+""" Inference"""
+def inference_vien(audio):
+    vi_text = speech2text_vi(audio)
     en_text = translate_vi2en(vi_text)
     return en_text
+def inference_envi(audio):
+    en_text = speech2text_en(audio)
+    vi_text = translate_en2vi(en_text)
+    return vi_text
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                    "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
                    "Nếu như một câu nói có thể khiến em vui."]
 vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
+en_example_text = ["According to a study by Statista, the global AI market is set to grow up to 54 percent every single year.",
+                   "As one of the world's greatest cities, Air New Zealand is proud to add the Big Apple to its list of 29 international destinations.",
+                   "And yet, earlier this month, I found myself at Halloween Horror Nights at Universal Orlando Resort, one of the most popular Halloween events in the US among hardcore horror buffs."
+                   ]
+en_example_voice =[['en_speech_01.wav'], ['en_speech_02.wav'], ['en_speech_03.wav']]
 with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.TabItem("Translation: Vietnamese to English"):
             with gr.Row():
                 with gr.Column():
+                    vietnamese_text = gr.Textbox(label="Vietnamese Text")
+                    translate_button_vien_1 = gr.Button(value="Translate To English")
                 with gr.Column():
+                    english_out_1 = gr.Textbox(label="English Text")
+            translate_button_vien_1.click(lambda text: translate_vi2en(text), inputs=vietnamese_text, outputs=english_out_1)
             gr.Examples(examples=vi_example_text,
+                        inputs=[vietnamese_text])
+        with gr.TabItem("Speech2text and Vi-En Translation"):
             with gr.Row():
                 with gr.Column():
+                    vi_audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
+                    translate_button_vien_2 = gr.Button(value="Translate To English")
                 with gr.Column():
+                    english_out_2 = gr.Textbox(label="English Text")
+            translate_button_vien_2.click(lambda voice: inference_vien(voice), inputs=vi_audio, outputs=english_out_2)
             gr.Examples(examples=vi_example_voice,
+                        inputs=[vi_audio])
+    with gr.Tabs():
+        with gr.TabItem("Translation: English to Vietnamese"):
+            with gr.Row():
+                with gr.Column():
+                    english_text = gr.Textbox(label="English Text")
+                    translate_button_envi_1 = gr.Button(value="Translate To Vietnamese")
+                with gr.Column():
+                    vietnamese_out_1 = gr.Textbox(label="Vietnamese Text")
+            translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
+            gr.Examples(examples=en_example_text,
+                        inputs=[english_text])
+        with gr.TabItem("Speech2text and En-Vi Translation"):
+            with gr.Row():
+                with gr.Column():
+                    en_audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
+                    translate_button_envi_2 = gr.Button(value="Translate To English")
+                with gr.Column():
+                    vietnamese_out_2 = gr.Textbox(label="English Text")
+            translate_button_envi_2.click(lambda voice: inference_envi(voice), inputs=en_audio, outputs=vietnamese_out_2)
+            gr.Examples(examples=en_example_voice,
+                        inputs=[en_audio])
 if __name__ == "__main__":
     demo.launch()

en_speech_01.wav ADDED Viewed

Binary file (816 kB). View file

en_speech_02.wav ADDED Viewed

Binary file (238 kB). View file

en_speech_03.wav ADDED Viewed

Binary file (751 kB). View file

requirements.txt CHANGED Viewed

@@ -10,6 +10,8 @@ pyctcdecode
 soundfile
 ffmpeg-python
 gradio
 transformers
 transformers[sentencepiece]
 https://github.com/kpu/kenlm/archive/master.zip

 soundfile
 ffmpeg-python
 gradio
+nltk
+librosa
 transformers
 transformers[sentencepiece]
 https://github.com/kpu/kenlm/archive/master.zip