datnth1709 commited on
Commit
5c35238
1 Parent(s): f4a01a0

add envi traslation

Browse files
Files changed (5) hide show
  1. app.py +112 -22
  2. en_speech_01.wav +0 -0
  3. en_speech_02.wav +0 -0
  4. en_speech_03.wav +0 -0
  5. requirements.txt +2 -0
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import gradio as gr
 
 
2
  from transformers import pipeline
3
  from transformers.file_utils import cached_path, hf_bucket_url
4
  import os, zipfile
5
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
6
  from datasets import load_dataset
7
  import torch
8
  import kenlm
@@ -12,12 +14,12 @@ from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
12
  """Vietnamese speech2text"""
13
  cache_dir = './cache/'
14
  processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
15
- model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
16
  lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
17
  lm_file = cached_path(lm_file,cache_dir=cache_dir)
18
  with zipfile.ZipFile(lm_file, 'r') as zip_ref:
19
  zip_ref.extractall(cache_dir)
20
- lm_file = cache_dir + 'vi_lm_4grams.bin'\
21
 
22
  def get_decoder_ngram_model(tokenizer, ngram_lm_path):
23
  vocab_dict = tokenizer.get_vocab()
@@ -56,7 +58,7 @@ def speech_file_to_array_fn(path, max_seconds=10):
56
  return batch
57
 
58
  # tokenize
59
- def speech2text(audio):
60
  # read in sound file
61
  # load dummy dataset and read soundfiles
62
  ds = speech_file_to_array_fn(audio.name)
@@ -67,57 +69,145 @@ def speech2text(audio):
67
  return_tensors="pt"
68
  ).input_values
69
  # decode ctc output
70
- logits = model(input_values).logits[0]
71
  pred_ids = torch.argmax(logits, dim=-1)
72
  greedy_search_output = processor.decode(pred_ids)
73
  beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
74
  return beam_search_output
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  """Machine translation"""
78
- model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
79
- translator = pipeline("translation", model=model_checkpoint)
 
 
80
 
81
  def translate_vi2en(Vietnamese):
82
- return translator(Vietnamese)[0]['translation_text']
 
 
 
 
83
 
84
- def inference(audio):
85
- vi_text = speech2text(audio)
 
 
 
86
  en_text = translate_vi2en(vi_text)
87
  return en_text
88
 
 
 
 
 
 
89
 
90
  """Gradio demo"""
91
 
92
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
93
  "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
94
  "Nếu như một câu nói có thể khiến em vui."]
95
-
96
  vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
97
 
 
 
 
 
 
 
 
98
  with gr.Blocks() as demo:
99
  with gr.Tabs():
100
  with gr.TabItem("Translation: Vietnamese to English"):
101
  with gr.Row():
102
  with gr.Column():
103
- vietnamese = gr.Textbox(label="Vietnamese Text")
104
- translate_to_english = gr.Button(value="Translate To English")
105
  with gr.Column():
106
- english1 = gr.Textbox(label="English Text")
107
- translate_to_english.click(lambda text: translate_vi2en(text), inputs=vietnamese, outputs=english1)
108
  gr.Examples(examples=vi_example_text,
109
- inputs=[vietnamese])
110
- with gr.TabItem("Speech2text and translation"):
111
  with gr.Row():
112
  with gr.Column():
113
- audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
114
- translate_button = gr.Button(value="Translate To English")
115
  with gr.Column():
116
- english2 = gr.Textbox(label="English Text")
117
 
118
- translate_button.click(lambda voice: inference(voice), inputs=audio, outputs=english2)
119
  gr.Examples(examples=vi_example_voice,
120
- inputs=[audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  demo.launch()
 
1
  import gradio as gr
2
+ import nltk
3
+ import librosa
4
  from transformers import pipeline
5
  from transformers.file_utils import cached_path, hf_bucket_url
6
  import os, zipfile
7
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer
8
  from datasets import load_dataset
9
  import torch
10
  import kenlm
 
14
  """Vietnamese speech2text"""
15
  cache_dir = './cache/'
16
  processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
17
+ vi_model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
18
  lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
19
  lm_file = cached_path(lm_file,cache_dir=cache_dir)
20
  with zipfile.ZipFile(lm_file, 'r') as zip_ref:
21
  zip_ref.extractall(cache_dir)
22
+ lm_file = cache_dir + 'vi_lm_4grams.bin'
23
 
24
  def get_decoder_ngram_model(tokenizer, ngram_lm_path):
25
  vocab_dict = tokenizer.get_vocab()
 
58
  return batch
59
 
60
  # tokenize
61
+ def speech2text_vi(audio):
62
  # read in sound file
63
  # load dummy dataset and read soundfiles
64
  ds = speech_file_to_array_fn(audio.name)
 
69
  return_tensors="pt"
70
  ).input_values
71
  # decode ctc output
72
+ logits = vi_model(input_values).logits[0]
73
  pred_ids = torch.argmax(logits, dim=-1)
74
  greedy_search_output = processor.decode(pred_ids)
75
  beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
76
  return beam_search_output
77
 
78
 
79
+ """English speech2text"""
80
+ nltk.download("punkt")
81
+ # Loading the model and the tokenizer
82
+ model_name = "facebook/wav2vec2-base-960h"
83
+ eng_tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
84
+ eng_model = Wav2Vec2ForCTC.from_pretrained(model_name)
85
+
86
+ def load_data(input_file):
87
+ """ Function for resampling to ensure that the speech input is sampled at 16KHz.
88
+ """
89
+ # read the file
90
+ speech, sample_rate = librosa.load(input_file)
91
+ # make it 1-D
92
+ if len(speech.shape) > 1:
93
+ speech = speech[:, 0] + speech[:, 1]
94
+ # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
95
+ if sample_rate != 16000:
96
+ speech = librosa.resample(speech, sample_rate, 16000)
97
+ return speech
98
+
99
+ def correct_casing(input_sentence):
100
+ """ This function is for correcting the casing of the generated transcribed text
101
+ """
102
+ sentences = nltk.sent_tokenize(input_sentence)
103
+ return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]))
104
+
105
+
106
+ def speech2text_en(input_file):
107
+ """This function generates transcripts for the provided audio input
108
+ """
109
+ speech = load_data(input_file)
110
+ # Tokenize
111
+ input_values = eng_tokenizer(speech, return_tensors="pt").input_values
112
+ # Take logits
113
+ logits = eng_model(input_values).logits
114
+ # Take argmax
115
+ predicted_ids = torch.argmax(logits, dim=-1)
116
+ # Get the words from predicted word ids
117
+ transcription = eng_tokenizer.decode(predicted_ids[0])
118
+ # Output is all upper case
119
+ transcription = correct_casing(transcription.lower())
120
+ return transcription
121
+
122
+
123
+
124
  """Machine translation"""
125
+ vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
126
+ envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
127
+ vien_translator = pipeline("translation", model=vien_model_checkpoint)
128
+ envi_translator = pipeline("translation", model=envi_model_checkpoint)
129
 
130
  def translate_vi2en(Vietnamese):
131
+ return vien_translator(Vietnamese)[0]['translation_text']
132
+
133
+ def translate_en2vi(English):
134
+ return envi_translator(English)[0]['translation_text']
135
+
136
 
137
+
138
+
139
+ """ Inference"""
140
+ def inference_vien(audio):
141
+ vi_text = speech2text_vi(audio)
142
  en_text = translate_vi2en(vi_text)
143
  return en_text
144
 
145
+ def inference_envi(audio):
146
+ en_text = speech2text_en(audio)
147
+ vi_text = translate_en2vi(en_text)
148
+ return vi_text
149
+
150
 
151
  """Gradio demo"""
152
 
153
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
154
  "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
155
  "Nếu như một câu nói có thể khiến em vui."]
 
156
  vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
157
 
158
+ en_example_text = ["According to a study by Statista, the global AI market is set to grow up to 54 percent every single year.",
159
+ "As one of the world's greatest cities, Air New Zealand is proud to add the Big Apple to its list of 29 international destinations.",
160
+ "And yet, earlier this month, I found myself at Halloween Horror Nights at Universal Orlando Resort, one of the most popular Halloween events in the US among hardcore horror buffs."
161
+ ]
162
+ en_example_voice =[['en_speech_01.wav'], ['en_speech_02.wav'], ['en_speech_03.wav']]
163
+
164
+
165
  with gr.Blocks() as demo:
166
  with gr.Tabs():
167
  with gr.TabItem("Translation: Vietnamese to English"):
168
  with gr.Row():
169
  with gr.Column():
170
+ vietnamese_text = gr.Textbox(label="Vietnamese Text")
171
+ translate_button_vien_1 = gr.Button(value="Translate To English")
172
  with gr.Column():
173
+ english_out_1 = gr.Textbox(label="English Text")
174
+ translate_button_vien_1.click(lambda text: translate_vi2en(text), inputs=vietnamese_text, outputs=english_out_1)
175
  gr.Examples(examples=vi_example_text,
176
+ inputs=[vietnamese_text])
177
+ with gr.TabItem("Speech2text and Vi-En Translation"):
178
  with gr.Row():
179
  with gr.Column():
180
+ vi_audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
181
+ translate_button_vien_2 = gr.Button(value="Translate To English")
182
  with gr.Column():
183
+ english_out_2 = gr.Textbox(label="English Text")
184
 
185
+ translate_button_vien_2.click(lambda voice: inference_vien(voice), inputs=vi_audio, outputs=english_out_2)
186
  gr.Examples(examples=vi_example_voice,
187
+ inputs=[vi_audio])
188
+
189
+ with gr.Tabs():
190
+ with gr.TabItem("Translation: English to Vietnamese"):
191
+ with gr.Row():
192
+ with gr.Column():
193
+ english_text = gr.Textbox(label="English Text")
194
+ translate_button_envi_1 = gr.Button(value="Translate To Vietnamese")
195
+ with gr.Column():
196
+ vietnamese_out_1 = gr.Textbox(label="Vietnamese Text")
197
+ translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
198
+ gr.Examples(examples=en_example_text,
199
+ inputs=[english_text])
200
+ with gr.TabItem("Speech2text and En-Vi Translation"):
201
+ with gr.Row():
202
+ with gr.Column():
203
+ en_audio = gr.Audio(source="microphone", label="Input Audio", type="file", streaming=False)
204
+ translate_button_envi_2 = gr.Button(value="Translate To English")
205
+ with gr.Column():
206
+ vietnamese_out_2 = gr.Textbox(label="English Text")
207
+
208
+ translate_button_envi_2.click(lambda voice: inference_envi(voice), inputs=en_audio, outputs=vietnamese_out_2)
209
+ gr.Examples(examples=en_example_voice,
210
+ inputs=[en_audio])
211
 
212
  if __name__ == "__main__":
213
  demo.launch()
en_speech_01.wav ADDED
Binary file (816 kB). View file
 
en_speech_02.wav ADDED
Binary file (238 kB). View file
 
en_speech_03.wav ADDED
Binary file (751 kB). View file
 
requirements.txt CHANGED
@@ -10,6 +10,8 @@ pyctcdecode
10
  soundfile
11
  ffmpeg-python
12
  gradio
 
 
13
  transformers
14
  transformers[sentencepiece]
15
  https://github.com/kpu/kenlm/archive/master.zip
 
10
  soundfile
11
  ffmpeg-python
12
  gradio
13
+ nltk
14
+ librosa
15
  transformers
16
  transformers[sentencepiece]
17
  https://github.com/kpu/kenlm/archive/master.zip