Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,13 @@ import tempfile
|
|
8 |
import soundfile as sf
|
9 |
import scipy.io.wavfile as wav
|
10 |
|
11 |
-
from transformers import
|
12 |
from nemo.collections.asr.models import EncDecMultiTaskModel
|
13 |
|
14 |
# Constants
|
15 |
SAMPLE_RATE = 16000 # Hz
|
16 |
|
17 |
-
#
|
18 |
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
19 |
decode_cfg = canary_model.cfg.decoding
|
20 |
decode_cfg.beam.beam_size = 1
|
@@ -34,13 +34,13 @@ def gen_text(audio_filepath, action, source_lang, target_lang):
|
|
34 |
converted_audio_filepath = os.path.join(tmpdir, f"{utt_id}.wav")
|
35 |
sf.write(converted_audio_filepath, data, SAMPLE_RATE)
|
36 |
|
37 |
-
# Transcribe audio
|
38 |
duration = len(data) / SAMPLE_RATE
|
39 |
manifest_data = {
|
40 |
"audio_filepath": converted_audio_filepath,
|
41 |
"taskname": action,
|
42 |
"source_lang": source_lang,
|
43 |
-
"target_lang": source_lang if action=="asr" else target_lang,
|
44 |
"pnc": "no",
|
45 |
"answer": "predict",
|
46 |
"duration": str(duration),
|
@@ -50,33 +50,13 @@ def gen_text(audio_filepath, action, source_lang, target_lang):
|
|
50 |
fout.write(json.dumps(manifest_data))
|
51 |
|
52 |
predicted_text = canary_model.transcribe(manifest_filepath)[0]
|
53 |
-
# if duration < 40:
|
54 |
-
# predicted_text = canary_model.transcribe(manifest_filepath)[0]
|
55 |
-
# else:
|
56 |
-
# predicted_text = get_buffered_pred_feat_multitaskAED(
|
57 |
-
# frame_asr,
|
58 |
-
# canary_model.cfg.preprocessor,
|
59 |
-
# model_stride_in_secs,
|
60 |
-
# canary_model.device,
|
61 |
-
# manifest=manifest_filepath,
|
62 |
-
# )[0].text
|
63 |
|
64 |
return predicted_text
|
65 |
|
66 |
# Function to convert text to speech using TTS
|
67 |
def gen_speech(text, lang):
|
68 |
set_seed(555) # Make it deterministic
|
69 |
-
|
70 |
-
case "en":
|
71 |
-
model = "facebook/mms-tts-eng"
|
72 |
-
case "fr":
|
73 |
-
model = "facebook/mms-tts-fra"
|
74 |
-
case "de":
|
75 |
-
model = "facebook/mms-tts-deu"
|
76 |
-
case "es":
|
77 |
-
model = "facebook/mms-tts-spa"
|
78 |
-
case _:
|
79 |
-
model = "facebook/mms-tts"
|
80 |
|
81 |
# load TTS model
|
82 |
tts_model = VitsModel.from_pretrained(model)
|
@@ -86,75 +66,54 @@ def gen_speech(text, lang):
|
|
86 |
with torch.no_grad():
|
87 |
outputs = tts_model(**input_text)
|
88 |
waveform_np = outputs.waveform[0].cpu().numpy()
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
102 |
|
103 |
-
|
104 |
# Create Gradio interface
|
105 |
-
|
106 |
-
|
107 |
-
with playground:
|
108 |
-
|
109 |
-
with gr.Row():
|
110 |
-
gr.Markdown("""
|
111 |
-
## Your AI Translate Assistant
|
112 |
-
### Gets input audio from user, transcribe and translate it. Convert back to speech.
|
113 |
-
- category: [Automatic Speech Recognition](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition), model: [nvidia/canary-1b](https://huggingface.co/nvidia/canary-1b)
|
114 |
-
- category: [Text-to-Speech](https://huggingface.co/models?pipeline_tag=text-to-speech), model: [facebook/mms-tts](https://huggingface.co/facebook/mms-tts)
|
115 |
-
""")
|
116 |
-
|
117 |
-
with gr.Row():
|
118 |
-
with gr.Column():
|
119 |
-
source_lang = gr.Dropdown(
|
120 |
-
choices=["en", "de", "es", "fr"], value="en", label="Source Language"
|
121 |
-
)
|
122 |
-
with gr.Column():
|
123 |
-
target_lang = gr.Dropdown(
|
124 |
-
choices=["en", "de", "es", "fr"], value="fr", label="Target Language"
|
125 |
-
)
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
with gr.Column():
|
131 |
-
translated_speech = gr.Audio(type="filepath", label="Generated Speech")
|
132 |
-
|
133 |
-
with gr.Row():
|
134 |
-
with gr.Column():
|
135 |
-
transcipted_text = gr.Textbox(label="Transcription")
|
136 |
-
with gr.Column():
|
137 |
-
translated_text = gr.Textbox(label="Translation")
|
138 |
-
|
139 |
-
with gr.Row():
|
140 |
-
with gr.Column():
|
141 |
-
submit_button = gr.Button(value="Start Process", variant="primary")
|
142 |
-
with gr.Column():
|
143 |
-
clear_button = gr.ClearButton(components=[input_audio, source_lang, target_lang, transcipted_text, translated_text, translated_speech], value="Clear")
|
144 |
|
145 |
with gr.Row():
|
146 |
-
gr.
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import soundfile as sf
|
9 |
import scipy.io.wavfile as wav
|
10 |
|
11 |
+
from transformers import VitsModel, AutoTokenizer, set_seed
|
12 |
from nemo.collections.asr.models import EncDecMultiTaskModel
|
13 |
|
14 |
# Constants
|
15 |
SAMPLE_RATE = 16000 # Hz
|
16 |
|
17 |
+
# Load ASR model
|
18 |
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
19 |
decode_cfg = canary_model.cfg.decoding
|
20 |
decode_cfg.beam.beam_size = 1
|
|
|
34 |
converted_audio_filepath = os.path.join(tmpdir, f"{utt_id}.wav")
|
35 |
sf.write(converted_audio_filepath, data, SAMPLE_RATE)
|
36 |
|
37 |
+
# Transcribe or translate audio
|
38 |
duration = len(data) / SAMPLE_RATE
|
39 |
manifest_data = {
|
40 |
"audio_filepath": converted_audio_filepath,
|
41 |
"taskname": action,
|
42 |
"source_lang": source_lang,
|
43 |
+
"target_lang": source_lang if action == "asr" else target_lang,
|
44 |
"pnc": "no",
|
45 |
"answer": "predict",
|
46 |
"duration": str(duration),
|
|
|
50 |
fout.write(json.dumps(manifest_data))
|
51 |
|
52 |
predicted_text = canary_model.transcribe(manifest_filepath)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
return predicted_text
|
55 |
|
56 |
# Function to convert text to speech using TTS
|
57 |
def gen_speech(text, lang):
|
58 |
set_seed(555) # Make it deterministic
|
59 |
+
model = f"facebook/mms-tts-{lang}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# load TTS model
|
62 |
tts_model = VitsModel.from_pretrained(model)
|
|
|
66 |
with torch.no_grad():
|
67 |
outputs = tts_model(**input_text)
|
68 |
waveform_np = outputs.waveform[0].cpu().numpy()
|
69 |
+
return SAMPLE_RATE, waveform_np
|
70 |
+
|
71 |
+
# Main function for speech-to-speech translation
|
72 |
+
def speech_to_speech_translation(audio_filepath, source_lang, target_lang):
|
73 |
+
translation = gen_text(audio_filepath, "s2t_translation", source_lang, target_lang)
|
74 |
+
sample_rate, synthesized_speech = gen_speech(translation, target_lang)
|
75 |
+
return sample_rate, synthesized_speech
|
76 |
+
|
77 |
+
# Define supported languages
|
78 |
+
LANGUAGES = {
|
79 |
+
"English": "eng",
|
80 |
+
"German": "deu",
|
81 |
+
"Spanish": "spa",
|
82 |
+
"French": "fra"
|
83 |
+
}
|
84 |
|
|
|
85 |
# Create Gradio interface
|
86 |
+
demo = gr.Blocks()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
with demo:
|
89 |
+
gr.Markdown("# Multilingual Speech-to-Speech Translation")
|
90 |
+
gr.Markdown("Translate speech from one language to another.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
with gr.Row():
|
93 |
+
source_lang = gr.Dropdown(choices=list(LANGUAGES.keys()), value="English", label="Source Language")
|
94 |
+
target_lang = gr.Dropdown(choices=list(LANGUAGES.keys()), value="French", label="Target Language")
|
95 |
+
|
96 |
+
with gr.Tabs():
|
97 |
+
with gr.TabItem("Microphone"):
|
98 |
+
mic_input = gr.Audio(source="microphone", type="filepath")
|
99 |
+
mic_output = gr.Audio(label="Generated Speech", type="numpy")
|
100 |
+
mic_button = gr.Button("Translate")
|
101 |
+
|
102 |
+
with gr.TabItem("Audio File"):
|
103 |
+
file_input = gr.Audio(source="upload", type="filepath")
|
104 |
+
file_output = gr.Audio(label="Generated Speech", type="numpy")
|
105 |
+
file_button = gr.Button("Translate")
|
106 |
+
|
107 |
+
mic_button.click(
|
108 |
+
speech_to_speech_translation,
|
109 |
+
inputs=[mic_input, source_lang, target_lang],
|
110 |
+
outputs=mic_output
|
111 |
+
)
|
112 |
+
|
113 |
+
file_button.click(
|
114 |
+
speech_to_speech_translation,
|
115 |
+
inputs=[file_input, source_lang, target_lang],
|
116 |
+
outputs=file_output
|
117 |
+
)
|
118 |
+
|
119 |
+
demo.launch()
|