Spaces:
Runtime error
Runtime error
datnth1709
commited on
Commit
•
e8eafba
1
Parent(s):
607348e
update speech2text module
Browse files- .gitignore +1 -4
- app.py +93 -6
- packages.txt +1 -0
- requirements.txt +9 -1
- vi_speech_01.wav +0 -0
- vi_speech_02.wav +0 -0
- vi_speech_03.wav +0 -0
.gitignore
CHANGED
@@ -4,7 +4,4 @@ __pycache__
|
|
4 |
.git
|
5 |
.vs
|
6 |
.vscode
|
7 |
-
.ipynb_checkpoints
|
8 |
-
|
9 |
-
# Except this file
|
10 |
-
*.pbf
|
|
|
4 |
.git
|
5 |
.vs
|
6 |
.vscode
|
7 |
+
.ipynb_checkpoints
|
|
|
|
|
|
app.py
CHANGED
@@ -1,15 +1,102 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
model_checkpoint = "huynguyen208/fantastic4-finetuned-vi-to-en-PhoMT-demo"
|
5 |
translator = pipeline("translation", model=model_checkpoint)
|
6 |
|
7 |
def translate(Vietnamese):
|
8 |
return translator(Vietnamese)[0]['translation_text']
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
import gradio as gr
|
5 |
+
from transformers.file_utils import cached_path, hf_bucket_url
|
6 |
+
import os, zipfile
|
7 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
8 |
+
from datasets import load_dataset
|
9 |
+
import torch
|
10 |
+
import kenlm
|
11 |
+
import torchaudio
|
12 |
+
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
|
13 |
+
|
14 |
+
"""Vietnamese speech2text"""
|
15 |
+
cache_dir = './cache/'
|
16 |
+
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
17 |
+
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
18 |
+
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
|
19 |
+
lm_file = cached_path(lm_file,cache_dir=cache_dir)
|
20 |
+
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
|
21 |
+
zip_ref.extractall(cache_dir)
|
22 |
+
lm_file = cache_dir + 'vi_lm_4grams.bin'\
|
23 |
+
|
24 |
+
def get_decoder_ngram_model(tokenizer, ngram_lm_path):
|
25 |
+
vocab_dict = tokenizer.get_vocab()
|
26 |
+
sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
|
27 |
+
vocab = [x[1] for x in sort_vocab][:-2]
|
28 |
+
vocab_list = vocab
|
29 |
+
# convert ctc blank character representation
|
30 |
+
vocab_list[tokenizer.pad_token_id] = ""
|
31 |
+
# replace special characters
|
32 |
+
vocab_list[tokenizer.unk_token_id] = ""
|
33 |
+
# vocab_list[tokenizer.bos_token_id] = ""
|
34 |
+
# vocab_list[tokenizer.eos_token_id] = ""
|
35 |
+
# convert space character representation
|
36 |
+
vocab_list[tokenizer.word_delimiter_token_id] = " "
|
37 |
+
# specify ctc blank char index, since conventially it is the last entry of the logit matrix
|
38 |
+
alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
|
39 |
+
lm_model = kenlm.Model(ngram_lm_path)
|
40 |
+
decoder = BeamSearchDecoderCTC(alphabet,
|
41 |
+
language_model=LanguageModel(lm_model))
|
42 |
+
return decoder
|
43 |
+
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)
|
44 |
+
|
45 |
+
# define function to read in sound file
|
46 |
+
def speech_file_to_array_fn(path, max_seconds=10):
|
47 |
+
batch = {"file": path}
|
48 |
+
speech_array, sampling_rate = torchaudio.load(batch["file"])
|
49 |
+
if sampling_rate != 16000:
|
50 |
+
transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
|
51 |
+
new_freq=16000)
|
52 |
+
speech_array = transform(speech_array)
|
53 |
+
speech_array = speech_array[0]
|
54 |
+
if max_seconds > 0:
|
55 |
+
speech_array = speech_array[:max_seconds*16000]
|
56 |
+
batch["speech"] = speech_array.numpy()
|
57 |
+
batch["sampling_rate"] = 16000
|
58 |
+
return batch
|
59 |
+
# tokenize
|
60 |
+
def speech2text(audio):
|
61 |
+
# read in sound file
|
62 |
+
# load dummy dataset and read soundfiles
|
63 |
+
ds = speech_file_to_array_fn(audio.name)
|
64 |
+
# infer model
|
65 |
+
input_values = processor(
|
66 |
+
ds["speech"],
|
67 |
+
sampling_rate=ds["sampling_rate"],
|
68 |
+
return_tensors="pt"
|
69 |
+
).input_values
|
70 |
+
# decode ctc output
|
71 |
+
logits = model(input_values).logits[0]
|
72 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
73 |
+
greedy_search_output = processor.decode(pred_ids)
|
74 |
+
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
75 |
+
return beam_search_output
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
"""Machine translation"""
|
80 |
model_checkpoint = "huynguyen208/fantastic4-finetuned-vi-to-en-PhoMT-demo"
|
81 |
translator = pipeline("translation", model=model_checkpoint)
|
82 |
|
83 |
def translate(Vietnamese):
|
84 |
return translator(Vietnamese)[0]['translation_text']
|
85 |
|
86 |
+
def inference(audio):
|
87 |
+
vi_text = speech2text(audio)
|
88 |
+
en_text = translate(vi_text)
|
89 |
+
return en_text
|
90 |
+
|
91 |
+
inputs = gr.inputs.Audio(label="Input Audio", type="file")
|
92 |
+
outputs = gr.outputs.Textbox(label="Output Text")
|
93 |
+
title = "Speech to text and translate Vietnamese to English"
|
94 |
+
description = "Gradio demo for a wav2vec2-base-vietnamese-250h and Helsinki-NLP/opus-mt-vi-en"
|
95 |
+
examples=[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
|
96 |
+
iface = gr.Interface(inference,
|
97 |
+
inputs,
|
98 |
+
outputs,
|
99 |
+
title=title,
|
100 |
+
description=description,
|
101 |
+
examples=examples)
|
102 |
+
iface.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
libsndfile1
|
requirements.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1 |
torch
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
gradio
|
3 |
-
transformers
|
|
|
|
|
|
1 |
torch
|
2 |
+
torchaudio
|
3 |
+
speechbrain
|
4 |
+
pydub
|
5 |
+
datasets
|
6 |
+
soundfile
|
7 |
+
ffmpeg-python
|
8 |
gradio
|
9 |
+
transformers
|
10 |
+
transformers[sentencepiece]
|
11 |
+
https://github.com/kpu/kenlm/archive/master.zip
|
vi_speech_01.wav
ADDED
Binary file (120 kB). View file
|
|
vi_speech_02.wav
ADDED
Binary file (49.6 kB). View file
|
|
vi_speech_03.wav
ADDED
Binary file (76.8 kB). View file
|
|