File size: 2,600 Bytes
9c1145c 21728f1 4b7e966 21728f1 4b7e966 21728f1 44d4c43 21728f1 a97a975 21728f1 156ffda 21728f1 44d4c43 2f6b008 21728f1 3b36209 4b7e966 cb4e060 3b36209 21728f1 c5d4c36 7ea5247 3b36209 21728f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import gradio as gr
import librosa
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
def load_and_fix_data(input_file, model_sampling_rate):
speech, sample_rate = librosa.load(input_file)
if len(speech.shape) > 1:
speech = speech[:, 0] + speech[:, 1]
if sample_rate != model_sampling_rate:
speech = librosa.resample(speech, sample_rate, model_sampling_rate)
return speech
feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish")
sampling_rate = feature_extractor.sampling_rate
asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish")
model_name = 'hackathon-pln-es/t5-small-finetuned-spanish-to-quechua'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
new_line = '\n'
def predict_and_ctc_lm_decode(input_file):
speech = load_and_fix_data(input_file, sampling_rate)
transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1)
transcribed_text = transcribed_text["text"]
input = tokenizer(transcribed_text, return_tensors="pt")
output = model.generate(input["input_ids"], max_length=40, num_beams=4, early_stopping=True)
output = tokenizer.decode(output[0], skip_special_tokens=True)
return f"Spanish Audio Transcription: {transcribed_text} {new_line} Quechua Translation: {output}"
description = """ This is a Gradio demo of Spanish Audio Transcriptions to Quechua Translation. To use this, simply provide an audio input (audio recording or via microphone), which will subsequently be transcribed and translated to the Quechua language.
Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish)
Pre-trained model used for translating Spanish audio transcription to the Quechua language: [t5-small-finetuned-spanish-to-quechua](https://huggingface.co/hackathon-pln-es/t5-small-finetuned-spanish-to-quechua)
"""
gr.Interface(
predict_and_ctc_lm_decode,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
],
outputs=[gr.outputs.Textbox()],
examples=[["sunny_day.wav"], ["travel.wav"], ["sample_audio.wav"]],
title="Spanish-Audio-Transcriptions-to-Quechua-Translation",
description = description,
#article="<p><center><img src='........e'></center></p>",
layout="horizontal",
theme="huggingface",
).launch(enable_queue=True, cache_examples=True)
|