ales's picture
Update app.py
abfa68a
from pprint import pformat
from huggingface_hub import hf_hub_download
import librosa
import gradio as gr
from pipeline import PreTrainedPipeline
HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
MODEL_SAMPLING_RATE = 16_000 # 16kHz
# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
def main(recorded_audio_fp: str, uploaded_audio_fp: str):
audio_fp = None
if recorded_audio_fp is not None:
audio_fp = recorded_audio_fp
used_audiofile = 'recorded'
elif uploaded_audio_fp is not None:
audio_fp = uploaded_audio_fp
used_audiofile = 'uploaded'
else:
return (
'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
'Error! You have to either record or upload an audiofile.'
)
# read audio file
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
# recognize speech
pipeline_res = pipeline(inputs=inputs)
text = pipeline_res['text'][0] # unpack batch of size 1
# add technical information to the output
tech_data = pipeline_res
del tech_data['text']
tech_data['used_audiofile'] = used_audiofile
tech_data['recorded_file_present'] = recorded_audio_fp is not None
tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
tech_data['audiofile_path'] = audio_fp
tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
tech_data['inputs_shape'] = inputs.shape
tech_data['inputs_max'] = inputs.max().item()
tech_data['inputs_min'] = inputs.min().item()
tech_data_str = pformat(tech_data)
return text, tech_data_str
article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)
![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits)
"""
iface = gr.Interface(
fn=main,
inputs=[
gr.inputs.Audio(
source='microphone', type='filepath',
label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
optional=True,
),
gr.inputs.Audio(
source='upload', type='filepath',
label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
optional=True
),
],
outputs=[
gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
],
title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
'Акустычная мадэль + моўная мадэль.'
),
article=article
)
iface.launch(enable_queue=True)