test / app.py
mskov's picture
Update app.py
5625d5f
raw
history blame
2.35 kB
import transformers
from transformers import pipeline
import gradio as gr
import os
import sys
os.system("pip install evaluate")
os.system("pip install datasets")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
p = pipeline("automatic-speech-recognition")
task_evaluator = evaluator("automatic-speech-recognition")
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
model_or_pipeline="https://huggingface.co/mskov/whisper_miso",
data=dataset,
input_column="audio",
label_column="audio",
metric="wer",
)
print(results)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
return state, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch()