import transformers | |
from transformers import pipeline | |
import gradio as gr | |
import os | |
import sys | |
os.system("pip install evaluate") | |
os.system("pip install datasets") | |
os.system("pip install spicy") | |
os.system("pip install soundfile") | |
os.system("pip install datasets[audio]") | |
#os.system("pip install numpy==1.21.4") | |
#os.system("pip install numpy==1.22.1") | |
os.system("pip install numba==0.51.2") | |
from evaluate import evaluator | |
from datasets import load_dataset, Audio | |
p = pipeline("automatic-speech-recognition") | |
task_evaluator = evaluator("automatic-speech-recognition") | |
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"} | |
#data = load_dataset("audiofolder", data_dir="mskov/miso_test") | |
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"]) | |
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) | |
results = task_evaluator.compute( | |
model_or_pipeline="https://huggingface.co/mskov/whisper_esc50", | |
data=dataset, | |
input_column="audio", | |
label_column="audio", | |
metric="wer", | |
) | |
print(results) | |
def transcribe(audio, state=""): | |
text = p(audio)["text"] | |
state += text + " " | |
return state, state | |
gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath", streaming=True), | |
"state" | |
], | |
outputs=[ | |
"textbox", | |
"state" | |
], | |
live=True).launch() | |