test / app.py
mskov's picture
Update app.py
5e24192
raw
history blame
1.99 kB
import os
import sys
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC
os.system("pip install evaluate")
import evaluate
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig
from datasets import load_dataset
metric = evaluate.load("wer")
# Load the Whisper model and tokenizer
huggingface_token = os.environ["huggingface_token"]
whisper_miso =AutoModelForCTC.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = AutoModelForCTC.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
# Initialize the automatic-speech-recognition pipeline with the Whisper model and tokenizer
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=whisper_miso,
tokenizer=miso_tokenizer
)
# Load the dataset
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
# Compute the evaluation results
results = asr_pipeline(dataset)
metric = WERMetric()
wer = metric.compute(predictions=results, references=dataset["audio"])
print(wer)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
return state, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch()