|
import os |
|
import sys |
|
os.system("pip install transformers==4.27.0") |
|
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC |
|
os.system("pip install evaluate") |
|
import evaluate |
|
os.system("pip install datasets") |
|
os.system("pip install llvmlite") |
|
os.system("pip install spicy") |
|
os.system("pip install soundfile") |
|
os.system("pip install jiwer") |
|
os.system("pip install datasets[audio]") |
|
|
|
|
|
os.system("pip install numba==0.51.2") |
|
from evaluate import evaluator |
|
from datasets import load_dataset, Audio |
|
from datasets import disable_caching |
|
from datasets import set_caching_enabled |
|
set_caching_enabled(False) |
|
disable_caching() |
|
|
|
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig |
|
from datasets import load_dataset |
|
|
|
|
|
metric = evaluate.load("wer") |
|
|
|
|
|
huggingface_token = os.environ["huggingface_token"] |
|
whisper_miso =AutoModelForCTC.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
|
miso_tokenizer = AutoModelForCTC.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
|
|
|
|
|
asr_pipeline = pipeline( |
|
"automatic-speech-recognition", |
|
model=whisper_miso, |
|
tokenizer=miso_tokenizer |
|
) |
|
|
|
|
|
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) |
|
|
|
|
|
results = asr_pipeline(dataset) |
|
metric = WERMetric() |
|
wer = metric.compute(predictions=results, references=dataset["audio"]) |
|
print(wer) |
|
|
|
|
|
|
|
def transcribe(audio, state=""): |
|
text = p(audio)["text"] |
|
state += text + " " |
|
return state, state |
|
|
|
gr.Interface( |
|
fn=transcribe, |
|
inputs=[ |
|
gr.Audio(source="microphone", type="filepath", streaming=True), |
|
"state" |
|
], |
|
outputs=[ |
|
"textbox", |
|
"state" |
|
], |
|
live=True).launch() |
|
|