import os import sys os.system("pip install transformers==4.27.0") from transformers import pipeline, WhisperModel, WhisperTokenizer os.system("pip install evaluate") import evaluate os.system("pip install datasets") os.system("pip install llvmlite") os.system("pip install spicy") os.system("pip install soundfile") os.system("pip install datasets[audio]") #os.system("pip install numpy==1.21.4") #os.system("pip install numpy==1.22.1") os.system("pip install numba==0.51.2") from evaluate import evaluator from datasets import load_dataset, Audio from datasets import disable_caching from datasets import set_caching_enabled set_caching_enabled(False) disable_caching() from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig from datasets import load_dataset metric = evaluate.load("wer") # Load the Whisper model and tokenizer huggingface_token = os.environ["huggingface_token"] whisper_miso = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) # Initialize the automatic-speech-recognition pipeline with the Whisper model and tokenizer asr_pipeline = pipeline( "automatic-speech-recognition", model=whisper_miso, tokenizer=miso_tokenizer ) # Load the dataset dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) # Compute the evaluation results results = asr_pipeline(dataset) metric = WERMetric() wer = metric.compute(predictions=results, references=dataset["audio"]) print(wer) def transcribe(audio, state=""): text = p(audio)["text"] state += text + " " return state, state gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True), "state" ], outputs=[ "textbox", "state" ], live=True).launch()