import os import sys os.system("pip install transformers==4.27.0") from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC os.system("pip install evaluate") import evaluate os.system("pip install evaluate[evaluator]") os.system("pip install datasets") os.system("pip install llvmlite") os.system("pip install spicy>=1.7.1") os.system("pip install soundfile") os.system("pip install jiwer") os.system("pip install datasets[audio]") #os.system("pip install numpy==1.21.4") #os.system("pip install numpy==1.22.1") os.system("pip install numba==0.51.2") from evaluate import evaluator from datasets import load_dataset, Audio from datasets import disable_caching from datasets import set_caching_enabled set_caching_enabled(False) disable_caching() p = pipeline("automatic-speech-recognition") #config = AutoConfig.from_pretrained('whisper-small') huggingface_token = os.environ["huggingface_token"] whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) #miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token) task_evaluator = evaluator("automatic-speech-recognition") #url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"} #data = load_dataset("audiofolder", data_dir="mskov/miso_test") # data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"]) dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio()) results = task_evaluator.compute( model_or_pipeline=whisper_miso, #model_or_pipeline="mskov/whisper-small.en", data=dataset, tokenizer=miso_tokenizer, input_column="audio", label_column="audio", # device=None, strategy="simple", metric="wer", ) print(results) def transcribe(audio, state=""): text = p(audio)["text"] state += text + " " returnstate, state gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True), "state" ], outputs=[ "textbox", "state" ], live=True).launch()