File size: 3,159 Bytes
de07127 e6f4bc9 13cee50 de07127 46553a4 a8d3864 2996449 664eb76 7ce5d41 5625d5f 0b5b7f4 664eb76 3826e01 3b57b43 3826e01 973bb39 74f5766 e7380d9 13cee50 d7388cd 35a8357 4ef4640 62b683d 4ef4640 62b683d e4a4e02 8fdbf39 fd1d55f b1e43e6 dcf6504 e4a4e02 62b683d eb55c78 4da2268 13cee50 196da5c 661b535 338024e f7fc8d6 e4a4e02 74f5766 973bb39 74f5766 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import sys
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer
os.system("pip install evaluate")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()
p = pipeline("automatic-speech-recognition")
#config = AutoConfig.from_pretrained('whisper-small')
huggingface_token = os.environ["huggingface_token"]
whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)
task_evaluator = p.task_evaluator
task_evaluator.feature_extractor = whisper_miso.get_feature_extractor()
task_evaluator = evaluator("automatic-speech-recognition")
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
model_or_pipeline=whisper_miso,
#model_or_pipeline="mskov/whisper-small.en",
data=dataset,
tokenizer=miso_tokenizer,
input_column="audio",
label_column="audio",
# device=None,
strategy="simple",
metric="wer",
)
print(results)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
return state, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch()
|