File size: 3,074 Bytes
de07127 e6f4bc9 5e24192 de07127 8a965da de07127 8a965da 2996449 decc59e 664eb76 5625d5f 0b5b7f4 8a965da 3b57b43 3826e01 973bb39 877c079 14427e6 877c079 14427e6 d7388cd 62b683d 8a965da 877c079 15fe17a bfbada5 877c079 e4a4e02 74f5766 877c079 973bb39 74f5766 877c079 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import sys
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC
os.system("pip install evaluate")
#import evaluate
#os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
# os.system("pip install llvmlite")
# os.system("pip install spicy==1.8.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
set_caching_enabled(False)
disable_caching()
p = pipeline("automatic-speech-recognition")
#config = AutoConfig.from_pretrained('whisper-small')
huggingface_token = os.environ["huggingface_token"]
whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token, device_map="auto")
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)
task_evaluator = evaluator("automatic-speech-recognition")
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
model_or_pipeline=whisper_miso,
#model_or_pipeline="mskov/whisper-small.en",
data=dataset,
tokenizer=miso_tokenizer,
input_column="audio",
label_column="audio",
# device=None,
strategy="simple",
metric="wer",
)
print(results)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
returnstate, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch() |