File size: 3,168 Bytes
de07127 e6f4bc9 5e24192 de07127 89ae3df 45b990b de07127 46553a4 f9bc809 2996449 decc59e 664eb76 7ce5d41 5625d5f 0b5b7f4 664eb76 3826e01 3b57b43 3826e01 973bb39 877c079 14427e6 877c079 14427e6 d7388cd 62b683d 877c079 15fe17a bfbada5 877c079 e4a4e02 74f5766 877c079 973bb39 74f5766 877c079 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import os
import sys
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC
os.system("pip install evaluate")
import evaluate
os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy>=1.7.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()
p = pipeline("automatic-speech-recognition")
#config = AutoConfig.from_pretrained('whisper-small')
huggingface_token = os.environ["huggingface_token"]
whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)
task_evaluator = evaluator("automatic-speech-recognition")
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
model_or_pipeline=whisper_miso,
#model_or_pipeline="mskov/whisper-small.en",
data=dataset,
tokenizer=miso_tokenizer,
input_column="audio",
label_column="audio",
# device=None,
strategy="simple",
metric="wer",
)
print(results)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
returnstate, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch() |