test / app.py
mskov's picture
Update app.py
5892c2d
raw
history blame
3.3 kB
import os
import sys
os.system("pip install transformers==4.27.0")
os.system("pip install torch")
os.system("pip install openai")
os.system("pip install accelerate")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor
os.system("pip install evaluate")
#import evaluate
#os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
# os.system("pip install llvmlite")
os.system("pip install spicy==1.8.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
set_caching_enabled(False)
disable_caching()
#config = AutoConfig.from_pretrained('whisper-small')
huggingface_token = os.environ["huggingface_token"]
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_feature_extractor = WhisperFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token, device_map="auto")
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)
task_evaluator = evaluator("automatic-speech-recognition")
#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
model_or_pipeline=whisper_miso,
#model_or_pipeline="mskov/whisper-small.en",
data=dataset,
tokenizer=miso_tokenizer,
feature_extractor=miso_feature_extractor,
input_column="audio",
label_column="audio",
# device=None,
strategy="simple",
metric="wer",
)
print(results)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
returnstate, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch()