File size: 1,921 Bytes
de07127 e6f4bc9 13cee50 de07127 46553a4 a8d3864 2996449 664eb76 7ce5d41 5625d5f 0b5b7f4 664eb76 3826e01 3b57b43 3826e01 973bb39 15fe17a 14427e6 13cee50 15fe17a d7388cd 15fe17a 4ef4640 62b683d 15fe17a 62b683d 15fe17a dcf6504 15fe17a e4a4e02 74f5766 973bb39 74f5766 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import sys
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer
os.system("pip install evaluate")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig
from datasets import load_dataset
metric = evaluate.load("wer")
# Load the Whisper model and tokenizer
huggingface_token = os.environ["huggingface_token"]
whisper_miso = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
# Initialize the automatic-speech-recognition pipeline with the Whisper model and tokenizer
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=whisper_miso,
tokenizer=miso_tokenizer
)
# Load the dataset
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
# Compute the evaluation results
results = asr_pipeline(dataset)
metric = WERMetric()
wer = metric.compute(predictions=results, references=dataset["audio"])
print(wer)
def transcribe(audio, state=""):
text = p(audio)["text"]
state += text + " "
return state, state
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch()
|