File size: 1,924 Bytes
de07127
 
e6f4bc9
13cee50
de07127
 
46553a4
a8d3864
2996449
664eb76
7ce5d41
5625d5f
 
0b5b7f4
664eb76
3826e01
3b57b43
 
3826e01
973bb39
15fe17a
 
 
13cee50
15fe17a
d7388cd
15fe17a
4ef4640
62b683d
15fe17a
 
 
 
 
 
62b683d
15fe17a
dcf6504
15fe17a
 
 
 
 
 
 
e4a4e02
 
74f5766
 
 
 
973bb39
74f5766
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os 
import sys 
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer
os.system("pip install evaluate")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()

from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig
from datasets import load_dataset
from transformers import WERMetric

# Load the Whisper model and tokenizer
huggingface_token = os.environ["huggingface_token"]
whisper_miso = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)

# Initialize the automatic-speech-recognition pipeline with the Whisper model and tokenizer
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=whisper_miso,
    tokenizer=miso_tokenizer
)

# Load the dataset
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())

# Compute the evaluation results
results = asr_pipeline(dataset)
metric = WERMetric()
wer = metric.compute(predictions=results, references=dataset["audio"])
print(wer)



def transcribe(audio, state=""):
    text = p(audio)["text"]
    state += text + " "
    return state, state

gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    live=True).launch()