File size: 3,168 Bytes
de07127
 
e6f4bc9
5e24192
de07127
89ae3df
45b990b
de07127
46553a4
f9bc809
2996449
decc59e
664eb76
7ce5d41
5625d5f
 
0b5b7f4
664eb76
3826e01
3b57b43
 
3826e01
973bb39
877c079
14427e6
877c079
14427e6
d7388cd
62b683d
877c079
 
 
15fe17a
 
bfbada5
 
877c079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4a4e02
 
74f5766
 
 
877c079
973bb39
74f5766
 
 
 
 
 
 
 
 
 
877c079
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os 
import sys 
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC
os.system("pip install evaluate")
import evaluate
os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy>=1.7.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()

p = pipeline("automatic-speech-recognition")

#config = AutoConfig.from_pretrained('whisper-small')

huggingface_token = os.environ["huggingface_token"]

whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)


task_evaluator = evaluator("automatic-speech-recognition")

#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
    model_or_pipeline=whisper_miso,
    #model_or_pipeline="mskov/whisper-small.en",
    data=dataset,
    tokenizer=miso_tokenizer,
    input_column="audio",
    label_column="audio",
    # device=None,
    strategy="simple",
    metric="wer",
)
print(results)


def transcribe(audio, state=""):
    text = p(audio)["text"]
    state += text + " "
    returnstate, state

gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    live=True).launch()