File size: 3,074 Bytes
de07127
 
e6f4bc9
5e24192
de07127
8a965da
 
de07127
8a965da
 
2996449
decc59e
664eb76
5625d5f
0b5b7f4
8a965da
 
3b57b43
3826e01
973bb39
877c079
14427e6
877c079
14427e6
d7388cd
62b683d
8a965da
877c079
 
15fe17a
 
bfbada5
 
877c079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4a4e02
 
74f5766
 
 
877c079
973bb39
74f5766
 
 
 
 
 
 
 
 
 
877c079
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os 
import sys 
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoModelForCTC
os.system("pip install evaluate")
#import evaluate
#os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
# os.system("pip install llvmlite")
# os.system("pip install spicy==1.8.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled

set_caching_enabled(False)
disable_caching()

p = pipeline("automatic-speech-recognition")

#config = AutoConfig.from_pretrained('whisper-small')

huggingface_token = os.environ["huggingface_token"]

whisper_miso=WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token, device_map="auto")
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
#miso_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", use_auth_token=huggingface_token)


task_evaluator = evaluator("automatic-speech-recognition")

#url = {"test" : "https://huggingface.co/datasets/mskov/miso_test/blob/main/test_set.parquet"}
#data = load_dataset("audiofolder", data_dir="mskov/miso_test")
# data = load_dataset("audiofolder", data_files=["datasets/mskov/miso_test/test_set/and.wav","mskov/miso_test/test_set/chew1.wav","mskov/miso_test/test_set/chew3.wav", "mskov/miso_test/test_set/chew3.wav","mskov/miso_test/test_set/chew4.wav","mskov/miso_test/test_set/cough1.wav","mskov/miso_test/test_set/cough2.wav","mskov/miso_test/test_set/cough3.wav","mskov/miso_test/test_set/hi.wav","mskov/miso_test/test_set/knock_knock.wav","mskov/miso_test/test_set/mouth_sounds1.wav","mskov/miso_test/test_set/mouth_sounds2.wav","mskov/miso_test/test_set/no.wav","mskov/miso_test/test_set/not_bad.wav","mskov/miso_test/test_set/oh_i_wish.wav","mskov/miso_test/test_set/pop1.wav","mskov/miso_test/test_set/really.wav","mskov/miso_test/test_set/sigh1.wav","mskov/miso_test/test_set/sigh2.wav","mskov/miso_test/test_set/slurp1.wav","mskov/miso_test/test_set/slurp2.wav","mskov/miso_test/test_set/sneeze1.wav","mskov/miso_test/test_set/sneeze2.wav","mskov/miso_test/test_set/so_i_did_it_again.wav"])
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())
results = task_evaluator.compute(
    model_or_pipeline=whisper_miso,
    #model_or_pipeline="mskov/whisper-small.en",
    data=dataset,
    tokenizer=miso_tokenizer,
    input_column="audio",
    label_column="audio",
    # device=None,
    strategy="simple",
    metric="wer",
)
print(results)


def transcribe(audio, state=""):
    text = p(audio)["text"]
    state += text + " "
    returnstate, state

gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    live=True).launch()