File size: 1,309 Bytes
de07127
 
e6f4bc9
79f46c8
256b607
2f1b912
5892c2d
de07127
8a965da
 
de07127
8a965da
79f46c8
2996449
decc59e
664eb76
5625d5f
0b5b7f4
8a965da
 
3b57b43
3826e01
973bb39
14427e6
75e29dc
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os 
import sys 
os.system("pip install transformers==4.27.0")
os.system("pip install torch")
os.system("pip install openai")
os.system("pip install accelerate")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor
os.system("pip install evaluate")
#import evaluate
#os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
# os.system("pip install llvmlite")
os.system("pip install spicy==1.8.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled

set_caching_enabled(False)
disable_caching()


model = WhisperModel.from_pretrained("mskov/whisper_miso")
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso")
ds = load_dataset("mskov.ESC50", split="test")
inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))