|
import os |
|
import sys |
|
os.system("pip install transformers==4.27.0") |
|
os.system("pip install torch") |
|
os.system("pip install openai") |
|
os.system("pip install accelerate") |
|
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor |
|
os.system("pip install evaluate") |
|
|
|
|
|
os.system("pip install datasets") |
|
|
|
os.system("pip install spicy==1.8.1") |
|
os.system("pip install soundfile") |
|
os.system("pip install jiwer") |
|
os.system("pip install datasets[audio]") |
|
os.system("pip install numba==0.51.2") |
|
from evaluate import evaluator |
|
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled |
|
|
|
set_caching_enabled(False) |
|
disable_caching() |
|
|
|
huggingface_token = os.environ["huggingface_token"] |
|
|
|
|
|
model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
|
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) |
|
ds = load_dataset("mskov.ESC50", split="test") |
|
inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") |
|
input_features = inputs.input_features |
|
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id |
|
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state |
|
list(last_hidden_state.shape) |
|
print(list(last_hidden_state.shape)) |