import os import sys os.system("pip install transformers==4.27.0") os.system("pip install torch") os.system("pip install openai") os.system("pip install accelerate") from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor os.system("pip install evaluate") #import evaluate #os.system("pip install evaluate[evaluator]") os.system("pip install datasets") # os.system("pip install llvmlite") os.system("pip install spicy==1.8.1") os.system("pip install soundfile") os.system("pip install jiwer") os.system("pip install datasets[audio]") os.system("pip install numba==0.51.2") from evaluate import evaluator from datasets import load_dataset, Audio, disable_caching, set_caching_enabled set_caching_enabled(False) disable_caching() huggingface_token = os.environ["huggingface_token"] model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) ds = load_dataset("mskov.ESC50", split="test") inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") input_features = inputs.input_features decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state list(last_hidden_state.shape) print(list(last_hidden_state.shape))