import os import sys from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig os.system("pip install jiwer") os.system("pip install datasets[audio]") from evaluate import evaluator from datasets import load_dataset, Audio, disable_caching, set_caching_enabled set_caching_enabled(False) disable_caching() huggingface_token = os.environ["huggingface_token"] model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token) ds = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000)) print(ds, "and at 0 ", ds[0]) inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") print("check check") print(inputs) input_features = inputs.input_features decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state list(last_hidden_state.shape) print(list(last_hidden_state.shape))