|
import os |
|
import sys |
|
os.system("pip install transformers==4.27.0") |
|
os.system("pip install numpy==1.23") |
|
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig |
|
os.system("pip install jiwer") |
|
from jiwer import wer |
|
os.system("pip install datasets[audio]") |
|
from evaluate import evaluator |
|
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled |
|
import gradio as gr |
|
|
|
set_caching_enabled(False) |
|
disable_caching() |
|
|
|
huggingface_token = os.environ["huggingface_token"] |
|
pipe = pipeline(model="mskov/whisper-small-esc50") |
|
print(pipe) |
|
''' |
|
model = WhisperModel.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) |
|
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) |
|
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) |
|
''' |
|
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000)) |
|
|
|
print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"]) |
|
|
|
def transcribe(audio): |
|
text = pipe(audio)["text"] |
|
return text |
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Whisper Small Miso Test", |
|
) |
|
|
|
iface.launch() |
|
''' |
|
inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt") |
|
print("inputs ::: ", inputs, "and dataset type for good measure: ", type(dataset)) |
|
tempDataset = dataset[0]["audio"]["array"].tostring() |
|
tokenized_dataset = miso_tokenizer(tempDataset) # Tokenize the dataset |
|
|
|
input_ids = features.input_ids |
|
attention_mask = features.attention_mask |
|
''' |
|
''' |
|
# Evaluate the model |
|
model.eval() |
|
with torch.no_grad(): |
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
|
|
|
# Convert predicted token IDs back to text |
|
predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True) |
|
|
|
# Get ground truth labels from the dataset |
|
labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset |
|
|
|
# Compute WER |
|
wer_score = wer(labels, predicted_text) |
|
|
|
# Print or return WER score |
|
print(f"Word Error Rate (WER): {wer_score}") |
|
''' |
|
''' |
|
print("check check") |
|
print(inputs) |
|
input_features = inputs.input_features |
|
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id |
|
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state |
|
list(last_hidden_state.shape) |
|
print(list(last_hidden_state.shape)) |
|
''' |