Spaces:

mskov
/

test

Runtime error

App Files Files Community

test / app.py

mskov

Update app.py

b494685 about 1 year ago

raw

history blame

2.71 kB

	import os
	import sys
	os.system("pip install transformers==4.27.0")
	os.system("pip install numpy==1.23")
	from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig
	os.system("pip install jiwer")
	from jiwer import wer
	os.system("pip install datasets[audio]")
	from evaluate import evaluator
	from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
	import gradio as gr

	set_caching_enabled(False)
	disable_caching()

	huggingface_token = os.environ["huggingface_token"]
	pipe = pipeline(model="mskov/whisper-small-esc50")
	print(pipe)
	'''
	model = WhisperModel.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
	feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
	miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
	'''
	dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))

	print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])

	def transcribe(audio):
	text = pipe(audio)["text"]
	return text

	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs="text",
	title="Whisper Small Miso Test",
	)

	iface.launch()
	'''
	inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
	print("inputs ::: ", inputs, "and dataset type for good measure: ", type(dataset))
	tempDataset = dataset[0]["audio"]["array"].tostring()
	tokenized_dataset = miso_tokenizer(tempDataset) # Tokenize the dataset

	input_ids = features.input_ids
	attention_mask = features.attention_mask
	'''
	'''
	# Evaluate the model
	model.eval()
	with torch.no_grad():
	outputs = model(input_ids=input_ids, attention_mask=attention_mask)

	# Convert predicted token IDs back to text
	predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)

	# Get ground truth labels from the dataset
	labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset

	# Compute WER
	wer_score = wer(labels, predicted_text)

	# Print or return WER score
	print(f"Word Error Rate (WER): {wer_score}")
	'''
	'''
	print("check check")
	print(inputs)
	input_features = inputs.input_features
	decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
	last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
	list(last_hidden_state.shape)
	print(list(last_hidden_state.shape))
	'''