Spaces:

adriszmar
/

whisper-large-v3-turbo-vs-base-model

Running on Zero

whisper-large-v3-turbo-vs-base-model / app.py

adrian-saez-martinez

naming base model

6046e53 24 days ago

3.13 kB

	import torch
	import spaces
	import gradio as gr
	from transformers import pipeline
	import concurrent.futures
	import time

	# Load both models
	MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo"
	MODEL_NAME_base = "openai/whisper-large-v3"

	device = 0 if torch.cuda.is_available() else "cpu"

	# Set up the pipeline for both models
	pipe_turbo = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME_TURBO,
	chunk_length_s=30,
	device=device,
	)

	pipe_base = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME_base,
	chunk_length_s=30,
	device=device,
	)

	# Function to transcribe audio using the turbo model
	@spaces.GPU
	def transcribe_turbo(audio):
	start_time = time.time()
	text_turbo = pipe_turbo(audio)["text"]
	elapsed_time = time.time() - start_time
	return text_turbo, elapsed_time

	# Function to transcribe audio using the base model
	@spaces.GPU
	def transcribe_base(audio):
	start_time = time.time()
	text_base = pipe_base(audio)["text"]
	elapsed_time = time.time() - start_time
	return text_base, elapsed_time

	# Function to compare transcriptions and speed
	@spaces.GPU
	def compare_transcriptions(audio):
	if audio is None:
	raise gr.Error("No audio file submitted! Please record an audio before submitting your request.")

	# Run both transcriptions in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_turbo = executor.submit(transcribe_turbo, audio)
	future_base = executor.submit(transcribe_base, audio)

	# Get the results
	text_turbo, time_turbo = future_turbo.result()
	text_base, time_base = future_base.result()

	# Return both transcriptions and processing times
	return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds")

	css = """
	h1 {
	text-align: center;
	display:block;
	}
	"""

	# Gradio Interface
	with gr.Blocks(css=css) as demo:
	# Title and description
	gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3")
	gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3")

	with gr.Column():
	with gr.Row():
	with gr.Group():
	audio_input = gr.Audio(sources=["microphone"], type="filepath")
	transcribe_button = gr.Button("Start transcription", variant="primary")

	with gr.Row():
	with gr.Row():
	with gr.Group():
	gr.Markdown("### 📝 Base model")
	base_output = gr.Textbox(label="Transcription")
	base_time = gr.Textbox(label="Processing Time")
	with gr.Group():
	gr.Markdown("### ⚡ Turbo model")
	turbo_output = gr.Textbox(label="Transcription")
	turbo_time = gr.Textbox(label="Processing Time")

	# Set up the interaction
	transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time])

	# Launch the demo
	demo.launch()