Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import spaces | |
import gradio as gr | |
from transformers import pipeline | |
import concurrent.futures | |
import time | |
# Load both models | |
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" | |
MODEL_NAME_base = "openai/whisper-large-v3" | |
device = 0 if torch.cuda.is_available() else "cpu" | |
# Set up the pipeline for both models | |
pipe_turbo = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME_TURBO, | |
chunk_length_s=30, | |
device=device, | |
) | |
pipe_base = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME_base, | |
chunk_length_s=30, | |
device=device, | |
) | |
# Function to transcribe audio using the turbo model | |
def transcribe_turbo(audio): | |
start_time = time.time() | |
text_turbo = pipe_turbo(audio)["text"] | |
elapsed_time = time.time() - start_time | |
return text_turbo, elapsed_time | |
# Function to transcribe audio using the base model | |
def transcribe_base(audio): | |
start_time = time.time() | |
text_base = pipe_base(audio)["text"] | |
elapsed_time = time.time() - start_time | |
return text_base, elapsed_time | |
# Function to compare transcriptions and speed | |
def compare_transcriptions(audio): | |
if audio is None: | |
raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") | |
# Run both transcriptions in parallel | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_turbo = executor.submit(transcribe_turbo, audio) | |
future_base = executor.submit(transcribe_base, audio) | |
# Get the results | |
text_turbo, time_turbo = future_turbo.result() | |
text_base, time_base = future_base.result() | |
# Return both transcriptions and processing times | |
return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") | |
css = """ | |
h1 { | |
text-align: center; | |
display:block; | |
} | |
""" | |
# Gradio Interface | |
with gr.Blocks(css=css) as demo: | |
# Title and description | |
gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3") | |
gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3") | |
with gr.Column(): | |
with gr.Row(): | |
with gr.Group(): | |
audio_input = gr.Audio(sources=["microphone"], type="filepath") | |
transcribe_button = gr.Button("Start transcription", variant="primary") | |
with gr.Row(): | |
with gr.Row(): | |
with gr.Group(): | |
gr.Markdown("### π **Base model**") | |
base_output = gr.Textbox(label="Transcription") | |
base_time = gr.Textbox(label="Processing Time") | |
with gr.Group(): | |
gr.Markdown("### β‘ **Turbo model**") | |
turbo_output = gr.Textbox(label="Transcription") | |
turbo_time = gr.Textbox(label="Processing Time") | |
# Set up the interaction | |
transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time]) | |
# Launch the demo | |
demo.launch() | |