Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import spaces | |
import gradio as gr | |
from transformers import pipeline | |
import concurrent.futures | |
import time | |
# Load both models | |
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" | |
MODEL_NAME_STANDARD = "openai/whisper-large-v3" | |
device = 0 if torch.cuda.is_available() else "cpu" | |
# Set up the pipeline for both models | |
pipe_turbo = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME_TURBO, | |
chunk_length_s=30, | |
device=device, | |
) | |
pipe_standard = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME_STANDARD, | |
chunk_length_s=30, | |
device=device, | |
) | |
# Function to transcribe audio using the turbo model | |
def transcribe_turbo(audio): | |
start_time = time.time() | |
text_turbo = pipe_turbo(audio)["text"] | |
elapsed_time = time.time() - start_time | |
return text_turbo, elapsed_time | |
# Function to transcribe audio using the standard model | |
def transcribe_standard(audio): | |
start_time = time.time() | |
text_standard = pipe_standard(audio)["text"] | |
elapsed_time = time.time() - start_time | |
return text_standard, elapsed_time | |
# Function to compare transcriptions and speed | |
def compare_transcriptions(audio): | |
if audio is None: | |
raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") | |
# Run both transcriptions in parallel | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_turbo = executor.submit(transcribe_turbo, audio) | |
future_standard = executor.submit(transcribe_standard, audio) | |
# Get the results | |
text_turbo, time_turbo = future_turbo.result() | |
text_standard, time_standard = future_standard.result() | |
# Return both transcriptions and processing times | |
return (text_standard, f"{time_standard:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") | |
css = """ | |
h1 { | |
text-align: center; | |
display:block; | |
} | |
""" | |
# Gradio Interface | |
with gr.Blocks(css=css) as demo: | |
# Title and description | |
gr.Markdown("# Whisper large-v3-turbo ...vs... Whisper large-v3") | |
gr.Markdown("This app compares the transcription performance and processing time between openAI 'Whisper large-v3' and 'Whisper large-v3-turbo' models") | |
with gr.Column(): | |
with gr.Row(): | |
with gr.Group(): | |
audio_input = gr.Audio(sources=["microphone"], type="filepath") | |
transcribe_button = gr.Button("Start transcription", variant="primary") | |
with gr.Row(): | |
with gr.Row(): | |
with gr.Group(): | |
gr.Markdown("### π **Standard model**") | |
standard_output = gr.Textbox(label="Transcription") | |
standard_time = gr.Textbox(label="Processing Time") | |
with gr.Group(): | |
gr.Markdown("### β‘ **Turbo model**") | |
turbo_output = gr.Textbox(label="Transcription") | |
turbo_time = gr.Textbox(label="Processing Time") | |
# Set up the interaction | |
transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[standard_output, standard_time, turbo_output, turbo_time]) | |
# Launch the demo | |
demo.launch() | |