import torch import gradio as gr from transformers import pipeline import concurrent.futures import time # Load both models MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" MODEL_NAME_STANDARD = "openai/whisper-large-v3" device = 0 if torch.cuda.is_available() else "cpu" # Set up the pipeline for both models pipe_turbo = pipeline( task="automatic-speech-recognition", model=MODEL_NAME_TURBO, chunk_length_s=30, device=device, ) pipe_standard = pipeline( task="automatic-speech-recognition", model=MODEL_NAME_STANDARD, chunk_length_s=30, device=device, ) # Function to transcribe audio using the turbo model def transcribe_turbo(audio): start_time = time.time() text_turbo = pipe_turbo(audio)["text"] elapsed_time = time.time() - start_time return text_turbo, elapsed_time # Function to transcribe audio using the standard model def transcribe_standard(audio): start_time = time.time() text_standard = pipe_standard(audio)["text"] elapsed_time = time.time() - start_time return text_standard, elapsed_time # Function to compare transcriptions and speed def compare_transcriptions(audio): if audio is None: raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") # Run both transcriptions in parallel with concurrent.futures.ThreadPoolExecutor() as executor: future_turbo = executor.submit(transcribe_turbo, audio) future_standard = executor.submit(transcribe_standard, audio) # Get the results text_turbo, time_turbo = future_turbo.result() text_standard, time_standard = future_standard.result() # Return both transcriptions and processing times return (text_standard, f"{time_standard:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") css = """ h1 { text-align: center; display:block; } """ # Gradio Interface with gr.Blocks(css=css) as demo: # Title and description gr.Markdown("# Whisper large-v3-turbo ...vs... Whisper large-v3") gr.Markdown("This app compares the transcription performance and processing time between openAI 'Whisper large-v3' and 'Whisper large-v3-turbo' models") with gr.Column(): with gr.Row(): with gr.Group(): audio_input = gr.Audio(sources=["microphone"], type="filepath") transcribe_button = gr.Button("Start transcription", variant="primary") with gr.Row(): with gr.Row(): with gr.Group(): gr.Markdown("### 📝 **Standard model**") standard_output = gr.Textbox(label="Transcription") standard_time = gr.Textbox(label="Processing Time") with gr.Group(): gr.Markdown("### ⚡ **Turbo model**") turbo_output = gr.Textbox(label="Transcription") turbo_time = gr.Textbox(label="Processing Time") # Set up the interaction transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[standard_output, standard_time, turbo_output, turbo_time]) # Launch the demo demo.launch()