Spaces:

adriszmar
/

whisper-large-v3-turbo-vs-base-model

Running on Zero

File size: 3,130 Bytes

import torch
import spaces
import gradio as gr
from transformers import pipeline
import concurrent.futures
import time

# Load both models
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo"
MODEL_NAME_base = "openai/whisper-large-v3"

device = 0 if torch.cuda.is_available() else "cpu"

# Set up the pipeline for both models
pipe_turbo = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME_TURBO,
    chunk_length_s=30,
    device=device,
)

pipe_base = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME_base,
    chunk_length_s=30,
    device=device,
)

# Function to transcribe audio using the turbo model
@spaces.GPU
def transcribe_turbo(audio):
    start_time = time.time()
    text_turbo = pipe_turbo(audio)["text"]
    elapsed_time = time.time() - start_time
    return text_turbo, elapsed_time

# Function to transcribe audio using the base model
@spaces.GPU
def transcribe_base(audio):
    start_time = time.time()
    text_base = pipe_base(audio)["text"]
    elapsed_time = time.time() - start_time
    return text_base, elapsed_time

# Function to compare transcriptions and speed
@spaces.GPU
def compare_transcriptions(audio):
    if audio is None:
        raise gr.Error("No audio file submitted! Please record an audio before submitting your request.")

    # Run both transcriptions in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_turbo = executor.submit(transcribe_turbo, audio)
        future_base = executor.submit(transcribe_base, audio)

        # Get the results
        text_turbo, time_turbo = future_turbo.result()
        text_base, time_base = future_base.result()

    # Return both transcriptions and processing times
    return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds")

css = """
h1 {
    text-align: center;
    display:block;
}
"""

# Gradio Interface
with gr.Blocks(css=css) as demo:
    # Title and description
    gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3")
    gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3")

    with gr.Column():
        with gr.Row():
            with gr.Group():
                audio_input = gr.Audio(sources=["microphone"], type="filepath")
                transcribe_button = gr.Button("Start transcription", variant="primary")

        with gr.Row():
            with gr.Row():
               with gr.Group():
                  gr.Markdown("### 📝 **Base model**")
                  base_output = gr.Textbox(label="Transcription")
                  base_time = gr.Textbox(label="Processing Time")
               with gr.Group():
                 gr.Markdown("### ⚡ **Turbo model**")
                 turbo_output = gr.Textbox(label="Transcription")
                 turbo_time = gr.Textbox(label="Processing Time")

    # Set up the interaction
    transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time])

# Launch the demo
demo.launch()