Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,130 Bytes
f11f43f c9c13a6 f11f43f 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f 754df64 f11f43f 6046e53 754df64 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f 754df64 f11f43f 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f 6046e53 f11f43f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import torch
import spaces
import gradio as gr
from transformers import pipeline
import concurrent.futures
import time
# Load both models
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo"
MODEL_NAME_base = "openai/whisper-large-v3"
device = 0 if torch.cuda.is_available() else "cpu"
# Set up the pipeline for both models
pipe_turbo = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME_TURBO,
chunk_length_s=30,
device=device,
)
pipe_base = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME_base,
chunk_length_s=30,
device=device,
)
# Function to transcribe audio using the turbo model
@spaces.GPU
def transcribe_turbo(audio):
start_time = time.time()
text_turbo = pipe_turbo(audio)["text"]
elapsed_time = time.time() - start_time
return text_turbo, elapsed_time
# Function to transcribe audio using the base model
@spaces.GPU
def transcribe_base(audio):
start_time = time.time()
text_base = pipe_base(audio)["text"]
elapsed_time = time.time() - start_time
return text_base, elapsed_time
# Function to compare transcriptions and speed
@spaces.GPU
def compare_transcriptions(audio):
if audio is None:
raise gr.Error("No audio file submitted! Please record an audio before submitting your request.")
# Run both transcriptions in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
future_turbo = executor.submit(transcribe_turbo, audio)
future_base = executor.submit(transcribe_base, audio)
# Get the results
text_turbo, time_turbo = future_turbo.result()
text_base, time_base = future_base.result()
# Return both transcriptions and processing times
return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds")
css = """
h1 {
text-align: center;
display:block;
}
"""
# Gradio Interface
with gr.Blocks(css=css) as demo:
# Title and description
gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3")
gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3")
with gr.Column():
with gr.Row():
with gr.Group():
audio_input = gr.Audio(sources=["microphone"], type="filepath")
transcribe_button = gr.Button("Start transcription", variant="primary")
with gr.Row():
with gr.Row():
with gr.Group():
gr.Markdown("### 📝 **Base model**")
base_output = gr.Textbox(label="Transcription")
base_time = gr.Textbox(label="Processing Time")
with gr.Group():
gr.Markdown("### ⚡ **Turbo model**")
turbo_output = gr.Textbox(label="Transcription")
turbo_time = gr.Textbox(label="Processing Time")
# Set up the interaction
transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time])
# Launch the demo
demo.launch()
|