File size: 3,130 Bytes
f11f43f
c9c13a6
f11f43f
 
 
 
 
 
 
6046e53
f11f43f
 
 
 
 
 
 
 
 
 
 
6046e53
f11f43f
6046e53
f11f43f
 
 
 
 
754df64
f11f43f
 
 
 
 
 
6046e53
754df64
6046e53
f11f43f
6046e53
f11f43f
6046e53
f11f43f
 
754df64
f11f43f
 
 
 
 
 
 
6046e53
f11f43f
 
 
6046e53
f11f43f
 
6046e53
f11f43f
 
 
 
 
 
 
 
 
 
 
6046e53
 
f11f43f
 
 
 
 
 
 
 
 
 
6046e53
 
 
f11f43f
 
 
 
 
 
6046e53
f11f43f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
import spaces
import gradio as gr
from transformers import pipeline
import concurrent.futures
import time

# Load both models
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo"
MODEL_NAME_base = "openai/whisper-large-v3"

device = 0 if torch.cuda.is_available() else "cpu"

# Set up the pipeline for both models
pipe_turbo = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME_TURBO,
    chunk_length_s=30,
    device=device,
)

pipe_base = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME_base,
    chunk_length_s=30,
    device=device,
)

# Function to transcribe audio using the turbo model
@spaces.GPU
def transcribe_turbo(audio):
    start_time = time.time()
    text_turbo = pipe_turbo(audio)["text"]
    elapsed_time = time.time() - start_time
    return text_turbo, elapsed_time

# Function to transcribe audio using the base model
@spaces.GPU
def transcribe_base(audio):
    start_time = time.time()
    text_base = pipe_base(audio)["text"]
    elapsed_time = time.time() - start_time
    return text_base, elapsed_time

# Function to compare transcriptions and speed
@spaces.GPU
def compare_transcriptions(audio):
    if audio is None:
        raise gr.Error("No audio file submitted! Please record an audio before submitting your request.")

    # Run both transcriptions in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_turbo = executor.submit(transcribe_turbo, audio)
        future_base = executor.submit(transcribe_base, audio)

        # Get the results
        text_turbo, time_turbo = future_turbo.result()
        text_base, time_base = future_base.result()

    # Return both transcriptions and processing times
    return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds")

css = """
h1 {
    text-align: center;
    display:block;
}
"""

# Gradio Interface
with gr.Blocks(css=css) as demo:
    # Title and description
    gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3")
    gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3")

    with gr.Column():
        with gr.Row():
            with gr.Group():
                audio_input = gr.Audio(sources=["microphone"], type="filepath")
                transcribe_button = gr.Button("Start transcription", variant="primary")

        with gr.Row():
            with gr.Row():
               with gr.Group():
                  gr.Markdown("### 📝 **Base model**")
                  base_output = gr.Textbox(label="Transcription")
                  base_time = gr.Textbox(label="Processing Time")
               with gr.Group():
                 gr.Markdown("### ⚡ **Turbo model**")
                 turbo_output = gr.Textbox(label="Transcription")
                 turbo_time = gr.Textbox(label="Processing Time")

    # Set up the interaction
    transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time])

# Launch the demo
demo.launch()