File size: 5,575 Bytes
5d52c32
6c226f9
 
d790c0b
88183ad
1e8d252
6cd6646
e19d3c8
23a2ead
e19d3c8
8d01bbb
 
 
 
 
 
6c226f9
e19d3c8
 
e7a6563
e19d3c8
 
8d01bbb
e19d3c8
 
 
 
ea9f05f
6c226f9
 
 
e19d3c8
 
 
ceea111
e19d3c8
6c226f9
 
 
5d52c32
3da85d4
e19d3c8
1e8d252
3845c66
6cd6646
ceea111
2bf1d0a
ceea111
 
2bf1d0a
e19d3c8
 
f22a1c2
1e8d252
3845c66
e19d3c8
 
ea9f05f
5335399
ea9f05f
 
 
 
 
 
5335399
ea9f05f
 
 
 
 
 
 
 
 
e19d3c8
 
 
3da85d4
fd5a036
3df1d51
c3799a0
e19d3c8
a84f14d
e19d3c8
 
 
a84f14d
 
 
 
3da85d4
ea9f05f
fd5a036
 
 
 
 
 
 
 
 
 
1a6b2bc
fd5a036
 
a6e64af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea9f05f
fd5a036
a6e64af
fd5a036
41e81ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time  
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
import subprocess
subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def transcribe(inputs, previous_transcription):
    start_time = time.time() 
    try:
        filename = f"{uuid.uuid4().hex}.wav"
        sample_rate, audio_data = inputs
        scipy.io.wavfile.write(filename, sample_rate, audio_data)

        transcription = pipe(filename)["text"]
        previous_transcription += transcription

        end_time = time.time()
        latency = end_time - start_time
        return previous_transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return previous_transcription, "Error"

@spaces.GPU
def translate_and_transcribe(inputs, previous_transcription, target_language):
    start_time = time.time()
    try:
        filename = f"{uuid.uuid4().hex}.wav"
        sample_rate, audio_data = inputs
        scipy.io.wavfile.write(filename, sample_rate, audio_data)

        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]

        previous_transcription += translation

        end_time = time.time()
        latency = end_time - start_time
        return previous_transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Translation and Transcription: {e}")
        return previous_transcription, "Error"

def clear():
    return ""

with gr.Blocks() as microphone:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
        with gr.Row():
            input_audio_microphone = gr.Audio(streaming=True)
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            clear_button = gr.Button("Clear Output")

        input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
        clear_button.click(clear, outputs=[output])

with gr.Blocks() as file:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
        with gr.Row():
            input_audio_microphone = gr.Audio(sources="upload", type="numpy")
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            submit_button = gr.Button("Submit")
            clear_button = gr.Button("Clear Output")

        submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
        clear_button.click(clear, outputs=[output])

# with gr.Blocks() as translate:
#     with gr.Column():
#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
#         with gr.Row():
#             input_audio_microphone = gr.Audio(streaming=True)
#             output = gr.Textbox(label="Transcription and Translation", value="")
#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
#             target_language_dropdown = gr.Dropdown(
#                 choices=["english", "french", "hindi", "spanish", "russian"],  
#                 label="Target Language",
#                 value="<|es|>"  
#             )
#         with gr.Row():
#             clear_button = gr.Button("Clear Output")

#         input_audio_microphone.stream(
#             translate_and_transcribe, 
#             [input_audio_microphone, output, target_language_dropdown], 
#             [output, latency_textbox], 
#             time_limit=45, 
#             stream_every=2, 
#             concurrency_limit=None
#         )
#         clear_button.click(clear, outputs=[output])

with gr.Blocks() as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])

demo.launch()