Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,575 Bytes
5d52c32 6c226f9 d790c0b 88183ad 1e8d252 6cd6646 e19d3c8 23a2ead e19d3c8 8d01bbb 6c226f9 e19d3c8 e7a6563 e19d3c8 8d01bbb e19d3c8 ea9f05f 6c226f9 e19d3c8 ceea111 e19d3c8 6c226f9 5d52c32 3da85d4 e19d3c8 1e8d252 3845c66 6cd6646 ceea111 2bf1d0a ceea111 2bf1d0a e19d3c8 f22a1c2 1e8d252 3845c66 e19d3c8 ea9f05f 5335399 ea9f05f 5335399 ea9f05f e19d3c8 3da85d4 fd5a036 3df1d51 c3799a0 e19d3c8 a84f14d e19d3c8 a84f14d 3da85d4 ea9f05f fd5a036 1a6b2bc fd5a036 a6e64af ea9f05f fd5a036 a6e64af fd5a036 41e81ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
import subprocess
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
)
model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=10,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def transcribe(inputs, previous_transcription):
start_time = time.time()
try:
filename = f"{uuid.uuid4().hex}.wav"
sample_rate, audio_data = inputs
scipy.io.wavfile.write(filename, sample_rate, audio_data)
transcription = pipe(filename)["text"]
previous_transcription += transcription
end_time = time.time()
latency = end_time - start_time
return previous_transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Transcription: {e}")
return previous_transcription, "Error"
@spaces.GPU
def translate_and_transcribe(inputs, previous_transcription, target_language):
start_time = time.time()
try:
filename = f"{uuid.uuid4().hex}.wav"
sample_rate, audio_data = inputs
scipy.io.wavfile.write(filename, sample_rate, audio_data)
translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
previous_transcription += translation
end_time = time.time()
latency = end_time - start_time
return previous_transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Translation and Transcription: {e}")
return previous_transcription, "Error"
def clear():
return ""
with gr.Blocks() as microphone:
with gr.Column():
gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
clear_button.click(clear, outputs=[output])
with gr.Blocks() as file:
with gr.Column():
gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
with gr.Row():
input_audio_microphone = gr.Audio(sources="upload", type="numpy")
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
submit_button = gr.Button("Submit")
clear_button = gr.Button("Clear Output")
submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
clear_button.click(clear, outputs=[output])
# with gr.Blocks() as translate:
# with gr.Column():
# gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
# with gr.Row():
# input_audio_microphone = gr.Audio(streaming=True)
# output = gr.Textbox(label="Transcription and Translation", value="")
# latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
# target_language_dropdown = gr.Dropdown(
# choices=["english", "french", "hindi", "spanish", "russian"],
# label="Target Language",
# value="<|es|>"
# )
# with gr.Row():
# clear_button = gr.Button("Clear Output")
# input_audio_microphone.stream(
# translate_and_transcribe,
# [input_audio_microphone, output, target_language_dropdown],
# [output, latency_textbox],
# time_limit=45,
# stream_every=2,
# concurrency_limit=None
# )
# clear_button.click(clear, outputs=[output])
with gr.Blocks() as demo:
gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
demo.launch() |