akjedidtz commited on
Commit
8def4ef
1 Parent(s): f2b31f5

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -97
app.py DELETED
@@ -1,97 +0,0 @@
1
-
2
-
3
- import spaces
4
- import torch
5
- import gradio as gr
6
- import tempfile
7
- import os
8
- import uuid
9
- import scipy.io.wavfile
10
- import time
11
- import numpy as np
12
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
13
- import subprocess
14
- subprocess.run(
15
- "pip install flash-attn --no-build-isolation",
16
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
- shell=True,
18
- )
19
-
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
- torch_dtype = torch.float16
22
- MODEL_NAME = "openai/whisper-large-v3-turbo"
23
-
24
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
25
- MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
26
- )
27
- model.to(device)
28
-
29
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
30
- tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
31
-
32
- pipe = pipeline(
33
- task="automatic-speech-recognition",
34
- model=model,
35
- tokenizer=tokenizer,
36
- feature_extractor=processor.feature_extractor,
37
- chunk_length_s=10,
38
- torch_dtype=torch_dtype,
39
- device=device,
40
- )
41
-
42
- @spaces.GPU
43
- def transcribe(inputs, previous_transcription):
44
- start_time = time.time()
45
- try:
46
- filename = f"{uuid.uuid4().hex}.wav"
47
- sample_rate, audio_data = inputs
48
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
49
-
50
- transcription = pipe(filename)["text"]
51
- previous_transcription += transcription
52
-
53
- end_time = time.time()
54
- latency = end_time - start_time
55
- return previous_transcription, f"{latency:.2f}"
56
- except Exception as e:
57
- print(f"Error during Transcription: {e}")
58
- return previous_transcription, "Error"
59
-
60
- @spaces.GPU
61
- def translate_and_transcribe(inputs, previous_transcription, target_language):
62
- start_time = time.time()
63
- try:
64
- filename = f"{uuid.uuid4().hex}.wav"
65
- sample_rate, audio_data = inputs
66
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
67
-
68
- translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
69
-
70
- previous_transcription += translation
71
-
72
- end_time = time.time()
73
- latency = end_time - start_time
74
- return previous_transcription, f"{latency:.2f}"
75
- except Exception as e:
76
- print(f"Error during Translation and Transcription: {e}")
77
- return previous_transcription, "Error"
78
-
79
- def clear():
80
- return ""
81
-
82
- with gr.Blocks() as microphone:
83
- with gr.Column():
84
- gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
85
- with gr.Row():
86
- input_audio_microphone = gr.Audio(streaming=True)
87
- output = gr.Textbox(label="Transcription", value="")
88
- latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
89
- with gr.Row():
90
- clear_button = gr.Button("Clear Output")
91
-
92
- input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
93
- clear_button.click(clear, outputs=[output])
94
-
95
-
96
-
97
- demo.launch()