eusholli commited on
Commit
8c3129f
1 Parent(s): 6f82df3

initial push

Browse files
Files changed (4) hide show
  1. .gitignore +81 -0
  2. README.md +2 -4
  3. app.py +199 -0
  4. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python build
2
+ .eggs/
3
+ gradio.egg-info
4
+ dist/
5
+ *.pyc
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ build/
10
+ __tmp/*
11
+ *.pyi
12
+ py.typed
13
+
14
+ # JS build
15
+ gradio/templates/*
16
+ gradio/node/*
17
+ gradio/_frontend_code/*
18
+ js/gradio-preview/test/*
19
+
20
+ # Secrets
21
+ .env
22
+
23
+ # Gradio run artifacts
24
+ *.db
25
+ *.sqlite3
26
+ gradio/launches.json
27
+ flagged/
28
+ gradio_cached_examples/
29
+ tmp.zip
30
+
31
+ # Tests
32
+ .coverage
33
+ coverage.xml
34
+ test.txt
35
+ **/snapshots/**/*.png
36
+ playwright-report/
37
+
38
+ # Demos
39
+ demo/tmp.zip
40
+ demo/files/*.avi
41
+ demo/files/*.mp4
42
+ demo/all_demos/demos/*
43
+ demo/all_demos/requirements.txt
44
+ demo/*/config.json
45
+ demo/annotatedimage_component/*.png
46
+ demo/fake_diffusion_with_gif/*.gif
47
+
48
+ # Etc
49
+ .idea/*
50
+ .DS_Store
51
+ *.bak
52
+ workspace.code-workspace
53
+ *.h5
54
+
55
+ # dev containers
56
+ .pnpm-store/
57
+
58
+ # log files
59
+ .pnpm-debug.log
60
+
61
+ # Local virtualenv for devs
62
+ venv*
63
+
64
+ # FRP
65
+ gradio/frpc_*
66
+ .vercel
67
+
68
+ # js
69
+ node_modules
70
+ public/build/
71
+ test-results
72
+ client/js/test.js
73
+ .config/test.py
74
+
75
+ # storybook
76
+ storybook-static
77
+ build-storybook.log
78
+ js/storybook/theme.css
79
+
80
+ # playwright
81
+ .config/playwright/.cache
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  title: Whisper Any Model
3
  emoji: 📉
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Whisper Any Model
3
  emoji: 📉
4
+ colorFrom: pink
5
+ colorTo: yellow
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import yt_dlp as youtube_dl
4
+ from transformers import pipeline
5
+ from transformers.pipelines.audio_utils import ffmpeg_read
6
+
7
+ import tempfile
8
+ import os
9
+ import time
10
+
11
+ # Available model sizes
12
+ MODEL_CHOICES = ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]
13
+
14
+ current_choice = "tiny"
15
+ DEFAULT_MODEL_NAME = f"openai/whisper-{current_choice}"
16
+ BATCH_SIZE = 8
17
+ FILE_LIMIT_MB = 1000
18
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
19
+
20
+ device = 0 if torch.cuda.is_available() else "cpu"
21
+
22
+ # Initialize the pipeline with the default model
23
+ pipe = pipeline(
24
+ task="automatic-speech-recognition",
25
+ model=DEFAULT_MODEL_NAME,
26
+ chunk_length_s=30,
27
+ device=device,
28
+ )
29
+
30
+
31
+ def transcribe(model_size, inputs, task):
32
+ if inputs is None:
33
+ raise gr.Error(
34
+ "No audio file submitted! Please upload or record an audio file before submitting your request."
35
+ )
36
+
37
+ global current_choice
38
+ global pipe
39
+
40
+ current_choice = model_size
41
+
42
+ MODEL_NAME = f"openai/whisper-{model_size}"
43
+ if (
44
+ pipe.model.name_or_path != MODEL_NAME
45
+ ): # Reload the pipeline if model has changed
46
+ pipe = pipeline(
47
+ task="automatic-speech-recognition",
48
+ model=MODEL_NAME,
49
+ chunk_length_s=30,
50
+ device=device,
51
+ )
52
+
53
+ text = pipe(
54
+ inputs,
55
+ batch_size=BATCH_SIZE,
56
+ generate_kwargs={"task": task},
57
+ return_timestamps=True,
58
+ )["text"]
59
+ return text
60
+
61
+
62
+ def _return_yt_html_embed(yt_url):
63
+ video_id = yt_url.split("?v=")[-1]
64
+ HTML_str = (
65
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
66
+ " </center>"
67
+ )
68
+ return HTML_str
69
+
70
+
71
+ def download_yt_audio(yt_url, filename):
72
+ info_loader = youtube_dl.YoutubeDL()
73
+
74
+ try:
75
+ info = info_loader.extract_info(yt_url, download=False)
76
+ except youtube_dl.utils.DownloadError as err:
77
+ raise gr.Error(str(err))
78
+
79
+ file_length = info["duration_string"]
80
+ file_h_m_s = file_length.split(":")
81
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
82
+
83
+ if len(file_h_m_s) == 1:
84
+ file_h_m_s.insert(0, 0)
85
+ if len(file_h_m_s) == 2:
86
+ file_h_m_s.insert(0, 0)
87
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
88
+
89
+ if file_length_s > YT_LENGTH_LIMIT_S:
90
+ yt_length_limit_hms = time.strftime(
91
+ "%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S)
92
+ )
93
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
94
+ raise gr.Error(
95
+ f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
96
+ )
97
+
98
+ ydl_opts = {
99
+ "outtmpl": filename,
100
+ "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
101
+ }
102
+
103
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
104
+ try:
105
+ ydl.download([yt_url])
106
+ except youtube_dl.utils.ExtractorError as err:
107
+ raise gr.Error(str(err))
108
+
109
+
110
+ def yt_transcribe(yt_url, task, max_filesize=75.0):
111
+ html_embed_str = _return_yt_html_embed(yt_url)
112
+
113
+ with tempfile.TemporaryDirectory() as tmpdirname:
114
+ filepath = os.path.join(tmpdirname, "video.mp4")
115
+ download_yt_audio(yt_url, filepath)
116
+ with open(filepath, "rb") as f:
117
+ inputs = f.read()
118
+
119
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
120
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
121
+
122
+ text = pipe(
123
+ inputs,
124
+ batch_size=BATCH_SIZE,
125
+ generate_kwargs={"task": task},
126
+ return_timestamps=True,
127
+ )["text"]
128
+
129
+ return html_embed_str, text
130
+
131
+
132
+ demo = gr.Blocks()
133
+
134
+ mf_transcribe = gr.Interface(
135
+ fn=transcribe,
136
+ inputs=[
137
+ gr.Dropdown(MODEL_CHOICES, label="Model Size", value=current_choice),
138
+ gr.Audio(sources=["microphone"], type="filepath"),
139
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
140
+ ],
141
+ outputs="text",
142
+ theme="default",
143
+ title="Whisper: Transcribe Audio",
144
+ description=(
145
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo allows selection of any of the"
146
+ f" [OpenAI Whisper model sizes](https://huggingface.co/openai/whisper-large-v3) and Transformers to transcribe audio files"
147
+ " of arbitrary length. Large and above are multilingual."
148
+ " Based on https://huggingface.co/spaces/openai/whisper"
149
+ ),
150
+ allow_flagging="never",
151
+ )
152
+
153
+ file_transcribe = gr.Interface(
154
+ fn=transcribe,
155
+ inputs=[
156
+ gr.Dropdown(MODEL_CHOICES, label="Model Size", value=current_choice),
157
+ gr.Audio(sources=["upload"], type="filepath", label="Audio file"),
158
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
159
+ ],
160
+ outputs="text",
161
+ theme="default",
162
+ title="Whisper: Transcribe Audio",
163
+ description=(
164
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
165
+ f" checkpoint [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and Transformers to transcribe audio files"
166
+ " of arbitrary length."
167
+ ),
168
+ allow_flagging="never",
169
+ )
170
+
171
+ yt_transcribe = gr.Interface(
172
+ fn=yt_transcribe,
173
+ inputs=[
174
+ gr.Dropdown(MODEL_CHOICES, label="Model Size", value=current_choice),
175
+ gr.Textbox(
176
+ lines=1,
177
+ placeholder="Paste the URL to a YouTube video here",
178
+ label="YouTube URL",
179
+ ),
180
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
181
+ ],
182
+ outputs=["html", "text"],
183
+ theme="default",
184
+ title="Whisper: Transcribe Audio",
185
+ description=(
186
+ "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
187
+ f" [{DEFAULT_MODEL_NAME}](https://huggingface.co/{DEFAULT_MODEL_NAME}) and Transformers to transcribe video files of"
188
+ " arbitrary length."
189
+ ),
190
+ allow_flagging="never",
191
+ )
192
+
193
+ with demo:
194
+ gr.TabbedInterface(
195
+ [mf_transcribe, file_transcribe, yt_transcribe],
196
+ ["Microphone", "Audio file", "YouTube"],
197
+ )
198
+
199
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ git+https://github.com/huggingface/transformers
3
+ torch
4
+ yt-dlp