Yuekai Zhang commited on
Commit
f241123
1 Parent(s): 7527245

remove app local

Browse files
Files changed (3) hide show
  1. .gitattributes +2 -0
  2. app.py +3 -3
  3. app_local.py +0 -443
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test_wavs/fleurs/15029788401146217023.wav filter=lfs diff=lfs merge=lfs -text
37
+ test_wavs/fleurs/7760285811293653093.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -41,7 +41,7 @@ def convert_to_wav(in_filename: str) -> str:
41
  """Convert the input audio file to a wave file"""
42
  out_filename = in_filename + ".wav"
43
  if '.mp3' in in_filename:
44
- _ = os.system(f"ffmpeg -y -i '{in_filename}' -acodec pcm_s16le -ac 1 -ar 16000 '{out_filename}'|| exit 1")
45
  else:
46
  _ = os.system(f"ffmpeg -hide_banner -y -i '{in_filename}' -ar 16000 '{out_filename}' || exit 1")
47
  return out_filename
@@ -289,6 +289,7 @@ with demo:
289
  language_choices = ["Chinese", "English", "Chinese+English", "Korean", "Japanese", "Arabic", "German", "French", "Russian"]
290
  server_url_textbox = gr.Textbox(
291
  label='Triton Inference Server URL',
 
292
  placeholder='e.g. localhost:8001',
293
  max_lines=1,
294
  )
@@ -439,5 +440,4 @@ if __name__ == "__main__":
439
 
440
  logging.basicConfig(format=formatter, level=logging.INFO)
441
 
442
- #demo.launch(share=True)
443
- demo.launch()
 
41
  """Convert the input audio file to a wave file"""
42
  out_filename = in_filename + ".wav"
43
  if '.mp3' in in_filename:
44
+ _ = os.system(f"ffmpeg -y -i '{in_filename}' -acodec pcm_s16le -ac 1 -ar 16000 '{out_filename}' || exit 1")
45
  else:
46
  _ = os.system(f"ffmpeg -hide_banner -y -i '{in_filename}' -ar 16000 '{out_filename}' || exit 1")
47
  return out_filename
 
289
  language_choices = ["Chinese", "English", "Chinese+English", "Korean", "Japanese", "Arabic", "German", "French", "Russian"]
290
  server_url_textbox = gr.Textbox(
291
  label='Triton Inference Server URL',
292
+ value='10.19.203.82:8001',
293
  placeholder='e.g. localhost:8001',
294
  max_lines=1,
295
  )
 
440
 
441
  logging.basicConfig(format=formatter, level=logging.INFO)
442
 
443
+ demo.launch(share=True)
 
app_local.py DELETED
@@ -1,443 +0,0 @@
1
- #!/usr/bin/env python3
2
- #
3
- # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
- # 2023 Nvidia. (authors: Yuekai Zhang)
5
- #
6
- # See LICENSE for clarification regarding multiple authors
7
- #
8
- # Licensed under the Apache License, Version 2.0 (the "License");
9
- # you may not use this file except in compliance with the License.
10
- # You may obtain a copy of the License at
11
- #
12
- # http://www.apache.org/licenses/LICENSE-2.0
13
- #
14
- # Unless required by applicable law or agreed to in writing, software
15
- # distributed under the License is distributed on an "AS IS" BASIS,
16
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
- # See the License for the specific language governing permissions and
18
- # limitations under the License.
19
-
20
- # References:
21
- # https://gradio.app/docs/#dropdown
22
- # https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
23
-
24
- import logging
25
- import os
26
- import tempfile
27
- import time
28
- from datetime import datetime
29
-
30
- import gradio as gr
31
- import numpy as np
32
- import urllib.request
33
- import tritonclient
34
- import tritonclient.grpc as grpcclient
35
- from tritonclient.utils import np_to_triton_dtype
36
- import soundfile
37
-
38
- from examples import examples
39
-
40
- def convert_to_wav(in_filename: str) -> str:
41
- """Convert the input audio file to a wave file"""
42
- out_filename = in_filename + ".wav"
43
- if '.mp3' in in_filename:
44
- _ = os.system(f"ffmpeg -y -i '{in_filename}' -acodec pcm_s16le -ac 1 -ar 16000 '{out_filename}' || exit 1")
45
- else:
46
- _ = os.system(f"ffmpeg -hide_banner -y -i '{in_filename}' -ar 16000 '{out_filename}' || exit 1")
47
- return out_filename
48
-
49
-
50
- def build_html_output(s: str, style: str = "result_item_success"):
51
- return f"""
52
- <div class='result'>
53
- <div class='result_item {style}'>
54
- {s}
55
- </div>
56
- </div>
57
- """
58
-
59
- def process_url(
60
- language: str,
61
- repo_id: str,
62
- decoding_method: str,
63
- whisper_prompt_textbox: str,
64
- url: str,
65
- server_url_textbox: str,
66
- ):
67
- logging.info(f"Processing URL: {url}")
68
- with tempfile.NamedTemporaryFile() as f:
69
- try:
70
- urllib.request.urlretrieve(url, f.name)
71
-
72
- return process(
73
- in_filename=f.name,
74
- language=language,
75
- repo_id=repo_id,
76
- decoding_method=decoding_method,
77
- whisper_prompt_textbox=whisper_prompt_textbox,
78
- server_url=server_url_textbox,
79
- )
80
- except Exception as e:
81
- logging.info(str(e))
82
- return "", build_html_output(str(e), "result_item_error")
83
-
84
- def process_uploaded_file(
85
- language: str,
86
- repo_id: str,
87
- decoding_method: str,
88
- whisper_prompt_textbox: int,
89
- in_filename: str,
90
- server_url_textbox: str,
91
- ):
92
- if in_filename is None or in_filename == "":
93
- return "", build_html_output(
94
- "Please first upload a file and then click "
95
- 'the button "submit for recognition"',
96
- "result_item_error",
97
- )
98
-
99
- logging.info(f"Processing uploaded file: {in_filename}")
100
- try:
101
- return process(
102
- in_filename=in_filename,
103
- language=language,
104
- repo_id=repo_id,
105
- decoding_method=decoding_method,
106
- whisper_prompt_textbox=whisper_prompt_textbox,
107
- server_url=server_url_textbox,
108
- )
109
- except Exception as e:
110
- logging.info(str(e))
111
- return "", build_html_output(str(e), "result_item_error")
112
-
113
-
114
- def process_microphone(
115
- language: str,
116
- repo_id: str,
117
- decoding_method: str,
118
- whisper_prompt_textbox: str,
119
- in_filename: str,
120
- server_url_textbox: str,
121
- ):
122
- if in_filename is None or in_filename == "":
123
- return "", build_html_output(
124
- "Please first click 'Record from microphone', speak, "
125
- "click 'Stop recording', and then "
126
- "click the button 'submit for recognition'",
127
- "result_item_error",
128
- )
129
-
130
- logging.info(f"Processing microphone: {in_filename}")
131
- try:
132
- return process(
133
- in_filename=in_filename,
134
- language=language,
135
- repo_id=repo_id,
136
- decoding_method=decoding_method,
137
- whisper_prompt_textbox=whisper_prompt_textbox,
138
- server_url=server_url_textbox,
139
- )
140
- except Exception as e:
141
- logging.info(str(e))
142
- return "", build_html_output(str(e), "result_item_error")
143
-
144
- def send_whisper(whisper_prompt, wav_path, model_name, triton_client, protocol_client, padding_duration=10):
145
- waveform, sample_rate = soundfile.read(wav_path)
146
- assert sample_rate == 16000, f"Only support 16k sample rate, but got {sample_rate}"
147
- duration = int(len(waveform) / sample_rate)
148
-
149
- # padding to nearset 10 seconds
150
- samples = np.zeros(
151
- (
152
- 1,
153
- padding_duration * sample_rate * ((duration // padding_duration) + 1),
154
- ),
155
- dtype=np.float32,
156
- )
157
-
158
- samples[0, : len(waveform)] = waveform
159
-
160
- lengths = np.array([[len(waveform)]], dtype=np.int32)
161
-
162
- inputs = [
163
- protocol_client.InferInput(
164
- "WAV", samples.shape, np_to_triton_dtype(samples.dtype)
165
- ),
166
- protocol_client.InferInput(
167
- "TEXT_PREFIX", [1, 1], "BYTES"
168
- ),
169
- ]
170
- inputs[0].set_data_from_numpy(samples)
171
-
172
- input_data_numpy = np.array([whisper_prompt], dtype=object)
173
- input_data_numpy = input_data_numpy.reshape((1, 1))
174
- inputs[1].set_data_from_numpy(input_data_numpy)
175
-
176
- outputs = [protocol_client.InferRequestedOutput("TRANSCRIPTS")]
177
- # generate a random sequence id
178
- sequence_id = np.random.randint(0, 1000000)
179
-
180
- response = triton_client.infer(
181
- model_name, inputs, request_id=str(sequence_id), outputs=outputs
182
- )
183
-
184
- decoding_results = response.as_numpy("TRANSCRIPTS")[0]
185
- if type(decoding_results) == np.ndarray:
186
- decoding_results = b" ".join(decoding_results).decode("utf-8")
187
- else:
188
- # For wenet
189
- decoding_results = decoding_results.decode("utf-8")
190
- return decoding_results, duration
191
-
192
- def process(
193
- language: str,
194
- repo_id: str,
195
- decoding_method: str,
196
- whisper_prompt_textbox: str,
197
- in_filename: str,
198
- server_url: str,
199
- ):
200
- logging.info(f"language: {language}")
201
- logging.info(f"repo_id: {repo_id}")
202
- logging.info(f"decoding_method: {decoding_method}")
203
- logging.info(f"whisper_prompt_textbox: {whisper_prompt_textbox}")
204
- logging.info(f"in_filename: {in_filename}")
205
-
206
- model_name = "whisper"
207
- triton_client = grpcclient.InferenceServerClient(url=server_url, verbose=False)
208
- protocol_client = grpcclient
209
-
210
- filename = convert_to_wav(in_filename)
211
-
212
- now = datetime.now()
213
- date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
214
- logging.info(f"Started at {date_time}")
215
-
216
- start = time.time()
217
-
218
- text, duration = send_whisper(whisper_prompt_textbox, filename, model_name, triton_client, protocol_client)
219
-
220
- date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
221
- end = time.time()
222
-
223
- #metadata = torchaudio.info(filename)
224
- #duration = metadata.num_frames / sample_rate
225
- rtf = (end - start) / duration
226
-
227
- logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
228
-
229
- info = f"""
230
- Wave duration : {duration: .3f} s <br/>
231
- Processing time: {end - start: .3f} s <br/>
232
- RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
233
- """
234
- if rtf > 1:
235
- info += (
236
- "<br/>We are loading the model for the first run. "
237
- "Please run again to measure the real RTF.<br/>"
238
- )
239
-
240
- logging.info(info)
241
- logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
242
-
243
- return text, build_html_output(info)
244
-
245
-
246
- title = "# Speech Recognition and Translation with Whisper"
247
- description = """
248
- This space shows how to do speech recognition and translation with Nvidia **Triton**.
249
-
250
- Please visit
251
- <https://huggingface.co/yuekai/model_repo_whisper_large_v2>
252
- for triton speech recognition.
253
-
254
- The service is running on a GPU based on triton server.
255
-
256
- See more information by visiting the following links:
257
-
258
- - <https://github.com/triton-inference-server>
259
- - <https://github.com/yuekaizhang/Triton-ASR-Client/tree/main>
260
- - <https://github.com/k2-fsa/sherpa/tree/master/triton>
261
- - <https://github.com/wenet-e2e/wenet/tree/main/runtime/gpu>
262
- - <https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/triton_gpu>
263
-
264
- """
265
-
266
- # css style is copied from
267
- # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
268
- css = """
269
- .result {display:flex;flex-direction:column}
270
- .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
271
- .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
272
- .result_item_error {background-color:#ff7070;color:white;align-self:start}
273
- """
274
-
275
-
276
- # def update_model_dropdown(language: str):
277
- # if language in language_to_models:
278
- # choices = language_to_models[language]
279
- # return gr.Dropdown.update(choices=choices, value=choices[0])
280
-
281
- # raise ValueError(f"Unsupported language: {language}")
282
-
283
-
284
- demo = gr.Blocks(css=css)
285
-
286
-
287
- with demo:
288
- gr.Markdown(title)
289
- language_choices = ["Chinese", "English", "Chinese+English", "Korean", "Japanese", "Arabic", "German", "French", "Russian"]
290
- server_url_textbox = gr.Textbox(
291
- label='Triton Inference Server URL',
292
- value='10.19.203.82:8001',
293
- placeholder='e.g. localhost:8001',
294
- max_lines=1,
295
- )
296
-
297
- whisper_prompt_textbox = gr.Textbox(
298
- label='Whisper prompt',
299
- placeholder='Whisper prompt e.g. <|startoftranscript|><zh><en><transcribe>',
300
- max_lines=1,
301
- )
302
- language_radio = gr.Radio(
303
- label="Language",
304
- choices=language_choices,
305
- value=language_choices[0],
306
- )
307
- model_dropdown = gr.Dropdown(
308
- choices=["whisper-large-v2"],
309
- label="Select a model",
310
- value="whisper-large-v2",
311
- )
312
-
313
- # language_radio.change(
314
- # update_model_dropdown,
315
- # inputs=language_radio,
316
- # outputs=model_dropdown,
317
- # )
318
-
319
- decoding_method_radio = gr.Radio(
320
- label="Decoding method",
321
- choices=["greedy_search"],
322
- value="greedy_search",
323
- )
324
-
325
- # whisper_prompt_textbox_slider = gr.Slider(
326
- # minimum=1,
327
- # value=4,
328
- # step=1,
329
- # label="Number of active paths for modified_beam_search",
330
- # )
331
-
332
- with gr.Tabs():
333
- with gr.TabItem("Upload from disk"):
334
- uploaded_file = gr.Audio(
335
- source="upload", # Choose between "microphone", "upload"
336
- type="filepath",
337
- optional=False,
338
- label="Upload from disk",
339
- )
340
- upload_button = gr.Button("Submit for recognition")
341
- uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
342
- uploaded_html_info = gr.HTML(label="Info")
343
-
344
- gr.Examples(
345
- examples=examples,
346
- inputs=[
347
- language_radio,
348
- model_dropdown,
349
- decoding_method_radio,
350
- whisper_prompt_textbox,
351
- uploaded_file,
352
- ],
353
- outputs=[uploaded_output, uploaded_html_info],
354
- fn=process_uploaded_file,
355
- cache_examples=False,
356
- )
357
-
358
- with gr.TabItem("Record from microphone"):
359
- microphone = gr.Audio(
360
- source="microphone", # Choose between "microphone", "upload"
361
- type="filepath",
362
- optional=False,
363
- label="Record from microphone",
364
- )
365
-
366
- record_button = gr.Button("Submit for recognition")
367
- recorded_output = gr.Textbox(label="Recognized speech from recordings")
368
- recorded_html_info = gr.HTML(label="Info")
369
-
370
- gr.Examples(
371
- examples=examples,
372
- inputs=[
373
- language_radio,
374
- model_dropdown,
375
- decoding_method_radio,
376
- whisper_prompt_textbox,
377
- microphone,
378
- ],
379
- outputs=[recorded_output, recorded_html_info],
380
- fn=process_microphone,
381
- cache_examples=False,
382
- )
383
-
384
- with gr.TabItem("From URL"):
385
- url_textbox = gr.Textbox(
386
- max_lines=1,
387
- placeholder="URL to an audio file",
388
- label="URL",
389
- interactive=True,
390
- )
391
-
392
- url_button = gr.Button("Submit for recognition")
393
- url_output = gr.Textbox(label="Recognized speech from URL")
394
- url_html_info = gr.HTML(label="Info")
395
-
396
- upload_button.click(
397
- process_uploaded_file,
398
- inputs=[
399
- language_radio,
400
- model_dropdown,
401
- decoding_method_radio,
402
- whisper_prompt_textbox,
403
- uploaded_file,
404
- server_url_textbox,
405
- ],
406
- outputs=[uploaded_output, uploaded_html_info],
407
- )
408
-
409
- record_button.click(
410
- process_microphone,
411
- inputs=[
412
- language_radio,
413
- model_dropdown,
414
- decoding_method_radio,
415
- whisper_prompt_textbox,
416
- microphone,
417
- server_url_textbox,
418
- ],
419
- outputs=[recorded_output, recorded_html_info],
420
- )
421
-
422
- url_button.click(
423
- process_url,
424
- inputs=[
425
- language_radio,
426
- model_dropdown,
427
- decoding_method_radio,
428
- whisper_prompt_textbox,
429
- url_textbox,
430
- server_url_textbox,
431
- ],
432
- outputs=[url_output, url_html_info],
433
- )
434
-
435
- gr.Markdown(description)
436
-
437
-
438
- if __name__ == "__main__":
439
- formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
440
-
441
- logging.basicConfig(format=formatter, level=logging.INFO)
442
-
443
- demo.launch(share=True)