HoneyTian commited on
Commit
2267fac
1 Parent(s): 85881d8
Files changed (5) hide show
  1. decode.py +122 -0
  2. examples.py +395 -0
  3. main.py +46 -10
  4. models.py +107 -0
  5. requirements.txt +1 -0
decode.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Union, Tuple
4
+
5
+ import numpy as np
6
+ import sherpa
7
+ import sherpa_onnx
8
+ import torch
9
+ import torchaudio
10
+ import wave
11
+
12
+
13
+ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
14
+ """
15
+ :param wave_filename: Path to a wave file. It should be single channel and each sample should be 16-bit.
16
+ Its sample rate does not need to be 16kHz.
17
+ :return: Return a tuple containing:
18
+ signal: A 1-D array of dtype np.float32 containing the samples, which are normalized to the range [-1, 1].
19
+ sample_rate: sample rate of the wave file
20
+ """
21
+
22
+ with wave.open(wave_filename) as f:
23
+ assert f.getnchannels() == 1, f.getnchannels()
24
+ assert f.getsampwidth() == 2, f.getsampwidth()
25
+ num_samples = f.getnframes()
26
+ samples = f.readframes(num_samples)
27
+ samples_int16 = np.frombuffer(samples, dtype=np.int16)
28
+ samples_float32 = samples_int16.astype(np.float32)
29
+
30
+ samples_float32 = samples_float32 / 32768
31
+ return samples_float32, f.getframerate()
32
+
33
+
34
+ def decode_offline_recognizer(recognizer: sherpa.OfflineRecognizer,
35
+ filename: str,
36
+ ) -> str:
37
+ s = recognizer.create_stream()
38
+
39
+ s.accept_wave_file(filename)
40
+ recognizer.decode_stream(s)
41
+
42
+ text = s.result.text.strip()
43
+ return text.lower()
44
+
45
+
46
+ def decode_online_recognizer(recognizer: sherpa.OnlineRecognizer,
47
+ filename: str,
48
+ expected_sample_rate: int = 16000,
49
+ ) -> str:
50
+ samples, actual_sample_rate = torchaudio.load(filename)
51
+ if expected_sample_rate != actual_sample_rate:
52
+ raise AssertionError(
53
+ "expected sample rate: {}, but: actually: {}".format(expected_sample_rate, actual_sample_rate)
54
+ )
55
+
56
+ samples = samples[0].contiguous()
57
+
58
+ s = recognizer.create_stream()
59
+
60
+ tail_padding = torch.zeros(int(expected_sample_rate * 0.3), dtype=torch.float32)
61
+ s.accept_waveform(expected_sample_rate, samples)
62
+ s.accept_waveform(expected_sample_rate, tail_padding)
63
+ s.input_finished()
64
+
65
+ while recognizer.is_ready(s):
66
+ recognizer.decode_stream(s)
67
+
68
+ text = recognizer.get_result(s).text
69
+ return text.strip().lower()
70
+
71
+
72
+ def decode_offline_recognizer_sherpa_onnx(recognizer: sherpa_onnx.OfflineRecognizer,
73
+ filename: str,
74
+ ) -> str:
75
+ s = recognizer.create_stream()
76
+ samples, sample_rate = read_wave(filename)
77
+ s.accept_waveform(sample_rate, samples)
78
+ recognizer.decode_stream(s)
79
+
80
+ return s.result.text.lower()
81
+
82
+
83
+ def decode_online_recognizer_sherpa_onnx(recognizer: sherpa_onnx.OnlineRecognizer,
84
+ filename: str,
85
+ ) -> str:
86
+ s = recognizer.create_stream()
87
+ samples, sample_rate = read_wave(filename)
88
+ s.accept_waveform(sample_rate, samples)
89
+
90
+ tail_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32)
91
+ s.accept_waveform(sample_rate, tail_paddings)
92
+ s.input_finished()
93
+
94
+ while recognizer.is_ready(s):
95
+ recognizer.decode_stream(s)
96
+
97
+ return recognizer.get_result(s).lower()
98
+
99
+
100
+ def decode_by_recognizer(
101
+ recognizer: Union[
102
+ sherpa.OfflineRecognizer,
103
+ sherpa.OnlineRecognizer,
104
+ sherpa_onnx.OfflineRecognizer,
105
+ sherpa_onnx.OnlineRecognizer,
106
+ ],
107
+ filename: str,
108
+ ) -> str:
109
+ if isinstance(recognizer, sherpa.OfflineRecognizer):
110
+ return decode_offline_recognizer(recognizer, filename)
111
+ elif isinstance(recognizer, sherpa.OnlineRecognizer):
112
+ return decode_online_recognizer(recognizer, filename)
113
+ elif isinstance(recognizer, sherpa_onnx.OfflineRecognizer):
114
+ return decode_offline_recognizer_sherpa_onnx(recognizer, filename)
115
+ elif isinstance(recognizer, sherpa_onnx.OnlineRecognizer):
116
+ return decode_online_recognizer_sherpa_onnx(recognizer, filename)
117
+ else:
118
+ raise ValueError(f"Unknown recognizer type {type(recognizer)}")
119
+
120
+
121
+ if __name__ == "__main__":
122
+ pass
examples.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ examples = [
6
+ [
7
+ "Chinese+English",
8
+ "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20",
9
+ "greedy_search",
10
+ 4,
11
+ "Yes",
12
+ "./data/test_wavs/tal_csasr/0.wav",
13
+ ],
14
+ [
15
+ "Chinese+English+Cantonese",
16
+ "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
17
+ "greedy_search",
18
+ 4,
19
+ "Yes",
20
+ "./data/test_wavs/cantonese/2.wav",
21
+ ],
22
+ [
23
+ "Cantonese",
24
+ "zrjin/icefall-asr-mdcc-zipformer-2024-03-11",
25
+ "greedy_search",
26
+ 4,
27
+ "Yes",
28
+ "./data/test_wavs/cantonese/1.wav",
29
+ ],
30
+ [
31
+ "English",
32
+ "whisper-base.en",
33
+ "greedy_search",
34
+ 4,
35
+ "Yes",
36
+ "./data/test_wavs/librispeech/1089-134686-0001.wav",
37
+ ],
38
+ [
39
+ "Chinese",
40
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
41
+ "greedy_search",
42
+ 4,
43
+ "Yes",
44
+ "./data/test_wavs/paraformer-zh/四川话.wav",
45
+ ],
46
+ [
47
+ "Russian",
48
+ "alphacep/vosk-model-ru",
49
+ "greedy_search",
50
+ 4,
51
+ "No",
52
+ "./data/test_wavs/russian/russian-i-love-you.wav",
53
+ ],
54
+ [
55
+ "Russian",
56
+ "alphacep/vosk-model-ru",
57
+ "greedy_search",
58
+ 4,
59
+ "No",
60
+ "./data/test_wavs/russian/test.wav",
61
+ ],
62
+ [
63
+ "German",
64
+ "csukuangfj/wav2vec2.0-torchaudio",
65
+ "greedy_search",
66
+ 4,
67
+ "No",
68
+ "./data/test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav",
69
+ ],
70
+ [
71
+ "Arabic",
72
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
73
+ "greedy_search",
74
+ 4,
75
+ "No",
76
+ "./data/test_wavs/arabic/a.wav",
77
+ ],
78
+ [
79
+ "Tibetan",
80
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
81
+ "greedy_search",
82
+ 4,
83
+ "No",
84
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31117.wav",
85
+ ],
86
+ [
87
+ "French",
88
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
89
+ "greedy_search",
90
+ 4,
91
+ "No",
92
+ "./data/test_wavs/french/common_voice_fr_19364697.wav",
93
+ ],
94
+ [
95
+ "Chinese",
96
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
97
+ "greedy_search",
98
+ 4,
99
+ "Yes",
100
+ "./data/test_wavs/alimeeting/R8003_M8001-8004-165.wav",
101
+ ],
102
+ [
103
+ "Chinese",
104
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
105
+ "greedy_search",
106
+ 4,
107
+ "Yes",
108
+ "./data/test_wavs/paraformer-zh/天津话.wav",
109
+ ],
110
+ [
111
+ "Chinese",
112
+ "csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09",
113
+ "greedy_search",
114
+ 4,
115
+ "Yes",
116
+ "./data/test_wavs/paraformer-zh/郑州话.wav",
117
+ ],
118
+ [
119
+ "Chinese",
120
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
121
+ "greedy_search",
122
+ 4,
123
+ "Yes",
124
+ "./data/test_wavs/alimeeting/R8008_M8013-8049-74.wav",
125
+ ],
126
+ [
127
+ "Chinese",
128
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
129
+ "greedy_search",
130
+ 4,
131
+ "Yes",
132
+ "./data/test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav",
133
+ ],
134
+ [
135
+ "English",
136
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
137
+ "greedy_search",
138
+ 4,
139
+ "Yes",
140
+ "./data/test_wavs/tedlium3/DanBarber_2010-219.wav",
141
+ ],
142
+ [
143
+ "English",
144
+ "whisper-base.en",
145
+ "greedy_search",
146
+ 4,
147
+ "Yes",
148
+ "./data/test_wavs/tedlium3/DanielKahneman_2010-157.wav",
149
+ ],
150
+ [
151
+ "English",
152
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
153
+ "greedy_search",
154
+ 4,
155
+ "Yes",
156
+ "./data/test_wavs/tedlium3/RobertGupta_2010U-15.wav",
157
+ ],
158
+ # librispeech
159
+ # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/data/test_wavs
160
+ [
161
+ "English",
162
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
163
+ "greedy_search",
164
+ 4,
165
+ "Yes",
166
+ "./data/test_wavs/librispeech/1089-134686-0001.wav",
167
+ ],
168
+ [
169
+ "English",
170
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
171
+ "greedy_search",
172
+ 4,
173
+ "Yes",
174
+ "./data/test_wavs/librispeech/1221-135766-0001.wav",
175
+ ],
176
+ [
177
+ "English",
178
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
179
+ "greedy_search",
180
+ 4,
181
+ "Yes",
182
+ "./data/test_wavs/librispeech/1221-135766-0002.wav",
183
+ ],
184
+ # gigaspeech
185
+ [
186
+ "English",
187
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
188
+ "greedy_search",
189
+ 4,
190
+ "Yes",
191
+ "./data/test_wavs/gigaspeech/1-minute-audiobook.opus",
192
+ ],
193
+ [
194
+ "English",
195
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
196
+ "greedy_search",
197
+ 4,
198
+ "Yes",
199
+ "./data/test_wavs/gigaspeech/100-seconds-podcast.opus",
200
+ ],
201
+ [
202
+ "English",
203
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
204
+ "greedy_search",
205
+ 4,
206
+ "Yes",
207
+ "./data/test_wavs/gigaspeech/100-seconds-youtube.opus",
208
+ ],
209
+ # wenetspeech
210
+ # https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/data/test_wavs
211
+ [
212
+ "Chinese",
213
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
214
+ "greedy_search",
215
+ 4,
216
+ "Yes",
217
+ "./data/test_wavs/wenetspeech/DEV_T0000000000.opus",
218
+ ],
219
+ [
220
+ "Chinese",
221
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
222
+ "greedy_search",
223
+ 4,
224
+ "Yes",
225
+ "./data/test_wavs/wenetspeech/DEV_T0000000001.opus",
226
+ ],
227
+ [
228
+ "Chinese",
229
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
230
+ "greedy_search",
231
+ 4,
232
+ "Yes",
233
+ "./data/test_wavs/wenetspeech/DEV_T0000000002.opus",
234
+ ],
235
+ # aishell2-A
236
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/data/test_wavs
237
+ [
238
+ "Chinese",
239
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
240
+ "greedy_search",
241
+ 4,
242
+ "Yes",
243
+ "./data/test_wavs/aishell2/ID0012W0030.wav",
244
+ ],
245
+ [
246
+ "Chinese",
247
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
248
+ "greedy_search",
249
+ 4,
250
+ "Yes",
251
+ "./data/test_wavs/aishell2/ID0012W0162.wav",
252
+ ],
253
+ [
254
+ "Chinese",
255
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
256
+ "greedy_search",
257
+ 4,
258
+ "Yes",
259
+ "./data/test_wavs/aishell2/ID0012W0215.wav",
260
+ ],
261
+ # aishell2-B
262
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/data/test_wavs
263
+ [
264
+ "Chinese",
265
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
266
+ "greedy_search",
267
+ 4,
268
+ "Yes",
269
+ "./data/test_wavs/aishell2/ID0012W0030.wav",
270
+ ],
271
+ [
272
+ "Chinese",
273
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
274
+ "greedy_search",
275
+ 4,
276
+ "Yes",
277
+ "./data/test_wavs/aishell2/ID0012W0162.wav",
278
+ ],
279
+ [
280
+ "Chinese",
281
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
282
+ "greedy_search",
283
+ 4,
284
+ "Yes",
285
+ "./data/test_wavs/aishell2/ID0012W0215.wav",
286
+ ],
287
+ # aishell2-B
288
+ # https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/data/test_wavs
289
+ [
290
+ "Chinese",
291
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
292
+ "greedy_search",
293
+ 4,
294
+ "Yes",
295
+ "./data/test_wavs/aidatatang_200zh/T0055G0036S0002.wav",
296
+ ],
297
+ [
298
+ "Chinese",
299
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
300
+ "greedy_search",
301
+ 4,
302
+ "Yes",
303
+ "./data/test_wavs/aidatatang_200zh/T0055G0036S0003.wav",
304
+ ],
305
+ [
306
+ "Chinese",
307
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
308
+ "greedy_search",
309
+ 4,
310
+ "Yes",
311
+ "./data/test_wavs/aidatatang_200zh/T0055G0036S0004.wav",
312
+ ],
313
+ # tal_csasr
314
+ [
315
+ "Chinese+English",
316
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
317
+ "greedy_search",
318
+ 4,
319
+ "Yes",
320
+ "./data/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav",
321
+ ],
322
+ [
323
+ "Chinese+English",
324
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
325
+ "greedy_search",
326
+ 4,
327
+ "Yes",
328
+ "./data/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav",
329
+ ],
330
+ [
331
+ "Chinese+English",
332
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
333
+ "greedy_search",
334
+ 4,
335
+ "Yes",
336
+ "./data/test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
337
+ ],
338
+ [
339
+ "Tibetan",
340
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
341
+ "greedy_search",
342
+ 4,
343
+ "No",
344
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31116.wav",
345
+ ],
346
+ [
347
+ "Tibetan",
348
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
349
+ "greedy_search",
350
+ 4,
351
+ "No",
352
+ "./data/test_wavs/tibetan/a_0_cacm-A70_31118.wav",
353
+ ],
354
+ # arabic
355
+ [
356
+ "Arabic",
357
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
358
+ "greedy_search",
359
+ 4,
360
+ "No",
361
+ "./data/test_wavs/arabic/b.wav",
362
+ ],
363
+ [
364
+ "Arabic",
365
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
366
+ "greedy_search",
367
+ 4,
368
+ "No",
369
+ "./data/test_wavs/arabic/c.wav",
370
+ ],
371
+ [
372
+ "German",
373
+ "csukuangfj/wav2vec2.0-torchaudio",
374
+ "greedy_search",
375
+ 4,
376
+ "No",
377
+ "./data/test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav",
378
+ ],
379
+ [
380
+ "French",
381
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
382
+ "greedy_search",
383
+ 4,
384
+ "No",
385
+ "./data/test_wavs/french/common_voice_fr_19738183.wav",
386
+ ],
387
+ [
388
+ "French",
389
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
390
+ "greedy_search",
391
+ 4,
392
+ "No",
393
+ "./data/test_wavs/french/common_voice_fr_27024649.wav",
394
+ ],
395
+ ]
main.py CHANGED
@@ -5,6 +5,7 @@ import argparse
5
  import gradio as gr
6
 
7
  from examples import examples
 
8
  from project_settings import project_path
9
 
10
 
@@ -25,15 +26,36 @@ def get_args():
25
 
26
 
27
  def update_model_dropdown(language: str):
28
- if language in language_to_models:
29
- choices = language_to_models[language]
30
- return gr.Dropdown(
31
- choices=choices,
32
- value=choices[0],
33
- interactive=True,
34
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- raise ValueError(f"Unsupported language: {language}")
 
 
 
 
 
 
 
 
37
 
38
 
39
  def main():
@@ -56,13 +78,27 @@ def main():
56
  label="Select a model",
57
  value=language_to_models[language_choices[0]][0],
58
  )
59
-
60
  language_radio.change(
61
  update_model_dropdown,
62
  inputs=language_radio,
63
  outputs=model_dropdown,
64
  )
65
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  # blocks
68
  with gr.Blocks() as blocks:
 
5
  import gradio as gr
6
 
7
  from examples import examples
8
+ from models import model_map
9
  from project_settings import project_path
10
 
11
 
 
26
 
27
 
28
  def update_model_dropdown(language: str):
29
+ if language not in model_map.keys():
30
+ raise ValueError(f"Unsupported language: {language}")
31
+
32
+ choices = model_map[language]
33
+ choices = [c["repo_id"] for c in choices]
34
+ return gr.Dropdown(
35
+ choices=choices,
36
+ value=choices[0],
37
+ interactive=True,
38
+ )
39
+
40
+
41
+ def build_html_output(s: str, style: str = "result_item_success"):
42
+ return f"""
43
+ <div class='result'>
44
+ <div class='result_item {style}'>
45
+ {s}
46
+ </div>
47
+ </div>
48
+ """
49
 
50
+
51
+ def process_uploaded_file(language: str,
52
+ repo_id: str,
53
+ decoding_method: str,
54
+ num_active_paths: int,
55
+ add_punctuation: str,
56
+ in_filename: str,
57
+ ):
58
+ return "Dummy", build_html_output("Dummy")
59
 
60
 
61
  def main():
 
78
  label="Select a model",
79
  value=language_to_models[language_choices[0]][0],
80
  )
 
81
  language_radio.change(
82
  update_model_dropdown,
83
  inputs=language_radio,
84
  outputs=model_dropdown,
85
  )
86
+ decoding_method_radio = gr.Radio(
87
+ label="Decoding method",
88
+ choices=["greedy_search", "modified_beam_search"],
89
+ value="greedy_search",
90
+ )
91
+ num_active_paths_slider = gr.Slider(
92
+ minimum=1,
93
+ value=4,
94
+ step=1,
95
+ label="Number of active paths for modified_beam_search",
96
+ )
97
+ punct_radio = gr.Radio(
98
+ label="Whether to add punctuation (Only for Chinese and English)",
99
+ choices=["Yes", "No"],
100
+ value="Yes",
101
+ )
102
 
103
  # blocks
104
  with gr.Blocks() as blocks:
models.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from enum import Enum
4
+ from functools import lru_cache
5
+ import os
6
+
7
+ import huggingface_hub
8
+ import sherpa
9
+
10
+
11
+ class EnumDecodingMethod(Enum):
12
+ greedy_search = "greedy_search"
13
+ modified_beam_search = "modified_beam_search"
14
+
15
+
16
+ class EnumRecognizerType(Enum):
17
+ sherpa_offline_recognizer = "sherpa.OfflineRecognizer"
18
+ sherpa_online_recognizer = "sherpa.OnlineRecognizer"
19
+ sherpa_onnx_offline_recognizer = "sherpa_onnx.OfflineRecognizer"
20
+ sherpa_onnx_online_recognizer = "sherpa_onnx.OnlineRecognizer"
21
+
22
+
23
+ model_map = {
24
+ "Chinese": [
25
+ {
26
+ "repo_id": "csukuangfj/wenet-chinese-model",
27
+ "model_file": "final.zip",
28
+ "tokens_file": "units.txt",
29
+ "subfolder": ".",
30
+ }
31
+ ]
32
+ }
33
+
34
+
35
+ def download_model(repo_id: str,
36
+ nn_model_filename: str,
37
+ tokens_filename: str,
38
+ sub_folder: str,
39
+ local_model_dir: str,
40
+ ):
41
+
42
+ nn_model_filename = huggingface_hub.hf_hub_download(
43
+ repo_id=repo_id,
44
+ filename=nn_model_filename,
45
+ subfolder=sub_folder,
46
+ local_dir=local_model_dir,
47
+ )
48
+
49
+ tokens_filename = huggingface_hub.hf_hub_download(
50
+ repo_id=repo_id,
51
+ filename=tokens_filename,
52
+ subfolder=sub_folder,
53
+ local_dir=local_model_dir,
54
+ )
55
+ return nn_model_filename, tokens_filename
56
+
57
+
58
+ @lru_cache(maxsize=10)
59
+ def load_sherpa_offline_recognizer(nn_model_file: str,
60
+ tokens_file: str,
61
+ sample_rate: int = 16000,
62
+ num_active_paths: int = 2,
63
+ decoding_method: EnumDecodingMethod = EnumDecodingMethod.greedy_search,
64
+ num_mel_bins: int = 80,
65
+ frame_dither: int = 0,
66
+ ):
67
+ feat_config = sherpa.FeatureConfig()
68
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
69
+ feat_config.fbank_opts.mel_opts.num_bins = num_mel_bins
70
+ feat_config.fbank_opts.frame_opts.dither = frame_dither
71
+
72
+ config = sherpa.OfflineRecognizerConfig(
73
+ nn_model=nn_model_file,
74
+ tokens=tokens_file,
75
+ use_gpu=False,
76
+ feat_config=feat_config,
77
+ decoding_method=decoding_method,
78
+ num_active_paths=num_active_paths,
79
+ )
80
+
81
+ recognizer = sherpa.OfflineRecognizer(config)
82
+ return recognizer
83
+
84
+
85
+ def load_recognizer(
86
+ repo_id: str,
87
+ nn_model_filename: str,
88
+ tokens_filename: str,
89
+ sub_folder: str,
90
+ local_model_dir: str,
91
+ recognizer_type: EnumRecognizerType,
92
+ decoding_method: EnumDecodingMethod = EnumDecodingMethod.greedy_search,
93
+ ):
94
+ if not os.path.exists(local_model_dir):
95
+ download_model(
96
+ repo_id=repo_id,
97
+ nn_model_filename=nn_model_filename,
98
+ tokens_filename=tokens_filename,
99
+ sub_folder=sub_folder,
100
+ local_model_dir=local_model_dir,
101
+ )
102
+
103
+ return
104
+
105
+
106
+ if __name__ == "__main__":
107
+ pass
requirements.txt CHANGED
@@ -3,6 +3,7 @@ torch==1.13.1
3
  torchaudio==0.13.1
4
  librosa==0.8.1
5
  numpy==1.22.0
 
6
 
7
  data/wheels/k2-1.23.4.dev20230130+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
8
  data/wheels/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
 
3
  torchaudio==0.13.1
4
  librosa==0.8.1
5
  numpy==1.22.0
6
+ sherpa-onnx>=1.9.21
7
 
8
  data/wheels/k2-1.23.4.dev20230130+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
9
  data/wheels/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl