cdactvm commited on
Commit
9d7dce1
1 Parent(s): bb2ede1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +675 -49
app.py CHANGED
@@ -1,68 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import warnings
2
- warnings.filterwarnings("ignore")
3
- import os
4
- import re
5
  import gradio as gr
6
- import numpy as np
7
- import torchaudio
8
- import nbimporter
9
  from transformers import pipeline
10
  from transformers import AutoProcessor
11
  from pyctcdecode import build_ctcdecoder
12
  from transformers import Wav2Vec2ProcessorWithLM
13
- from text2int import text_to_int
14
- from isNumber import is_number
15
- from Text2List import text_to_list
16
- from convert2list import convert_to_list
17
- from processDoubles import process_doubles
18
- from replaceWords import replace_words
19
-
20
- # transcriber = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
21
- # processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-2.0-hindi_v1")
22
-
23
- # vocab_dict = processor.tokenizer.get_vocab()
24
-
25
- # sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
26
- # decoder = build_ctcdecoder(
27
- # labels=list(sorted_vocab_dict.keys()),
28
- # kenlm_model_path="lm.binary",
29
- # )
30
- # processor_with_lm = Wav2Vec2ProcessorWithLM(
31
- # feature_extractor=processor.feature_extractor,
32
- # tokenizer=processor.tokenizer,
33
- # decoder=decoder
34
- # )
35
- # processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
36
 
37
-
38
- def transcribe(audio):
39
- # # Process the audio file
40
- transcript = transcriber(audio)
41
- text_value = transcript['text']
42
- print(text_value)
43
- processd_doubles=process_doubles(text_value)
44
- converted_to_list=convert_to_list(processd_doubles,text_to_list())
45
- replaced_words = replace_words(converted_to_list)
46
- converted_text=text_to_int(replaced_words)
47
- return converted_text
48
-
49
-
50
- # demo = gr.Interface(
51
- # transcribe,
52
- # gr.Audio(sources="microphone", type="filepath"),
53
- # "text",
54
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
58
  demo=gr.Interface(
59
- transcribe,
 
60
  inputs=[
 
 
 
 
 
 
 
61
  gr.Audio(sources=["microphone","upload"], type="filepath"),
 
 
62
  ],
63
  outputs=[
64
  "textbox"
 
65
  ],
 
 
 
66
  title="Automatic Speech Recognition",
67
  description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
68
  ).launch()
 
1
+ # import warnings
2
+ # warnings.filterwarnings("ignore")
3
+ # import os
4
+ # import re
5
+ # import gradio as gr
6
+ # import numpy as np
7
+ # import torchaudio
8
+ # import nbimporter
9
+ # from transformers import pipeline
10
+ # from transformers import AutoProcessor
11
+ # from pyctcdecode import build_ctcdecoder
12
+ # from transformers import Wav2Vec2ProcessorWithLM
13
+ # from text2int import text_to_int
14
+ # from isNumber import is_number
15
+ # from Text2List import text_to_list
16
+ # from convert2list import convert_to_list
17
+ # from processDoubles import process_doubles
18
+ # from replaceWords import replace_words
19
+
20
+ # # transcriber = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
21
+ # # processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-2.0-hindi_v1")
22
+
23
+ # # vocab_dict = processor.tokenizer.get_vocab()
24
+
25
+ # # sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
26
+ # # decoder = build_ctcdecoder(
27
+ # # labels=list(sorted_vocab_dict.keys()),
28
+ # # kenlm_model_path="lm.binary",
29
+ # # )
30
+ # # processor_with_lm = Wav2Vec2ProcessorWithLM(
31
+ # # feature_extractor=processor.feature_extractor,
32
+ # # tokenizer=processor.tokenizer,
33
+ # # decoder=decoder
34
+ # # )
35
+ # # processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
36
+
37
+
38
+ # def transcribe(audio):
39
+ # # # Process the audio file
40
+ # transcript = transcriber(audio)
41
+ # text_value = transcript['text']
42
+ # print(text_value)
43
+ # processd_doubles=process_doubles(text_value)
44
+ # converted_to_list=convert_to_list(processd_doubles,text_to_list())
45
+ # replaced_words = replace_words(converted_to_list)
46
+ # converted_text=text_to_int(replaced_words)
47
+ # return converted_text
48
+
49
+
50
+ # # demo = gr.Interface(
51
+ # # transcribe,
52
+ # # gr.Audio(sources="microphone", type="filepath"),
53
+ # # "text",
54
+ # # )
55
+
56
+ # # demo.launch()
57
+
58
+ # demo=gr.Interface(
59
+ # transcribe,
60
+ # inputs=[
61
+ # gr.Audio(sources=["microphone","upload"], type="filepath"),
62
+ # ],
63
+ # outputs=[
64
+ # "textbox"
65
+ # ],
66
+ # title="Automatic Speech Recognition",
67
+ # description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
68
+ # ).launch()
69
+
70
  import warnings
 
 
 
71
  import gradio as gr
 
 
 
72
  from transformers import pipeline
73
  from transformers import AutoProcessor
74
  from pyctcdecode import build_ctcdecoder
75
  from transformers import Wav2Vec2ProcessorWithLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ import os
78
+ import re
79
+ #import torchaudio
80
+
81
+ # Initialize the speech recognition pipeline and transliterator
82
+ odia_model1 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-odia_v1")
83
+ odia_model2 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-odia_v2")
84
+ # p2 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
85
+ # punjaib_modle_30000=pipeline(task="automatic-speech-recognition", model="cdactvm/wav2vec-bert-punjabi-30000-model")
86
+ # punjaib_modle_155750=pipeline(task="automatic-speech-recognition", model="cdactvm/wav2vec-bert-punjabi-155750-model")
87
+ # punjaib_modle_70000_aug=pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-model-30000-augmented")
88
+ #p3 = pipeline(task="automatic-speech-recognition", model="cdactvm/kannada_w2v-bert_model")
89
+ #p4 = pipeline(task="automatic-speech-recognition", model="cdactvm/telugu_w2v-bert_model")
90
+ #p5 = pipeline(task="automatic-speech-recognition", model="Sajjo/w2v-bert-2.0-bangala-gpu-CV16.0_v2")
91
+ #p6 = pipeline(task="automatic-speech-recognition", model="cdactvm/hf-open-assames")
92
+ # p7 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-assames")
93
+ processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-odia_v2")
94
+ vocab_dict = processor.tokenizer.get_vocab()
95
+ sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
96
+ decoder = build_ctcdecoder(
97
+ labels=list(sorted_vocab_dict.keys()),
98
+ kenlm_model_path="lm.binary",
99
+ )
100
+ processor_with_lm = Wav2Vec2ProcessorWithLM(
101
+ feature_extractor=processor.feature_extractor,
102
+ tokenizer=processor.tokenizer,
103
+ decoder=decoder
104
+ )
105
+ processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
106
+ #p8 = pipeline("automatic-speech-recognition", model="cdactvm/w2v-assames", tokenizer=processor_with_lm, feature_extractor=processor_with_lm.feature_extractor, decoder=processor_with_lm.decoder)
107
+
108
+
109
+ os.system('git clone https://github.com/irshadbhat/indic-trans.git')
110
+ os.system('pip install ./indic-trans/.')
111
+
112
+ #HF_TOKEN = os.getenv('HF_TOKEN')
113
+ #hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "asr_demo")
114
+
115
+ from indictrans import Transliterator
116
+
117
+ ###########################################
118
+
119
+ # Function to replace incorrectly spelled words
120
+ def replace_words(sentence):
121
+ replacements = [
122
+ (r'\bjiro\b', 'zero'), (r'\bjero\b', 'zero'),
123
+ (r'\bnn\b', 'one'),(r'\bn\b', 'one'), (r'\bvan\b', 'one'),(r'\bna\b', 'one'), (r'\bnn\b', 'one'),(r'\bek\b', 'one'),
124
+ (r'\btu\b', 'two'),(r'\btoo\b', 'two'),(r'\bdo\b', 'two'),
125
+ (r'\bthiri\b', 'three'), (r'\btiri\b', 'three'), (r'\bdubalathri\b', 'double three'),(r'\btin\b', 'three'),
126
+ (r'\bfor\b', 'four'),(r'\bfore\b', 'four'),
127
+ (r'\bfib\b', 'five'),(r'\bpaanch\b', 'five'),
128
+ (r'\bchha\b', 'six'),(r'\bchhah\b', 'six'),(r'\bchau\b', 'six'),
129
+ (r'\bdublseven\b', 'double seven'),(r'\bsath\b', 'seven'),
130
+ (r'\baath\b', 'eight'),
131
+ (r'\bnau\b', 'nine'),
132
+ (r'\bdas\b', 'ten'),
133
+ (r'\bnineeit\b', 'nine eight'),
134
+ (r'\bfipeit\b', 'five eight'), (r'\bdubal\b', 'double'), (r'\bsevenatu\b', 'seven two'),
135
+ ]
136
+ for pattern, replacement in replacements:
137
+ sentence = re.sub(pattern, replacement, sentence)
138
+ return sentence
139
+
140
+ # Function to process "double" followed by a number
141
+ def process_doubles(sentence):
142
+ tokens = sentence.split()
143
+ result = []
144
+ i = 0
145
+ while i < len(tokens):
146
+ if tokens[i] in ("double", "dubal"):
147
+ if i + 1 < len(tokens):
148
+ result.append(tokens[i + 1])
149
+ result.append(tokens[i + 1])
150
+ i += 2
151
+ else:
152
+ result.append(tokens[i])
153
+ i += 1
154
+ else:
155
+ result.append(tokens[i])
156
+ i += 1
157
+ return ' '.join(result)
158
+
159
+ # Function to generate Soundex code for a word
160
+ def soundex(word):
161
+ word = word.upper()
162
+ word = ''.join(filter(str.isalpha, word))
163
+ if not word:
164
+ return None
165
+ soundex_mapping = {
166
+ 'B': '1', 'F': '1', 'P': '1', 'V': '1',
167
+ 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
168
+ 'D': '3', 'T': '3', 'L': '4', 'M': '5', 'N': '5', 'R': '6'
169
+ }
170
+ soundex_code = word[0]
171
+ for char in word[1:]:
172
+ if char not in ('H', 'W'):
173
+ soundex_code += soundex_mapping.get(char, '0')
174
+ soundex_code = soundex_code[0] + ''.join(c for i, c in enumerate(soundex_code[1:]) if c != soundex_code[i])
175
+ soundex_code = soundex_code.replace('0', '') + '000'
176
+ return soundex_code[:4]
177
+
178
+ # Function to convert text to numerical representation
179
+ def is_number(x):
180
+ if type(x) == str:
181
+ x = x.replace(',', '')
182
+ try:
183
+ float(x)
184
+ except:
185
+ return False
186
+ return True
187
+
188
+ def text2int(textnum, numwords={}):
189
+ units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
190
+ 'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
191
+ tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
192
+ scales = ['H536', 'T253', 'M450', 'C600']
193
+ ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
194
+ 'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
195
+ ordinal_endings = [('ieth', 'y'), ('th', '')]
196
+ if not numwords:
197
+ numwords['and'] = (1, 0)
198
+ for idx, word in enumerate(units): numwords[word] = (1, idx)
199
+ for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
200
+ for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
201
+
202
+ textnum = textnum.replace('-', ' ')
203
+
204
+ current = result = 0
205
+ curstring = ''
206
+ onnumber = False
207
+ lastunit = False
208
+ lastscale = False
209
+
210
+ def is_numword(x):
211
+ if is_number(x):
212
+ return True
213
+ if x in numwords:
214
+ return True
215
+ return False
216
+
217
+ def from_numword(x):
218
+ if is_number(x):
219
+ scale = 0
220
+ increment = int(x.replace(',', ''))
221
+ return scale, increment
222
+ return numwords[x]
223
+
224
+ for word in textnum.split():
225
+ if word in ordinal_words:
226
+ scale, increment = (1, ordinal_words[word])
227
+ current = current * scale + increment
228
+ if scale > 100:
229
+ result += current
230
+ current = 0
231
+ onnumber = True
232
+ lastunit = False
233
+ lastscale = False
234
+ else:
235
+ for ending, replacement in ordinal_endings:
236
+ if word.endswith(ending):
237
+ word = "%s%s" % (word[:-len(ending)], replacement)
238
+
239
+ if (not is_numword(word)) or (word == 'and' and not lastscale):
240
+ if onnumber:
241
+ curstring += repr(result + current) + " "
242
+ curstring += word + " "
243
+ result = current = 0
244
+ onnumber = False
245
+ lastunit = False
246
+ lastscale = False
247
+ else:
248
+ scale, increment = from_numword(word)
249
+ onnumber = True
250
+
251
+ if lastunit and (word not in scales):
252
+ curstring += repr(result + current)
253
+ result = current = 0
254
+
255
+ if scale > 1:
256
+ current = max(1, current)
257
+
258
+ current = current * scale + increment
259
+ if scale > 100:
260
+ result += current
261
+ current = 0
262
+
263
+ lastscale = False
264
+ lastunit = False
265
+ if word in scales:
266
+ lastscale = True
267
+ elif word in units:
268
+ lastunit = True
269
+
270
+ if onnumber:
271
+ curstring += repr(result + current)
272
+
273
+ return curstring
274
+
275
+ # Convert sentence to transcript using Soundex
276
+ def sentence_to_transcript(sentence, word_to_code_map):
277
+ words = sentence.split()
278
+ transcript_codes = []
279
+
280
+ for word in words:
281
+ if word not in word_to_code_map:
282
+ word_to_code_map[word] = soundex(word)
283
+ transcript_codes.append(word_to_code_map[word])
284
+
285
+ transcript = ' '.join(transcript_codes)
286
+ return transcript
287
+
288
+ # Convert transcript back to sentence using mapping
289
+ def transcript_to_sentence(transcript, code_to_word_map):
290
+ codes = transcript.split()
291
+ sentence_words = []
292
+
293
+ for code in codes:
294
+ sentence_words.append(code_to_word_map.get(code, code))
295
+
296
+ sentence = ' '.join(sentence_words)
297
+ return sentence
298
+
299
+ # # Process the audio file
300
+ # transcript = pipe("./odia_recorded/AUD-20240614-WA0004.wav")
301
+ # text_value = transcript['text']
302
+ # sentence = trn.transform(text_value)
303
+ # replaced_words = replace_words(sentence)
304
+ # processed_sentence = process_doubles(replaced_words)
305
+
306
+ # input_sentence_1 = processed_sentence
307
+
308
+ # Create empty mappings
309
+ word_to_code_map = {}
310
+ code_to_word_map = {}
311
+
312
+ # Convert sentence to transcript
313
+ # transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
314
+
315
+ # Convert transcript to numerical representation
316
+ # numbers = text2int(transcript_1)
317
+
318
+ # Create reverse mapping
319
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
320
+
321
+ def process_transcription(input_sentence):
322
+ word_to_code_map = {}
323
+ code_to_word_map = {}
324
+
325
+ transcript_1 = sentence_to_transcript(input_sentence, word_to_code_map)
326
+ if transcript_1 is None:
327
+ return "Error: Transcript conversion returned None"
328
+
329
+ numbers = text2int(transcript_1)
330
+ if numbers is None:
331
+ return "Error: Text to number conversion returned None"
332
+
333
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
334
+ text = transcript_to_sentence(numbers, code_to_word_map)
335
+ return text
336
+
337
+ ###########################################
338
+
339
+ def transcribe_punjabi_30000(speech):
340
+ text = punjaib_modle_30000(speech)["text"]
341
+ text = text.replace("[PAD]","")
342
+ if text is None:
343
+ return "Error: ASR returned None"
344
+ return text
345
+
346
+ def transcribe_punjabi_eng_model_30000(speech):
347
+ trn = Transliterator(source='pan', target='eng', build_lookup=True)
348
+ text = punjaib_modle_30000(speech)["text"]
349
+ text = text.replace("[PAD]","")
350
+ if text is None:
351
+ return "Error: ASR returned None"
352
+ sentence = trn.transform(text)
353
+ if sentence is None:
354
+ return "Error: Transliteration returned None"
355
+ replaced_words = replace_words(sentence)
356
+ processed_sentence = process_doubles(replaced_words)
357
+ return process_transcription(processed_sentence)
358
+ return sentence
359
+
360
+ def transcribe_punjabi_70000_aug(speech):
361
+ text = punjaib_modle_70000_aug(speech)["text"]
362
+ text = text.replace("<s>","")
363
+ if text is None:
364
+ return "Error: ASR returned None"
365
+ return text
366
+
367
+ def transcribe_punjabi_eng_model_70000_aug(speech):
368
+ trn = Transliterator(source='pan', target='eng', build_lookup=True)
369
+ text = punjaib_modle_70000_aug(speech)["text"]
370
+ text = text.replace("<s>","")
371
+ if text is None:
372
+ return "Error: ASR returned None"
373
+ sentence = trn.transform(text)
374
+ if sentence is None:
375
+ return "Error: Transliteration returned None"
376
+ replaced_words = replace_words(sentence)
377
+ processed_sentence = process_doubles(replaced_words)
378
+ return process_transcription(processed_sentence)
379
+ return sentence
380
+
381
+ def transcribe_punjabi_155750(speech):
382
+ text = punjaib_modle_155750(speech)["text"]
383
+ text = text.replace("[PAD]","")
384
+ if text is None:
385
+ return "Error: ASR returned None"
386
+ return text
387
+
388
+ def transcribe_punjabi_eng_model_155750(speech):
389
+ trn = Transliterator(source='pan', target='eng', build_lookup=True)
390
+ text = punjaib_modle_155750(speech)["text"]
391
+ text = text.replace("[PAD]","")
392
+ if text is None:
393
+ return "Error: ASR returned None"
394
+ sentence = trn.transform(text)
395
+ if sentence is None:
396
+ return "Error: Transliteration returned None"
397
+ replaced_words = replace_words(sentence)
398
+ processed_sentence = process_doubles(replaced_words)
399
+ return process_transcription(processed_sentence)
400
+ return sentence
401
+
402
+ ###########################################
403
+ def transcribe_odiya_model1(speech):
404
+ text = odia_model1(speech)["text"]
405
+ if text is None:
406
+ return "Error: ASR returned None"
407
+ return text
408
+
409
+ def transcribe_odiya_model2(speech):
410
+ text = odia_model2(speech)["text"]
411
+ if text is None:
412
+ return "Error: ASR returned None"
413
+ return text
414
+
415
+ def transcribe_odiya_eng_model1(speech):
416
+ trn = Transliterator(source='ori', target='eng', build_lookup=True)
417
+ text = odia_model1(speech)["text"]
418
+ if text is None:
419
+ return "Error: ASR returned None"
420
+ sentence = trn.transform(text)
421
+ if sentence is None:
422
+ return "Error: Transliteration returned None"
423
+ replaced_words = replace_words(sentence)
424
+ processed_sentence = process_doubles(replaced_words)
425
+ return process_transcription(processed_sentence)
426
+
427
+ def transcribe_odiya_eng_model2(speech):
428
+ trn = Transliterator(source='ori', target='eng', build_lookup=True)
429
+ text = odia_model2(speech)["text"]
430
+ if text is None:
431
+ return "Error: ASR returned None"
432
+ sentence = trn.transform(text)
433
+ if sentence is None:
434
+ return "Error: Transliteration returned None"
435
+ replaced_words = replace_words(sentence)
436
+ processed_sentence = process_doubles(replaced_words)
437
+ return process_transcription(processed_sentence)
438
+
439
+ ########################################
440
+ def cleanhtml(raw_html):
441
+ cleantext = re.sub(r'<.*?>', '', raw_html)
442
+ return cleantext
443
+ #######################################
444
+
445
+ # def transcribe_hindi(speech):
446
+ # text = p2(speech)["text"]
447
+ # if text is None:
448
+ # return "Error: ASR returned None"
449
+ # return text
450
+
451
+ def transcribe_hindi(speech):
452
+ text = p2(speech)["text"]
453
+ if text is None:
454
+ return "Error: ASR returned None"
455
+
456
+ hindi_map = {
457
+ "सेवन": "7",
458
+ "जीरो": "0",
459
+ "वन" : "1",
460
+ "टू" : "2",
461
+ "थ्री" : "3",
462
+ "त्री" : "3",
463
+ "फोर" : "4",
464
+ "फाइव": "5",
465
+ "सिक्स": "6",
466
+ "एट": "8",
467
+ "नाइन": "9",
468
+ "टेन": "10",
469
+ "एक": "1",
470
+ "दो": "2",
471
+ "तीन": "3",
472
+ "चार": "4",
473
+ "पांच": "5",
474
+ "पाँच": "5",
475
+ "छह": "6",
476
+ "छः": "6",
477
+ "सात": "7",
478
+ "आठ": "8",
479
+ "नौ": "9",
480
+ "दस": "10"
481
+ }
482
+
483
+ for hindi, num in hindi_map.items():
484
+ text = text.replace(hindi, num)
485
+
486
+ # Split the string into parts separated by spaces
487
+ parts = text.split(' ')
488
+
489
+ # Initialize an empty list to store the processed parts
490
+ processed_parts = []
491
+
492
+ # Iterate over each part
493
+ for part in parts:
494
+ # Check if the part is a number (contains only digits)
495
+ if part.isdigit():
496
+ # If the previous part was also a number, concatenate them
497
+ if processed_parts and processed_parts[-1].isdigit():
498
+ processed_parts[-1] += part
499
+ else:
500
+ processed_parts.append(part)
501
+ else:
502
+ # If the part is not a number, add it to the list as is
503
+ processed_parts.append(part)
504
+
505
+ # Join the processed parts back into a string with spaces
506
+ text = ' '.join(processed_parts)
507
+
508
+ return text
509
+
510
+ ###########################################################
511
+ def transcribe_kannada(speech):
512
+ text = p3(speech)["text"]
513
+ if text is None:
514
+ return "Error: ASR returned None"
515
+ return text
516
+ def transcribe_telugu(speech):
517
+ text = p4(speech)["text"]
518
+ if text is None:
519
+ return "Error: ASR returned None"
520
+ return text
521
+
522
+ def transcribe_bangala(speech):
523
+ text = p5(speech)["text"]
524
+ if text is None:
525
+ return "Error: ASR returned None"
526
+ return text
527
+
528
+ def transcribe_assamese_LM(speech):
529
+ text = p8(speech)["text"]
530
+ text = cleanhtml(text)
531
+ if text is None:
532
+ return "Error: ASR returned None"
533
+ return text
534
+
535
+ def transcribe_assamese_model2(speech):
536
+ text = p7(speech)["text"]
537
+ text = cleanhtml(text)
538
+ if text is None:
539
+ return "Error: ASR returned None"
540
+ return text
541
+
542
+ def transcribe_ban_eng(speech):
543
+ trn = Transliterator(source='ben', target='eng', build_lookup=True)
544
+ text = p5(speech)["text"]
545
+ if text is None:
546
+ return "Error: ASR returned None"
547
+ sentence = trn.transform(text)
548
+ if sentence is None:
549
+ return "Error: Transliteration returned None"
550
+ replaced_words = replace_words(sentence)
551
+ processed_sentence = process_doubles(replaced_words)
552
+ return process_transcription(processed_sentence)
553
+
554
+ def transcribe_hin_eng(speech):
555
+ trn = Transliterator(source='hin', target='eng', build_lookup=True)
556
+ text = p2(speech)["text"]
557
+ if text is None:
558
+ return "Error: ASR returned None"
559
+ sentence = trn.transform(text)
560
+ if sentence is None:
561
+ return "Error: Transliteration returned None"
562
+ replaced_words = replace_words(sentence)
563
+ processed_sentence = process_doubles(replaced_words)
564
+ return process_transcription(processed_sentence)
565
+
566
+ def transcribe_kan_eng(speech):
567
+ trn = Transliterator(source='kan', target='eng', build_lookup=True)
568
+ text = p3(speech)["text"]
569
+ if text is None:
570
+ return "Error: ASR returned None"
571
+ sentence = trn.transform(text)
572
+ if sentence is None:
573
+ return "Error: Transliteration returned None"
574
+ replaced_words = replace_words(sentence)
575
+ processed_sentence = process_doubles(replaced_words)
576
+ return process_transcription(processed_sentence)
577
+
578
+ def transcribe_tel_eng(speech):
579
+ trn = Transliterator(source='tel', target='eng', build_lookup=True)
580
+ text = p4(speech)["text"]
581
+ if text is None:
582
+ return "Error: ASR returned None"
583
+ sentence = trn.transform(text)
584
+ if sentence is None:
585
+ return "Error: Transliteration returned None"
586
+ replaced_words = replace_words(sentence)
587
+ processed_sentence = process_doubles(replaced_words)
588
+ return process_transcription(processed_sentence)
589
+
590
+
591
+ def sel_lng(lng, mic=None, file=None):
592
+ if mic is not None:
593
+ audio = mic
594
+ elif file is not None:
595
+ audio = file
596
+ else:
597
+ return "You must either provide a mic recording or a file"
598
+
599
+ if lng == "Odiya":
600
+ return transcribe_odiya(audio)
601
+ elif lng == "Odiya-trans":
602
+ return transcribe_odiya_eng(audio)
603
+ elif lng == "Hindi-trans":
604
+ return transcribe_hin_eng(audio)
605
+ elif lng == "Hindi":
606
+ return transcribe_hindi(audio)
607
+ elif lng == "Kannada-trans":
608
+ return transcribe_kan_eng(audio)
609
+ elif lng == "Kannada":
610
+ return transcribe_kannada(audio)
611
+ elif lng == "Telugu-trans":
612
+ return transcribe_tel_eng(audio)
613
+ elif lng == "Telugu":
614
+ return transcribe_telugu(audio)
615
+ elif lng == "Bangala-trans":
616
+ return transcribe_ban_eng(audio)
617
+ elif lng == "Bangala":
618
+ return transcribe_bangala(audio)
619
+ elif lng == "Assamese-LM":
620
+ return transcribe_assamese_LM(audio)
621
+ elif lng == "Assamese-Model2":
622
+ return transcribe_assamese_model2(audio)
623
+ elif lng == "Odia_model1":
624
+ return transcribe_odiya_model1(audio)
625
+ elif lng == "Odiya_trans_model1":
626
+ return transcribe_odiya_eng_model1(audio)
627
+ elif lng == "Odia_model2":
628
+ return transcribe_odiya_model2(audio)
629
+ elif lng == "Odia_trans_model2":
630
+ return transcribe_odiya_eng_model2(audio)
631
+ elif lng == "Punjabi_Model0":
632
+ return transcribe_punjabi_30000(audio)
633
+ elif lng == "Punjabi_Model0_Trans":
634
+ return transcribe_punjabi_eng_model_30000(audio)
635
+ elif lng == "Punjabi_Model_aug":
636
+ return transcribe_punjabi_70000_aug(audio)
637
+ elif lng == "Punjabi_Model_aug_Trans":
638
+ return transcribe_punjabi_eng_model_70000_aug(audio)
639
+ elif lng == "Punjabi_Model1":
640
+ return transcribe_punjabi_155750(audio)
641
+ elif lng == "Punjabi_Model1_Trans":
642
+ return transcribe_punjabi_eng_model_155750(audio)
643
+
644
+
645
+
646
+
647
+ # Convert transcript back to sentence
648
+ # reconstructed_sentence_1 = transcript_to_sentence(numbers, code_to_word_map)
649
 
650
+ # demo=gr.Interface(
651
+ # fn=sel_lng,
652
+
653
+ # inputs=[
654
+
655
+ # gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans"],value="Hindi",label="Select Language"),
656
+ # gr.Audio(source="microphone", type="filepath"),
657
+ # gr.Audio(source= "upload", type="filepath"),
658
+ # #gr.Audio(sources="upload", type="filepath"),
659
+ # #"state"
660
+ # ],
661
+ # outputs=[
662
+ # "textbox"
663
+ # # #"state"
664
+ # ],
665
+ # title="Automatic Speech Recognition",
666
+ # description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
667
+ # ).launch()
668
 
669
+ ######################################################
670
  demo=gr.Interface(
671
+ fn=sel_lng,
672
+
673
  inputs=[
674
+
675
+ #gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans","Kannada","Kannada-trans","Telugu","Telugu-trans","Bangala","Bangala-trans"],value="Hindi",label="Select Language"),
676
+ gr.Dropdown([
677
+ # "Hindi","Hindi-trans",
678
+ "Odia_model1","Odiya_trans_model1","Odia_model2","Odia_trans_model2"],label="Select Language"),
679
+ # "Assamese-LM","Assamese-Model2",
680
+ # "Punjabi_Model1","Punjabi_Model1_Trans","Punjabi_Model_aug","Punjabi_Model_aug_Trans"],value="Hindi",label="Select Language"),
681
  gr.Audio(sources=["microphone","upload"], type="filepath"),
682
+ #gr.Audio(sources="upload", type="filepath"),
683
+ #"state"
684
  ],
685
  outputs=[
686
  "textbox"
687
+ # #"state"
688
  ],
689
+ allow_flagging="auto",
690
+ #flagging_options=["Language error", "English transliteration error", "Other"],
691
+ #flagging_callback=hf_writer,
692
  title="Automatic Speech Recognition",
693
  description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
694
  ).launch()