Update app.py
Browse files
app.py
CHANGED
@@ -13,43 +13,6 @@ text_output = gr.TextArea(label="متن فارسی", type="text")
|
|
13 |
|
14 |
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
15 |
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
16 |
-
|
17 |
-
chars_to_ignore = [
|
18 |
-
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
|
19 |
-
"#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
|
20 |
-
"“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
|
21 |
-
]
|
22 |
-
chars_to_mapping = {
|
23 |
-
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
|
24 |
-
}
|
25 |
-
|
26 |
-
def multiple_replace(text, chars_to_mapping):
|
27 |
-
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
28 |
-
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
29 |
-
|
30 |
-
def remove_special_characters(text, chars_to_ignore_regex):
|
31 |
-
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
|
32 |
-
return text
|
33 |
-
|
34 |
-
def normalizer(batch, chars_to_ignore, chars_to_mapping):
|
35 |
-
chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
|
36 |
-
text = batch[0].lower().strip()
|
37 |
-
|
38 |
-
text = text.replace("\u0307", " ").strip()
|
39 |
-
text = multiple_replace(text, chars_to_mapping)
|
40 |
-
text = remove_special_characters(text, chars_to_ignore_regex)
|
41 |
-
|
42 |
-
batch = text
|
43 |
-
return batch
|
44 |
-
|
45 |
-
|
46 |
-
def speech_file_to_array_fn(batch):
|
47 |
-
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
48 |
-
speech_array = speech_array.squeeze().numpy()
|
49 |
-
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
|
50 |
-
|
51 |
-
batch["speech"] = speech_array
|
52 |
-
return batch
|
53 |
|
54 |
def ASR(audio):
|
55 |
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
@@ -70,12 +33,7 @@ def ASR(audio):
|
|
70 |
with torch.no_grad():
|
71 |
logits = model(input_values,attention_mask).logits
|
72 |
# Decode the transcription
|
73 |
-
#result = normalizer(processor.batch_decode(torch.argmax(logits[0], dim=-1)),chars_to_ignore,chars_to_mapping)
|
74 |
result = processor.decode(torch.argmax(logits[0], dim=-1))
|
75 |
-
# max_items = np.random.randint(0, len(result), 10).tolist()
|
76 |
-
# for i in max_items:
|
77 |
-
# transcription=result[i]
|
78 |
-
# return transcription
|
79 |
return result
|
80 |
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
|
81 |
iface.launch(share=False)
|
|
|
13 |
|
14 |
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
15 |
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def ASR(audio):
|
18 |
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
|
|
|
33 |
with torch.no_grad():
|
34 |
logits = model(input_values,attention_mask).logits
|
35 |
# Decode the transcription
|
|
|
36 |
result = processor.decode(torch.argmax(logits[0], dim=-1))
|
|
|
|
|
|
|
|
|
37 |
return result
|
38 |
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
|
39 |
iface.launch(share=False)
|