File size: 1,866 Bytes
12a7531
 
da75630
12a7531
 
 
71e5c09
 
 
12a7531
8aac9d1
42332b0
12a7531
 
 
10f2106
12a7531
 
da75630
 
 
 
318958b
 
da9d83e
b9e5247
 
318958b
8052521
f1cb24f
b9e5247
8052521
 
b9e5247
8052521
277ff1a
71e5c09
56a4486
12a7531
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio
import numpy as np
import re
import string

audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
text_output = gr.TextArea(label="متن فارسی",text_align="right",rtl=True,type="text")

processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
    
def ASR(audio):
   pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
   with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
        # Copy the contents of the uploaded audio file to the temporary file
        temp_audio_file.write(open(audio, "rb").read())
        temp_audio_file.flush()
        # Load the audio file using torchaudio
        waveform, sample_rate = torchaudio.load(temp_audio_file.name)
        # Resample the audio to 16kHz
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
        # Convert the PyTorch tensor to a NumPy ndarray
        # Preprocess the audio file
        input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
        attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
        # Transcribe the audio file
        with torch.no_grad():
            logits = model(input_values,attention_mask).logits
        # Decode the transcription
        result = processor.decode(torch.argmax(logits[0], dim=-1))
        return result
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
iface.launch(share=False)