Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

File size: 1,866 Bytes

import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio
import numpy as np
import re
import string

audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
text_output = gr.TextArea(label="متن فارسی",text_align="right",rtl=True,type="text")

processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
    
def ASR(audio):
   pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
   with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
        # Copy the contents of the uploaded audio file to the temporary file
        temp_audio_file.write(open(audio, "rb").read())
        temp_audio_file.flush()
        # Load the audio file using torchaudio
        waveform, sample_rate = torchaudio.load(temp_audio_file.name)
        # Resample the audio to 16kHz
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
        # Convert the PyTorch tensor to a NumPy ndarray
        # Preprocess the audio file
        input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
        attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
        # Transcribe the audio file
        with torch.no_grad():
            logits = model(input_values,attention_mask).logits
        # Decode the transcription
        result = processor.decode(torch.argmax(logits[0], dim=-1))
        return result
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
iface.launch(share=False)