import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio


processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
text_output = gr.TextArea(label="متن فارسی", type="text")
def ASR(audio):
   pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
   with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
        # Copy the contents of the uploaded audio file to the temporary file
        temp_audio_file.write(open(audio, "rb").read())
        temp_audio_file.flush()
        # Load the audio file using torchaudio
        waveform, sample_rate = torchaudio.load(temp_audio_file.name)
        # Resample the audio to 16kHz
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
        # Convert the audio to a single channel
        waveform = torch.mean(waveform, dim=0, keepdim=True)
        # Convert the PyTorch tensor to a NumPy ndarray
        audio_array = waveform.numpy()
        #inputs = processor(audio_array, sampling_rate=16_000)
        text = pipe(audio_array)
        return text
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text')
iface.launch(share=False)