import tempfile ,os import gradio as gr from transformers import AutoProcessor, AutoModelForCTC,pipeline import torch import numpy as np import torchaudio processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath") text_output = gr.TextArea(label="متن فارسی", type="text") def ASR(audio): pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file: # Copy the contents of the uploaded audio file to the temporary file temp_audio_file.write(open(audio, "rb").read()) temp_audio_file.flush() # Load the audio file using torchaudio waveform, sample_rate = torchaudio.load(temp_audio_file.name) # Resample the audio to 16kHz resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform) # Convert the audio to a single channel waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert the PyTorch tensor to a NumPy ndarray audio_array = waveform.numpy() #inputs = processor(audio_array, sampling_rate=16_000) text = pipe(audio_array) return text iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text') iface.launch(share=False)