SeyedAli's picture
Update app.py
7c8c991
raw
history blame
1.53 kB
import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
text_output = gr.TextArea(label="متن فارسی", type="text")
def ASR(audio):
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
# Copy the contents of the uploaded audio file to the temporary file
temp_audio_file.write(open(audio, "rb").read())
temp_audio_file.flush()
# Load the audio file using torchaudio
waveform, sample_rate = torchaudio.load(temp_audio_file.name)
# Resample the audio to 16kHz
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
waveform = resampler(waveform)
# Convert the audio to a single channel
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert the PyTorch tensor to a NumPy ndarray
audio_array = waveform.numpy()
#inputs = processor(audio_array, sampling_rate=16_000)
text = pipe(audio_array)
return text
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text')
iface.launch(share=False)