SeyedAli's picture
Update app.py
8aac9d1
raw
history blame
1.87 kB
import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio
import numpy as np
import re
import string
audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
text_output = gr.TextArea(label="متن فارسی",text_align="right",rtl=True,type="text")
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
def ASR(audio):
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
# Copy the contents of the uploaded audio file to the temporary file
temp_audio_file.write(open(audio, "rb").read())
temp_audio_file.flush()
# Load the audio file using torchaudio
waveform, sample_rate = torchaudio.load(temp_audio_file.name)
# Resample the audio to 16kHz
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
waveform = resampler(waveform)
# Convert the PyTorch tensor to a NumPy ndarray
# Preprocess the audio file
input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
# Transcribe the audio file
with torch.no_grad():
logits = model(input_values,attention_mask).logits
# Decode the transcription
result = processor.decode(torch.argmax(logits[0], dim=-1))
return result
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
iface.launch(share=False)