SeyedAli's picture
Update app.py
b9e5247
raw
history blame
1.81 kB
import tempfile ,os
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC,pipeline
import torch
import numpy as np
import torchaudio
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
audio_input = gr.Audio(label="صوت گفتار فارسی", type="filepath")
text_output = gr.TextArea(label="متن فارسی", type="text")
def ASR(audio):
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
# Copy the contents of the uploaded audio file to the temporary file
temp_audio_file.write(open(audio, "rb").read())
temp_audio_file.flush()
# Load the audio file using torchaudio
waveform, sample_rate = torchaudio.load(temp_audio_file.name)
# Resample the audio to 16kHz
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
waveform = resampler(waveform)
# Convert the PyTorch tensor to a NumPy ndarray
# Preprocess the audio file
input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
# Transcribe the audio file
with torch.no_grad():
logits = model(input_values,attention_mask).logits
# Decode the transcription
transcription = processor.decode(torch.argmax(logits, dim=-1))
return transcription
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
iface.launch(share=False)