|
import tempfile ,os |
|
import gradio as gr |
|
from transformers import AutoProcessor, AutoModelForCTC,pipeline |
|
import torch |
|
import numpy as np |
|
import torchaudio |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath") |
|
text_output = gr.TextArea(label="متن فارسی", type="text") |
|
def ASR(audio): |
|
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file: |
|
|
|
temp_audio_file.write(open(audio, "rb").read()) |
|
temp_audio_file.flush() |
|
|
|
waveform, sample_rate = torchaudio.load(temp_audio_file.name) |
|
|
|
resampler = torchaudio.transforms.Resample(sample_rate, 16000) |
|
waveform = resampler(waveform) |
|
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
audio_array = waveform.numpy() |
|
|
|
text = pipe(audio_array) |
|
return text |
|
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text') |
|
iface.launch(share=False) |