|
import tempfile ,os |
|
import gradio as gr |
|
from transformers import AutoProcessor, AutoModelForCTC,pipeline |
|
import torch |
|
import numpy as np |
|
import torchaudio |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
audio_input = gr.Audio(label="صوت گفتار فارسی", type="filepath") |
|
text_output = gr.TextArea(label="متن فارسی", type="text") |
|
def ASR(audio): |
|
pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") |
|
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file: |
|
|
|
temp_audio_file.write(open(audio, "rb").read()) |
|
temp_audio_file.flush() |
|
|
|
waveform, sample_rate = torchaudio.load(temp_audio_file.name) |
|
|
|
resampler = torchaudio.transforms.Resample(sample_rate, 16000) |
|
waveform = resampler(waveform) |
|
|
|
|
|
input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values |
|
attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask |
|
|
|
with torch.no_grad(): |
|
logits = model(input_values,attention_mask).logits |
|
|
|
transcription = processor.decode(torch.argmax(logits, dim=-1)) |
|
return transcription |
|
iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output) |
|
iface.launch(share=False) |