import tempfile ,os import gradio as gr from transformers import AutoProcessor, AutoModelForCTC,pipeline import torch import numpy as np import torchaudio import numpy as np import re import string audio_input = gr.Audio(label="صوت گفتار فارسی",rtl=True, type="filepath") text_output = gr.TextArea(label="متن فارسی",text_align="right",rtl=True,type="text") processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") def ASR(audio): pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1") with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file: # Copy the contents of the uploaded audio file to the temporary file temp_audio_file.write(open(audio, "rb").read()) temp_audio_file.flush() # Load the audio file using torchaudio waveform, sample_rate = torchaudio.load(temp_audio_file.name) # Resample the audio to 16kHz resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform) # Convert the PyTorch tensor to a NumPy ndarray # Preprocess the audio file input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask # Transcribe the audio file with torch.no_grad(): logits = model(input_values,attention_mask).logits # Decode the transcription result = processor.decode(torch.argmax(logits[0], dim=-1)) return result iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output) iface.launch(share=False)