Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

Update app.py

7c8c991 about 1 year ago

1.53 kB

	import tempfile ,os
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCTC,pipeline
	import torch
	import numpy as np
	import torchaudio


	processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
	text_output = gr.TextArea(label="متن فارسی", type="text")
	def ASR(audio):
	pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
	# Copy the contents of the uploaded audio file to the temporary file
	temp_audio_file.write(open(audio, "rb").read())
	temp_audio_file.flush()
	# Load the audio file using torchaudio
	waveform, sample_rate = torchaudio.load(temp_audio_file.name)
	# Resample the audio to 16kHz
	resampler = torchaudio.transforms.Resample(sample_rate, 16000)
	waveform = resampler(waveform)
	# Convert the audio to a single channel
	waveform = torch.mean(waveform, dim=0, keepdim=True)
	# Convert the PyTorch tensor to a NumPy ndarray
	audio_array = waveform.numpy()
	#inputs = processor(audio_array, sampling_rate=16_000)
	text = pipe(audio_array)
	return text
	iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text')
	iface.launch(share=False)