Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

App Files Files Community

Persian-Speech-Transcription / app.py

SeyedAli

Update app.py

b9e5247 about 1 year ago

raw

history blame

1.81 kB

	import tempfile ,os
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCTC,pipeline
	import torch
	import numpy as np
	import torchaudio


	processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	audio_input = gr.Audio(label="صوت گفتار فارسی", type="filepath")
	text_output = gr.TextArea(label="متن فارسی", type="text")
	def ASR(audio):
	pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
	with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
	# Copy the contents of the uploaded audio file to the temporary file
	temp_audio_file.write(open(audio, "rb").read())
	temp_audio_file.flush()
	# Load the audio file using torchaudio
	waveform, sample_rate = torchaudio.load(temp_audio_file.name)
	# Resample the audio to 16kHz
	resampler = torchaudio.transforms.Resample(sample_rate, 16000)
	waveform = resampler(waveform)
	# Convert the PyTorch tensor to a NumPy ndarray
	# Preprocess the audio file
	input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
	attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
	# Transcribe the audio file
	with torch.no_grad():
	logits = model(input_values,attention_mask).logits
	# Decode the transcription
	transcription = processor.decode(torch.argmax(logits, dim=-1))
	return transcription
	iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
	iface.launch(share=False)