Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

SeyedAli commited on Sep 21, 2023

Commit

56a4486

•

1 Parent(s): 7c8c991

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torchaudio
 processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
-audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
 text_output = gr.TextArea(label="متن فارسی", type="text")
 def ASR(audio):
    pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
@@ -21,12 +21,10 @@ def ASR(audio):
         # Resample the audio to 16kHz
         resampler = torchaudio.transforms.Resample(sample_rate, 16000)
         waveform = resampler(waveform)
-        # Convert the audio to a single channel
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
         # Convert the PyTorch tensor to a NumPy ndarray
         audio_array = waveform.numpy()
         #inputs = processor(audio_array, sampling_rate=16_000)
         text = pipe(audio_array)
         return text
-iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text')
 iface.launch(share=False)

 processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
+audio_input = gr.Audio(label="صوت گفتار فارسی", type="filepath")
 text_output = gr.TextArea(label="متن فارسی", type="text")
 def ASR(audio):
    pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
         # Resample the audio to 16kHz
         resampler = torchaudio.transforms.Resample(sample_rate, 16000)
         waveform = resampler(waveform)
         # Convert the PyTorch tensor to a NumPy ndarray
         audio_array = waveform.numpy()
         #inputs = processor(audio_array, sampling_rate=16_000)
         text = pipe(audio_array)
         return text
+iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
 iface.launch(share=False)