Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

SeyedAli commited on Sep 21, 2023

Commit

7c8c991

•

1 Parent(s): 66237b4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torchaudio
 processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
-#text_output = gr.Textbox(label="متن فارسی", type="text")
 def ASR(audio):
    pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
@@ -22,7 +22,7 @@ def ASR(audio):
         resampler = torchaudio.transforms.Resample(sample_rate, 16000)
         waveform = resampler(waveform)
         # Convert the audio to a single channel
-        waveform = torchaudio.functional.downmix_mono(waveform)
         # Convert the PyTorch tensor to a NumPy ndarray
         audio_array = waveform.numpy()
         #inputs = processor(audio_array, sampling_rate=16_000)

 processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
+text_output = gr.TextArea(label="متن فارسی", type="text")
 def ASR(audio):
    pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
         resampler = torchaudio.transforms.Resample(sample_rate, 16000)
         waveform = resampler(waveform)
         # Convert the audio to a single channel
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
         # Convert the PyTorch tensor to a NumPy ndarray
         audio_array = waveform.numpy()
         #inputs = processor(audio_array, sampling_rate=16_000)