Spaces:

SeyedAli
/

Persian-Speech-Transcription

Running

SeyedAli commited on Sep 21, 2023

Commit

b9e5247

•

1 Parent(s): f1cb24f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,14 +19,15 @@ def ASR(audio):
         # Load the audio file using torchaudio
         waveform, sample_rate = torchaudio.load(temp_audio_file.name)
         # Resample the audio to 16kHz
-        # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-        # waveform = resampler(waveform)
         # Convert the PyTorch tensor to a NumPy ndarray
         # Preprocess the audio file
         input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
         # Transcribe the audio file
         with torch.no_grad():
-            logits = model(input_values).logits
         # Decode the transcription
         transcription = processor.decode(torch.argmax(logits, dim=-1))
         return transcription

         # Load the audio file using torchaudio
         waveform, sample_rate = torchaudio.load(temp_audio_file.name)
         # Resample the audio to 16kHz
+        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+        waveform = resampler(waveform)
         # Convert the PyTorch tensor to a NumPy ndarray
         # Preprocess the audio file
         input_values = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").input_values
+        attention_mask = processor(waveform.squeeze().numpy(),sampling_rate=16_000, return_tensors="pt").attention_mask
         # Transcribe the audio file
         with torch.no_grad():
+            logits = model(input_values,attention_mask).logits
         # Decode the transcription
         transcription = processor.decode(torch.argmax(logits, dim=-1))
         return transcription