SeyedAli commited on
Commit
56a4486
1 Parent(s): 7c8c991

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -4
app.py CHANGED
@@ -8,7 +8,7 @@ import torchaudio
8
 
9
  processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
10
  model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
11
- audio_input = gr.inputs.Audio(label="صوت گفتار فارسی", type="filepath")
12
  text_output = gr.TextArea(label="متن فارسی", type="text")
13
  def ASR(audio):
14
  pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
@@ -21,12 +21,10 @@ def ASR(audio):
21
  # Resample the audio to 16kHz
22
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
23
  waveform = resampler(waveform)
24
- # Convert the audio to a single channel
25
- waveform = torch.mean(waveform, dim=0, keepdim=True)
26
  # Convert the PyTorch tensor to a NumPy ndarray
27
  audio_array = waveform.numpy()
28
  #inputs = processor(audio_array, sampling_rate=16_000)
29
  text = pipe(audio_array)
30
  return text
31
- iface = gr.Interface(fn=ASR, inputs=audio_input, outputs='text')
32
  iface.launch(share=False)
 
8
 
9
  processor = AutoProcessor.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
10
  model = AutoModelForCTC.from_pretrained("SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
11
+ audio_input = gr.Audio(label="صوت گفتار فارسی", type="filepath")
12
  text_output = gr.TextArea(label="متن فارسی", type="text")
13
  def ASR(audio):
14
  pipe = pipeline("automatic-speech-recognition", model="SeyedAli/Persian-Speech-Transcription-Wav2Vec2-V1")
 
21
  # Resample the audio to 16kHz
22
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
23
  waveform = resampler(waveform)
 
 
24
  # Convert the PyTorch tensor to a NumPy ndarray
25
  audio_array = waveform.numpy()
26
  #inputs = processor(audio_array, sampling_rate=16_000)
27
  text = pipe(audio_array)
28
  return text
29
+ iface = gr.Interface(fn=ASR, inputs=audio_input, outputs=text_output)
30
  iface.launch(share=False)