mohAhmad commited on
Commit
1956df9
1 Parent(s): fc391ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -86,8 +86,18 @@ elif options == "Phoneme Practice":
86
 
87
  if uploaded_audio:
88
  with st.spinner("Analyzing phonemes..."):
89
- audio_input, _ = torchaudio.load(uploaded_audio)
90
- input_values = phoneme_processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
 
 
 
 
 
 
 
 
 
 
91
  logits = phoneme_model(input_values).logits
92
  predicted_ids = torch.argmax(logits, dim=-1)
93
  transcription = phoneme_processor.batch_decode(predicted_ids)
 
86
 
87
  if uploaded_audio:
88
  with st.spinner("Analyzing phonemes..."):
89
+ waveform, sample_rate = torchaudio.load(uploaded_audio)
90
+
91
+ # Ensure mono audio by averaging channels if stereo
92
+ if waveform.shape[0] > 1:
93
+ waveform = waveform.mean(dim=0, keepdim=True)
94
+
95
+ # Resample if needed
96
+ if sample_rate != 16000:
97
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
98
+ waveform = resampler(waveform)
99
+
100
+ input_values = phoneme_processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
101
  logits = phoneme_model(input_values).logits
102
  predicted_ids = torch.argmax(logits, dim=-1)
103
  transcription = phoneme_processor.batch_decode(predicted_ids)