Update app.py
Browse files
app.py
CHANGED
@@ -86,8 +86,18 @@ elif options == "Phoneme Practice":
|
|
86 |
|
87 |
if uploaded_audio:
|
88 |
with st.spinner("Analyzing phonemes..."):
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
logits = phoneme_model(input_values).logits
|
92 |
predicted_ids = torch.argmax(logits, dim=-1)
|
93 |
transcription = phoneme_processor.batch_decode(predicted_ids)
|
|
|
86 |
|
87 |
if uploaded_audio:
|
88 |
with st.spinner("Analyzing phonemes..."):
|
89 |
+
waveform, sample_rate = torchaudio.load(uploaded_audio)
|
90 |
+
|
91 |
+
# Ensure mono audio by averaging channels if stereo
|
92 |
+
if waveform.shape[0] > 1:
|
93 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
94 |
+
|
95 |
+
# Resample if needed
|
96 |
+
if sample_rate != 16000:
|
97 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
98 |
+
waveform = resampler(waveform)
|
99 |
+
|
100 |
+
input_values = phoneme_processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
|
101 |
logits = phoneme_model(input_values).logits
|
102 |
predicted_ids = torch.argmax(logits, dim=-1)
|
103 |
transcription = phoneme_processor.batch_decode(predicted_ids)
|