skytnt commited on
Commit
dde297c
β€’
1 Parent(s): 5e9e468

fix vc bugs

Browse files
Files changed (1) hide show
  1. app.py +7 -2
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
 
3
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
4
 
 
5
  import numpy as np
6
  import torch
7
  from torch import no_grad, LongTensor
@@ -34,9 +35,13 @@ def tts_fn(text, speaker_id):
34
 
35
  def vc_fn(original_speaker_id, target_speaker_id, input_audio):
36
  sampling_rate, audio = input_audio
37
- y = torch.FloatTensor(audio.astype(np.float32)) / hps.data.max_wav_value
 
 
 
 
 
38
  y = y.unsqueeze(0)
39
-
40
  spec = spectrogram_torch(y, hps.data.filter_length,
41
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
42
  center=False)
 
2
 
3
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
4
 
5
+ import librosa
6
  import numpy as np
7
  import torch
8
  from torch import no_grad, LongTensor
 
35
 
36
  def vc_fn(original_speaker_id, target_speaker_id, input_audio):
37
  sampling_rate, audio = input_audio
38
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
39
+ if len(audio.shape) > 1:
40
+ audio = librosa.to_mono(audio.transpose(1, 0))
41
+ if sampling_rate != hps.data.sampling_rate:
42
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
43
+ y = torch.FloatTensor(audio)
44
  y = y.unsqueeze(0)
 
45
  spec = spectrogram_torch(y, hps.data.filter_length,
46
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
47
  center=False)