Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,34 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
4 |
+
|
5 |
+
This model is trained on Google's AudioSet (28GB data) for 1 million steps. (Originally planned 2 million steps, but I'm exploring better training schedule)
|
6 |
+
|
7 |
+
You can regard it as a pretrained base model, which is common in language models but not for vocoders.
|
8 |
+
|
9 |
+
How to load and use this model:
|
10 |
+
|
11 |
+
```python
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
from scipy.io.wavfile import write
|
15 |
+
with torch.no_grad():
|
16 |
+
from vocos import Vocos
|
17 |
+
A = torch.load("./vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt", map_location="cpu")
|
18 |
+
V = Vocos.from_hparams("./config.yaml")
|
19 |
+
V.load_state_dict(A['state_dict'], strict=False)
|
20 |
+
V.eval()
|
21 |
+
def safe_log(x: torch.Tensor, clip_val: float = 1e-7):
|
22 |
+
return torch.log(torch.clip(x, min=clip_val))
|
23 |
+
voice, sr = torchaudio.load('example.wav') # must be sample_rate=32000
|
24 |
+
if sr != 32000:
|
25 |
+
raise ValueError
|
26 |
+
mel = torchaudio.transforms.MelSpectrogram(
|
27 |
+
sample_rate=32000, n_fft=2048, hop_length=1024, n_mels=128, center=True, power=1,
|
28 |
+
)(voice)
|
29 |
+
mel = safe_log(mel)
|
30 |
+
audio = V.decode(mel)
|
31 |
+
write('out.wav', 32000, audio.flatten().numpy())
|
32 |
+
```
|
33 |
+
|
34 |
+
|