Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,900 Bytes
3629250 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import torch
import librosa
import soundfile as sf
import gradio as gr
import torchaudio
import os
from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder
fa_encoder = FACodecEncoder(
ngf=32,
up_ratios=[2, 4, 5, 5],
out_channels=256,
)
fa_decoder = FACodecDecoder(
in_channels=256,
upsample_initial_channel=1024,
ngf=32,
up_ratios=[5, 5, 4, 2],
vq_num_q_c=2,
vq_num_q_p=1,
vq_num_q_r=3,
vq_dim=256,
codebook_dim=8,
codebook_size_prosody=10,
codebook_size_content=10,
codebook_size_residual=10,
use_gr_x_timbre=True,
use_gr_residual_f0=True,
use_gr_residual_phone=True,
)
fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin"))
fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fa_encoder = fa_encoder.to(device)
fa_decoder = fa_decoder.to(device)
fa_encoder.eval()
fa_decoder.eval()
def codec_inference(speech_path):
with torch.no_grad():
wav, sr = librosa.load(speech_path, sr=16000)
wav = torch.tensor(wav).to(device).unsqueeze(0).unsqueeze(0)
enc_out = fa_encoder(wav)
vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder(
enc_out, eval_vq=False, vq=True
)
recon_wav = fa_decoder.inference(vq_post_emb, spk_embs)
os.makedirs("temp", exist_ok=True)
result_path = "temp/result.wav"
sf.write(result_path, recon_wav[0, 0].cpu().numpy(), 16000)
return result_path
demo_inputs = [
gr.Audio(
sources=["upload", "microphone"],
label="Upload the speech file",
type="filepath",
),
]
demo_outputs = gr.Audio(label="")
demo = gr.Interface(
fn=codec_inference,
inputs=demo_inputs,
outputs=demo_outputs,
title="NaturalSpeech3 FACodec",
)
if __name__ == "__main__":
demo.launch()
|