Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import librosa | |
import soundfile as sf | |
import gradio as gr | |
import torchaudio | |
import os | |
from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder | |
fa_encoder = FACodecEncoder( | |
ngf=32, | |
up_ratios=[2, 4, 5, 5], | |
out_channels=256, | |
) | |
fa_decoder = FACodecDecoder( | |
in_channels=256, | |
upsample_initial_channel=1024, | |
ngf=32, | |
up_ratios=[5, 5, 4, 2], | |
vq_num_q_c=2, | |
vq_num_q_p=1, | |
vq_num_q_r=3, | |
vq_dim=256, | |
codebook_dim=8, | |
codebook_size_prosody=10, | |
codebook_size_content=10, | |
codebook_size_residual=10, | |
use_gr_x_timbre=True, | |
use_gr_residual_f0=True, | |
use_gr_residual_phone=True, | |
) | |
fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin")) | |
fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin")) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
fa_encoder = fa_encoder.to(device) | |
fa_decoder = fa_decoder.to(device) | |
fa_encoder.eval() | |
fa_decoder.eval() | |
def codec_inference(speech_path): | |
with torch.no_grad(): | |
wav, sr = librosa.load(speech_path, sr=16000) | |
wav = torch.tensor(wav).to(device).unsqueeze(0).unsqueeze(0) | |
enc_out = fa_encoder(wav) | |
vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder( | |
enc_out, eval_vq=False, vq=True | |
) | |
recon_wav = fa_decoder.inference(vq_post_emb, spk_embs) | |
os.makedirs("temp", exist_ok=True) | |
result_path = "temp/result.wav" | |
sf.write(result_path, recon_wav[0, 0].cpu().numpy(), 16000) | |
return result_path | |
demo_inputs = [ | |
gr.Audio( | |
sources=["upload", "microphone"], | |
label="Upload the speech file", | |
type="filepath", | |
), | |
] | |
demo_outputs = gr.Audio(label="") | |
demo = gr.Interface( | |
fn=codec_inference, | |
inputs=demo_inputs, | |
outputs=demo_outputs, | |
title="NaturalSpeech3 FACodec", | |
description= | |
""" | |
## FACodec: Speech Codec with Attribute Factorization used for NaturalSpeech 3 | |
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2403.03100.pdf) | |
[![demo](https://img.shields.io/badge/FACodec-Demo-red)](https://speechresearch.github.io/naturalspeech3/) | |
[![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/naturalspeech3_facodec) | |
## Overview | |
FACodec is a core component of the advanced text-to-speech (TTS) model NaturalSpeech 3. FACodec converts complex speech waveform into disentangled subspaces representing speech attributes of content, prosody, timbre, and acoustic details and reconstruct high-quality speech waveform from these attributes. FACodec decomposes complex speech into subspaces representing different attributes, thus simplifying the modeling of speech representation. | |
Research can use FACodec to develop different modes of TTS models, such as non-autoregressive based discrete diffusion (NaturalSpeech 3) or autoregressive models (like VALL-E). | |
""", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |