In [None]:
%%bash # install the vall-e and required libraries
# PyTorch
pip install torch==1.13.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
pip install torchmetrics==0.11.1
# fbank
pip install librosa==0.8.1

# phonemizer pypinyin
apt-get install espeak-ng
## OSX: brew install espeak
pip install phonemizer==3.2.1 pypinyin==0.48.0

# lhotse update to newest version
# https://github.com/lhotse-speech/lhotse/pull/956
# https://github.com/lhotse-speech/lhotse/pull/960
pip uninstall lhotse
pip install lhotse

# k2
# find the right version in https://huggingface.co/csukuangfj/k2
pip install https://huggingface.co/csukuangfj/k2/resolve/main/cuda/k2-1.23.4.dev20230224+cuda11.6.torch1.13.1-cp310-cp310-linux_x86_64.whl

# icefall
git clone https://github.com/k2-fsa/icefall
cd icefall
pip install -r requirements.txt
export PYTHONPATH=`pwd`/../icefall:$PYTHONPATH
echo "export PYTHONPATH=`pwd`/../icefall:\$PYTHONPATH" >> ~/.zshrc
echo "export PYTHONPATH=`pwd`/../icefall:\$PYTHONPATH" >> ~/.bashrc
cd -
source ~/.zshrc

# valle
git clone https://github.com/lifeiteng/valle.git
cd valle
pip install -e .

In [1]:
import argparse
import logging
import os
import pathlib
import time
import tempfile
import platform
import webbrowser
import sys
import torch, torchaudio
import random

import numpy as np

from valle.data import (
    AudioTokenizer,
    TextTokenizer,
    tokenize_audio,
    tokenize_text,
)
from icefall.utils import AttributeDict
from valle.data.collation import get_text_token_collater
from valle.models import get_model

from vocos import Vocos
from encodec.utils import convert_audio
import multiprocessing

thread_count = multiprocessing.cpu_count()

print("Use",thread_count,"cpu cores for computing")

torch.set_num_threads(thread_count)
torch.set_num_interop_threads(thread_count)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)
torch._C._set_graph_executor_optimize(False)

text_tokenizer = TextTokenizer(language='ko')

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda", 0)

checkpoint = torch.load("./vall-e_ko_v0.pt", map_location='cpu')
model = get_model(AttributeDict(checkpoint))
missing_keys, unexpected_keys = model.load_state_dict(
    checkpoint["model"], strict=True
)
assert not missing_keys
model.eval()
model.to(device)
text_collater = get_text_token_collater('./unique_text_tokens.k2symbols')

# Encodec model
audio_tokenizer = AudioTokenizer(device)

# Vocos decoder
vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)

model.to(device)
@torch.no_grad()
def infer_from_prompt(text_prompt, audio_prompt, text):
    ## text to token
    text_tokens, text_tokens_lens = text_collater(
        [
            tokenize_text(
                text_tokenizer, text=f"{text_prompt} {text}".strip()
            )
        ]
    )
    _, enroll_x_lens = text_collater(
        [
            tokenize_text(
                text_tokenizer, text=f"{text_prompt}".strip()
            )
        ]
    )
    print('text_loaded')

    # text to synthesize
    wav_pr, sr = torchaudio.load(audio_prompt)
    wav_pr = convert_audio(wav_pr, sr, audio_tokenizer.sample_rate, audio_tokenizer.channels)
    audio_prompts = audio_tokenizer.encode(wav_pr.unsqueeze(0))[0][0].transpose(2, 1).to(device)
    print('Audio encoded')

    encoded_frames = model.inference(
        text_tokens.to(device), text_tokens_lens.to(device),
        audio_prompts, enroll_x_lens=enroll_x_lens,
        top_k=-100, temperature=1)
    vocos_features = vocos.codes_to_features(encoded_frames.permute(2, 0, 1))
    samples = vocos.decode(vocos_features, bandwidth_id=torch.tensor([2], device=device))
    message = f"sythesized text: {text}"
    return message, (24000, samples.squeeze(0).cpu().numpy())


  warn(


[2023-09-21 14:36:33,978] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Use 8 cpu cores for computing


# Example

In [2]:
text_prompt = '' # text of the audio 
audio_prompt = '' # path to the audio file
text = '' # 
message, (sr, data) = infer_from_prompt(text_prompt, audio_prompt, text)

In [None]:
print(message)
from IPython.display import Audio
Audio(data, rate=sr)

# Simple Gradio App

In [3]:
!pip install gradio

Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
import gradio as gr
app = gr.Blocks(title="VALL-E Korean")
with app:
    #gr.Markdown(top_md)
    with gr.Tab("VALL-E Korean Demo"):
        #gr.Markdown(infer_from_prompt_md)
        with gr.Row():
            with gr.Column():
                text_prompt = gr.TextArea(label="Input Text",
                                      placeholder="Type text in the audio file (Korean)",)
                audio_prompt= gr.Audio(label="Input Audio", source='upload', interactive=True, type="filepath")
                text_input = gr.TextArea(label="Output Text",
                                      placeholder="Type text you want to generate (Korean)",)
            with gr.Column():
                text_output = gr.Textbox(label="Message")
                audio_output= gr.Audio(label="Output Audio")
                btn = gr.Button("Generate!")
                btn.click(infer_from_prompt,
                          inputs=[text_prompt, audio_prompt, text_input],
                          outputs=[text_output, audio_output])
webbrowser.open("http://127.0.0.1:7860")
app.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://b3512daf295a0b63b1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




text_loaded
Audio encoded
VALL-E EOS [356 -> 899]


