{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "cb5d0890-3f2d-4020-8270-f3a9bb9f63c6", "metadata": {}, "outputs": [], "source": [ "%%bash # install the vall-e and required libraries\n", "# PyTorch\n", "pip install torch==1.13.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116\n", "pip install torchmetrics==0.11.1\n", "# fbank\n", "pip install librosa==0.8.1\n", "\n", "# phonemizer pypinyin\n", "apt-get install espeak-ng\n", "## OSX: brew install espeak\n", "pip install phonemizer==3.2.1 pypinyin==0.48.0\n", "\n", "# lhotse update to newest version\n", "# https://github.com/lhotse-speech/lhotse/pull/956\n", "# https://github.com/lhotse-speech/lhotse/pull/960\n", "pip uninstall lhotse\n", "pip install lhotse\n", "\n", "# k2\n", "# find the right version in https://huggingface.co/csukuangfj/k2\n", "pip install https://huggingface.co/csukuangfj/k2/resolve/main/cuda/k2-1.23.4.dev20230224+cuda11.6.torch1.13.1-cp310-cp310-linux_x86_64.whl\n", "\n", "# icefall\n", "git clone https://github.com/k2-fsa/icefall\n", "cd icefall\n", "pip install -r requirements.txt\n", "export PYTHONPATH=`pwd`/../icefall:$PYTHONPATH\n", "echo \"export PYTHONPATH=`pwd`/../icefall:\\$PYTHONPATH\" >> ~/.zshrc\n", "echo \"export PYTHONPATH=`pwd`/../icefall:\\$PYTHONPATH\" >> ~/.bashrc\n", "cd -\n", "source ~/.zshrc\n", "\n", "# valle\n", "git clone https://github.com/lifeiteng/valle.git\n", "cd valle\n", "pip install -e ." ] }, { "cell_type": "code", "execution_count": 1, "id": "1b8a4af2-5851-4c41-96bb-bda4b259f857", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/dongsun/.local/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/dongsun/.local/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", " warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-21 14:36:33,978] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", "Use 8 cpu cores for computing\n" ] } ], "source": [ "import argparse\n", "import logging\n", "import os\n", "import pathlib\n", "import time\n", "import tempfile\n", "import platform\n", "import webbrowser\n", "import sys\n", "import torch, torchaudio\n", "import random\n", "\n", "import numpy as np\n", "\n", "from valle.data import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", " tokenize_audio,\n", " tokenize_text,\n", ")\n", "from icefall.utils import AttributeDict\n", "from valle.data.collation import get_text_token_collater\n", "from valle.models import get_model\n", "\n", "from vocos import Vocos\n", "from encodec.utils import convert_audio\n", "import multiprocessing\n", "\n", "thread_count = multiprocessing.cpu_count()\n", "\n", "print(\"Use\",thread_count,\"cpu cores for computing\")\n", "\n", "torch.set_num_threads(thread_count)\n", "torch.set_num_interop_threads(thread_count)\n", "torch._C._jit_set_profiling_executor(False)\n", "torch._C._jit_set_profiling_mode(False)\n", "torch._C._set_graph_executor_optimize(False)\n", "\n", "text_tokenizer = TextTokenizer(language='ko')\n", "\n", "device = torch.device(\"cpu\")\n", "if torch.cuda.is_available():\n", " device = torch.device(\"cuda\", 0)\n", "\n", "checkpoint = torch.load(\"./vall-e_ko_v0.pt\", map_location='cpu')\n", "model = get_model(AttributeDict(checkpoint))\n", "missing_keys, unexpected_keys = model.load_state_dict(\n", " checkpoint[\"model\"], strict=True\n", ")\n", "assert not missing_keys\n", "model.eval()\n", "model.to(device)\n", "text_collater = get_text_token_collater('./unique_text_tokens.k2symbols')\n", "\n", "# Encodec model\n", "audio_tokenizer = AudioTokenizer(device)\n", "\n", "# Vocos decoder\n", "vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)\n", "\n", "model.to(device)\n", "@torch.no_grad()\n", "def infer_from_prompt(text_prompt, audio_prompt, text):\n", " ## text to token\n", " text_tokens, text_tokens_lens = text_collater(\n", " [\n", " tokenize_text(\n", " text_tokenizer, text=f\"{text_prompt} {text}\".strip()\n", " )\n", " ]\n", " )\n", " _, enroll_x_lens = text_collater(\n", " [\n", " tokenize_text(\n", " text_tokenizer, text=f\"{text_prompt}\".strip()\n", " )\n", " ]\n", " )\n", " print('text_loaded')\n", "\n", " # text to synthesize\n", " wav_pr, sr = torchaudio.load(audio_prompt)\n", " wav_pr = convert_audio(wav_pr, sr, audio_tokenizer.sample_rate, audio_tokenizer.channels)\n", " audio_prompts = audio_tokenizer.encode(wav_pr.unsqueeze(0))[0][0].transpose(2, 1).to(device)\n", " print('Audio encoded')\n", "\n", " encoded_frames = model.inference(\n", " text_tokens.to(device), text_tokens_lens.to(device),\n", " audio_prompts, enroll_x_lens=enroll_x_lens,\n", " top_k=-100, temperature=1)\n", " vocos_features = vocos.codes_to_features(encoded_frames.permute(2, 0, 1))\n", " samples = vocos.decode(vocos_features, bandwidth_id=torch.tensor([2], device=device))\n", " message = f\"sythesized text: {text}\"\n", " return message, (24000, samples.squeeze(0).cpu().numpy())\n" ] }, { "cell_type": "markdown", "id": "fa6e2e1d-7522-43f0-985c-e731047acd9c", "metadata": {}, "source": [ "# Example" ] }, { "cell_type": "code", "execution_count": 2, "id": "41e40fe5-595e-4f9a-8dd7-dfda52944529", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <module> │\n", "│ │\n", "│ 1 text_prompt = '' # text of the audio │\n", "│ 2 audio_prompt = '' # path to the audio file │\n", "│ 3 text = '' # │\n", "│ ❱ 4 message, (sr, data) = infer_from_prompt(text_prompt, audio_prompt, text) │\n", "│ 5 │\n", "│ │\n", "│ /home/dongsun/.local/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27 in │\n", "│ decorate_context │\n", "│ │\n", "│ 24 │ │ @functools.wraps(func) │\n", "│ 25 │ │ def decorate_context(*args, **kwargs): │\n", "│ 26 │ │ │ with self.clone(): │\n", "│ ❱ 27 │ │ │ │ return func(*args, **kwargs) │\n", "│ 28 │ │ return cast(F, decorate_context) │\n", "│ 29 │ │\n", "│ 30 │ def _wrap_generator(self, func): │\n", "│ │\n", "│ in infer_from_prompt │\n", "│ │\n", "│ 64 │ ## text to token │\n", "│ 65 │ text_tokens, text_tokens_lens = text_collater( │\n", "│ 66 │ │ [ │\n", "│ ❱ 67 │ │ │ tokenize_text( │\n", "│ 68 │ │ │ │ text_tokenizer, text=f\"{text_prompt} {text}\".strip() │\n", "│ 69 │ │ │ ) │\n", "│ 70 │ │ ] │\n", "│ │\n", "│ /home/dongsun/vall-e/valle/data/tokenizer.py:178 in tokenize_text │\n", "│ │\n", "│ 175 │\n", "│ 176 def tokenize_text(tokenizer: TextTokenizer, text: str) -> List[str]: │\n", "│ 177 │ phonemes = tokenizer([text.strip()]) │\n", "│ ❱ 178 │ return phonemes[0] # k2symbols │\n", "│ 179 │\n", "│ 180 │\n", "│ 181 def remove_encodec_weight_norm(model): │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "IndexError: list index out of range\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m