{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install --upgrade vocos encodec librosa" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import pprint\n", "import IPython.display as ipd\n", "import torch\n", "import librosa" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load model\n", "mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that the model is loaded, pick a reference audio to clone from. If you want to use deep clone, also specify its transcript. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download example ref audio\n", "!wget -O example.wav https://github.com/Camb-ai/mars5-tts/raw/master/docs/assets/example_ref.wav " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wav, sr = librosa.load('./example.wav', \n", " sr=mars5.sr, mono=True)\n", "wav = torch.from_numpy(wav)\n", "ref_transcript = \"We actually haven't managed to meet demand.\"\n", "print(\"Reference audio:\")\n", "ipd.display(ipd.Audio(wav.numpy(), rate=mars5.sr))\n", "print(f\"Reference transcript: {ref_transcript}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "deep_clone = True # set to False if you don't know prompt transcript or want fast inference.\n", "# Below you can tune other inference settings, like top_k, temperature, top_p, etc...\n", "cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100,\n", " top_k=100, temperature=0.7, freq_penalty=3)\n", "\n", "ar_codes, wav_out = mars5.tts(\"The quick brown rat.\", wav, \n", " ref_transcript,\n", " cfg=cfg)\n", "\n", "print('Synthesized output audio:')\n", "ipd.Audio(wav_out.numpy(), rate=mars5.sr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can see all the inference settings available to tune in the inference config here:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pprint.pprint(config_class())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can also listen to the vocoded raw coarse codes, for debugging purposes:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ar_wav = mars5.vocode(ar_codes.cpu()[:, None])\n", "ipd.Audio(ar_wav.numpy(), rate=mars5.sr)" ] } ], "metadata": { "kernelspec": { "display_name": "matt-py311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }