import json import os import subprocess from pathlib import Path import gradio as gr import librosa import numpy as np import torch from demucs.apply import apply_model from demucs.pretrained import DEFAULT_MODEL, get_model from huggingface_hub import hf_hub_download, list_repo_files from so_vits_svc_fork.hparams import HParams from so_vits_svc_fork.inference.core import Svc ################################################################### # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS ################################################################### # The Hugging Face Hub repo ID repo_id = "Shashashasha/kvinka" # If None, Uses latest ckpt in the repo ckpt_name = None # If None, Uses "kmeans.pt" if it exists in the repo cluster_model_name = None # Set the default f0 type to use - use the one it was trained on. # The default for so-vits-svc-fork is "dio". # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest" default_f0_method = "crepe" # The default ratio of cluster inference to SVC inference. # If cluster_model_name is not found in the repo, this is set to 0. default_cluster_infer_ratio = 0.5 # Limit on duration of audio at inference time. increase if you can # In this parent app, we set the limit with an env var to 30 seconds # If you didnt set env var + you go OOM try changing 9e9 to <=300ish duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) ################################################################### # Figure out the latest generator by taking highest value one. # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth if ckpt_name is None: latest_id = sorted( [ int(Path(x).stem.split("_")[1]) for x in list_repo_files(repo_id) if x.startswith("G_") and x.endswith(".pth") ] )[-1] ckpt_name = f"G_{latest_id}.pth" cluster_model_name = cluster_model_name or "kmeans.pt" if cluster_model_name in list_repo_files(repo_id): print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}") cluster_model_path = hf_hub_download(repo_id, cluster_model_name) else: print(f"Could not find {cluster_model_name} in {repo_id}. Using None") cluster_model_path = None default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 generator_path = hf_hub_download(repo_id, ckpt_name) config_path = hf_hub_download(repo_id, "config.json") hparams = HParams(**json.loads(Path(config_path).read_text())) speakers = list(hparams.spk.keys()) device = "cuda" if torch.cuda.is_available() else "cpu" model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) demucs_model = get_model(DEFAULT_MODEL) def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0): wav, sr = librosa.load(filename, mono=False, sr=sr) wav = torch.tensor(wav) ref = wav.mean(0) wav = (wav - ref.mean()) / ref.std() sources = apply_model( model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs )[0] sources = sources * ref.std() + ref.mean() # We take just the vocals stem. I know the vocals for this model are at index -1 # If using different model, check model.sources.index('vocals') vocal_wav = sources[-1] # I did this because its the same normalization the so-vits model required vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1) vocal_wav = vocal_wav.numpy() vocal_wav = librosa.to_mono(vocal_wav) vocal_wav = vocal_wav.T instrumental_wav = sources[:-1].sum(0).numpy().T return vocal_wav, instrumental_wav def download_youtube_clip( video_identifier, start_time, end_time, output_filename, num_attempts=5, url_base="https://www.youtube.com/watch?v=", quiet=False, force=False, ): output_path = Path(output_filename) if output_path.exists(): if not force: return output_path else: output_path.unlink() quiet = "--quiet --no-warnings" if quiet else "" command = f""" yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501 """.strip() attempts = 0 while True: try: _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: attempts += 1 if attempts == num_attempts: return None else: break if output_path.exists(): return output_path else: return None def predict( speaker, audio, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: str = "crepe", db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, ): audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) audio = model.infer_silence( audio.astype(np.float32), speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, ) return model.target_sample, audio SPACE_ID = "nateraw/voice-cloning" description = f""" # Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU.
Duplicate Space
#### This app uses models trained with [so-vits-svc-fork](https://github.com/voicepaw/so-vits-svc-fork) to clone a voice. Model currently being used is https://hf.co/{repo_id}. To change the model being served, duplicate the space and update the `repo_id`/other settings in `app.py`. #### Train Your Own: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nateraw/voice-cloning/blob/main/training_so_vits_svc_fork.ipynb) """.strip() article = """

Github Repo

""".strip() interface_mic = gr.Interface( predict, inputs=[ gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), gr.Audio(type="filepath", source="microphone", label="Source Audio"), gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), gr.Checkbox(False, label="Auto Predict F0"), gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"), gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), gr.Dropdown( choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value=default_f0_method, label="f0 method", ), ], outputs="audio", title="Voice Cloning", description=description, article=article, ) interface_file = gr.Interface( predict, inputs=[ gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), gr.Audio(type="filepath", source="upload", label="Source Audio"), gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), gr.Checkbox(False, label="Auto Predict F0"), gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"), gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), gr.Dropdown( choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value=default_f0_method, label="f0 method", ), ], outputs="audio", title="Voice Cloning", description=description, article=article, ) interface = gr.TabbedInterface( [interface_mic, interface_file], ["Clone From Mic", "Clone From File"], ) if __name__ == "__main__": interface.launch()