diff --git a/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py b/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..135006046929ad3d8c385cec975f5854e881e01d --- /dev/null +++ b/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py @@ -0,0 +1 @@ +from .api import DreamVoice \ No newline at end of file diff --git a/dreamvoice/.ipynb_checkpoints/api-checkpoint.py b/dreamvoice/.ipynb_checkpoints/api-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c373aa0089b79a2400b0cae867d31f7946e5b7 --- /dev/null +++ b/dreamvoice/.ipynb_checkpoints/api-checkpoint.py @@ -0,0 +1,295 @@ +import os +import requests +import yaml +import torch +import librosa +import numpy as np +import soundfile as sf +from pathlib import Path +from transformers import T5Tokenizer, T5EncoderModel +from tqdm import tqdm +from .src.vc_wrapper import ReDiffVC, DreamVC +from .src.plugin_wrapper import DreamVG +from .src.modules.speaker_encoder.encoder import inference as spk_encoder +from .src.modules.BigVGAN.inference import load_model as load_vocoder +from .src.feats.contentvec_hf import get_content_model, get_content + + +class DreamVoice: + def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16): + # Initial setup + script_dir = Path(__file__).resolve().parent + config_path = script_dir / config + + # Load configuration file + with open(config_path, 'r') as fp: + self.config = yaml.safe_load(fp) + + self.script_dir = script_dir + + # Ensure all checkpoints are downloaded + self._ensure_checkpoints_exist() + + # Initialize attributes + self.device = device + self.sr = self.config['sample_rate'] + + # Load vocoder + vocoder_path = script_dir / self.config['vocoder_path'] + self.hifigan, _ = load_vocoder(vocoder_path, device) + self.hifigan.eval() + + # Load content model + self.content_model = get_content_model().to(device) + + # Load tokenizer and text encoder + lm_path = self.config['lm_path'] + self.tokenizer = T5Tokenizer.from_pretrained(lm_path) + self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() + + # Set mode + self.mode = mode + if mode == 'plugin': + self._init_plugin_mode() + elif mode == 'end2end': + self._init_end2end_mode() + else: + raise NotImplementedError("Select mode from 'plugin' and 'end2end'") + + # chunk inputs to 10s clips + self.chunk_size = chunk_size * 50 + + def _ensure_checkpoints_exist(self): + checkpoints = [ + ('vocoder_path', self.config.get('vocoder_url')), + ('vocoder_config_path', self.config.get('vocoder_config_url')), + ('speaker_path', self.config.get('speaker_url')), + ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')), + ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')), + ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url')) + ] + + for path_key, url in checkpoints: + local_path = self._get_local_path(path_key) + if not local_path.exists() and url: + print(f"Downloading {path_key} from {url}") + self._download_file(url, local_path) + + def _get_local_path(self, path_key): + keys = path_key.split('.') + local_path = self.config + for key in keys: + local_path = local_path.get(key, {}) + return self.script_dir / local_path + + def _download_file(self, url, local_path): + try: + # Attempt to send a GET request to the URL + response = requests.get(url, stream=True) + response.raise_for_status() # Ensure we raise an exception for HTTP errors + except requests.exceptions.RequestException as e: + # Log the error for debugging purposes + print(f"Error encountered: {e}") + + # Development mode: prompt user for Hugging Face API key + user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.") + self.hf_key = user_input if user_input else None + + # Set headers if an API key is provided + headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {} + + try: + # Attempt to send a GET request with headers in development mode + response = requests.get(url, stream=True, headers=headers) + response.raise_for_status() # Ensure we raise an exception for HTTP errors + except requests.exceptions.RequestException as e: + # Log the error for debugging purposes + print(f"Error encountered in dev mode: {e}") + response = None # Handle response accordingly in your code + + local_path.parent.mkdir(parents=True, exist_ok=True) + + total_size = int(response.headers.get('content-length', 0)) + block_size = 8192 + t = tqdm(total=total_size, unit='iB', unit_scale=True) + + with open(local_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + t.update(len(chunk)) + f.write(chunk) + t.close() + + def _init_plugin_mode(self): + # Initialize ReDiffVC + self.dreamvc = ReDiffVC( + config_path=self.script_dir / self.config['rediffvc']['config_path'], + ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'], + device=self.device + ) + + # Initialize DreamVG + self.dreamvg = DreamVG( + config_path=self.script_dir / self.config['dreamvg']['config_path'], + ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'], + device=self.device + ) + + # Load speaker encoder + spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device) + self.spk_encoder = spk_encoder + self.spk_embed_cache = None + + def _init_end2end_mode(self): + # Initialize DreamVC + self.dreamvc = DreamVC( + config_path=self.script_dir / self.config['dreamvc']['config_path'], + ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'], + device=self.device + ) + + def _load_content(self, audio_path): + content_audio, _ = librosa.load(audio_path, sr=16000) + # Calculate the required length to make it a multiple of 16*160 + target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160) + # Pad with zeros if necessary + if len(content_audio) < target_length: + content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant') + content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device) + content_clip = get_content(self.content_model, content_audio) + return content_clip + + def load_spk_embed(self, emb_path): + self.spk_embed_cache = torch.load(emb_path, map_location=self.device) + + def save_spk_embed(self, emb_path): + assert self.spk_embed_cache is not None + torch.save(self.spk_embed_cache.cpu(), emb_path) + + def save_audio(self, output_path, audio, sr): + sf.write(output_path, audio, samplerate=sr) + + @torch.no_grad() + def genvc(self, content_audio, prompt, + prompt_guidance_scale=3, prompt_guidance_rescale=0.0, + prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None, + vc_guidance_scale=3, vc_guidance_rescale=0.7, + vc_ddim_steps=50, vc_eta=1, vc_random_seed=None, + ): + + content_clip = self._load_content(content_audio) + + text_batch = self.tokenizer(prompt, max_length=32, + padding='max_length', truncation=True, return_tensors="pt") + text, text_mask = text_batch.input_ids.to(self.device), \ + text_batch.attention_mask.to(self.device) + text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0] + + if self.mode == 'plugin': + spk_embed = self.dreamvg.inference([text, text_mask], + guidance_scale=prompt_guidance_scale, + guidance_rescale=prompt_guidance_rescale, + ddim_steps=prompt_ddim_steps, eta=prompt_eta, + random_seed=prompt_random_seed) + + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference( + spk_embed, content_clip_chunk, None, + guidance_scale=vc_guidance_scale, + guidance_rescale=vc_guidance_rescale, + ddim_steps=vc_ddim_steps, + eta=vc_eta, + random_seed=vc_random_seed) + + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + self.spk_embed_cache = spk_embed + + elif self.mode == 'end2end': + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip, + guidance_scale=prompt_guidance_scale, + guidance_rescale=prompt_guidance_rescale, + ddim_steps=prompt_ddim_steps, + eta=prompt_eta, random_seed=prompt_random_seed) + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + else: + raise NotImplementedError("Select mode from 'plugin' and 'end2end'") + + gen_audio = self.hifigan(gen_audio.squeeze(1)) + gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0) + + return gen_audio, self.sr + + @torch.no_grad() + def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False, + vc_guidance_scale=3, vc_guidance_rescale=0.7, + vc_ddim_steps=50, vc_eta=1, vc_random_seed=None, + ): + + assert self.mode == 'plugin' + if speaker_audio is not None: + speaker_audio, _ = librosa.load(speaker_audio, sr=16000) + speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device) + spk_embed = spk_encoder.embed_utterance_batch(speaker_audio) + self.spk_embed_cache = spk_embed + elif use_spk_cache: + assert self.spk_embed_cache is not None + spk_embed = self.spk_embed_cache + else: + raise NotImplementedError + + content_clip = self._load_content(content_audio) + + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference( + spk_embed, content_clip_chunk, None, + guidance_scale=vc_guidance_scale, + guidance_rescale=vc_guidance_rescale, + ddim_steps=vc_ddim_steps, + eta=vc_eta, + random_seed=vc_random_seed) + + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + gen_audio = self.hifigan(gen_audio.squeeze(1)) + gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0) + + return gen_audio, self.sr + + +if __name__ == '__main__': + dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda') + content_audio = 'test.wav' + speaker_audio = 'speaker.wav' + prompt = 'young female voice, sounds young and cute' + gen_audio, sr = dreamvoice.genvc('test.wav', prompt) + dreamvoice.save_audio('debug.wav', gen_audio, sr) \ No newline at end of file diff --git a/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml b/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..99c2360c57f57b92ca9885db6100c787bc245d02 --- /dev/null +++ b/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml @@ -0,0 +1,26 @@ +version: 1.0 + +sample_rate: 24000 +vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt' +vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt' +vocoder_config_path: 'ckpts/bigvgan_24k/config.json' +vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json' + +speaker_path: 'ckpts/spk_encoder/pretrained.pt' +speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt' +lm_path: 'google/flan-t5-base' + +dreamvc: + config_path: 'src/configs/diffvc_cross.yaml' + ckpt_path: 'ckpts/dreamvc_cross.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt' + +rediffvc: + config_path: 'src/configs/diffvc_base.yaml' + ckpt_path: 'ckpts/dreamvc_base.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt' + +dreamvg: + config_path: 'src/configs/plugin_cross.yaml' + ckpt_path: 'ckpts/dreamvc_plugin.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt' diff --git a/dreamvoice/__init__.py b/dreamvoice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..135006046929ad3d8c385cec975f5854e881e01d --- /dev/null +++ b/dreamvoice/__init__.py @@ -0,0 +1 @@ +from .api import DreamVoice \ No newline at end of file diff --git a/dreamvoice/__pycache__/__init__.cpython-310.pyc b/dreamvoice/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7c5475eb77d8de07fe11c9d5d25465f28e31306 Binary files /dev/null and b/dreamvoice/__pycache__/__init__.cpython-310.pyc differ diff --git a/dreamvoice/__pycache__/__init__.cpython-311.pyc b/dreamvoice/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8526b942e2021f0cd0d020249790f61cdbf79394 Binary files /dev/null and b/dreamvoice/__pycache__/__init__.cpython-311.pyc differ diff --git a/dreamvoice/__pycache__/api.cpython-310.pyc b/dreamvoice/__pycache__/api.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3be2021bd271cb57cc65b0e990d8d0a074b6b81a Binary files /dev/null and b/dreamvoice/__pycache__/api.cpython-310.pyc differ diff --git a/dreamvoice/__pycache__/api.cpython-311.pyc b/dreamvoice/__pycache__/api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3bb6b037f07a4ed4e99f40d26b2481e300591f2 Binary files /dev/null and b/dreamvoice/__pycache__/api.cpython-311.pyc differ diff --git a/dreamvoice/api.py b/dreamvoice/api.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c373aa0089b79a2400b0cae867d31f7946e5b7 --- /dev/null +++ b/dreamvoice/api.py @@ -0,0 +1,295 @@ +import os +import requests +import yaml +import torch +import librosa +import numpy as np +import soundfile as sf +from pathlib import Path +from transformers import T5Tokenizer, T5EncoderModel +from tqdm import tqdm +from .src.vc_wrapper import ReDiffVC, DreamVC +from .src.plugin_wrapper import DreamVG +from .src.modules.speaker_encoder.encoder import inference as spk_encoder +from .src.modules.BigVGAN.inference import load_model as load_vocoder +from .src.feats.contentvec_hf import get_content_model, get_content + + +class DreamVoice: + def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16): + # Initial setup + script_dir = Path(__file__).resolve().parent + config_path = script_dir / config + + # Load configuration file + with open(config_path, 'r') as fp: + self.config = yaml.safe_load(fp) + + self.script_dir = script_dir + + # Ensure all checkpoints are downloaded + self._ensure_checkpoints_exist() + + # Initialize attributes + self.device = device + self.sr = self.config['sample_rate'] + + # Load vocoder + vocoder_path = script_dir / self.config['vocoder_path'] + self.hifigan, _ = load_vocoder(vocoder_path, device) + self.hifigan.eval() + + # Load content model + self.content_model = get_content_model().to(device) + + # Load tokenizer and text encoder + lm_path = self.config['lm_path'] + self.tokenizer = T5Tokenizer.from_pretrained(lm_path) + self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() + + # Set mode + self.mode = mode + if mode == 'plugin': + self._init_plugin_mode() + elif mode == 'end2end': + self._init_end2end_mode() + else: + raise NotImplementedError("Select mode from 'plugin' and 'end2end'") + + # chunk inputs to 10s clips + self.chunk_size = chunk_size * 50 + + def _ensure_checkpoints_exist(self): + checkpoints = [ + ('vocoder_path', self.config.get('vocoder_url')), + ('vocoder_config_path', self.config.get('vocoder_config_url')), + ('speaker_path', self.config.get('speaker_url')), + ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')), + ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')), + ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url')) + ] + + for path_key, url in checkpoints: + local_path = self._get_local_path(path_key) + if not local_path.exists() and url: + print(f"Downloading {path_key} from {url}") + self._download_file(url, local_path) + + def _get_local_path(self, path_key): + keys = path_key.split('.') + local_path = self.config + for key in keys: + local_path = local_path.get(key, {}) + return self.script_dir / local_path + + def _download_file(self, url, local_path): + try: + # Attempt to send a GET request to the URL + response = requests.get(url, stream=True) + response.raise_for_status() # Ensure we raise an exception for HTTP errors + except requests.exceptions.RequestException as e: + # Log the error for debugging purposes + print(f"Error encountered: {e}") + + # Development mode: prompt user for Hugging Face API key + user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.") + self.hf_key = user_input if user_input else None + + # Set headers if an API key is provided + headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {} + + try: + # Attempt to send a GET request with headers in development mode + response = requests.get(url, stream=True, headers=headers) + response.raise_for_status() # Ensure we raise an exception for HTTP errors + except requests.exceptions.RequestException as e: + # Log the error for debugging purposes + print(f"Error encountered in dev mode: {e}") + response = None # Handle response accordingly in your code + + local_path.parent.mkdir(parents=True, exist_ok=True) + + total_size = int(response.headers.get('content-length', 0)) + block_size = 8192 + t = tqdm(total=total_size, unit='iB', unit_scale=True) + + with open(local_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + t.update(len(chunk)) + f.write(chunk) + t.close() + + def _init_plugin_mode(self): + # Initialize ReDiffVC + self.dreamvc = ReDiffVC( + config_path=self.script_dir / self.config['rediffvc']['config_path'], + ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'], + device=self.device + ) + + # Initialize DreamVG + self.dreamvg = DreamVG( + config_path=self.script_dir / self.config['dreamvg']['config_path'], + ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'], + device=self.device + ) + + # Load speaker encoder + spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device) + self.spk_encoder = spk_encoder + self.spk_embed_cache = None + + def _init_end2end_mode(self): + # Initialize DreamVC + self.dreamvc = DreamVC( + config_path=self.script_dir / self.config['dreamvc']['config_path'], + ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'], + device=self.device + ) + + def _load_content(self, audio_path): + content_audio, _ = librosa.load(audio_path, sr=16000) + # Calculate the required length to make it a multiple of 16*160 + target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160) + # Pad with zeros if necessary + if len(content_audio) < target_length: + content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant') + content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device) + content_clip = get_content(self.content_model, content_audio) + return content_clip + + def load_spk_embed(self, emb_path): + self.spk_embed_cache = torch.load(emb_path, map_location=self.device) + + def save_spk_embed(self, emb_path): + assert self.spk_embed_cache is not None + torch.save(self.spk_embed_cache.cpu(), emb_path) + + def save_audio(self, output_path, audio, sr): + sf.write(output_path, audio, samplerate=sr) + + @torch.no_grad() + def genvc(self, content_audio, prompt, + prompt_guidance_scale=3, prompt_guidance_rescale=0.0, + prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None, + vc_guidance_scale=3, vc_guidance_rescale=0.7, + vc_ddim_steps=50, vc_eta=1, vc_random_seed=None, + ): + + content_clip = self._load_content(content_audio) + + text_batch = self.tokenizer(prompt, max_length=32, + padding='max_length', truncation=True, return_tensors="pt") + text, text_mask = text_batch.input_ids.to(self.device), \ + text_batch.attention_mask.to(self.device) + text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0] + + if self.mode == 'plugin': + spk_embed = self.dreamvg.inference([text, text_mask], + guidance_scale=prompt_guidance_scale, + guidance_rescale=prompt_guidance_rescale, + ddim_steps=prompt_ddim_steps, eta=prompt_eta, + random_seed=prompt_random_seed) + + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference( + spk_embed, content_clip_chunk, None, + guidance_scale=vc_guidance_scale, + guidance_rescale=vc_guidance_rescale, + ddim_steps=vc_ddim_steps, + eta=vc_eta, + random_seed=vc_random_seed) + + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + self.spk_embed_cache = spk_embed + + elif self.mode == 'end2end': + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip, + guidance_scale=prompt_guidance_scale, + guidance_rescale=prompt_guidance_rescale, + ddim_steps=prompt_ddim_steps, + eta=prompt_eta, random_seed=prompt_random_seed) + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + else: + raise NotImplementedError("Select mode from 'plugin' and 'end2end'") + + gen_audio = self.hifigan(gen_audio.squeeze(1)) + gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0) + + return gen_audio, self.sr + + @torch.no_grad() + def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False, + vc_guidance_scale=3, vc_guidance_rescale=0.7, + vc_ddim_steps=50, vc_eta=1, vc_random_seed=None, + ): + + assert self.mode == 'plugin' + if speaker_audio is not None: + speaker_audio, _ = librosa.load(speaker_audio, sr=16000) + speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device) + spk_embed = spk_encoder.embed_utterance_batch(speaker_audio) + self.spk_embed_cache = spk_embed + elif use_spk_cache: + assert self.spk_embed_cache is not None + spk_embed = self.spk_embed_cache + else: + raise NotImplementedError + + content_clip = self._load_content(content_audio) + + B, L, D = content_clip.shape + gen_audio_chunks = [] + num_chunks = (L + self.chunk_size - 1) // self.chunk_size + for i in range(num_chunks): + start_idx = i * self.chunk_size + end_idx = min((i + 1) * self.chunk_size, L) + content_clip_chunk = content_clip[:, start_idx:end_idx, :] + + gen_audio_chunk = self.dreamvc.inference( + spk_embed, content_clip_chunk, None, + guidance_scale=vc_guidance_scale, + guidance_rescale=vc_guidance_rescale, + ddim_steps=vc_ddim_steps, + eta=vc_eta, + random_seed=vc_random_seed) + + gen_audio_chunks.append(gen_audio_chunk) + + gen_audio = torch.cat(gen_audio_chunks, dim=-1) + + gen_audio = self.hifigan(gen_audio.squeeze(1)) + gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0) + + return gen_audio, self.sr + + +if __name__ == '__main__': + dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda') + content_audio = 'test.wav' + speaker_audio = 'speaker.wav' + prompt = 'young female voice, sounds young and cute' + gen_audio, sr = dreamvoice.genvc('test.wav', prompt) + dreamvoice.save_audio('debug.wav', gen_audio, sr) \ No newline at end of file diff --git a/dreamvoice/ckpts/bigvgan_24k/config.json b/dreamvoice/ckpts/bigvgan_24k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b675d4137fc8072bfb1c436d8f040a5d15e6a1 --- /dev/null +++ b/dreamvoice/ckpts/bigvgan_24k/config.json @@ -0,0 +1,44 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 32, + "learning_rate": 0.0001, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [10,6,4,2], + "upsample_kernel_sizes": [20,12,8,4], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "activation": "snakebeta", + "snake_logscale": true, + + "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], + "mpd_reshapes": [2, 3, 5, 7, 11], + "use_spectral_norm": false, + "discriminator_channel_mult": 1, + + "segment_size": 12000, + "num_mels": 128, + "n_fft": 1920, + "hop_size": 480, + "win_size": 1920, + + "sampling_rate": 24000, + + "fmin": 0, + "fmax": 12000, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt b/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a09d182d11b6c54fdc56dc4244677f6a64aee32f --- /dev/null +++ b/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683a7baafedda8ec2fd2409deff61bd58ae66fbf10630550a17fcfed6f728977 +size 58405452 diff --git a/dreamvoice/ckpts/dreamvc_base.pt b/dreamvoice/ckpts/dreamvc_base.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef704709ef4ec22ee7d26c10da40df5a2d3c6fe3 --- /dev/null +++ b/dreamvoice/ckpts/dreamvc_base.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5abe034bf590e2ce0405c66e950dc61f041629731e959cb09e2009688cd1254c +size 300117179 diff --git a/dreamvoice/ckpts/dreamvc_cross.pt b/dreamvoice/ckpts/dreamvc_cross.pt new file mode 100644 index 0000000000000000000000000000000000000000..086c5e76e9fe898d80529c9fb066365d1e8433c4 --- /dev/null +++ b/dreamvoice/ckpts/dreamvc_cross.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b4eb1e62b1bf4e157edc2766b9b4461c0be0f7d98a970d6b087f3797c35920 +size 451974443 diff --git a/dreamvoice/ckpts/dreamvc_plugin.pt b/dreamvoice/ckpts/dreamvc_plugin.pt new file mode 100644 index 0000000000000000000000000000000000000000..7beb2c8aa484bd12d78adf11dfb23a2138c77e06 --- /dev/null +++ b/dreamvoice/ckpts/dreamvc_plugin.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2396f6b96e9057e73e20eee173d7aaded6b5eb70745a9f5282999c0ea9a4d848 +size 104892440 diff --git a/dreamvoice/ckpts/spk_encoder/pretrained.pt b/dreamvoice/ckpts/spk_encoder/pretrained.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cd2e41ea79e4aeb8414c7ed9993c42ab5b0dc28 --- /dev/null +++ b/dreamvoice/ckpts/spk_encoder/pretrained.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e +size 17090379 diff --git a/dreamvoice/dreamvc.yaml b/dreamvoice/dreamvc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..99c2360c57f57b92ca9885db6100c787bc245d02 --- /dev/null +++ b/dreamvoice/dreamvc.yaml @@ -0,0 +1,26 @@ +version: 1.0 + +sample_rate: 24000 +vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt' +vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt' +vocoder_config_path: 'ckpts/bigvgan_24k/config.json' +vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json' + +speaker_path: 'ckpts/spk_encoder/pretrained.pt' +speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt' +lm_path: 'google/flan-t5-base' + +dreamvc: + config_path: 'src/configs/diffvc_cross.yaml' + ckpt_path: 'ckpts/dreamvc_cross.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt' + +rediffvc: + config_path: 'src/configs/diffvc_base.yaml' + ckpt_path: 'ckpts/dreamvc_base.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt' + +dreamvg: + config_path: 'src/configs/plugin_cross.yaml' + ckpt_path: 'ckpts/dreamvc_plugin.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt' diff --git a/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3 --- /dev/null +++ b/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py @@ -0,0 +1,103 @@ +import os +import torch +import librosa +import numpy as np +import soundfile as sf +import pandas as pd +# from feats.hubert_model import get_soft_model, get_hubert_soft_content +from feats.contentvec_hf import get_content_model, get_content +# from modules.speaker_encoder.encoder import inference as spk_encoder +# from pathlib import Path +from tqdm import tqdm +from multiprocessing import Process +import pyworld as pw + + +def resample_save(infolder, audio_path, model, + audio_sr=24000, content_sr=16000, min_length=1.92, + content_resolution=50, + save_path='features'): + if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False: + audio, sr = librosa.load(infolder + audio_path, sr=content_sr) + final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution) + # final_length = final_length / content_sr + + length = max(round(min_length*content_sr), round(final_length)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + + # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = get_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = content.cpu() + os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True) + torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt') + # print(audio_save.shape) + # print(content.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr)) + # print(save_path + '/' + 'audio_16k/' + audio_path) + + audio, sr = librosa.load(infolder + audio_path, sr=audio_sr) + length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + # print(audio_save.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr)) + + +def extract_f0(in_folder, audio_path, save_path): + audio, sr = librosa.load(in_folder + audio_path, sr=None) + assert sr == 16000 + if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False: + # wav = audio + # wav = np.pad(wav, int((1024-320)/2), mode='reflect') + # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr, + # fmin=librosa.note_to_hz('C2'), + # fmax=librosa.note_to_hz('C6')) + + _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000) + f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1] + + f0 = np.nan_to_num(f0) + os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True) + # print(save_path + '/' + 'f0/' + audio_path + '.pt') + torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt') + + +def chunks(arr, m): + result = [[] for i in range(m)] + for i in range(len(arr)): + result[i%m].append(arr[i]) + return result + + +def extract_f0_main(in_folder, audio_paths, save_path): + for audio_path in tqdm(audio_paths): + extract_f0(in_folder, audio_path, save_path) + + +if __name__ == '__main__': + df = pd.read_csv('../test_data/vc_meta.csv') + # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda') + model = get_content_model().to('cuda') + # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda") + for i in tqdm(range(len(df))): + row = df.iloc[i] + in_path = row['path'] + resample_save('../test_data/', in_path, model, save_path='../features/') + + in_folder = '../features/audio_16k/' + audio_files = list(df['path']) + save_path = '../features/' + cores = 6 + + subsets = chunks(audio_files, cores) + + for subset in subsets: + t = Process(target=extract_f0_main, args=(in_folder, subset, save_path)) + t.start() \ No newline at end of file diff --git a/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5 --- /dev/null +++ b/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py @@ -0,0 +1,76 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.p2e_cross import P2E_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class DreamVG(object): + def __init__(self, + config_path='configs/plugin_cross.yaml', + ckpt_path='../ckpts/dreamvc_plugin.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = P2E_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.spk_shape = config['model']['unet']['in_channels'] + + @torch.no_grad() + def inference(self, text, + guidance_scale=5, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023, + ): + text, text_mask = text + self.model.eval() + + gen_shape = (1, self.spk_shape) + + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, text, text_mask, train_cfg=False) + output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, text, text_mask, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5) + pred = scale_shift_re(latents, 1/self.scale, self.shift) + # pred = torch.clip(pred, min=0.0, max=0.5) + return pred \ No newline at end of file diff --git a/dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3 --- /dev/null +++ b/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py @@ -0,0 +1,144 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.model import DiffVC +from .model.model_cross import DiffVC_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class ReDiffVC(object): + def __init__(self, + config_path='configs/diffvc_base.yaml', + ckpt_path='../ckpts/dreamvc_base.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + spk_embed, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + + +class DreamVC(object): + def __init__(self, + config_path='configs/diffvc_cross.yaml', + ckpt_path='../ckpts/dreamvc_cross.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + text, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + text, text_mask = text + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + diff --git a/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56d7a532c84d2b4f4d4b2182f7ed813efff02fb7 Binary files /dev/null and b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc differ diff --git a/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22b2da5da072649c7c6e7e6e25ba4b7f8c898f6b Binary files /dev/null and b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc differ diff --git a/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc b/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ba54c792a3a21ca2aa5e6e12d61ae46be9d98ca Binary files /dev/null and b/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc differ diff --git a/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc b/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de887c0bf0026d809491b43e186c534dfe060130 Binary files /dev/null and b/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc differ diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23 --- /dev/null +++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml @@ -0,0 +1,47 @@ +version: 1.0 + +system: "base" + +model: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..582dd72f52a61b1a268e45c0af0d55a2d730e551 --- /dev/null +++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml @@ -0,0 +1,34 @@ +version: 1.0 + +system: "base" + +diffwrap: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [128, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' \ No newline at end of file diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66 --- /dev/null +++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml @@ -0,0 +1,45 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff --- /dev/null +++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml @@ -0,0 +1,33 @@ +version: 1.0 + +system: "cross" + +diffwrap: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [100, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 512] + down_block_types: + [ + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + \ No newline at end of file diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e --- /dev/null +++ b/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml @@ -0,0 +1,39 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + + unet: + sample_size: [1, 1] + in_channels: 256 + out_channels: 256 + layers_per_block: 2 + block_out_channels: [256] + down_block_types: + [ + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 0.05 + shift: -0.035 + \ No newline at end of file diff --git a/dreamvoice/src/configs/diffvc_base.yaml b/dreamvoice/src/configs/diffvc_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23 --- /dev/null +++ b/dreamvoice/src/configs/diffvc_base.yaml @@ -0,0 +1,47 @@ +version: 1.0 + +system: "base" + +model: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/src/configs/diffvc_base_pitch.yaml b/dreamvoice/src/configs/diffvc_base_pitch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d8b894cd095accdcb9eab7788e8088d0430eae1 --- /dev/null +++ b/dreamvoice/src/configs/diffvc_base_pitch.yaml @@ -0,0 +1,34 @@ +version: 1.0 + +system: "base" + +diffwrap: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [128, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 512] + down_block_types: + [ + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' \ No newline at end of file diff --git a/dreamvoice/src/configs/diffvc_cross.yaml b/dreamvoice/src/configs/diffvc_cross.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66 --- /dev/null +++ b/dreamvoice/src/configs/diffvc_cross.yaml @@ -0,0 +1,45 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/src/configs/diffvc_cross_pitch.yaml b/dreamvoice/src/configs/diffvc_cross_pitch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff --- /dev/null +++ b/dreamvoice/src/configs/diffvc_cross_pitch.yaml @@ -0,0 +1,33 @@ +version: 1.0 + +system: "cross" + +diffwrap: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [100, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 512] + down_block_types: + [ + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + \ No newline at end of file diff --git a/dreamvoice/src/configs/plugin_cross.yaml b/dreamvoice/src/configs/plugin_cross.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e --- /dev/null +++ b/dreamvoice/src/configs/plugin_cross.yaml @@ -0,0 +1,39 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + + unet: + sample_size: [1, 1] + in_channels: 256 + out_channels: 256 + layers_per_block: 2 + block_out_channels: [256] + down_block_types: + [ + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 0.05 + shift: -0.035 + \ No newline at end of file diff --git a/dreamvoice/src/debug.py b/dreamvoice/src/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/src/extract_features.py b/dreamvoice/src/extract_features.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3 --- /dev/null +++ b/dreamvoice/src/extract_features.py @@ -0,0 +1,103 @@ +import os +import torch +import librosa +import numpy as np +import soundfile as sf +import pandas as pd +# from feats.hubert_model import get_soft_model, get_hubert_soft_content +from feats.contentvec_hf import get_content_model, get_content +# from modules.speaker_encoder.encoder import inference as spk_encoder +# from pathlib import Path +from tqdm import tqdm +from multiprocessing import Process +import pyworld as pw + + +def resample_save(infolder, audio_path, model, + audio_sr=24000, content_sr=16000, min_length=1.92, + content_resolution=50, + save_path='features'): + if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False: + audio, sr = librosa.load(infolder + audio_path, sr=content_sr) + final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution) + # final_length = final_length / content_sr + + length = max(round(min_length*content_sr), round(final_length)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + + # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = get_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = content.cpu() + os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True) + torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt') + # print(audio_save.shape) + # print(content.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr)) + # print(save_path + '/' + 'audio_16k/' + audio_path) + + audio, sr = librosa.load(infolder + audio_path, sr=audio_sr) + length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + # print(audio_save.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr)) + + +def extract_f0(in_folder, audio_path, save_path): + audio, sr = librosa.load(in_folder + audio_path, sr=None) + assert sr == 16000 + if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False: + # wav = audio + # wav = np.pad(wav, int((1024-320)/2), mode='reflect') + # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr, + # fmin=librosa.note_to_hz('C2'), + # fmax=librosa.note_to_hz('C6')) + + _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000) + f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1] + + f0 = np.nan_to_num(f0) + os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True) + # print(save_path + '/' + 'f0/' + audio_path + '.pt') + torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt') + + +def chunks(arr, m): + result = [[] for i in range(m)] + for i in range(len(arr)): + result[i%m].append(arr[i]) + return result + + +def extract_f0_main(in_folder, audio_paths, save_path): + for audio_path in tqdm(audio_paths): + extract_f0(in_folder, audio_path, save_path) + + +if __name__ == '__main__': + df = pd.read_csv('../test_data/vc_meta.csv') + # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda') + model = get_content_model().to('cuda') + # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda") + for i in tqdm(range(len(df))): + row = df.iloc[i] + in_path = row['path'] + resample_save('../test_data/', in_path, model, save_path='../features/') + + in_folder = '../features/audio_16k/' + audio_files = list(df['path']) + save_path = '../features/' + cores = 6 + + subsets = chunks(audio_files, cores) + + for subset in subsets: + t = Process(target=extract_f0_main, args=(in_folder, subset, save_path)) + t.start() \ No newline at end of file diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30 --- /dev/null +++ b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py @@ -0,0 +1,42 @@ +import torch +import librosa +from fairseq import checkpoint_utils +import torch.nn.functional as F + + +def get_model(vec_path): + print("load model(s) from {}".format(vec_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + model = models[0] + model.eval() + return model + + +@torch.no_grad() +def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + feats = wav_16k_tensor + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav_16k_tensor.device), + "padding_mask": padding_mask.to(wav_16k_tensor.device), + "output_layer": layer + } + logits = hmodel.extract_features(**inputs)[0] + # feats = hmodel.final_proj(logits[0]) + return logits + + +if __name__ == '__main__': + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + model = get_model('../../ckpts/checkpoint_best_legacy_500.pt') + model = model.cuda() + content = get_content(model, torch.tensor([audio])) + print(content) \ No newline at end of file diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638 --- /dev/null +++ b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py @@ -0,0 +1,40 @@ +from transformers import HubertModel +import torch.nn as nn +import torch +import torch.nn.functional as F +import librosa + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + + # The final projection layer is only used for backward compatibility. + # Following https://github.com/auspicious3000/contentvec/issues/6 + # Remove this layer is necessary to achieve the desired outcome. + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def get_content_model(config='lengyue233/content-vec-best'): + model = HubertModelWithFinalProj.from_pretrained(config) + model.eval() + return model + + +@torch.no_grad() +def get_content(model, wav_16k_tensor, device='cuda'): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + logits = model(wav_16k_tensor)['last_hidden_state'] + return logits + + +if __name__ == '__main__': + model = get_content_model().cuda() + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + audio = torch.tensor([audio]) + content = get_content(model, audio, 'cuda') + print(content) \ No newline at end of file diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd --- /dev/null +++ b/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py @@ -0,0 +1,24 @@ +import torch, torchaudio +from .hubert.hubert import HubertSoft +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present +import librosa + + +def get_soft_model(model_path): + hubert = HubertSoft() + # Load checkpoint (either hubert_soft or hubert_discrete) + # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) + checkpoint = torch.load(model_path) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +@torch.no_grad() +def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'): + wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1) + # print(wav_16k_tensor.shape) + units = hmodel.units(wav_16k_tensor) + # print(units.shape) + return units.cpu() \ No newline at end of file diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..875c33d953e859609ab401e5d9b153581a8c5215 --- /dev/null +++ b/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py @@ -0,0 +1,22 @@ +import torch, torchaudio +from hubert.hubert import HubertSoft +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present +import librosa + + +def get_soft_model(model_path): + hubert = HubertSoft() + # Load checkpoint (either hubert_soft or hubert_discrete) + # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) + checkpoint = torch.load(model_path) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return model + + +@torch.no_grad() +def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'): + wav_16k_tensor = wav_16k_tensor.to(device) + units = hmodel.units(wav_16k_tensor) + return units.cpu() \ No newline at end of file diff --git a/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc b/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4f38a3fc3fe39f026788685fd465d5899fcb704 Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc differ diff --git a/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d775fdddd3d767bc827f7d1261d161fa8b65d553 Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc differ diff --git a/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a3724af7934ca72887aa33c71dad7e4b8e13f6d Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc differ diff --git a/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9c28ffe5034e6ece3ce9635ed61a79f3ac38abb Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc differ diff --git a/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c141a6db71db045064c08c6cf0d3636d7fd46ba5 Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc differ diff --git a/dreamvoice/src/feats/contentvec.py b/dreamvoice/src/feats/contentvec.py new file mode 100644 index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30 --- /dev/null +++ b/dreamvoice/src/feats/contentvec.py @@ -0,0 +1,42 @@ +import torch +import librosa +from fairseq import checkpoint_utils +import torch.nn.functional as F + + +def get_model(vec_path): + print("load model(s) from {}".format(vec_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + model = models[0] + model.eval() + return model + + +@torch.no_grad() +def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + feats = wav_16k_tensor + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav_16k_tensor.device), + "padding_mask": padding_mask.to(wav_16k_tensor.device), + "output_layer": layer + } + logits = hmodel.extract_features(**inputs)[0] + # feats = hmodel.final_proj(logits[0]) + return logits + + +if __name__ == '__main__': + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + model = get_model('../../ckpts/checkpoint_best_legacy_500.pt') + model = model.cuda() + content = get_content(model, torch.tensor([audio])) + print(content) \ No newline at end of file diff --git a/dreamvoice/src/feats/contentvec_hf.py b/dreamvoice/src/feats/contentvec_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638 --- /dev/null +++ b/dreamvoice/src/feats/contentvec_hf.py @@ -0,0 +1,40 @@ +from transformers import HubertModel +import torch.nn as nn +import torch +import torch.nn.functional as F +import librosa + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + + # The final projection layer is only used for backward compatibility. + # Following https://github.com/auspicious3000/contentvec/issues/6 + # Remove this layer is necessary to achieve the desired outcome. + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def get_content_model(config='lengyue233/content-vec-best'): + model = HubertModelWithFinalProj.from_pretrained(config) + model.eval() + return model + + +@torch.no_grad() +def get_content(model, wav_16k_tensor, device='cuda'): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + logits = model(wav_16k_tensor)['last_hidden_state'] + return logits + + +if __name__ == '__main__': + model = get_content_model().cuda() + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + audio = torch.tensor([audio]) + content = get_content(model, audio, 'cuda') + print(content) \ No newline at end of file diff --git a/dreamvoice/src/feats/hubert/.gitignore b/dreamvoice/src/feats/hubert/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0202868f93e8b1be2f925f2ec6b22f3df691e8c3 --- /dev/null +++ b/dreamvoice/src/feats/hubert/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# VSCode project settings +.vscode + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/dreamvoice/src/feats/hubert/LICENSE b/dreamvoice/src/feats/hubert/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e --- /dev/null +++ b/dreamvoice/src/feats/hubert/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Benjamin van Niekerk + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dreamvoice/src/feats/hubert/README.md b/dreamvoice/src/feats/hubert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..68602858ed726acd4f99ce9fecca008f3511dc90 --- /dev/null +++ b/dreamvoice/src/feats/hubert/README.md @@ -0,0 +1,161 @@ +# HuBERT + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2111.02392) +[![demo](https://img.shields.io/static/v1?message=Audio%20Samples&logo=Github&labelColor=grey&color=blue&logoColor=white&label=%20&style=flat)](https://bshall.github.io/soft-vc/) +[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb) + +Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484). +For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb). + +
+ Soft-VC +
+
+ + Fig 1: Architecture of the voice conversion system. a) The discrete content encoder clusters audio features to produce a sequence of discrete speech units. b) The soft content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform. + +
+ +## Example Usage + +### Programmatic Usage + +```python +import torch, torchaudio + +# Load checkpoint (either hubert_soft or hubert_discrete) +hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda() + +# Load audio +wav, sr = torchaudio.load("path/to/wav") +assert sr == 16000 +wav = wav.unsqueeze(0).cuda() + +# Extract speech units +units = hubert.units(x) +``` + +### Script-Based Usage + +``` +usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir + +Encode an audio dataset. + +positional arguments: + {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete) + in-dir path to the dataset directory. + out-dir path to the output directory. + +optional arguments: + -h, --help show this help message and exit + --extension EXTENSION + extension of the audio files (defaults to .flac). +``` + +## Training + +### Step 1: Dataset Preparation + +Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory: + +``` +│ lengths.json +│ +└───wavs + ├───dev-* + │ ├───84 + │ ├───... + │ └───8842 + └───train-* + ├───19 + ├───... + └───8975 +``` + +The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples: + +```json +{ + "dev-clean/1272/128104/1272-128104-0000": 93680, + "dev-clean/1272/128104/1272-128104-0001": 77040, +} +``` + +### Step 2: Extract Discrete Speech Units + +Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script: + +``` +usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir + +Encode an audio dataset. + +positional arguments: + {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete) + in-dir path to the dataset directory. + out-dir path to the output directory. + +optional arguments: + -h, --help show this help message and exit + --extension EXTENSION + extension of the audio files (defaults to .flac). +``` + +for example: + +``` +python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete +``` + +At this point the directory tree should look like: + +``` +│ lengths.json +│ +├───discrete +│ ├───... +└───wavs + ├───... +``` + +### Step 3: Train the HuBERT-Soft Content Encoder + +``` +usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir + +Train HuBERT soft content encoder. + +positional arguments: + dataset-dir path to the data directory. + checkpoint-dir path to the checkpoint directory. + +optional arguments: + -h, --help show this help message and exit + --resume RESUME path to the checkpoint to resume from. + --warmstart whether to initialize from the fairseq HuBERT checkpoint. + --mask whether to use input masking. + --alpha ALPHA weight for the masked loss. +``` + +## Links + +- [Soft-VC repo](https://github.com/bshall/soft-vc) +- [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484) +- [Official HuBERT repo](https://github.com/pytorch/fairseq) +- [HuBERT paper](https://arxiv.org/abs/2106.07447) + +## Citation + +If you found this work helpful please consider citing our paper: + +``` +@inproceedings{ + soft-vc-2022, + author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman}, + booktitle={ICASSP}, + title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion}, + year={2022} +} +``` diff --git a/dreamvoice/src/feats/hubert/cluster.py b/dreamvoice/src/feats/hubert/cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..18b754c73c63b79e943d51e76414f0056f05589f --- /dev/null +++ b/dreamvoice/src/feats/hubert/cluster.py @@ -0,0 +1,66 @@ +from pathlib import Path +import logging +import argparse + +import torch +import numpy as np +from sklearn.cluster import KMeans + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def cluster(args): + with open(args.subset) as file: + subset = [line.strip() for line in file] + + logger.info(f"Loading features from {args.in_dir}") + features = [] + for path in subset: + in_path = args.in_dir / path + features.append(np.load(in_path.with_suffix(".npy"))) + features = np.concatenate(features, axis=0) + + logger.info(f"Clustering features of shape: {features.shape}") + kmeans = KMeans(n_clusters=args.n_clusters).fit(features) + + checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt" + checkpoint_path.parent.mkdir(exist_ok=True, parents=True) + torch.save( + checkpoint_path, + { + "n_features_in_": kmeans.n_features_in_, + "_n_threads": kmeans._n_threads, + "cluster_centers_": kmeans.cluster_centers_, + }, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Cluster speech features features.") + parser.add_argument( + "in_dir", + metavar="in-dir", + help="path to the encoded dataset", + type=Path, + ) + parser.add_argument( + "subset", + matavar="subset", + help="path to the .txt file containing the list of files to cluster", + type=Path, + ) + parser.add_argument( + "checkpoint_dir", + metavar="checkpoint-dir", + help="path to the checkpoint directory", + type=Path, + ) + parser.add_argument( + "--n-clusters", + help="number of clusters", + type=int, + default=100, + ) + args = parser.parse_args() + cluster(args) diff --git a/dreamvoice/src/feats/hubert/content-encoder.png b/dreamvoice/src/feats/hubert/content-encoder.png new file mode 100644 index 0000000000000000000000000000000000000000..fc59d538a9383896cf0c36e1d4a3f5030fce38fe Binary files /dev/null and b/dreamvoice/src/feats/hubert/content-encoder.png differ diff --git a/dreamvoice/src/feats/hubert/encode.py b/dreamvoice/src/feats/hubert/encode.py new file mode 100644 index 0000000000000000000000000000000000000000..14246e985fb0e9dc157d290853af6dcf6036f61c --- /dev/null +++ b/dreamvoice/src/feats/hubert/encode.py @@ -0,0 +1,60 @@ +import argparse +import logging +import numpy as np +from pathlib import Path +from tqdm import tqdm + +import torch +import torchaudio +from torchaudio.functional import resample + + +def encode_dataset(args): + print(f"Loading hubert checkpoint") + hubert = torch.hub.load( + "bshall/hubert:main", + f"hubert_{args.model}", + trust_repo=True, + ).cuda() + + print(f"Encoding dataset at {args.in_dir}") + for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))): + wav, sr = torchaudio.load(in_path) + wav = resample(wav, sr, 16000) + wav = wav.unsqueeze(0).cuda() + + with torch.inference_mode(): + units = hubert.units(wav) + + out_path = args.out_dir / in_path.relative_to(args.in_dir) + out_path.parent.mkdir(parents=True, exist_ok=True) + np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Encode an audio dataset.") + parser.add_argument( + "model", + help="available models (HuBERT-Soft or HuBERT-Discrete)", + choices=["soft", "discrete"], + ) + parser.add_argument( + "in_dir", + metavar="in-dir", + help="path to the dataset directory.", + type=Path, + ) + parser.add_argument( + "out_dir", + metavar="out-dir", + help="path to the output directory.", + type=Path, + ) + parser.add_argument( + "--extension", + help="extension of the audio files (defaults to .flac).", + default=".flac", + type=str, + ) + args = parser.parse_args() + encode_dataset(args) diff --git a/dreamvoice/src/feats/hubert/hubconf.py b/dreamvoice/src/feats/hubert/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..b58749e4a40b29eab470686b27e06a97bfecb321 --- /dev/null +++ b/dreamvoice/src/feats/hubert/hubconf.py @@ -0,0 +1,80 @@ +dependencies = ["torch", "torchaudio", "sklearn"] + +URLS = { + "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt", + "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt", + "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt", +} + +import torch +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + +from sklearn.cluster import KMeans + +from hubert import HubertDiscrete, HubertSoft + + +def hubert_discrete( + pretrained: bool = True, + progress: bool = True, +) -> HubertDiscrete: + r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + pretrained (bool): load pretrained weights into the model + progress (bool): show progress bar when downloading model + """ + kmeans = kmeans100(pretrained=pretrained, progress=progress) + hubert = HubertDiscrete(kmeans) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-discrete"], progress=progress + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +def hubert_soft( + pretrained: bool = True, + progress: bool = True, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + pretrained (bool): load pretrained weights into the model. + progress (bool): show progress bar when downloading model. + """ + hubert = HubertSoft() + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-soft"], + progress=progress, + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +def _kmeans( + num_clusters: int, pretrained: bool = True, progress: bool = True +) -> KMeans: + kmeans = KMeans(num_clusters) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS[f"kmeans{num_clusters}"], progress=progress + ) + kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"] + kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"] + kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy() + return kmeans + + +def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans: + r""" + k-means checkpoint for HuBERT-Discrete with 100 clusters. + Args: + pretrained (bool): load pretrained weights into the model + progress (bool): show progress bar when downloading model + """ + return _kmeans(100, pretrained, progress) diff --git a/dreamvoice/src/feats/hubert/hubert/__init__.py b/dreamvoice/src/feats/hubert/hubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e07f859e99f51dcf35639f26a3eb53a81c993f3 --- /dev/null +++ b/dreamvoice/src/feats/hubert/hubert/__init__.py @@ -0,0 +1,5 @@ +from .model import ( + Hubert, + HubertDiscrete, + HubertSoft, +) diff --git a/dreamvoice/src/feats/hubert/hubert/dataset.py b/dreamvoice/src/feats/hubert/hubert/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3ac2b84f95340e088913e06db8e5db0a68e83c2e --- /dev/null +++ b/dreamvoice/src/feats/hubert/hubert/dataset.py @@ -0,0 +1,91 @@ +import random +from pathlib import Path +import numpy as np +import json + +import torch +import torch.nn.functional as F +from torch.utils.data import Dataset +import torchaudio + + +class AcousticUnitsDataset(Dataset): + def __init__( + self, + root: Path, + sample_rate: int = 16000, + label_rate: int = 50, + min_samples: int = 32000, + max_samples: int = 250000, + train: bool = True, + ): + self.wavs_dir = root / "wavs" + self.units_dir = root / "discrete" + + with open(root / "lengths.json") as file: + self.lenghts = json.load(file) + + pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac" + metadata = ( + (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix()) + for path in self.wavs_dir.rglob(pattern) + ) + metadata = ((path, key) for path, key in metadata if key in self.lenghts) + self.metadata = [ + path for path, key in metadata if self.lenghts[key] > min_samples + ] + + self.sample_rate = sample_rate + self.label_rate = label_rate + self.min_samples = min_samples + self.max_samples = max_samples + self.train = train + + def __len__(self): + return len(self.metadata) + + def __getitem__(self, index): + wav_path = self.metadata[index] + units_path = self.units_dir / wav_path.relative_to(self.wavs_dir) + + wav, _ = torchaudio.load(wav_path) + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + codes = np.load(units_path.with_suffix(".npy")) + + return wav, torch.from_numpy(codes).long() + + def collate(self, batch): + wavs, codes = zip(*batch) + wavs, codes = list(wavs), list(codes) + + wav_lengths = [wav.size(-1) for wav in wavs] + code_lengths = [code.size(-1) for code in codes] + + wav_frames = min(self.max_samples, *wav_lengths) + + collated_wavs, wav_offsets = [], [] + for wav in wavs: + wav_diff = wav.size(-1) - wav_frames + wav_offset = random.randint(0, wav_diff) + wav = wav[:, wav_offset : wav_offset + wav_frames] + + collated_wavs.append(wav) + wav_offsets.append(wav_offset) + + rate = self.label_rate / self.sample_rate + code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets] + code_frames = round(wav_frames * rate) + remaining_code_frames = [ + length - offset for length, offset in zip(code_lengths, code_offsets) + ] + code_frames = min(code_frames, *remaining_code_frames) + + collated_codes = [] + for code, code_offset in zip(codes, code_offsets): + code = code[code_offset : code_offset + code_frames] + collated_codes.append(code) + + wavs = torch.stack(collated_wavs, dim=0) + codes = torch.stack(collated_codes, dim=0) + + return wavs, codes diff --git a/dreamvoice/src/feats/hubert/hubert/model.py b/dreamvoice/src/feats/hubert/hubert/model.py new file mode 100644 index 0000000000000000000000000000000000000000..523dd95633ba73babff8b6836324ae0a7c2d267f --- /dev/null +++ b/dreamvoice/src/feats/hubert/hubert/model.py @@ -0,0 +1,241 @@ +import copy +from typing import Optional, Tuple +import random + +from sklearn.cluster import KMeans + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Hubert(nn.Module): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu", batch_first=True + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) + x[mask] = self.masked_spec_embed.to(x.dtype) + return x, mask + + def encode( + self, x: torch.Tensor, layer: Optional[int] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose(1, 2)) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: torch.Tensor) -> torch.Tensor: + logits = torch.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + dim=-1, + ) + return logits / 0.1 + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x, mask = self.encode(x) + x = self.proj(x) + logits = self.logits(x) + return logits, mask + + +class HubertSoft(Hubert): + """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.""" + + def __init__(self): + super().__init__() + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.Tensor: + """Extract soft speech units. + + Args: + wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples. + + Returns: + Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions. + """ + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + +class HubertDiscrete(Hubert): + """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.""" + + def __init__(self, kmeans: KMeans): + super().__init__(504) + self.kmeans = kmeans + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.LongTensor: + """Extract discrete speech units. + + Args: + wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples. + + Returns: + LongTensor: soft speech units of shape (N,), where N is the number of frames. + """ + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav, layer=7) + x = self.kmeans.predict(x.squeeze().cpu().numpy()) + return torch.tensor(x, dtype=torch.long, device=wav.device) + + +class FeatureExtractor(nn.Module): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) + self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.gelu(self.norm0(self.conv0(x))) + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = F.gelu(self.conv3(x)) + x = F.gelu(self.conv4(x)) + x = F.gelu(self.conv5(x)) + x = F.gelu(self.conv6(x)) + return x + + +class FeatureProjection(nn.Module): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x.transpose(1, 2)) + x = F.gelu(x[:, :, :-1]) + return x.transpose(1, 2) + + +class TransformerEncoder(nn.Module): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, + ) -> torch.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, +) -> torch.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = torch.ones( + (batch_size, sequence_length - (mask_length - 1)), device=device + ) + + # get random indices to mask + mask_indices = torch.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + torch.arange(mask_length, device=device)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask diff --git a/dreamvoice/src/feats/hubert/hubert/utils.py b/dreamvoice/src/feats/hubert/hubert/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d42ba3acb822938f246dba27b3de81ec51aa72b0 --- /dev/null +++ b/dreamvoice/src/feats/hubert/hubert/utils.py @@ -0,0 +1,61 @@ +import torch + + +class Metric: + def __init__(self): + self.steps = 0 + self.value = 0 + + def update(self, value): + self.steps += 1 + self.value += (value - self.value) / self.steps + return self.value + + def reset(self): + self.steps = 0 + self.value = 0 + + +def save_checkpoint( + checkpoint_dir, + hubert, + optimizer, + scaler, + step, + loss, + best, + logger, +): + state = { + "hubert": hubert.state_dict(), + "optimizer": optimizer.state_dict(), + "scaler": scaler.state_dict(), + "step": step, + "loss": loss, + } + checkpoint_dir.mkdir(exist_ok=True, parents=True) + checkpoint_path = checkpoint_dir / f"model-{step}.pt" + torch.save(state, checkpoint_path) + if best: + best_path = checkpoint_dir / "model-best.pt" + torch.save(state, best_path) + logger.info(f"Saved checkpoint: {checkpoint_path.stem}") + + +def load_checkpoint( + load_path, + hubert, + optimizer, + scaler, + rank, + logger, +): + logger.info(f"Loading checkpoint from {load_path}") + checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"}) + hubert.load_state_dict(checkpoint["hubert"]) + if "scaler" in checkpoint: + scaler.load_state_dict(checkpoint["scaler"]) + if "optimizer" in checkpoint: + optimizer.load_state_dict(checkpoint["optimizer"]) + step, loss = checkpoint.get("step", 0), checkpoint.get("loss", float("inf")) + return step, loss diff --git a/dreamvoice/src/feats/hubert/train.py b/dreamvoice/src/feats/hubert/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ff5ca9de087f72e343ffb4e5ef00cdbb90765097 --- /dev/null +++ b/dreamvoice/src/feats/hubert/train.py @@ -0,0 +1,459 @@ +import argparse +import logging +from pathlib import Path + +import torch +import torch.cuda.amp as amp +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.distributed as dist +from torch.utils.data.distributed import DistributedSampler +import torch.multiprocessing as mp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + +from hubert.model import Hubert, URLS +from hubert.dataset import AcousticUnitsDataset +from hubert.utils import Metric, save_checkpoint, load_checkpoint + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +######################################################################################## +# Define hyperparameters for training: +######################################################################################## + +BATCH_SIZE = 32 +LEARNING_RATE = 2e-5 +BETAS = (0.9, 0.98) +EPS = 1e-06 +WEIGHT_DECAY = 1e-2 +MAX_NORM = 10 +STEPS = 25000 +LOG_INTERVAL = 5 +VALIDATION_INTERVAL = 1000 +CHECKPOINT_INTERVAL = 5000 +BACKEND = "nccl" +INIT_METHOD = "tcp://localhost:54321" + + +def train(rank, world_size, args): + dist.init_process_group( + BACKEND, + rank=rank, + world_size=world_size, + init_method=INIT_METHOD, + ) + + #################################################################################### + # Setup logging utilities: + #################################################################################### + + log_dir = args.checkpoint_dir / "logs" + log_dir.mkdir(exist_ok=True, parents=True) + + if rank == 0: + logger.setLevel(logging.INFO) + handler = logging.FileHandler(log_dir / f"{args.checkpoint_dir.stem}.log") + handler.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + else: + logger.setLevel(logging.ERROR) + + writer = SummaryWriter(log_dir) if rank == 0 else None + + #################################################################################### + # Initialize models + #################################################################################### + + hubert = Hubert(mask=args.mask).to(rank) + + if args.warmstart: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-discrete"], map_location={"cuda:0": f"cuda:{rank}"} + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + + # don't use warmstart weights for label embeddings and proj layer + del checkpoint["hubert"]["label_embedding.weight"] + del checkpoint["hubert"]["proj.weight"] + del checkpoint["hubert"]["proj.bias"] + + hubert.load_state_dict(checkpoint["hubert"], strict=False) + + hubert = DDP(hubert, device_ids=[rank]) + + #################################################################################### + # Initialze optimizer and grad scaler + #################################################################################### + + optimizer = optim.AdamW( + hubert.parameters(), + lr=LEARNING_RATE, + betas=BETAS, + eps=EPS, + weight_decay=WEIGHT_DECAY, + ) + scaler = amp.GradScaler() + + #################################################################################### + # Initialize datasets and dataloaders + #################################################################################### + + train_dataset = AcousticUnitsDataset( + root=args.dataset_dir, + train=True, + ) + train_sampler = DistributedSampler(train_dataset, drop_last=True) + train_loader = DataLoader( + train_dataset, + collate_fn=train_dataset.collate, + batch_size=BATCH_SIZE, + sampler=train_sampler, + num_workers=8, + pin_memory=True, + shuffle=False, + drop_last=True, + ) + + validation_dataset = AcousticUnitsDataset( + root=args.dataset_dir, + train=False, + ) + validation_loader = DataLoader( + validation_dataset, + batch_size=1, + shuffle=False, + num_workers=8, + pin_memory=True, + ) + + #################################################################################### + # Load checkpoint if args.resume is set + #################################################################################### + + if args.resume is not None: + global_step, best_loss = load_checkpoint( + load_path=args.resume, + hubert=hubert, + optimizer=optimizer, + scaler=scaler, + rank=rank, + logger=logger, + ) + else: + global_step, best_loss = 0, float("inf") + + # =================================================================================# + # Start training loop + # =================================================================================# + + n_epochs = STEPS // len(train_loader) + 1 + start_epoch = global_step // len(train_loader) + 1 + + logger.info("**" * 40) + logger.info(f"PyTorch version: {torch.__version__}") + logger.info(f"CUDA version: {torch.version.cuda}") + logger.info(f"CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}") + logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}") + logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}") + logger.info(f"# of GPUS: {torch.cuda.device_count()}") + logger.info(f"batch size: {BATCH_SIZE}") + logger.info(f"iterations per epoch: {len(train_loader)}") + logger.info(f"# of epochs: {n_epochs}") + logger.info(f"started at epoch: {start_epoch}") + logger.info("**" * 40 + "\n") + + if args.mask: + average_masked_loss = Metric() + average_unmasked_loss = Metric() + average_masked_accuracy = Metric() + average_unmasked_accuracy = Metric() + + epoch_masked_loss = Metric() + epoch_unmasked_loss = Metric() + epoch_masked_accuracy = Metric() + epoch_unmasked_accuracy = Metric() + else: + average_loss = Metric() + average_accuracy = Metric() + + epoch_loss = Metric() + epoch_accuracy = Metric() + + validation_loss = Metric() + validation_accuracy = Metric() + + for epoch in range(start_epoch, n_epochs + 1): + train_sampler.set_epoch(epoch) + + hubert.train() + if args.mask: + epoch_masked_loss.reset() + epoch_unmasked_loss.reset() + epoch_masked_accuracy.reset() + epoch_unmasked_accuracy.reset() + else: + epoch_loss.reset() + epoch_accuracy.reset() + + for wavs, codes in train_loader: + global_step += 1 + wavs, codes = wavs.to(rank), codes.to(rank) + + ############################################################################ + # Compute training loss + ############################################################################ + + optimizer.zero_grad() + + with amp.autocast(): + logits, mask = hubert(wavs) + length = min( + mask.size(-1) if args.mask else float("inf"), codes.size(-1) + ) + logits = logits[:, :length, :] + codes = codes[:, :length] + if args.mask: + mask = mask[:, :length] + + if args.mask: + masked_loss = F.cross_entropy(logits[mask], codes[mask]) + unmasked_loss = F.cross_entropy(logits[~mask], codes[~mask]) + loss = args.alpha * masked_loss + (1 - args.alpha) * unmasked_loss + else: + loss = F.cross_entropy(logits.transpose(1, 2), codes) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + + nn.utils.clip_grad_norm_(hubert.parameters(), MAX_NORM) + + scaler.step(optimizer) + scaler.update() + + if args.mask: + masked_accuracy = logits[mask].argmax(dim=-1) == codes[mask] + masked_accuracy = torch.mean(masked_accuracy.float()) + + unmasked_accuracy = logits[~mask].argmax(dim=-1) == codes[~mask] + unmasked_accuracy = torch.mean(unmasked_accuracy.float()) + else: + accuracy = logits.argmax(dim=-1) == codes + accuracy = torch.mean(accuracy.float()) + + ############################################################################ + # Update and log training metrics + ############################################################################ + + if args.mask: + average_masked_loss.update(masked_loss.item()) + average_unmasked_loss.update(unmasked_loss.item()) + average_masked_accuracy.update(masked_accuracy.item()) + average_unmasked_accuracy.update(unmasked_accuracy.item()) + + epoch_masked_loss.update(masked_loss.item()) + epoch_unmasked_loss.update(unmasked_loss.item()) + epoch_masked_accuracy.update(masked_accuracy.item()) + epoch_unmasked_accuracy.update(unmasked_accuracy.item()) + else: + average_loss.update(loss.item()) + average_accuracy.update(accuracy.item()) + + epoch_loss.update(loss.item()) + epoch_accuracy.update(accuracy.item()) + + if rank == 0 and global_step % LOG_INTERVAL == 0: + if args.mask: + writer.add_scalar( + "train/masked_loss", + average_masked_loss.value, + global_step, + ) + writer.add_scalar( + "train/unmasked_loss", + average_unmasked_loss.value, + global_step, + ) + writer.add_scalar( + "train/masked_accuracy", + average_masked_accuracy.value * 100, + global_step, + ) + writer.add_scalar( + "train/unmasked_accuracy", + average_unmasked_accuracy.value * 100, + global_step, + ) + average_masked_loss.reset() + average_unmasked_loss.reset() + average_masked_accuracy.reset() + average_unmasked_accuracy.reset() + else: + writer.add_scalar( + "train/loss", + average_loss.value, + global_step, + ) + writer.add_scalar( + "train/accuracy", + average_accuracy.value, + global_step, + ) + average_loss.reset() + average_accuracy.reset() + + # --------------------------------------------------------------------------# + # Start validation loop + # --------------------------------------------------------------------------# + + if global_step % VALIDATION_INTERVAL == 0: + hubert.eval() + validation_loss.reset() + validation_accuracy.reset() + for wavs, codes in validation_loader: + wavs, codes = wavs.to(rank), codes.to(rank) + + with torch.no_grad(): + logits, _ = hubert(wavs) + logits = logits.transpose(1, 2) + + loss = F.cross_entropy(logits, codes) + + accuracy = logits.argmax(dim=1) == codes + accuracy = torch.mean(accuracy.float()) + + #################################################################### + # Update validation metrics + #################################################################### + + validation_loss.update(loss.item()) + validation_accuracy.update(accuracy.item()) + + hubert.train() + + ############################################################################ + # Log validation metrics + ############################################################################ + + if rank == 0: + writer.add_scalar( + "validation/unit_loss", + validation_loss.value, + global_step, + ) + writer.add_scalar( + "validation/unit_accuracy", + validation_accuracy.value * 100, + global_step, + ) + logger.info( + f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}, accuracy: {validation_accuracy.value * 100:.2f}" + ) + + ############################################################################ + # Save model checkpoint + ############################################################################ + + new_best = best_loss > validation_loss.value + if new_best or global_step % CHECKPOINT_INTERVAL == 0: + if new_best: + logger.info("-------- new best model found!") + best_loss = validation_loss.value + + if rank == 0: + save_checkpoint( + checkpoint_dir=args.checkpoint_dir, + hubert=hubert, + optimizer=optimizer, + scaler=scaler, + step=global_step, + loss=validation_loss.value, + best=new_best, + logger=logger, + ) + + # -----------------------------------------------------------------------------# + # End validation loop + # -----------------------------------------------------------------------------# + + #################################################################################### + # Log training metrics + #################################################################################### + + logger.info( + f""" + train -- epoch: {epoch}, masked loss: {epoch_masked_loss.value:.4f}, unmasked loss: {epoch_unmasked_loss.value:.4f}, + masked accuracy: {epoch_masked_accuracy.value * 100:.2f}, umasked accuracy: {epoch_unmasked_accuracy.value * 100:.2f} + """ + ) + + # ==================================================================================# + # End training loop + # ==================================================================================# + + dist.destroy_process_group() + + +def train_hubert(args): + world_size = torch.cuda.device_count() + mp.spawn( + train, + args=(world_size, args), + nprocs=world_size, + join=True, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Train HuBERT soft content encoder.") + parser.add_argument( + "dataset_dir", + metavar="dataset-dir", + help="path to the data directory.", + type=Path, + ) + parser.add_argument( + "checkpoint_dir", + metavar="checkpoint-dir", + help="path to the checkpoint directory.", + type=Path, + ) + parser.add_argument( + "--resume", + help="path to the checkpoint to resume from.", + type=Path, + ) + parser.add_argument( + "--warmstart", + help="whether to initialize from the fairseq HuBERT checkpoint.", + action="store_true", + ) + parser.add_argument( + "--mask", + help="whether to use input masking.", + action="store_true", + ) + parser.add_argument( + "--alpha", + help="weight for the masked loss.", + default=1, + type=float, + ) + args = parser.parse_args() + + world_size = torch.cuda.device_count() + mp.spawn( + train, + args=(world_size, args), + nprocs=world_size, + join=True, + ) diff --git a/dreamvoice/src/feats/hubert_model.py b/dreamvoice/src/feats/hubert_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd --- /dev/null +++ b/dreamvoice/src/feats/hubert_model.py @@ -0,0 +1,24 @@ +import torch, torchaudio +from .hubert.hubert import HubertSoft +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present +import librosa + + +def get_soft_model(model_path): + hubert = HubertSoft() + # Load checkpoint (either hubert_soft or hubert_discrete) + # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) + checkpoint = torch.load(model_path) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +@torch.no_grad() +def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'): + wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1) + # print(wav_16k_tensor.shape) + units = hmodel.units(wav_16k_tensor) + # print(units.shape) + return units.cpu() \ No newline at end of file diff --git a/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2 --- /dev/null +++ b/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class DiffVC(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.speaker_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim'])) + self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) / + self.config['cls_embedding']['speaker_dim'] ** 0.5) + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, speaker, pitch, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype) + batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device) + speaker = torch.where(batch_mask, uncond, speaker) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + speaker = self.speaker_embedding(speaker) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_base_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + p = torch.rand(2, 256, 1).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + spk = torch.rand(2, 256).to(device) + + output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2 --- /dev/null +++ b/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class DiffVC_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + prompt = self.context_embedding(prompt) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_cross_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + p = torch.rand(2, 256, 1).to(device) + + output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5) \ No newline at end of file diff --git a/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..266e177f862a0d66658ed1b7e9d73e1947755ab4 --- /dev/null +++ b/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class P2E_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + def forward(self, target, t, prompt, prompt_mask=None, + train_cfg=False, cfg_prob=0.0): + B, C = target.shape + target = target.unsqueeze(-1).unsqueeze(-1) + + if train_cfg: + if cfg_prob > 0.0: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + prompt = self.context_embedding(prompt) + # fix the bug that prompt will copy dtype from target in diffusers + target = target.to(prompt.dtype) + + output = self.unet(sample=target, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output.squeeze(-1).squeeze(-1) + + +if __name__ == "__main__": + with open('p2e_cross.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = P2E_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 256)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + + output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/src/model/__pycache__/model.cpython-310.pyc b/dreamvoice/src/model/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..564230e72743052a16a565fd58b54b6d88a234de Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model.cpython-310.pyc differ diff --git a/dreamvoice/src/model/__pycache__/model.cpython-311.pyc b/dreamvoice/src/model/__pycache__/model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d044dc24ae43a0e18c8b14a3f3d61a073725134a Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model.cpython-311.pyc differ diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b07fb812aaf24610b9ac2f9e54746a53e61950b Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc differ diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c849e2bff7d11e4c03eac35ab1d4964d0d948677 Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc differ diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ce98cc327edd0c0dedc479a3e950985e1794694 Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc differ diff --git a/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7f1e6c9c0a5b94c07d393f37cf2aa2a8e8bd2f1 Binary files /dev/null and b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc differ diff --git a/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ab2868fb6e1eadb7ce8f20ee4ce52c05f62de68 Binary files /dev/null and b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc differ diff --git a/dreamvoice/src/model/model.py b/dreamvoice/src/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2 --- /dev/null +++ b/dreamvoice/src/model/model.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class DiffVC(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.speaker_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim'])) + self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) / + self.config['cls_embedding']['speaker_dim'] ** 0.5) + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, speaker, pitch, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype) + batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device) + speaker = torch.where(batch_mask, uncond, speaker) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + speaker = self.speaker_embedding(speaker) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_base_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + p = torch.rand(2, 256, 1).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + spk = torch.rand(2, 256).to(device) + + output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/src/model/model_cross.py b/dreamvoice/src/model/model_cross.py new file mode 100644 index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2 --- /dev/null +++ b/dreamvoice/src/model/model_cross.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class DiffVC_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + prompt = self.context_embedding(prompt) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_cross_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + p = torch.rand(2, 256, 1).to(device) + + output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5) \ No newline at end of file diff --git a/dreamvoice/src/model/p2e_cross.py b/dreamvoice/src/model/p2e_cross.py new file mode 100644 index 0000000000000000000000000000000000000000..266e177f862a0d66658ed1b7e9d73e1947755ab4 --- /dev/null +++ b/dreamvoice/src/model/p2e_cross.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class P2E_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + def forward(self, target, t, prompt, prompt_mask=None, + train_cfg=False, cfg_prob=0.0): + B, C = target.shape + target = target.unsqueeze(-1).unsqueeze(-1) + + if train_cfg: + if cfg_prob > 0.0: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + prompt = self.context_embedding(prompt) + # fix the bug that prompt will copy dtype from target in diffusers + target = target.to(prompt.dtype) + + output = self.unet(sample=target, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output.squeeze(-1).squeeze(-1) + + +if __name__ == "__main__": + with open('p2e_cross.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = P2E_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 256)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + + output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py b/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f --- /dev/null +++ b/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F +import torchaudio +import torchaudio.transforms as transforms + + +class LogMelSpectrogram(torch.nn.Module): + def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,): + super().__init__() + self.frame_length = frame_length + self.hop_length = hop_length + self.mel = transforms.MelSpectrogram( + sample_rate=sr, + n_fft=frame_length, + win_length=frame_length, + hop_length=hop_length, + center=False, + power=1.0, + norm="slaney", + n_mels=n_mel, + mel_scale="slaney", + f_min=f_min, + f_max=f_max + ) + + @torch.no_grad() + def forward(self, x, target_length=None): + x = F.pad(x, ((self.frame_length - self.hop_length) // 2, + (self.frame_length - self.hop_length) // 2), "reflect") + mel = self.mel(x) + + target_length = mel.shape[-1] if target_length is None else target_length + logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device) + logmel[:, :, :mel.shape[2]] = mel + + logmel = torch.log(torch.clamp(logmel, min=1e-5)) + return logmel \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py b/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py @@ -0,0 +1,381 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. + +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + + +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +from .activations import activations +from .utils import init_weights, get_padding +from .alias_free_torch import * + +LRELU_SLOPE = 0.1 + + +class AMPBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None): + super(AMPBlock1, self).__init__() + self.h = h + + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class AMPBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None): + super(AMPBlock2, self).__init__() + self.h = h + + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + self.num_layers = len(self.convs) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + for c, a in zip (self.convs, self.activations): + xt = a(x) + xt = c(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class BigVGAN(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, h): + super(BigVGAN, self).__init__() + self.h = h + + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + + # pre conv + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) + + # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default + resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2 + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append(nn.ModuleList([ + weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, u, padding=(k - u) // 2)) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d, activation=h.activation)) + + # post conv + if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing + activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing + activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + + # weight initialization + for i in range(len(self.ups)): + self.ups[i].apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + # pre conv + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + for l_i in l: + remove_weight_norm(l_i) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.d_mult = h.discriminator_channel_mult + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, h): + super(MultiPeriodDiscriminator, self).__init__() + self.mpd_reshapes = h.mpd_reshapes + print("mpd_reshapes: {}".format(self.mpd_reshapes)) + discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes] + self.discriminators = nn.ModuleList(discriminators) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorR(nn.Module): + def __init__(self, cfg, resolution): + super().__init__() + + self.resolution = resolution + assert len(self.resolution) == 3, \ + "MRD layer requires list with len=3, got {}".format(self.resolution) + self.lrelu_slope = LRELU_SLOPE + + norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm + if hasattr(cfg, "mrd_use_spectral_norm"): + print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm)) + norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm + self.d_mult = cfg.discriminator_channel_mult + if hasattr(cfg, "mrd_channel_mult"): + print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult)) + self.d_mult = cfg.mrd_channel_mult + + self.convs = nn.ModuleList([ + norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))), + ]) + self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))) + + def forward(self, x): + fmap = [] + + x = self.spectrogram(x) + x = x.unsqueeze(1) + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, self.lrelu_slope) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + def spectrogram(self, x): + n_fft, hop_length, win_length = self.resolution + x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') + x = x.squeeze(1) + x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True) + x = torch.view_as_real(x) # [B, F, TT, 2] + mag = torch.norm(x, p=2, dim =-1) #[B, F, TT] + + return mag + + +class MultiResolutionDiscriminator(nn.Module): + def __init__(self, cfg, debug=False): + super().__init__() + self.resolutions = cfg.resolutions + assert len(self.resolutions) == 3,\ + "MRD requires list of list with len=3, each element having a list with len=3. got {}".\ + format(self.resolutions) + self.discriminators = nn.ModuleList( + [DiscriminatorR(cfg, resolution) for resolution in self.resolutions] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(x=y) + y_d_g, fmap_g = d(x=y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + diff --git a/dreamvoice/src/modules/BigVGAN/LICENSE b/dreamvoice/src/modules/BigVGAN/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 NVIDIA CORPORATION. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/README.md b/dreamvoice/src/modules/BigVGAN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/README.md @@ -0,0 +1,95 @@ +## BigVGAN: A Universal Neural Vocoder with Large-Scale Training +#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon + +
+ + +### [Paper](https://arxiv.org/abs/2206.04658) +### [Audio demo](https://bigvgan-demo.github.io/) + +## Installation +Clone the repository and install dependencies. +```shell +# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries +git clone https://github.com/NVIDIA/BigVGAN +pip install -r requirements.txt +``` + +Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset. +``` shell +cd LibriTTS && \ +ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \ +ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \ +ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \ +ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \ +ln -s /path/to/your/LibriTTS/dev-other dev-other && \ +ln -s /path/to/your/LibriTTS/test-clean test-clean && \ +ln -s /path/to/your/LibriTTS/test-other test-other && \ +cd .. +``` + +## Training +Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input. +```shell +python train.py \ +--config configs/bigvgan_24khz_100band.json \ +--input_wavs_dir LibriTTS \ +--input_training_file LibriTTS/train-full.txt \ +--input_validation_file LibriTTS/val-full.txt \ +--list_input_unseen_wavs_dir LibriTTS LibriTTS \ +--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \ +--checkpoint_path exp/bigvgan +``` + +## Synthesis +Synthesize from BigVGAN model. Below is an example command for generating audio from the model. +It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`. +```shell +python inference.py \ +--checkpoint_file exp/bigvgan/g_05000000 \ +--input_wavs_dir /path/to/your/input_wav \ +--output_dir /path/to/your/output_wav +``` + +`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`. +It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`. + +Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model. +```shell +python inference_e2e.py \ +--checkpoint_file exp/bigvgan/g_05000000 \ +--input_mels_dir /path/to/your/input_mel \ +--output_dir /path/to/your/output_wav +``` + +## Pretrained Models +We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq). +One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders. + +|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned| +|------|---|---|---|---|------|---| +|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No| +|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No| +|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No| +|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No| + +The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset. +We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications. +Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality. + + +## TODO + +Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. + + +## References +* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator) + +* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation) + +* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing) + +* [Julius](https://github.com/adefossez/julius) (for low-pass filter) + +* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator) \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd8ec6a9f201528dae3177dec447dba562779d13 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..944bf499d4d9e2293856a3fc1b7b589f09b11bba Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6ad6022b905f0726278468c130c38e351229424 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f5238ccc2abe7d89ad07b8be8d28aaf2b434b24 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9452098b19c32715461aaed8412d15a78947021 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4da540abb6cee481f0857f4ed62b888194fa9b1f Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27e63b2b64f00899b7ec2910443d7058cfb05570 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ffcf33665e7ad868aa70941e5050c25c06e1277 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..717995369d8f414c631aeda4d0d421b77ecc7ce3 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1fb7f580e6ffa7546e2a08bf4f1fb064422e0fc Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d62632c6c8d75cb2014386a0dd10d78831f7cfa4 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ef60d0e501cc1558847c17382aea818671cb73c Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fb9eb3da3c95ebeb39caad5e879e10beaff693d Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a8baa0a3aa82168837a209ad631a824fa21cb7a Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcde8e2762c89f5509b59c3537f8b2caa5bc36e3 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae498304bce7351eedd5837cc7488a7a7e8583b4 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/activations/activations.py b/dreamvoice/src/modules/BigVGAN/activations/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/activations/activations.py @@ -0,0 +1,120 @@ +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter + + +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + self.beta = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + self.beta = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py @@ -0,0 +1,6 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +from .filter import * +from .resample import * +from .act import * \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e7c086a019eff89bcbab80a1911bbe3824c7793 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70a72c2a6fe2b7600a0ae70c5bddc68543f7f916 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89ddc0d89af089d72dd97520e57b0b2375063336 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a28bab23adbe73558251485a19ba613c5c4b6df Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b00e6040d42b21f7a84c9c541ab0cec6964ace8f Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a08646fdf31cb36321bedf80b911f6dbfd5209c Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a0de1a364c0d5cb89bfa72e4b015dc8652a10f7 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7325ef7be6657739011bc6c157db7986bea065a Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5df92a971fff07fd60e120ba8f0fd2633b1c34fe Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50726ae35e7fc66531f1f1cf9ef5014fa673290b Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa22888731722dd9bda74d239cad5135419a4f6d Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..273d9fb158e2cbdc8f3e4dad85efaa5b15cf7286 Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py new file mode 100644 index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py @@ -0,0 +1,28 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from .resample import UpSample1d, DownSample1d + + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py @@ -0,0 +1,95 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), + mode=self.padding_mode) + out = F.conv1d(x, self.filter.expand(C, -1, -1), + stride=self.stride, groups=C) + + return out \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py @@ -0,0 +1,49 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from torch.nn import functional as F +from .filter import LowPassFilter1d +from .filter import kaiser_sinc_filter1d + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size) + + def forward(self, x): + xx = self.lowpass(x) + + return xx \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/env.py b/dreamvoice/src/modules/BigVGAN/env.py new file mode 100644 index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/env.py @@ -0,0 +1,18 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) \ No newline at end of file diff --git a/dreamvoice/src/modules/BigVGAN/inference.py b/dreamvoice/src/modules/BigVGAN/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/inference.py @@ -0,0 +1,36 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +from scipy.io.wavfile import write +from .env import AttrDict +from .utils import MAX_WAV_VALUE +from .models import BigVGAN as Generator +import librosa + + +def load_model(model_path, device='cuda'): + config_file = os.path.join(os.path.split(model_path)[0], 'config.json') + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + + h = AttrDict(json_config) + + generator = Generator(h).to(device) + + cp_dict = torch.load(model_path, map_location=device) + generator.load_state_dict(cp_dict['generator']) + generator.eval() + generator.remove_weight_norm() + del cp_dict + return generator, h + diff --git a/dreamvoice/src/modules/BigVGAN/models.py b/dreamvoice/src/modules/BigVGAN/models.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/models.py @@ -0,0 +1,381 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. + +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + + +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +from .activations import activations +from .utils import init_weights, get_padding +from .alias_free_torch import * + +LRELU_SLOPE = 0.1 + + +class AMPBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None): + super(AMPBlock1, self).__init__() + self.h = h + + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class AMPBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None): + super(AMPBlock2, self).__init__() + self.h = h + + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + self.num_layers = len(self.convs) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + for c, a in zip (self.convs, self.activations): + xt = a(x) + xt = c(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class BigVGAN(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, h): + super(BigVGAN, self).__init__() + self.h = h + + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + + # pre conv + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) + + # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default + resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2 + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append(nn.ModuleList([ + weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, u, padding=(k - u) // 2)) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d, activation=h.activation)) + + # post conv + if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing + activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing + activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + + # weight initialization + for i in range(len(self.ups)): + self.ups[i].apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + # pre conv + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + for l_i in l: + remove_weight_norm(l_i) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.d_mult = h.discriminator_channel_mult + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, h): + super(MultiPeriodDiscriminator, self).__init__() + self.mpd_reshapes = h.mpd_reshapes + print("mpd_reshapes: {}".format(self.mpd_reshapes)) + discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes] + self.discriminators = nn.ModuleList(discriminators) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorR(nn.Module): + def __init__(self, cfg, resolution): + super().__init__() + + self.resolution = resolution + assert len(self.resolution) == 3, \ + "MRD layer requires list with len=3, got {}".format(self.resolution) + self.lrelu_slope = LRELU_SLOPE + + norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm + if hasattr(cfg, "mrd_use_spectral_norm"): + print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm)) + norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm + self.d_mult = cfg.discriminator_channel_mult + if hasattr(cfg, "mrd_channel_mult"): + print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult)) + self.d_mult = cfg.mrd_channel_mult + + self.convs = nn.ModuleList([ + norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))), + ]) + self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))) + + def forward(self, x): + fmap = [] + + x = self.spectrogram(x) + x = x.unsqueeze(1) + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, self.lrelu_slope) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + def spectrogram(self, x): + n_fft, hop_length, win_length = self.resolution + x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') + x = x.squeeze(1) + x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True) + x = torch.view_as_real(x) # [B, F, TT, 2] + mag = torch.norm(x, p=2, dim =-1) #[B, F, TT] + + return mag + + +class MultiResolutionDiscriminator(nn.Module): + def __init__(self, cfg, debug=False): + super().__init__() + self.resolutions = cfg.resolutions + assert len(self.resolutions) == 3,\ + "MRD requires list of list with len=3, each element having a list with len=3. got {}".\ + format(self.resolutions) + self.discriminators = nn.ModuleList( + [DiscriminatorR(cfg, resolution) for resolution in self.resolutions] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(x=y) + y_d_g, fmap_g = d(x=y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + diff --git a/dreamvoice/src/modules/BigVGAN/utils.py b/dreamvoice/src/modules/BigVGAN/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017 --- /dev/null +++ b/dreamvoice/src/modules/BigVGAN/utils.py @@ -0,0 +1,81 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm +matplotlib.use("Agg") +import matplotlib.pylab as plt +from scipy.io.wavfile import write + +MAX_WAV_VALUE = 32768.0 + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def plot_spectrogram_clipped(spectrogram, clip_max=2.): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none', vmin=1e-6, vmax=clip_max) + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] + +def save_audio(audio, path, sr): + # wav: torch with 1d shape + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + write(path, sr, audio) \ No newline at end of file diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce88fc172e8b0905e89324595156d7887cb217e6 Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9d4a199212776cb8616a01bb5f7bdc71f698026 Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73fd8d01f9aedd492a6a02bd70109ebe593b2ceb Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/mel.py b/dreamvoice/src/modules/mel.py new file mode 100644 index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f --- /dev/null +++ b/dreamvoice/src/modules/mel.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F +import torchaudio +import torchaudio.transforms as transforms + + +class LogMelSpectrogram(torch.nn.Module): + def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,): + super().__init__() + self.frame_length = frame_length + self.hop_length = hop_length + self.mel = transforms.MelSpectrogram( + sample_rate=sr, + n_fft=frame_length, + win_length=frame_length, + hop_length=hop_length, + center=False, + power=1.0, + norm="slaney", + n_mels=n_mel, + mel_scale="slaney", + f_min=f_min, + f_max=f_max + ) + + @torch.no_grad() + def forward(self, x, target_length=None): + x = F.pad(x, ((self.frame_length - self.hop_length) // 2, + (self.frame_length - self.hop_length) // 2), "reflect") + mel = self.mel(x) + + target_length = mel.shape[-1] if target_length is None else target_length + logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device) + logmel[:, :, :mel.shape[2]] = mel + + logmel = torch.log(torch.clamp(logmel, min=1e-5)) + return logmel \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint b/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint @@ -0,0 +1,24 @@ +MIT License + +Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) +Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) +Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) +Original work Copyright (c) 2015 braindead (https://github.com/braindead) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dreamvoice/src/modules/speaker_encoder/LICENSE b/dreamvoice/src/modules/speaker_encoder/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/LICENSE @@ -0,0 +1,24 @@ +MIT License + +Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) +Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) +Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) +Original work Copyright (c) 2015 braindead (https://github.com/braindead) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dreamvoice/src/modules/speaker_encoder/README.md b/dreamvoice/src/modules/speaker_encoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95663cf5b29be905a8422176f661a8f7745b5cb0 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/README.md @@ -0,0 +1,64 @@ +# Real-Time Voice Cloning +This repository is an implementation of [Transfer Learning from Speaker Verification to +Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). + +SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text. + +**Video demonstration** (click the picture): + +[![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) + + + +### Papers implemented +| URL | Designation | Title | Implementation source | +| --- | ----------- | ----- | --------------------- | +|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | +|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | +|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) +|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | + +## News +**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion. + +**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below. + +**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish. + +**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this: +- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors. +- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info. + +**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it. + + +## Setup + +### 1. Install Requirements +1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. +2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. +3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. +4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. +5. Install the remaining requirements with `pip install -r requirements.txt` + +### 2. (Optional) Download Pretrained Models +Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). + +### 3. (Optional) Test Configuration +Before you download any dataset, you can begin by testing your configuration with: + +`python demo_cli.py` + +If all tests pass, you're good to go. + +### 4. (Optional) Download Datasets +For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. + +### 5. Launch the Toolbox +You can then try the toolbox: + +`python demo_toolbox.py -d ` +or +`python demo_toolbox.py` + +depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py @@ -0,0 +1,211 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .params_data import * +from .model import SpeakerEncoder +from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram +from matplotlib import cm +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import torch + +_model = None # type: SpeakerEncoder +_device = None # type: torch.device + + +def load_model(weights_fpath: Path, device="cpu"): + """ + Loads the model in memory. If this function is not explicitely called, it will be run on the + first call to embed_frames() with the default weights file. + + :param weights_fpath: the path to saved model weights. + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The + model will be loaded and will run on this device. Outputs will however always be on the cpu. + If None, will default to your GPU if it"s available, otherwise your CPU. + """ + # TODO: I think the slow loading of the encoder might have something to do with the device it + # was saved on. Worth investigating. + global _model, _device + if device is None: + _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + _device = torch.device(device) + _model = SpeakerEncoder(_device, torch.device("cpu")) + checkpoint = torch.load(weights_fpath, map_location="cpu") + _model.load_state_dict(checkpoint["model_state"]) + _model.eval() + _model = _model.to(device) + print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) + + +def is_loaded(): + return _model is not None + + +@torch.no_grad() +def embed_frames_batch(frames, use_torch=False): + if _model is None: + raise Exception("Model was not loaded. Call load_model() before inference.") + + if not use_torch: + frames = torch.from_numpy(frames) + frames = frames.to(_device) + + embeds = _model.forward(frames) + if not use_torch: + embeds = embeds.detach().cpu().numpy() + return embeds + + +def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, + min_pad_coverage=0.75, overlap=0.5): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain + partial utterances of each. Both the waveform and the mel + spectrogram slices are returned, so as to make each partial utterance waveform correspond to + its spectrogram. This function assumes that the mel spectrogram parameters used are those + defined in params_data.py. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial + utterance + :param min_pad_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + :param overlap: by how much the partial utterance should overlap. If set to 0, the partial + utterances are entirely disjoint. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 <= overlap < 1 + assert 0 < min_pad_coverage <= 1 + + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partial_utterance_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_pad_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + +@torch.no_grad() +def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): + """ + Computes an embedding for a single utterance. + + # TODO: handle multiple wavs to benefit from batching on GPU + :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 + :param using_partials: if True, then the utterance is split in partial utterances of + frames and the utterance embedding is computed from their + normalized average. If False, the utterance is instead computed from feeding the entire + spectogram to the network. + :param return_partials: if True, the partial embeddings will also be returned along with the + wav slices that correspond to the partial embeddings. + :param kwargs: additional arguments to compute_partial_splits() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. If is simultaneously set to False, both these values will be None + instead. + """ + # Process the entire utterance if not using partials + if not using_partials: + frames = wav_to_mel_spectrogram(wav) + embed = embed_frames_batch(frames[None, ...])[0] + if return_partials: + return embed, None, None + return embed + + # Compute where to split the utterance into partials and pad if necessary + wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials + frames = wav_to_mel_spectrogram(wav) + frames_batch = np.array([frames[s] for s in mel_slices]) + partial_embeds = embed_frames_batch(frames_batch) + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wave_slices + return embed + + +@torch.no_grad() +def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs): + # This torch version is designed to cope with a batch of same lengths wavs + if not using_partials: + frames = wav_to_mel_spectrogram_batch(wavs) + embeds = embed_frames_batch(frames) + if return_partials: + return embeds, None, None + return embeds + + wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= wavs.shape[-1]: + wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), + dtype=wavs.dtype, device=wavs.device)], 1) + + frames = wav_to_mel_spectrogram_batch(wavs) + frames_batch = [] + for i in range(len(frames)): + frames_batch += [frames[i][s] for s in mel_slices] + frames_batch = torch.stack(frames_batch, 0) + partial_embeds = embed_frames_batch(frames_batch, use_torch=True) + partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1) + + raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False) + embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True) + + if return_partials: + return embeds, partial_embeds, wave_slices + return embeds + + +def embed_speaker(wavs, **kwargs): + raise NotImplemented() + + +def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): + if ax is None: + ax = plt.gca() + + if shape is None: + height = int(np.sqrt(len(embed))) + shape = (height, -1) + embed = embed.reshape(shape) + + cmap = cm.get_cmap() + mappable = ax.imshow(embed, cmap=cmap) + cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) + cbar.set_clim(*color_range) + + ax.set_xticks([]), ax.set_yticks([]) + ax.set_title(title) diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py @@ -0,0 +1,177 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from multiprocess.pool import ThreadPool +from .params_data import * +from .config import librispeech_datasets, anglophone_nationalites +from datetime import datetime +from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch +from pathlib import Path +from tqdm import tqdm +import numpy as np + + +class DatasetLog: + """ + Registers metadata about the dataset in a text file. + """ + def __init__(self, root, name): + self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") + self.sample_data = dict() + + start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Creating dataset %s on %s" % (name, start_time)) + self.write_line("-----") + self._log_params() + + def _log_params(self): + from encoder import params_data + self.write_line("Parameter values:") + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + self.write_line("\t%s: %s" % (param_name, value)) + self.write_line("-----") + + def write_line(self, line): + self.text_file.write("%s\n" % line) + + def add_sample(self, **kwargs): + for param_name, value in kwargs.items(): + if not param_name in self.sample_data: + self.sample_data[param_name] = [] + self.sample_data[param_name].append(value) + + def finalize(self): + self.write_line("Statistics:") + for param_name, values in self.sample_data.items(): + self.write_line("\t%s:" % param_name) + self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) + self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) + self.write_line("-----") + end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Finished on %s" % end_time) + self.text_file.close() + + +def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): + dataset_root = datasets_root.joinpath(dataset_name) + if not dataset_root.exists(): + print("Couldn\'t find %s, skipping this dataset." % dataset_root) + return None, None + return dataset_root, DatasetLog(out_dir, dataset_name) + + +def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + + # Function to preprocess utterances for one speaker + def preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + if sources_fpath.exists(): + try: + with sources_fpath.open("r") as sources_file: + existing_fnames = {line.split(",")[0] for line in sources_file} + except: + existing_fnames = {} + else: + existing_fnames = {} + + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + + # Process the utterances for each speaker + with ThreadPool(8) as pool: + list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + unit="speakers")) + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): + for dataset_name in librispeech_datasets["train"]["other"]: + # Initialize the preprocessing + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Preprocess all speakers + speaker_dirs = list(dataset_root.glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", + skip_existing, logger) + + +def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb1" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the contents of the meta file + with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: + metadata = [line.split("\t") for line in metafile][1:] + + # Select the ID and the nationality, filter out non-anglophone speakers + nationalities = {line[0]: line[3] for line in metadata} + keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if + nationality.lower() in anglophone_nationalites] + print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % + (len(keep_speaker_ids), len(nationalities))) + + # Get the speaker directories for anglophone speakers only + speaker_dirs = dataset_root.joinpath("wav").glob("*") + speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if + speaker_dir.name in keep_speaker_ids] + print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % + (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) + + # Preprocess all speakers + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", + skip_existing, logger) + + +def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb2" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the speaker directories + # Preprocess all speakers + speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", + skip_existing, logger) diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py @@ -0,0 +1 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15c3671020065c3fce2c4c2fc7559755bf817801 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19aa499c5843528516f37122c22dd016a7aa7772 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..198619d9f47b4706b813cbe59638c6209227ae86 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22ae43eedbaa103e831a2e6169054b8c37842ef7 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..167e3b4a022ebacbeb73a4db6a4a1cac597bd3b1 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1472b1678eb31f1810967fbdc2f58a8608a9f5d7 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c398427ab385e5abf0274d9175cb15f6363eeb1c Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b104e5ded94920d3cbc07ae2cbb480807d84b6e5 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3c5c543b64e69f4f112bd06850ad50ebaacef52 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9e03f3b611dcf9d00938c48645fa77c38f1e7d0 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2719668e4e12b38078da2488a71639dd013136d3 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d7b54b8c3e251748d7a2da8e47e0c2997553f86 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6afe779501d6e220de041c7c133b03bf8c5cdb5 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6299e6dc1f67558a447c06b3bd2646015d606859 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa3c889ecee1a6f96a64896efc41f4652d58bb64 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8457e1f0a4ee2d82defdead8e92253dcee74e86f Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3134901b6849333f838c72b9c6a269fade00ce82 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1737b0fa586b3d5b6691a205c4239e32e1feed55 Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc differ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/audio.py b/dreamvoice/src/modules/speaker_encoder/encoder/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..de650b972fc7a4f3f8a698c128ee4642a373a6d6 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/audio.py @@ -0,0 +1,157 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from scipy.ndimage.morphology import binary_dilation +from .params_data import * +from pathlib import Path +from typing import Optional, Union +import numpy as np +import webrtcvad +import librosa +import struct + +import torch +from torchaudio.transforms import Resample +from librosa.filters import mel as librosa_mel_fn + + +int16_max = (2 ** 15) - 1 + + +def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], + source_sr: Optional[int] = None): + """ + Applies the preprocessing operations used in training the Speaker Encoder to a waveform + either on disk or in memory. The waveform will be resampled to match the data hyperparameters. + + :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not + just .wav), either the waveform as a numpy array of floats. + :param source_sr: if passing an audio waveform, the sampling rate of the waveform before + preprocessing. After preprocessing, the waveform's sampling rate will match the data + hyperparameters. If passing a filepath, the sampling rate will be automatically detected and + this argument will be ignored. + """ + # Load the wav from disk if needed + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + wav, source_sr = librosa.load(fpath_or_wav, sr=None) + else: + wav = fpath_or_wav + + # Resample the wav if needed + if source_sr is not None and source_sr != sampling_rate: + wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate) + + # Apply the preprocessing: normalize volume and shorten long silences + wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) + wav = trim_long_silences(wav) + + return wav + + +def preprocess_wav_batch(wavs, source_sr=22050): + # This torch version is designed to cope with a batch of same lengths wavs + if sampling_rate != source_sr: + resample = Resample(source_sr, sampling_rate) + wavs = resample(wavs) + wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, + increase_only=True) + # Trimming silence is not implemented in this version yet! + return wavs_preprocessed + + +def wav_to_mel_spectrogram(wav): + """ + Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. + Note: this not a log-mel spectrogram. + """ + frames = librosa.feature.melspectrogram( + y=wav, + sr=sampling_rate, + n_fft=int(sampling_rate * mel_window_length / 1000), + hop_length=int(sampling_rate * mel_window_step / 1000), + n_mels=mel_n_channels + ) + return frames.astype(np.float32).T + + +def wav_to_mel_spectrogram_batch(wavs): + # This torch version is designed to cope with a batch of same lengths wavs + n_fft = int(sampling_rate * mel_window_length / 1000) + hop_length = int(sampling_rate * mel_window_step / 1000) + win_length = int(sampling_rate * mel_window_length / 1000) + window = torch.hann_window(n_fft).to(wavs) + mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, + n_mels=mel_n_channels)).to(wavs) + s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, + win_length=win_length, window=window, center=True, return_complex=False) + real_part, imag_part = s.unbind(-1) + stftm = real_part**2 + imag_part**2 + mels = torch.matmul(mel_basis, stftm) + return torch.transpose(mels, 1, 2) + + +def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) + if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): + return wav + return wav * (10 ** (dBFS_change / 20)) + + +def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False): + # This torch version is designed to cope with a batch of same lengths wavs + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1)) + scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype) + if increase_only: + mask = (dBFS_change > 0).to(scales) + elif decrease_only: + mask = (dBFS_change < 0).to(scales) + else: + mask = torch.zeros_like(scales) + scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0) + return wavs * scales.unsqueeze(-1) + + +def trim_long_silences(wav): + """ + Ensures that segments without voice in the waveform remain no longer than a + threshold determined by the VAD parameters in params.py. + + :param wav: the raw waveform as a numpy array of floats + :return: the same waveform with silences trimmed away (length <= original wav length) + """ + # Compute the voice detection window size + samples_per_window = (vad_window_length * sampling_rate) // 1000 + + # Trim the end of the audio to have a multiple of the window size + wav = wav[:len(wav) - (len(wav) % samples_per_window)] + + # Convert the float waveform to 16-bit mono PCM + pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) + + # Perform voice activation detection + voice_flags = [] + vad = webrtcvad.Vad(mode=3) + for window_start in range(0, len(wav), samples_per_window): + window_end = window_start + samples_per_window + voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], + sample_rate=sampling_rate)) + voice_flags = np.array(voice_flags) + + # Smooth the voice detection with a moving average + def moving_average(array, width): + array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) + ret = np.cumsum(array_padded, dtype=float) + ret[width:] = ret[width:] - ret[:-width] + return ret[width - 1:] / width + + audio_mask = moving_average(voice_flags, vad_moving_average_width) + audio_mask = np.round(audio_mask).astype(np.bool) + + # Dilate the voiced regions + audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) + audio_mask = np.repeat(audio_mask, samples_per_window) + + return wav[audio_mask == True] diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/config.py b/dreamvoice/src/modules/speaker_encoder/encoder/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ce1f5aab0d3899c5e5045b40d4cecee1a11d844c --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/config.py @@ -0,0 +1,47 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +librispeech_datasets = { + "train": { + "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], + "other": ["LibriSpeech/train-other-500"] + }, + "test": { + "clean": ["LibriSpeech/test-clean"], + "other": ["LibriSpeech/test-other"] + }, + "dev": { + "clean": ["LibriSpeech/dev-clean"], + "other": ["LibriSpeech/dev-other"] + }, +} +libritts_datasets = { + "train": { + "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], + "other": ["LibriTTS/train-other-500"] + }, + "test": { + "clean": ["LibriTTS/test-clean"], + "other": ["LibriTTS/test-other"] + }, + "dev": { + "clean": ["LibriTTS/dev-clean"], + "other": ["LibriTTS/dev-other"] + }, +} +voxceleb_datasets = { + "voxceleb1" : { + "train": ["VoxCeleb1/wav"], + "test": ["VoxCeleb1/test_wav"] + }, + "voxceleb2" : { + "train": ["VoxCeleb2/dev/aac"], + "test": ["VoxCeleb2/test_wav"] + } +} + +other_datasets = [ + "LJSpeech-1.1", + "VCTK-Corpus/wav48", +] + +anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9af30b406f2a8debe81a8275cb2682cbd896245a --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py @@ -0,0 +1,4 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .speaker_verification_dataset import SpeakerVerificationDataset +from .speaker_verification_dataset import SpeakerVerificationDataLoader diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd5bb005923852327581e2dcaa03fec7dbce5b8 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py @@ -0,0 +1,39 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import random + +class RandomCycler: + """ + Creates an internal copy of a sequence and allows access to its items in a constrained random + order. For a source sequence of n items and one or several consecutive queries of a total + of m items, the following guarantees hold (one implies the other): + - Each item will be returned between m // n and ((m - 1) // n) + 1 times. + - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. + """ + + def __init__(self, source): + if len(source) == 0: + raise Exception("Can't create RandomCycler from an empty collection") + self.all_items = list(source) + self.next_items = [] + + def sample(self, count: int): + shuffle = lambda l: random.sample(l, len(l)) + + out = [] + while count > 0: + if count >= len(self.all_items): + out.extend(shuffle(list(self.all_items))) + count -= len(self.all_items) + continue + n = min(count, len(self.next_items)) + out.extend(self.next_items[:n]) + count -= n + self.next_items = self.next_items[n:] + if len(self.next_items) == 0: + self.next_items = shuffle(list(self.all_items)) + return out + + def __next__(self): + return self.sample(1)[0] + diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d189c835859efefa686d49b53f4e79aa444d96 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py @@ -0,0 +1,42 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .random_cycler import RandomCycler +from .utterance import Utterance +from pathlib import Path + +# Contains the set of utterances of a single speaker +class Speaker: + def __init__(self, root: Path): + self.root = root + self.name = root.name + self.utterances = None + self.utterance_cycler = None + + def _load_utterances(self): + with self.root.joinpath("_sources.txt").open("r") as sources_file: + sources = [l.split(",") for l in sources_file] + sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} + self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] + self.utterance_cycler = RandomCycler(self.utterances) + + def random_partial(self, count, n_frames): + """ + Samples a batch of unique partial utterances from the disk in a way that all + utterances come up at least once every two cycles and in a random order every time. + + :param count: The number of partial utterances to sample from the set of utterances from + that speaker. Utterances are guaranteed not to be repeated if is not larger than + the number of utterances available. + :param n_frames: The number of frames in the partial utterance. + :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, + frames are the frames of the partial utterances and range is the range of the partial + utterance with regard to the complete utterance. + """ + if self.utterances is None: + self._load_utterances() + + utterances = self.utterance_cycler.sample(count) + + a = [(u,) + u.random_partial(n_frames) for u in utterances] + + return a diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4080d636338bedcb8d1b8fc77945057027fd0ac1 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py @@ -0,0 +1,14 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np +from typing import List +from .speaker import Speaker + +class SpeakerBatch: + def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): + self.speakers = speakers + self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} + + # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with + # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) + self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc31fee9e0d62545caa2599aebc22decfb50aa0 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py @@ -0,0 +1,58 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .random_cycler import RandomCycler +from .speaker_batch import SpeakerBatch +from .speaker import Speaker +from ..params_data import partials_n_frames +from torch.utils.data import Dataset, DataLoader +from pathlib import Path + +# TODO: improve with a pool of speakers for data efficiency + +class SpeakerVerificationDataset(Dataset): + def __init__(self, datasets_root: Path): + self.root = datasets_root + speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] + if len(speaker_dirs) == 0: + raise Exception("No speakers found. Make sure you are pointing to the directory " + "containing all preprocessed speaker directories.") + self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] + self.speaker_cycler = RandomCycler(self.speakers) + + def __len__(self): + return int(1e10) + + def __getitem__(self, index): + return next(self.speaker_cycler) + + def get_logs(self): + log_string = "" + for log_fpath in self.root.glob("*.txt"): + with log_fpath.open("r") as log_file: + log_string += "".join(log_file.readlines()) + return log_string + + +class SpeakerVerificationDataLoader(DataLoader): + def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, + batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, + worker_init_fn=None): + self.utterances_per_speaker = utterances_per_speaker + + super().__init__( + dataset=dataset, + batch_size=speakers_per_batch, + shuffle=False, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=self.collate, + pin_memory=pin_memory, + drop_last=False, + timeout=timeout, + worker_init_fn=worker_init_fn + ) + + def collate(self, speakers): + return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) + \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py new file mode 100644 index 0000000000000000000000000000000000000000..2b878c58fd7d70d3ba0b33def66912adc1c1a45d --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py @@ -0,0 +1,28 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np + + +class Utterance: + def __init__(self, frames_fpath, wave_fpath): + self.frames_fpath = frames_fpath + self.wave_fpath = wave_fpath + + def get_frames(self): + return np.load(self.frames_fpath) + + def random_partial(self, n_frames): + """ + Crops the frames into a partial utterance of n_frames + + :param n_frames: The number of frames of the partial utterance + :return: the partial utterance frames and a tuple indicating the start and end of the + partial utterance in the complete utterance. + """ + frames = self.get_frames() + if frames.shape[0] == n_frames: + start = 0 + else: + start = np.random.randint(0, frames.shape[0] - n_frames) + end = start + n_frames + return frames[start:end], (start, end) \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/inference.py b/dreamvoice/src/modules/speaker_encoder/encoder/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/inference.py @@ -0,0 +1,211 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .params_data import * +from .model import SpeakerEncoder +from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram +from matplotlib import cm +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import torch + +_model = None # type: SpeakerEncoder +_device = None # type: torch.device + + +def load_model(weights_fpath: Path, device="cpu"): + """ + Loads the model in memory. If this function is not explicitely called, it will be run on the + first call to embed_frames() with the default weights file. + + :param weights_fpath: the path to saved model weights. + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The + model will be loaded and will run on this device. Outputs will however always be on the cpu. + If None, will default to your GPU if it"s available, otherwise your CPU. + """ + # TODO: I think the slow loading of the encoder might have something to do with the device it + # was saved on. Worth investigating. + global _model, _device + if device is None: + _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + _device = torch.device(device) + _model = SpeakerEncoder(_device, torch.device("cpu")) + checkpoint = torch.load(weights_fpath, map_location="cpu") + _model.load_state_dict(checkpoint["model_state"]) + _model.eval() + _model = _model.to(device) + print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) + + +def is_loaded(): + return _model is not None + + +@torch.no_grad() +def embed_frames_batch(frames, use_torch=False): + if _model is None: + raise Exception("Model was not loaded. Call load_model() before inference.") + + if not use_torch: + frames = torch.from_numpy(frames) + frames = frames.to(_device) + + embeds = _model.forward(frames) + if not use_torch: + embeds = embeds.detach().cpu().numpy() + return embeds + + +def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, + min_pad_coverage=0.75, overlap=0.5): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain + partial utterances of each. Both the waveform and the mel + spectrogram slices are returned, so as to make each partial utterance waveform correspond to + its spectrogram. This function assumes that the mel spectrogram parameters used are those + defined in params_data.py. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial + utterance + :param min_pad_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + :param overlap: by how much the partial utterance should overlap. If set to 0, the partial + utterances are entirely disjoint. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 <= overlap < 1 + assert 0 < min_pad_coverage <= 1 + + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partial_utterance_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_pad_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + +@torch.no_grad() +def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): + """ + Computes an embedding for a single utterance. + + # TODO: handle multiple wavs to benefit from batching on GPU + :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 + :param using_partials: if True, then the utterance is split in partial utterances of + frames and the utterance embedding is computed from their + normalized average. If False, the utterance is instead computed from feeding the entire + spectogram to the network. + :param return_partials: if True, the partial embeddings will also be returned along with the + wav slices that correspond to the partial embeddings. + :param kwargs: additional arguments to compute_partial_splits() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. If is simultaneously set to False, both these values will be None + instead. + """ + # Process the entire utterance if not using partials + if not using_partials: + frames = wav_to_mel_spectrogram(wav) + embed = embed_frames_batch(frames[None, ...])[0] + if return_partials: + return embed, None, None + return embed + + # Compute where to split the utterance into partials and pad if necessary + wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials + frames = wav_to_mel_spectrogram(wav) + frames_batch = np.array([frames[s] for s in mel_slices]) + partial_embeds = embed_frames_batch(frames_batch) + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wave_slices + return embed + + +@torch.no_grad() +def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs): + # This torch version is designed to cope with a batch of same lengths wavs + if not using_partials: + frames = wav_to_mel_spectrogram_batch(wavs) + embeds = embed_frames_batch(frames) + if return_partials: + return embeds, None, None + return embeds + + wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= wavs.shape[-1]: + wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), + dtype=wavs.dtype, device=wavs.device)], 1) + + frames = wav_to_mel_spectrogram_batch(wavs) + frames_batch = [] + for i in range(len(frames)): + frames_batch += [frames[i][s] for s in mel_slices] + frames_batch = torch.stack(frames_batch, 0) + partial_embeds = embed_frames_batch(frames_batch, use_torch=True) + partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1) + + raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False) + embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True) + + if return_partials: + return embeds, partial_embeds, wave_slices + return embeds + + +def embed_speaker(wavs, **kwargs): + raise NotImplemented() + + +def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): + if ax is None: + ax = plt.gca() + + if shape is None: + height = int(np.sqrt(len(embed))) + shape = (height, -1) + embed = embed.reshape(shape) + + cmap = cm.get_cmap() + mappable = ax.imshow(embed, cmap=cmap) + cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) + cbar.set_clim(*color_range) + + ax.set_xticks([]), ax.set_yticks([]) + ax.set_title(title) diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/model.py b/dreamvoice/src/modules/speaker_encoder/encoder/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8d246bc359ce1ffc6229ba8a4ced24d07b77e703 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/model.py @@ -0,0 +1,137 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .params_model import * +from .params_data import * +from scipy.interpolate import interp1d +from sklearn.metrics import roc_curve +from torch.nn.utils import clip_grad_norm_ +from scipy.optimize import brentq +from torch import nn +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, device, loss_device): + super().__init__() + self.loss_device = loss_device + + # Network defition + self.lstm = nn.LSTM(input_size=mel_n_channels, + hidden_size=model_hidden_size, + num_layers=model_num_layers, + batch_first=True).to(device) + self.linear = nn.Linear(in_features=model_hidden_size, + out_features=model_embedding_size).to(device) + self.relu = torch.nn.ReLU().to(device) + + # Cosine similarity scaling (with fixed initial parameter values) + self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) + self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) + + # Loss + self.loss_fn = nn.CrossEntropyLoss().to(loss_device) + + def do_gradient_ops(self): + # Gradient scale + self.similarity_weight.grad *= 0.01 + self.similarity_bias.grad *= 0.01 + + # Gradient clipping + clip_grad_norm_(self.parameters(), 3, norm_type=2) + + def forward(self, utterances, hidden_init=None): + """ + Computes the embeddings of a batch of utterance spectrograms. + + :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape + (batch_size, n_frames, n_channels) + :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, + batch_size, hidden_size). Will default to a tensor of zeros if None. + :return: the embeddings as a tensor of shape (batch_size, embedding_size) + """ + # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state + # and the final cell state. + out, (hidden, cell) = self.lstm(utterances, hidden_init) + + # We take only the hidden state of the last layer + embeds_raw = self.relu(self.linear(hidden[-1])) + + # L2-normalize it + embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + return embeds + + def similarity_matrix(self, embeds): + """ + Computes the similarity matrix according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the similarity matrix as a tensor of shape (speakers_per_batch, + utterances_per_speaker, speakers_per_batch) + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation + centroids_incl = torch.mean(embeds, dim=1, keepdim=True) + centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) + + # Exclusive centroids (1 per utterance) + centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) + centroids_excl /= (utterances_per_speaker - 1) + centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) + + # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot + # product of these vectors (which is just an element-wise multiplication reduced by a sum). + # We vectorize the computation for efficiency. + sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, + speakers_per_batch).to(self.loss_device) + mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) + for j in range(speakers_per_batch): + mask = np.where(mask_matrix[j])[0] + sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) + sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) + + ## Even more vectorized version (slower maybe because of transpose) + # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker + # ).to(self.loss_device) + # eye = np.eye(speakers_per_batch, dtype=np.int) + # mask = np.where(1 - eye) + # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) + # mask = np.where(eye) + # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) + # sim_matrix2 = sim_matrix2.transpose(1, 2) + + sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias + return sim_matrix + + def loss(self, embeds): + """ + Computes the softmax loss according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the loss and the EER for this batch of embeddings. + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Loss + sim_matrix = self.similarity_matrix(embeds) + sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, + speakers_per_batch)) + ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) + target = torch.from_numpy(ground_truth).long().to(self.loss_device) + loss = self.loss_fn(sim_matrix, target) + + # EER (not backpropagated) + with torch.no_grad(): + inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] + labels = np.array([inv_argmax(i) for i in ground_truth]) + preds = sim_matrix.detach().cpu().numpy() + + # Snippet from https://yangcha.github.io/EER-ROC/ + fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) + eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + + return loss, eer \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py b/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py new file mode 100644 index 0000000000000000000000000000000000000000..62d04121aed3d7862889ad6c771055db9b74ab6e --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py @@ -0,0 +1,30 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms +# Number of spectrogram frames at inference +inference_n_frames = 80 # 800 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py b/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9c535205028bfec75ba7c58ea7e750ba3fff1633 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py @@ -0,0 +1,12 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 + + +## Training parameters +learning_rate_init = 1e-4 +speakers_per_batch = 64 +utterances_per_speaker = 10 diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py b/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py @@ -0,0 +1,177 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from multiprocess.pool import ThreadPool +from .params_data import * +from .config import librispeech_datasets, anglophone_nationalites +from datetime import datetime +from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch +from pathlib import Path +from tqdm import tqdm +import numpy as np + + +class DatasetLog: + """ + Registers metadata about the dataset in a text file. + """ + def __init__(self, root, name): + self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") + self.sample_data = dict() + + start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Creating dataset %s on %s" % (name, start_time)) + self.write_line("-----") + self._log_params() + + def _log_params(self): + from encoder import params_data + self.write_line("Parameter values:") + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + self.write_line("\t%s: %s" % (param_name, value)) + self.write_line("-----") + + def write_line(self, line): + self.text_file.write("%s\n" % line) + + def add_sample(self, **kwargs): + for param_name, value in kwargs.items(): + if not param_name in self.sample_data: + self.sample_data[param_name] = [] + self.sample_data[param_name].append(value) + + def finalize(self): + self.write_line("Statistics:") + for param_name, values in self.sample_data.items(): + self.write_line("\t%s:" % param_name) + self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) + self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) + self.write_line("-----") + end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Finished on %s" % end_time) + self.text_file.close() + + +def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): + dataset_root = datasets_root.joinpath(dataset_name) + if not dataset_root.exists(): + print("Couldn\'t find %s, skipping this dataset." % dataset_root) + return None, None + return dataset_root, DatasetLog(out_dir, dataset_name) + + +def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + + # Function to preprocess utterances for one speaker + def preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + if sources_fpath.exists(): + try: + with sources_fpath.open("r") as sources_file: + existing_fnames = {line.split(",")[0] for line in sources_file} + except: + existing_fnames = {} + else: + existing_fnames = {} + + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + + # Process the utterances for each speaker + with ThreadPool(8) as pool: + list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + unit="speakers")) + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): + for dataset_name in librispeech_datasets["train"]["other"]: + # Initialize the preprocessing + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Preprocess all speakers + speaker_dirs = list(dataset_root.glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", + skip_existing, logger) + + +def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb1" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the contents of the meta file + with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: + metadata = [line.split("\t") for line in metafile][1:] + + # Select the ID and the nationality, filter out non-anglophone speakers + nationalities = {line[0]: line[3] for line in metadata} + keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if + nationality.lower() in anglophone_nationalites] + print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % + (len(keep_speaker_ids), len(nationalities))) + + # Get the speaker directories for anglophone speakers only + speaker_dirs = dataset_root.joinpath("wav").glob("*") + speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if + speaker_dir.name in keep_speaker_ids] + print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % + (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) + + # Preprocess all speakers + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", + skip_existing, logger) + + +def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb2" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the speaker directories + # Preprocess all speakers + speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", + skip_existing, logger) diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/train.py b/dreamvoice/src/modules/speaker_encoder/encoder/train.py new file mode 100644 index 0000000000000000000000000000000000000000..250d038a33b72d09dfe67811c917708aa0ea6714 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/train.py @@ -0,0 +1,127 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .visualizations import Visualizations +from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset +from .params_model import * +from .model import SpeakerEncoder +from .utils.profiler import Profiler +from pathlib import Path +import torch + +def sync(device: torch.device): + # FIXME + return + # For correct profiling (cuda operations are async) + if device.type == "cuda": + torch.cuda.synchronize(device) + +def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, + backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, + no_visdom: bool): + # Create a dataset and a dataloader + dataset = SpeakerVerificationDataset(clean_data_root) + loader = SpeakerVerificationDataLoader( + dataset, + speakers_per_batch, + utterances_per_speaker, + num_workers=8, + ) + + # Setup the device on which to run the forward pass and the loss. These can be different, + # because the forward pass is faster on the GPU whereas the loss is often (depending on your + # hyperparameters) faster on the CPU. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # FIXME: currently, the gradient is None if loss_device is cuda + loss_device = torch.device("cpu") + + # Create the model and the optimizer + model = SpeakerEncoder(device, loss_device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) + init_step = 1 + + # Configure file path for the model + state_fpath = models_dir.joinpath(run_id + ".pt") + backup_dir = models_dir.joinpath(run_id + "_backups") + + # Load any existing model + if not force_restart: + if state_fpath.exists(): + print("Found existing model \"%s\", loading it and resuming training." % run_id) + checkpoint = torch.load(state_fpath) + init_step = checkpoint["step"] + model.load_state_dict(checkpoint["model_state"]) + optimizer.load_state_dict(checkpoint["optimizer_state"]) + optimizer.param_groups[0]["lr"] = learning_rate_init + else: + print("No model \"%s\" found, starting training from scratch." % run_id) + else: + print("Starting the training from scratch.") + model.train() + + # Initialize the visualization environment + vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) + vis.log_dataset(dataset) + vis.log_params() + device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") + vis.log_implementation({"Device": device_name}) + + # Training loop + profiler = Profiler(summarize_every=10, disabled=False) + for step, speaker_batch in enumerate(loader, init_step): + profiler.tick("Blocking, waiting for batch (threaded)") + + # Forward pass + inputs = torch.from_numpy(speaker_batch.data).to(device) + sync(device) + profiler.tick("Data to %s" % device) + embeds = model(inputs) + sync(device) + profiler.tick("Forward pass") + embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) + loss, eer = model.loss(embeds_loss) + sync(loss_device) + profiler.tick("Loss") + + # Backward pass + model.zero_grad() + loss.backward() + profiler.tick("Backward pass") + model.do_gradient_ops() + optimizer.step() + profiler.tick("Parameter update") + + # Update visualizations + # learning_rate = optimizer.param_groups[0]["lr"] + vis.update(loss.item(), eer, step) + + # Draw projections and save them to the backup folder + if umap_every != 0 and step % umap_every == 0: + print("Drawing and saving projections (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) + embeds = embeds.detach().cpu().numpy() + vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) + vis.save() + + # Overwrite the latest version of the model + if save_every != 0 and step % save_every == 0: + print("Saving the model (step %d)" % step) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, state_fpath) + + # Make a backup + if backup_every != 0 and step % backup_every == 0: + print("Making a backup (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, backup_fpath) + + profiler.tick("Extras (visualizations, saving)") + \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py @@ -0,0 +1 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py new file mode 100644 index 0000000000000000000000000000000000000000..6de50f3ec61f6b61798299726b13a1caa1638abb --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py @@ -0,0 +1,42 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from pathlib import Path +import numpy as np +import argparse + +_type_priorities = [ # In decreasing order + Path, + str, + int, + float, + bool, +] + +def _priority(o): + p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) + if p is not None: + return p + p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) + if p is not None: + return p + return len(_type_priorities) + +def print_args(args: argparse.Namespace, parser=None): + args = vars(args) + if parser is None: + priorities = list(map(_priority, args.values())) + else: + all_params = [a.dest for g in parser._action_groups for a in g._group_actions ] + priority = lambda p: all_params.index(p) if p in all_params else len(all_params) + priorities = list(map(priority, args.keys())) + + pad = max(map(len, args.keys())) + 3 + indices = np.lexsort((list(args.keys()), priorities)) + items = list(args.items()) + + print("Arguments:") + for i in indices: + param, value = items[i] + print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) + print("") + \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py new file mode 100644 index 0000000000000000000000000000000000000000..43de43e4c29821df5d20d8303ce491101a041a86 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py @@ -0,0 +1,222 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np +import math +from scipy.special import expn +from collections import namedtuple + +NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2") + + +def profile_noise(noise, sampling_rate, window_size=0): + """ + Creates a profile of the noise in a given waveform. + + :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. + :param sampling_rate: the sampling rate of the audio + :param window_size: the size of the window the logmmse algorithm operates on. A default value + will be picked if left as 0. + :return: a NoiseProfile object + """ + noise, dtype = to_float(noise) + noise += np.finfo(np.float64).eps + + if window_size == 0: + window_size = int(math.floor(0.02 * sampling_rate)) + + if window_size % 2 == 1: + window_size = window_size + 1 + + perc = 50 + len1 = int(math.floor(window_size * perc / 100)) + len2 = int(window_size - len1) + + win = np.hanning(window_size) + win = win * len2 / np.sum(win) + n_fft = 2 * window_size + + noise_mean = np.zeros(n_fft) + n_frames = len(noise) // window_size + for j in range(0, window_size * n_frames, window_size): + noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0)) + noise_mu2 = (noise_mean / n_frames) ** 2 + + return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2) + + +def denoise(wav, noise_profile: NoiseProfile, eta=0.15): + """ + Cleans the noise from a speech waveform given a noise profile. The waveform must have the + same sampling rate as the one used to create the noise profile. + + :param wav: a speech waveform as a numpy array of floats or ints. + :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of + the same) waveform. + :param eta: voice threshold for noise update. While the voice activation detection value is + below this threshold, the noise profile will be continuously updated throughout the audio. + Set to 0 to disable updating the noise profile. + :return: the clean wav as a numpy array of floats or ints of the same length. + """ + wav, dtype = to_float(wav) + wav += np.finfo(np.float64).eps + p = noise_profile + + nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2)) + x_final = np.zeros(nframes * p.len2) + + aa = 0.98 + mu = 0.98 + ksi_min = 10 ** (-25 / 10) + + x_old = np.zeros(p.len1) + xk_prev = np.zeros(p.len1) + noise_mu2 = p.noise_mu2 + for k in range(0, nframes * p.len2, p.len2): + insign = p.win * wav[k:k + p.window_size] + + spec = np.fft.fft(insign, p.n_fft, axis=0) + sig = np.absolute(spec) + sig2 = sig ** 2 + + gammak = np.minimum(sig2 / noise_mu2, 40) + + if xk_prev.all() == 0: + ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) + else: + ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) + ksi = np.maximum(ksi_min, ksi) + + log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi) + vad_decision = np.sum(log_sigma_k) / p.window_size + if vad_decision < eta: + noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 + + a = ksi / (1 + ksi) + vk = a * gammak + ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) + hw = a * np.exp(ei_vk) + sig = sig * hw + xk_prev = sig ** 2 + xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0) + xi_w = np.real(xi_w) + + x_final[k:k + p.len2] = x_old + xi_w[0:p.len1] + x_old = xi_w[p.len1:p.window_size] + + output = from_float(x_final, dtype) + output = np.pad(output, (0, len(wav) - len(output)), mode="constant") + return output + + +## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that +## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of +## webrctvad +# def vad(wav, sampling_rate, eta=0.15, window_size=0): +# """ +# TODO: fix doc +# Creates a profile of the noise in a given waveform. +# +# :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. +# :param sampling_rate: the sampling rate of the audio +# :param window_size: the size of the window the logmmse algorithm operates on. A default value +# will be picked if left as 0. +# :param eta: voice threshold for noise update. While the voice activation detection value is +# below this threshold, the noise profile will be continuously updated throughout the audio. +# Set to 0 to disable updating the noise profile. +# """ +# wav, dtype = to_float(wav) +# wav += np.finfo(np.float64).eps +# +# if window_size == 0: +# window_size = int(math.floor(0.02 * sampling_rate)) +# +# if window_size % 2 == 1: +# window_size = window_size + 1 +# +# perc = 50 +# len1 = int(math.floor(window_size * perc / 100)) +# len2 = int(window_size - len1) +# +# win = np.hanning(window_size) +# win = win * len2 / np.sum(win) +# n_fft = 2 * window_size +# +# wav_mean = np.zeros(n_fft) +# n_frames = len(wav) // window_size +# for j in range(0, window_size * n_frames, window_size): +# wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0)) +# noise_mu2 = (wav_mean / n_frames) ** 2 +# +# wav, dtype = to_float(wav) +# wav += np.finfo(np.float64).eps +# +# nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2)) +# vad = np.zeros(nframes * len2, dtype=np.bool) +# +# aa = 0.98 +# mu = 0.98 +# ksi_min = 10 ** (-25 / 10) +# +# xk_prev = np.zeros(len1) +# noise_mu2 = noise_mu2 +# for k in range(0, nframes * len2, len2): +# insign = win * wav[k:k + window_size] +# +# spec = np.fft.fft(insign, n_fft, axis=0) +# sig = np.absolute(spec) +# sig2 = sig ** 2 +# +# gammak = np.minimum(sig2 / noise_mu2, 40) +# +# if xk_prev.all() == 0: +# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) +# else: +# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) +# ksi = np.maximum(ksi_min, ksi) +# +# log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi) +# vad_decision = np.sum(log_sigma_k) / window_size +# if vad_decision < eta: +# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 +# print(vad_decision) +# +# a = ksi / (1 + ksi) +# vk = a * gammak +# ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) +# hw = a * np.exp(ei_vk) +# sig = sig * hw +# xk_prev = sig ** 2 +# +# vad[k:k + len2] = vad_decision >= eta +# +# vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant") +# return vad + + +def to_float(_input): + if _input.dtype == np.float64: + return _input, _input.dtype + elif _input.dtype == np.float32: + return _input.astype(np.float64), _input.dtype + elif _input.dtype == np.uint8: + return (_input - 128) / 128., _input.dtype + elif _input.dtype == np.int16: + return _input / 32768., _input.dtype + elif _input.dtype == np.int32: + return _input / 2147483648., _input.dtype + raise ValueError('Unsupported wave file format') + + +def from_float(_input, dtype): + if dtype == np.float64: + return _input, np.float64 + elif dtype == np.float32: + return _input.astype(np.float32) + elif dtype == np.uint8: + return ((_input * 128) + 128).astype(np.uint8) + elif dtype == np.int16: + return (_input * 32768).astype(np.int16) + elif dtype == np.int32: + print(_input) + return (_input * 2147483648).astype(np.int32) + raise ValueError('Unsupported wave file format') diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..f0176f632b58dfde15e31c04e79543b629bd4499 --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py @@ -0,0 +1,47 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from time import perf_counter as timer +from collections import OrderedDict +import numpy as np + + +class Profiler: + def __init__(self, summarize_every=5, disabled=False): + self.last_tick = timer() + self.logs = OrderedDict() + self.summarize_every = summarize_every + self.disabled = disabled + + def tick(self, name): + if self.disabled: + return + + # Log the time needed to execute that function + if not name in self.logs: + self.logs[name] = [] + if len(self.logs[name]) >= self.summarize_every: + self.summarize() + self.purge_logs() + self.logs[name].append(timer() - self.last_tick) + + self.reset_timer() + + def purge_logs(self): + for name in self.logs: + self.logs[name].clear() + + def reset_timer(self): + self.last_tick = timer() + + def summarize(self): + n = max(map(len, self.logs.values())) + assert n == self.summarize_every + print("\nAverage execution time over %d steps:" % n) + + name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()] + pad = max(map(len, name_msgs)) + for name_msg, deltas in zip(name_msgs, self.logs.values()): + print(" %s mean: %4.0fms std: %4.0fms" % + (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) + print("", flush=True) + \ No newline at end of file diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py b/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..e8b0ffc1f3c54d85158521cac6d09f05dd21de6d --- /dev/null +++ b/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py @@ -0,0 +1,180 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from datetime import datetime +from time import perf_counter as timer +import matplotlib.pyplot as plt +import numpy as np +# import webbrowser +import visdom +import umap + +colormap = np.array([ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], +], dtype=np.float) / 255 + + +class Visualizations: + def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): + # Tracking data + self.last_update_timestamp = timer() + self.update_every = update_every + self.step_times = [] + self.losses = [] + self.eers = [] + print("Updating the visualizations every %d steps." % update_every) + + # If visdom is disabled TODO: use a better paradigm for that + self.disabled = disabled + if self.disabled: + return + + # Set the environment name + now = str(datetime.now().strftime("%d-%m %Hh%M")) + if env_name is None: + self.env_name = now + else: + self.env_name = "%s (%s)" % (env_name, now) + + # Connect to visdom and open the corresponding window in the browser + try: + self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) + except ConnectionError: + raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " + "start it.") + # webbrowser.open("http://localhost:8097/env/" + self.env_name) + + # Create the windows + self.loss_win = None + self.eer_win = None + # self.lr_win = None + self.implementation_win = None + self.projection_win = None + self.implementation_string = "" + + def log_params(self): + if self.disabled: + return + from encoder import params_data + from encoder import params_model + param_string = "Model parameters:
" + for param_name in (p for p in dir(params_model) if not p.startswith("__")): + value = getattr(params_model, param_name) + param_string += "\t%s: %s
" % (param_name, value) + param_string += "Data parameters:
" + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + param_string += "\t%s: %s
" % (param_name, value) + self.vis.text(param_string, opts={"title": "Parameters"}) + + def log_dataset(self, dataset: SpeakerVerificationDataset): + if self.disabled: + return + dataset_string = "" + dataset_string += "Speakers: %s\n" % len(dataset.speakers) + dataset_string += "\n" + dataset.get_logs() + dataset_string = dataset_string.replace("\n", "
") + self.vis.text(dataset_string, opts={"title": "Dataset"}) + + def log_implementation(self, params): + if self.disabled: + return + implementation_string = "" + for param, value in params.items(): + implementation_string += "%s: %s\n" % (param, value) + implementation_string = implementation_string.replace("\n", "
") + self.implementation_string = implementation_string + self.implementation_win = self.vis.text( + implementation_string, + opts={"title": "Training implementation"} + ) + + def update(self, loss, eer, step): + # Update the tracking data + now = timer() + self.step_times.append(1000 * (now - self.last_update_timestamp)) + self.last_update_timestamp = now + self.losses.append(loss) + self.eers.append(eer) + print(".", end="") + + # Update the plots every steps + if step % self.update_every != 0: + return + time_string = "Step time: mean: %5dms std: %5dms" % \ + (int(np.mean(self.step_times)), int(np.std(self.step_times))) + print("\nStep %6d Loss: %.4f EER: %.4f %s" % + (step, np.mean(self.losses), np.mean(self.eers), time_string)) + if not self.disabled: + self.loss_win = self.vis.line( + [np.mean(self.losses)], + [step], + win=self.loss_win, + update="append" if self.loss_win else None, + opts=dict( + legend=["Avg. loss"], + xlabel="Step", + ylabel="Loss", + title="Loss", + ) + ) + self.eer_win = self.vis.line( + [np.mean(self.eers)], + [step], + win=self.eer_win, + update="append" if self.eer_win else None, + opts=dict( + legend=["Avg. EER"], + xlabel="Step", + ylabel="EER", + title="Equal error rate" + ) + ) + if self.implementation_win is not None: + self.vis.text( + self.implementation_string + ("%s" % time_string), + win=self.implementation_win, + opts={"title": "Training implementation"}, + ) + + # Reset the tracking + self.losses.clear() + self.eers.clear() + self.step_times.clear() + + def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, + max_speakers=10): + max_speakers = min(max_speakers, len(colormap)) + embeds = embeds[:max_speakers * utterances_per_speaker] + + n_speakers = len(embeds) // utterances_per_speaker + ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) + colors = [colormap[i] for i in ground_truth] + + reducer = umap.UMAP() + projected = reducer.fit_transform(embeds) + plt.scatter(projected[:, 0], projected[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection (step %d)" % step) + if not self.disabled: + self.projection_win = self.vis.matplot(plt, win=self.projection_win) + if out_fpath is not None: + plt.savefig(out_fpath) + plt.clf() + + def save(self): + if not self.disabled: + self.vis.save([self.env_name]) + \ No newline at end of file diff --git a/dreamvoice/src/plugin_wrapper.py b/dreamvoice/src/plugin_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5 --- /dev/null +++ b/dreamvoice/src/plugin_wrapper.py @@ -0,0 +1,76 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.p2e_cross import P2E_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class DreamVG(object): + def __init__(self, + config_path='configs/plugin_cross.yaml', + ckpt_path='../ckpts/dreamvc_plugin.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = P2E_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.spk_shape = config['model']['unet']['in_channels'] + + @torch.no_grad() + def inference(self, text, + guidance_scale=5, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023, + ): + text, text_mask = text + self.model.eval() + + gen_shape = (1, self.spk_shape) + + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, text, text_mask, train_cfg=False) + output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, text, text_mask, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5) + pred = scale_shift_re(latents, 1/self.scale, self.shift) + # pred = torch.clip(pred, min=0.0, max=0.5) + return pred \ No newline at end of file diff --git a/dreamvoice/src/train_plugin.py b/dreamvoice/src/train_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/src/train_vc.py b/dreamvoice/src/train_vc.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py b/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67 --- /dev/null +++ b/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py b/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a --- /dev/null +++ b/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.io import wavfile +import torch + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +def scale_shift(x, scale, shift): + return (x+shift) * scale + + +def scale_shift_re(x, scale, shift): + return (x/scale) - shift + + +def align_seq(source, target_length, mapping_method='hard'): + source_len = source.shape[1] + if mapping_method == 'hard': + mapping_idx = np.round(np.arange(target_length) * source_len / target_length) + output = source[:, mapping_idx] + else: + # TBD + raise NotImplementedError + + return output + + +def save_plot(tensor, savepath): + tensor = tensor.squeeze().cpu() + plt.style.use('default') + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') + plt.colorbar(im, ax=ax) + plt.tight_layout() + fig.canvas.draw() + plt.savefig(savepath) + plt.close() + + +def save_audio(file_path, sampling_rate, audio): + audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999) + wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16")) + + +def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, vmin, vmax) + tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1 + return tensor + + +def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, -1.0, 1.0) + tensor = (tensor + 1) / 2 + tensor = tensor * (vmax - vmin) + vmin + return tensor + + +if __name__ == "__main__": + + a = torch.rand(2, 10) + target_len = 15 + + b = align_seq(a, target_len) \ No newline at end of file diff --git a/dreamvoice/src/utils/__init__.py b/dreamvoice/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67 --- /dev/null +++ b/dreamvoice/src/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..405d01af1c119ddc6e4d9d75f19f83ffd18ade03 Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16806b791a4fd0955747481eb9aeae12108cec3a Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc b/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36df2ec1198ddc4695a17a082bc6340e8e7d4fe8 Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc differ diff --git a/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc b/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebbb3f58af0e9432e9c295fec282ecbe4f78f90f Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc differ diff --git a/dreamvoice/src/utils/utils.py b/dreamvoice/src/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a --- /dev/null +++ b/dreamvoice/src/utils/utils.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.io import wavfile +import torch + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +def scale_shift(x, scale, shift): + return (x+shift) * scale + + +def scale_shift_re(x, scale, shift): + return (x/scale) - shift + + +def align_seq(source, target_length, mapping_method='hard'): + source_len = source.shape[1] + if mapping_method == 'hard': + mapping_idx = np.round(np.arange(target_length) * source_len / target_length) + output = source[:, mapping_idx] + else: + # TBD + raise NotImplementedError + + return output + + +def save_plot(tensor, savepath): + tensor = tensor.squeeze().cpu() + plt.style.use('default') + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') + plt.colorbar(im, ax=ax) + plt.tight_layout() + fig.canvas.draw() + plt.savefig(savepath) + plt.close() + + +def save_audio(file_path, sampling_rate, audio): + audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999) + wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16")) + + +def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, vmin, vmax) + tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1 + return tensor + + +def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, -1.0, 1.0) + tensor = (tensor + 1) / 2 + tensor = tensor * (vmax - vmin) + vmin + return tensor + + +if __name__ == "__main__": + + a = torch.rand(2, 10) + target_len = 15 + + b = align_seq(a, target_len) \ No newline at end of file diff --git a/dreamvoice/src/vc_wrapper.py b/dreamvoice/src/vc_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3 --- /dev/null +++ b/dreamvoice/src/vc_wrapper.py @@ -0,0 +1,144 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.model import DiffVC +from .model.model_cross import DiffVC_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class ReDiffVC(object): + def __init__(self, + config_path='configs/diffvc_base.yaml', + ckpt_path='../ckpts/dreamvc_base.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + spk_embed, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + + +class DreamVC(object): + def __init__(self, + config_path='configs/diffvc_cross.yaml', + ckpt_path='../ckpts/dreamvc_cross.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + text, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + text, text_mask = text + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + diff --git a/example.py b/example.py new file mode 100644 index 0000000000000000000000000000000000000000..522acf6d173f802bc1bebb0aee107658d08508a1 --- /dev/null +++ b/example.py @@ -0,0 +1,47 @@ +from dreamvoice import DreamVoice + +# Plugin mode (DreamVG + ReDiffVC) +# Initialize DreamVoice in plugin mode with CUDA device +dreamvoice = DreamVoice(mode='plugin', device='cuda') +# Description of the target voice +prompt = 'young female voice, sounds young and cute' +# Provide the path to the content audio and generate the converted audio +gen_audio, sr = dreamvoice.genvc('examples/test1.wav', prompt) +# Save the converted audio +dreamvoice.save_audio('gen1.wav', gen_audio, sr) + +# Save the speaker embedding if you like the generated voice +dreamvoice.save_spk_embed('voice_stash1.pt') +# Load the saved speaker embedding +dreamvoice.load_spk_embed('voice_stash1.pt') +# Use the saved speaker embedding for another audio sample +gen_audio2, sr = dreamvoice.simplevc('examples/test2.wav', use_spk_cache=True) +dreamvoice.save_audio('gen2.wav', gen_audio2, sr) + + +# End-to-end mode (DreamVC) +# Initialize DreamVoice in end-to-end mode with CUDA device +dreamvoice = DreamVoice(mode='end2end', device='cuda') +# Provide the path to the content audio and generate the converted audio +gen_end2end, sr = dreamvoice.genvc('examples/test1.wav', prompt) +# Save the converted audio +dreamvoice.save_audio('gen_end2end.wav', gen_end2end, sr) + +# Note: End-to-end mode does not support saving speaker embeddings +# To use a voice generated in end-to-end mode, switch back to plugin mode +# and extract the speaker embedding from the generated audio +# Switch back to plugin mode +dreamvoice = DreamVoice(mode='plugin', device='cuda') +# Load the speaker audio from the previously generated file +gen_end2end2, sr = dreamvoice.simplevc('examples/test2.wav', speaker_audio='gen_end2end.wav') +# Save the new converted audio +dreamvoice.save_audio('gen_end2end2.wav', gen_end2end2, sr) + + +# Traditional VC +# Plugin mode can be used for traditional one-shot voice conversion +dreamvoice = DreamVoice(mode='plugin', device='cuda') +# Generate audio using traditional one-shot voice conversion +gen_tradition, sr = dreamvoice.simplevc('examples/test1.wav', speaker_audio='examples/speaker.wav') +# Save the converted audio +dreamvoice.save_audio('gen_tradition.wav', gen_tradition, sr) diff --git a/examples/speaker.wav b/examples/speaker.wav new file mode 100644 index 0000000000000000000000000000000000000000..8e9057806844aca84ae455b519d4f3e836b2c6d9 Binary files /dev/null and b/examples/speaker.wav differ diff --git a/examples/test1.wav b/examples/test1.wav new file mode 100644 index 0000000000000000000000000000000000000000..1384ae82d0794281542b46ed638f4dd17004df46 Binary files /dev/null and b/examples/test1.wav differ diff --git a/examples/test2.wav b/examples/test2.wav new file mode 100644 index 0000000000000000000000000000000000000000..3a47e034433c59b33fa5fae82ed15dfd9b71ae99 Binary files /dev/null and b/examples/test2.wav differ