diff --git a/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py b/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..135006046929ad3d8c385cec975f5854e881e01d
--- /dev/null
+++ b/dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py
@@ -0,0 +1 @@
+from .api import DreamVoice
\ No newline at end of file
diff --git a/dreamvoice/.ipynb_checkpoints/api-checkpoint.py b/dreamvoice/.ipynb_checkpoints/api-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c373aa0089b79a2400b0cae867d31f7946e5b7
--- /dev/null
+++ b/dreamvoice/.ipynb_checkpoints/api-checkpoint.py
@@ -0,0 +1,295 @@
+import os
+import requests
+import yaml
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from transformers import T5Tokenizer, T5EncoderModel
+from tqdm import tqdm
+from .src.vc_wrapper import ReDiffVC, DreamVC
+from .src.plugin_wrapper import DreamVG
+from .src.modules.speaker_encoder.encoder import inference as spk_encoder
+from .src.modules.BigVGAN.inference import load_model as load_vocoder
+from .src.feats.contentvec_hf import get_content_model, get_content
+
+
+class DreamVoice:
+    def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
+        # Initial setup
+        script_dir = Path(__file__).resolve().parent
+        config_path = script_dir / config
+
+        # Load configuration file
+        with open(config_path, 'r') as fp:
+            self.config = yaml.safe_load(fp)
+
+        self.script_dir = script_dir
+
+        # Ensure all checkpoints are downloaded
+        self._ensure_checkpoints_exist()
+
+        # Initialize attributes
+        self.device = device
+        self.sr = self.config['sample_rate']
+
+        # Load vocoder
+        vocoder_path = script_dir / self.config['vocoder_path']
+        self.hifigan, _ = load_vocoder(vocoder_path, device)
+        self.hifigan.eval()
+
+        # Load content model
+        self.content_model = get_content_model().to(device)
+
+        # Load tokenizer and text encoder
+        lm_path = self.config['lm_path']
+        self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
+        self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+        # Set mode
+        self.mode = mode
+        if mode == 'plugin':
+            self._init_plugin_mode()
+        elif mode == 'end2end':
+            self._init_end2end_mode()
+        else:
+            raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
+
+        # chunk inputs to 10s clips
+        self.chunk_size = chunk_size * 50
+
+    def _ensure_checkpoints_exist(self):
+        checkpoints = [
+            ('vocoder_path', self.config.get('vocoder_url')),
+            ('vocoder_config_path', self.config.get('vocoder_config_url')),
+            ('speaker_path', self.config.get('speaker_url')),
+            ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
+            ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
+            ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
+        ]
+
+        for path_key, url in checkpoints:
+            local_path = self._get_local_path(path_key)
+            if not local_path.exists() and url:
+                print(f"Downloading {path_key} from {url}")
+                self._download_file(url, local_path)
+
+    def _get_local_path(self, path_key):
+        keys = path_key.split('.')
+        local_path = self.config
+        for key in keys:
+            local_path = local_path.get(key, {})
+        return self.script_dir / local_path
+
+    def _download_file(self, url, local_path):
+        try:
+            # Attempt to send a GET request to the URL
+            response = requests.get(url, stream=True)
+            response.raise_for_status()  # Ensure we raise an exception for HTTP errors
+        except requests.exceptions.RequestException as e:
+            # Log the error for debugging purposes
+            print(f"Error encountered: {e}")
+
+            # Development mode: prompt user for Hugging Face API key
+            user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
+            self.hf_key = user_input if user_input else None
+
+            # Set headers if an API key is provided
+            headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
+
+            try:
+                # Attempt to send a GET request with headers in development mode
+                response = requests.get(url, stream=True, headers=headers)
+                response.raise_for_status()  # Ensure we raise an exception for HTTP errors
+            except requests.exceptions.RequestException as e:
+                # Log the error for debugging purposes
+                print(f"Error encountered in dev mode: {e}")
+                response = None  # Handle response accordingly in your code
+
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+
+        total_size = int(response.headers.get('content-length', 0))
+        block_size = 8192
+        t = tqdm(total=total_size, unit='iB', unit_scale=True)
+
+        with open(local_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=block_size):
+                t.update(len(chunk))
+                f.write(chunk)
+        t.close()
+
+    def _init_plugin_mode(self):
+        # Initialize ReDiffVC
+        self.dreamvc = ReDiffVC(
+            config_path=self.script_dir / self.config['rediffvc']['config_path'],
+            ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
+            device=self.device
+        )
+
+        # Initialize DreamVG
+        self.dreamvg = DreamVG(
+            config_path=self.script_dir / self.config['dreamvg']['config_path'],
+            ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
+            device=self.device
+        )
+
+        # Load speaker encoder
+        spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
+        self.spk_encoder = spk_encoder
+        self.spk_embed_cache = None
+
+    def _init_end2end_mode(self):
+        # Initialize DreamVC
+        self.dreamvc = DreamVC(
+            config_path=self.script_dir / self.config['dreamvc']['config_path'],
+            ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
+            device=self.device
+        )
+
+    def _load_content(self, audio_path):
+        content_audio, _ = librosa.load(audio_path, sr=16000)
+        # Calculate the required length to make it a multiple of 16*160
+        target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
+        # Pad with zeros if necessary
+        if len(content_audio) < target_length:
+            content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
+        content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
+        content_clip = get_content(self.content_model, content_audio)
+        return content_clip
+
+    def load_spk_embed(self, emb_path):
+        self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
+
+    def save_spk_embed(self, emb_path):
+        assert self.spk_embed_cache is not None
+        torch.save(self.spk_embed_cache.cpu(), emb_path)
+
+    def save_audio(self, output_path, audio, sr):
+        sf.write(output_path, audio, samplerate=sr)
+
+    @torch.no_grad()
+    def genvc(self, content_audio, prompt,
+              prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
+              prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
+              vc_guidance_scale=3, vc_guidance_rescale=0.7,
+              vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
+             ):
+
+        content_clip = self._load_content(content_audio)
+
+        text_batch = self.tokenizer(prompt, max_length=32,
+                                    padding='max_length', truncation=True, return_tensors="pt")
+        text, text_mask = text_batch.input_ids.to(self.device), \
+            text_batch.attention_mask.to(self.device)
+        text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+        if self.mode == 'plugin':
+            spk_embed = self.dreamvg.inference([text, text_mask],
+                                               guidance_scale=prompt_guidance_scale,
+                                               guidance_rescale=prompt_guidance_rescale,
+                                               ddim_steps=prompt_ddim_steps, eta=prompt_eta,
+                                               random_seed=prompt_random_seed)
+            
+            B, L, D = content_clip.shape
+            gen_audio_chunks = []
+            num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+            for i in range(num_chunks):
+                start_idx = i * self.chunk_size
+                end_idx = min((i + 1) * self.chunk_size, L)
+                content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+                
+                gen_audio_chunk = self.dreamvc.inference(
+                        spk_embed, content_clip_chunk, None,
+                        guidance_scale=vc_guidance_scale,
+                        guidance_rescale=vc_guidance_rescale,
+                        ddim_steps=vc_ddim_steps,
+                        eta=vc_eta,
+                        random_seed=vc_random_seed)
+                    
+                gen_audio_chunks.append(gen_audio_chunk)
+            
+            gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+            self.spk_embed_cache = spk_embed
+
+        elif self.mode == 'end2end':
+            B, L, D = content_clip.shape
+            gen_audio_chunks = []
+            num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+            
+            for i in range(num_chunks):
+                start_idx = i * self.chunk_size
+                end_idx = min((i + 1) * self.chunk_size, L)
+                content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+                
+                gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
+                                                         guidance_scale=prompt_guidance_scale,
+                                                         guidance_rescale=prompt_guidance_rescale,
+                                                         ddim_steps=prompt_ddim_steps,
+                                                         eta=prompt_eta, random_seed=prompt_random_seed)
+                gen_audio_chunks.append(gen_audio_chunk)
+            
+            gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+        else:
+            raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
+
+        gen_audio = self.hifigan(gen_audio.squeeze(1))
+        gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
+
+        return gen_audio, self.sr
+
+    @torch.no_grad()
+    def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
+                 vc_guidance_scale=3, vc_guidance_rescale=0.7,
+                 vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
+                ):
+
+        assert self.mode == 'plugin'
+        if speaker_audio is not None:
+            speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
+            speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
+            spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
+            self.spk_embed_cache = spk_embed
+        elif use_spk_cache:
+            assert self.spk_embed_cache is not None
+            spk_embed = self.spk_embed_cache
+        else:
+            raise NotImplementedError
+
+        content_clip = self._load_content(content_audio)
+
+        B, L, D = content_clip.shape
+        gen_audio_chunks = []
+        num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+        for i in range(num_chunks):
+            start_idx = i * self.chunk_size
+            end_idx = min((i + 1) * self.chunk_size, L)
+            content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+            
+            gen_audio_chunk = self.dreamvc.inference(
+                    spk_embed, content_clip_chunk, None,
+                    guidance_scale=vc_guidance_scale,
+                    guidance_rescale=vc_guidance_rescale,
+                    ddim_steps=vc_ddim_steps,
+                    eta=vc_eta,
+                    random_seed=vc_random_seed)
+                
+            gen_audio_chunks.append(gen_audio_chunk)
+        
+        gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+        gen_audio = self.hifigan(gen_audio.squeeze(1))
+        gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
+
+        return gen_audio, self.sr
+
+
+if __name__ == '__main__':
+    dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
+    content_audio = 'test.wav'
+    speaker_audio = 'speaker.wav'
+    prompt = 'young female voice, sounds young and cute'
+    gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
+    dreamvoice.save_audio('debug.wav', gen_audio, sr)
\ No newline at end of file
diff --git a/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml b/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99c2360c57f57b92ca9885db6100c787bc245d02
--- /dev/null
+++ b/dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml
@@ -0,0 +1,26 @@
+version: 1.0
+
+sample_rate: 24000
+vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
+vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
+vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
+vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
+
+speaker_path: 'ckpts/spk_encoder/pretrained.pt'
+speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
+lm_path: 'google/flan-t5-base'
+
+dreamvc:
+  config_path: 'src/configs/diffvc_cross.yaml'
+  ckpt_path: 'ckpts/dreamvc_cross.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
+
+rediffvc:
+  config_path: 'src/configs/diffvc_base.yaml'
+  ckpt_path: 'ckpts/dreamvc_base.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
+
+dreamvg:
+  config_path: 'src/configs/plugin_cross.yaml'
+  ckpt_path: 'ckpts/dreamvc_plugin.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
diff --git a/dreamvoice/__init__.py b/dreamvoice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..135006046929ad3d8c385cec975f5854e881e01d
--- /dev/null
+++ b/dreamvoice/__init__.py
@@ -0,0 +1 @@
+from .api import DreamVoice
\ No newline at end of file
diff --git a/dreamvoice/__pycache__/__init__.cpython-310.pyc b/dreamvoice/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c5475eb77d8de07fe11c9d5d25465f28e31306
Binary files /dev/null and b/dreamvoice/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dreamvoice/__pycache__/__init__.cpython-311.pyc b/dreamvoice/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8526b942e2021f0cd0d020249790f61cdbf79394
Binary files /dev/null and b/dreamvoice/__pycache__/__init__.cpython-311.pyc differ
diff --git a/dreamvoice/__pycache__/api.cpython-310.pyc b/dreamvoice/__pycache__/api.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3be2021bd271cb57cc65b0e990d8d0a074b6b81a
Binary files /dev/null and b/dreamvoice/__pycache__/api.cpython-310.pyc differ
diff --git a/dreamvoice/__pycache__/api.cpython-311.pyc b/dreamvoice/__pycache__/api.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3bb6b037f07a4ed4e99f40d26b2481e300591f2
Binary files /dev/null and b/dreamvoice/__pycache__/api.cpython-311.pyc differ
diff --git a/dreamvoice/api.py b/dreamvoice/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c373aa0089b79a2400b0cae867d31f7946e5b7
--- /dev/null
+++ b/dreamvoice/api.py
@@ -0,0 +1,295 @@
+import os
+import requests
+import yaml
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from transformers import T5Tokenizer, T5EncoderModel
+from tqdm import tqdm
+from .src.vc_wrapper import ReDiffVC, DreamVC
+from .src.plugin_wrapper import DreamVG
+from .src.modules.speaker_encoder.encoder import inference as spk_encoder
+from .src.modules.BigVGAN.inference import load_model as load_vocoder
+from .src.feats.contentvec_hf import get_content_model, get_content
+
+
+class DreamVoice:
+    def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
+        # Initial setup
+        script_dir = Path(__file__).resolve().parent
+        config_path = script_dir / config
+
+        # Load configuration file
+        with open(config_path, 'r') as fp:
+            self.config = yaml.safe_load(fp)
+
+        self.script_dir = script_dir
+
+        # Ensure all checkpoints are downloaded
+        self._ensure_checkpoints_exist()
+
+        # Initialize attributes
+        self.device = device
+        self.sr = self.config['sample_rate']
+
+        # Load vocoder
+        vocoder_path = script_dir / self.config['vocoder_path']
+        self.hifigan, _ = load_vocoder(vocoder_path, device)
+        self.hifigan.eval()
+
+        # Load content model
+        self.content_model = get_content_model().to(device)
+
+        # Load tokenizer and text encoder
+        lm_path = self.config['lm_path']
+        self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
+        self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+        # Set mode
+        self.mode = mode
+        if mode == 'plugin':
+            self._init_plugin_mode()
+        elif mode == 'end2end':
+            self._init_end2end_mode()
+        else:
+            raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
+
+        # chunk inputs to 10s clips
+        self.chunk_size = chunk_size * 50
+
+    def _ensure_checkpoints_exist(self):
+        checkpoints = [
+            ('vocoder_path', self.config.get('vocoder_url')),
+            ('vocoder_config_path', self.config.get('vocoder_config_url')),
+            ('speaker_path', self.config.get('speaker_url')),
+            ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
+            ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
+            ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
+        ]
+
+        for path_key, url in checkpoints:
+            local_path = self._get_local_path(path_key)
+            if not local_path.exists() and url:
+                print(f"Downloading {path_key} from {url}")
+                self._download_file(url, local_path)
+
+    def _get_local_path(self, path_key):
+        keys = path_key.split('.')
+        local_path = self.config
+        for key in keys:
+            local_path = local_path.get(key, {})
+        return self.script_dir / local_path
+
+    def _download_file(self, url, local_path):
+        try:
+            # Attempt to send a GET request to the URL
+            response = requests.get(url, stream=True)
+            response.raise_for_status()  # Ensure we raise an exception for HTTP errors
+        except requests.exceptions.RequestException as e:
+            # Log the error for debugging purposes
+            print(f"Error encountered: {e}")
+
+            # Development mode: prompt user for Hugging Face API key
+            user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
+            self.hf_key = user_input if user_input else None
+
+            # Set headers if an API key is provided
+            headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
+
+            try:
+                # Attempt to send a GET request with headers in development mode
+                response = requests.get(url, stream=True, headers=headers)
+                response.raise_for_status()  # Ensure we raise an exception for HTTP errors
+            except requests.exceptions.RequestException as e:
+                # Log the error for debugging purposes
+                print(f"Error encountered in dev mode: {e}")
+                response = None  # Handle response accordingly in your code
+
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+
+        total_size = int(response.headers.get('content-length', 0))
+        block_size = 8192
+        t = tqdm(total=total_size, unit='iB', unit_scale=True)
+
+        with open(local_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=block_size):
+                t.update(len(chunk))
+                f.write(chunk)
+        t.close()
+
+    def _init_plugin_mode(self):
+        # Initialize ReDiffVC
+        self.dreamvc = ReDiffVC(
+            config_path=self.script_dir / self.config['rediffvc']['config_path'],
+            ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
+            device=self.device
+        )
+
+        # Initialize DreamVG
+        self.dreamvg = DreamVG(
+            config_path=self.script_dir / self.config['dreamvg']['config_path'],
+            ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
+            device=self.device
+        )
+
+        # Load speaker encoder
+        spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
+        self.spk_encoder = spk_encoder
+        self.spk_embed_cache = None
+
+    def _init_end2end_mode(self):
+        # Initialize DreamVC
+        self.dreamvc = DreamVC(
+            config_path=self.script_dir / self.config['dreamvc']['config_path'],
+            ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
+            device=self.device
+        )
+
+    def _load_content(self, audio_path):
+        content_audio, _ = librosa.load(audio_path, sr=16000)
+        # Calculate the required length to make it a multiple of 16*160
+        target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
+        # Pad with zeros if necessary
+        if len(content_audio) < target_length:
+            content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
+        content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
+        content_clip = get_content(self.content_model, content_audio)
+        return content_clip
+
+    def load_spk_embed(self, emb_path):
+        self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
+
+    def save_spk_embed(self, emb_path):
+        assert self.spk_embed_cache is not None
+        torch.save(self.spk_embed_cache.cpu(), emb_path)
+
+    def save_audio(self, output_path, audio, sr):
+        sf.write(output_path, audio, samplerate=sr)
+
+    @torch.no_grad()
+    def genvc(self, content_audio, prompt,
+              prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
+              prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
+              vc_guidance_scale=3, vc_guidance_rescale=0.7,
+              vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
+             ):
+
+        content_clip = self._load_content(content_audio)
+
+        text_batch = self.tokenizer(prompt, max_length=32,
+                                    padding='max_length', truncation=True, return_tensors="pt")
+        text, text_mask = text_batch.input_ids.to(self.device), \
+            text_batch.attention_mask.to(self.device)
+        text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+        if self.mode == 'plugin':
+            spk_embed = self.dreamvg.inference([text, text_mask],
+                                               guidance_scale=prompt_guidance_scale,
+                                               guidance_rescale=prompt_guidance_rescale,
+                                               ddim_steps=prompt_ddim_steps, eta=prompt_eta,
+                                               random_seed=prompt_random_seed)
+            
+            B, L, D = content_clip.shape
+            gen_audio_chunks = []
+            num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+            for i in range(num_chunks):
+                start_idx = i * self.chunk_size
+                end_idx = min((i + 1) * self.chunk_size, L)
+                content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+                
+                gen_audio_chunk = self.dreamvc.inference(
+                        spk_embed, content_clip_chunk, None,
+                        guidance_scale=vc_guidance_scale,
+                        guidance_rescale=vc_guidance_rescale,
+                        ddim_steps=vc_ddim_steps,
+                        eta=vc_eta,
+                        random_seed=vc_random_seed)
+                    
+                gen_audio_chunks.append(gen_audio_chunk)
+            
+            gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+            self.spk_embed_cache = spk_embed
+
+        elif self.mode == 'end2end':
+            B, L, D = content_clip.shape
+            gen_audio_chunks = []
+            num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+            
+            for i in range(num_chunks):
+                start_idx = i * self.chunk_size
+                end_idx = min((i + 1) * self.chunk_size, L)
+                content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+                
+                gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
+                                                         guidance_scale=prompt_guidance_scale,
+                                                         guidance_rescale=prompt_guidance_rescale,
+                                                         ddim_steps=prompt_ddim_steps,
+                                                         eta=prompt_eta, random_seed=prompt_random_seed)
+                gen_audio_chunks.append(gen_audio_chunk)
+            
+            gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+        else:
+            raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
+
+        gen_audio = self.hifigan(gen_audio.squeeze(1))
+        gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
+
+        return gen_audio, self.sr
+
+    @torch.no_grad()
+    def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
+                 vc_guidance_scale=3, vc_guidance_rescale=0.7,
+                 vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
+                ):
+
+        assert self.mode == 'plugin'
+        if speaker_audio is not None:
+            speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
+            speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
+            spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
+            self.spk_embed_cache = spk_embed
+        elif use_spk_cache:
+            assert self.spk_embed_cache is not None
+            spk_embed = self.spk_embed_cache
+        else:
+            raise NotImplementedError
+
+        content_clip = self._load_content(content_audio)
+
+        B, L, D = content_clip.shape
+        gen_audio_chunks = []
+        num_chunks = (L + self.chunk_size - 1) // self.chunk_size
+        for i in range(num_chunks):
+            start_idx = i * self.chunk_size
+            end_idx = min((i + 1) * self.chunk_size, L)
+            content_clip_chunk = content_clip[:, start_idx:end_idx, :]
+            
+            gen_audio_chunk = self.dreamvc.inference(
+                    spk_embed, content_clip_chunk, None,
+                    guidance_scale=vc_guidance_scale,
+                    guidance_rescale=vc_guidance_rescale,
+                    ddim_steps=vc_ddim_steps,
+                    eta=vc_eta,
+                    random_seed=vc_random_seed)
+                
+            gen_audio_chunks.append(gen_audio_chunk)
+        
+        gen_audio = torch.cat(gen_audio_chunks, dim=-1)
+
+        gen_audio = self.hifigan(gen_audio.squeeze(1))
+        gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
+
+        return gen_audio, self.sr
+
+
+if __name__ == '__main__':
+    dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
+    content_audio = 'test.wav'
+    speaker_audio = 'speaker.wav'
+    prompt = 'young female voice, sounds young and cute'
+    gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
+    dreamvoice.save_audio('debug.wav', gen_audio, sr)
\ No newline at end of file
diff --git a/dreamvoice/ckpts/bigvgan_24k/config.json b/dreamvoice/ckpts/bigvgan_24k/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1b675d4137fc8072bfb1c436d8f040a5d15e6a1
--- /dev/null
+++ b/dreamvoice/ckpts/bigvgan_24k/config.json
@@ -0,0 +1,44 @@
+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 32,
+    "learning_rate": 0.0001,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [10,6,4,2],
+    "upsample_kernel_sizes": [20,12,8,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+
+    "activation": "snakebeta",
+    "snake_logscale": true,
+
+    "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
+    "mpd_reshapes": [2, 3, 5, 7, 11],
+    "use_spectral_norm": false,
+    "discriminator_channel_mult": 1,
+
+    "segment_size": 12000,
+    "num_mels": 128,
+    "n_fft": 1920,
+    "hop_size": 480,
+    "win_size": 1920,
+
+    "sampling_rate": 24000,
+
+    "fmin": 0,
+    "fmax": 12000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}
diff --git a/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt b/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a09d182d11b6c54fdc56dc4244677f6a64aee32f
--- /dev/null
+++ b/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683a7baafedda8ec2fd2409deff61bd58ae66fbf10630550a17fcfed6f728977
+size 58405452
diff --git a/dreamvoice/ckpts/dreamvc_base.pt b/dreamvoice/ckpts/dreamvc_base.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef704709ef4ec22ee7d26c10da40df5a2d3c6fe3
--- /dev/null
+++ b/dreamvoice/ckpts/dreamvc_base.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5abe034bf590e2ce0405c66e950dc61f041629731e959cb09e2009688cd1254c
+size 300117179
diff --git a/dreamvoice/ckpts/dreamvc_cross.pt b/dreamvoice/ckpts/dreamvc_cross.pt
new file mode 100644
index 0000000000000000000000000000000000000000..086c5e76e9fe898d80529c9fb066365d1e8433c4
--- /dev/null
+++ b/dreamvoice/ckpts/dreamvc_cross.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87b4eb1e62b1bf4e157edc2766b9b4461c0be0f7d98a970d6b087f3797c35920
+size 451974443
diff --git a/dreamvoice/ckpts/dreamvc_plugin.pt b/dreamvoice/ckpts/dreamvc_plugin.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7beb2c8aa484bd12d78adf11dfb23a2138c77e06
--- /dev/null
+++ b/dreamvoice/ckpts/dreamvc_plugin.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2396f6b96e9057e73e20eee173d7aaded6b5eb70745a9f5282999c0ea9a4d848
+size 104892440
diff --git a/dreamvoice/ckpts/spk_encoder/pretrained.pt b/dreamvoice/ckpts/spk_encoder/pretrained.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5cd2e41ea79e4aeb8414c7ed9993c42ab5b0dc28
--- /dev/null
+++ b/dreamvoice/ckpts/spk_encoder/pretrained.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
+size 17090379
diff --git a/dreamvoice/dreamvc.yaml b/dreamvoice/dreamvc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99c2360c57f57b92ca9885db6100c787bc245d02
--- /dev/null
+++ b/dreamvoice/dreamvc.yaml
@@ -0,0 +1,26 @@
+version: 1.0
+
+sample_rate: 24000
+vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
+vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
+vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
+vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
+
+speaker_path: 'ckpts/spk_encoder/pretrained.pt'
+speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
+lm_path: 'google/flan-t5-base'
+
+dreamvc:
+  config_path: 'src/configs/diffvc_cross.yaml'
+  ckpt_path: 'ckpts/dreamvc_cross.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
+
+rediffvc:
+  config_path: 'src/configs/diffvc_base.yaml'
+  ckpt_path: 'ckpts/dreamvc_base.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
+
+dreamvg:
+  config_path: 'src/configs/plugin_cross.yaml'
+  ckpt_path: 'ckpts/dreamvc_plugin.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
diff --git a/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3
--- /dev/null
+++ b/dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+import pandas as pd
+# from feats.hubert_model import get_soft_model, get_hubert_soft_content
+from feats.contentvec_hf import get_content_model, get_content
+# from modules.speaker_encoder.encoder import inference as spk_encoder
+# from pathlib import Path
+from tqdm import tqdm
+from multiprocessing import Process
+import pyworld as pw
+
+
+def resample_save(infolder, audio_path, model,
+                  audio_sr=24000, content_sr=16000, min_length=1.92,
+                  content_resolution=50,
+                  save_path='features'):
+    if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
+        audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
+        final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
+        # final_length = final_length / content_sr
+
+        length = max(round(min_length*content_sr), round(final_length))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+
+        # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = content.cpu()
+        os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
+        torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
+        # print(audio_save.shape)
+        # print(content.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
+        # print(save_path + '/' + 'audio_16k/' + audio_path)
+
+        audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
+        length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+        # print(audio_save.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
+
+
+def extract_f0(in_folder, audio_path, save_path):
+    audio, sr = librosa.load(in_folder + audio_path, sr=None)
+    assert sr == 16000
+    if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
+        # wav = audio
+        # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
+        # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
+        #                         fmin=librosa.note_to_hz('C2'),
+        #                         fmax=librosa.note_to_hz('C6'))
+
+        _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
+        f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
+
+        f0 = np.nan_to_num(f0)
+        os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
+        # print(save_path + '/' + 'f0/' + audio_path + '.pt')
+        torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_f0_main(in_folder, audio_paths, save_path):
+    for audio_path in tqdm(audio_paths):
+        extract_f0(in_folder, audio_path, save_path)
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('../test_data/vc_meta.csv')
+    # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
+    model = get_content_model().to('cuda')
+    # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
+    for i in tqdm(range(len(df))):
+        row = df.iloc[i]
+        in_path = row['path']
+        resample_save('../test_data/', in_path, model, save_path='../features/')
+
+    in_folder = '../features/audio_16k/'
+    audio_files = list(df['path'])
+    save_path = '../features/'
+    cores = 6
+
+    subsets = chunks(audio_files, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
+        t.start()
\ No newline at end of file
diff --git a/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5
--- /dev/null
+++ b/dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py
@@ -0,0 +1,76 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.p2e_cross import P2E_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class DreamVG(object):
+    def __init__(self,
+                 config_path='configs/plugin_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_plugin.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = P2E_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.spk_shape = config['model']['unet']['in_channels']
+
+    @torch.no_grad()
+    def inference(self, text,
+                  guidance_scale=5, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023,
+                 ):
+        text, text_mask = text
+        self.model.eval()
+
+        gen_shape = (1, self.spk_shape)
+        
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+    
+        # init noise
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+    
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+    
+            if guidance_scale:
+                output_text = self.model(latents, t, text, text_mask, train_cfg=False)
+                output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+    
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
+    
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+    
+        # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+        pred = scale_shift_re(latents, 1/self.scale, self.shift)
+        # pred = torch.clip(pred, min=0.0, max=0.5)
+        return pred
\ No newline at end of file
diff --git a/dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py b/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3
--- /dev/null
+++ b/dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py
@@ -0,0 +1,144 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.model import DiffVC
+from .model.model_cross import DiffVC_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class ReDiffVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_base.yaml',
+                 ckpt_path='../ckpts/dreamvc_base.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  spk_embed, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
+
+class DreamVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_cross.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  text, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        text, text_mask = text
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
diff --git a/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56d7a532c84d2b4f4d4b2182f7ed813efff02fb7
Binary files /dev/null and b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc differ
diff --git a/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22b2da5da072649c7c6e7e6e25ba4b7f8c898f6b
Binary files /dev/null and b/dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc differ
diff --git a/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc b/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ba54c792a3a21ca2aa5e6e12d61ae46be9d98ca
Binary files /dev/null and b/dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc differ
diff --git a/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc b/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de887c0bf0026d809491b43e186c534dfe060130
Binary files /dev/null and b/dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc differ
diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23
--- /dev/null
+++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml
@@ -0,0 +1,47 @@
+version: 1.0
+
+system: "base"
+
+model:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..582dd72f52a61b1a268e45c0af0d55a2d730e551
--- /dev/null
+++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml
@@ -0,0 +1,34 @@
+version: 1.0
+
+system: "base"
+
+diffwrap:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+    
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
\ No newline at end of file
diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66
--- /dev/null
+++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml
@@ -0,0 +1,45 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+  
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff
--- /dev/null
+++ b/dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml
@@ -0,0 +1,33 @@
+version: 1.0
+
+system: "cross"
+
+diffwrap:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+
+  unet:  
+    sample_size: [100, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+    
\ No newline at end of file
diff --git a/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml b/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e
--- /dev/null
+++ b/dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml
@@ -0,0 +1,39 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+
+  unet:  
+    sample_size: [1, 1]
+    in_channels: 256
+    out_channels: 256
+    layers_per_block: 2
+    block_out_channels: [256]
+    down_block_types:
+      [
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 0.05 
+  shift: -0.035
+    
\ No newline at end of file
diff --git a/dreamvoice/src/configs/diffvc_base.yaml b/dreamvoice/src/configs/diffvc_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23
--- /dev/null
+++ b/dreamvoice/src/configs/diffvc_base.yaml
@@ -0,0 +1,47 @@
+version: 1.0
+
+system: "base"
+
+model:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/src/configs/diffvc_base_pitch.yaml b/dreamvoice/src/configs/diffvc_base_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d8b894cd095accdcb9eab7788e8088d0430eae1
--- /dev/null
+++ b/dreamvoice/src/configs/diffvc_base_pitch.yaml
@@ -0,0 +1,34 @@
+version: 1.0
+
+system: "base"
+
+diffwrap:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+    
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
\ No newline at end of file
diff --git a/dreamvoice/src/configs/diffvc_cross.yaml b/dreamvoice/src/configs/diffvc_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66
--- /dev/null
+++ b/dreamvoice/src/configs/diffvc_cross.yaml
@@ -0,0 +1,45 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+  
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/src/configs/diffvc_cross_pitch.yaml b/dreamvoice/src/configs/diffvc_cross_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff
--- /dev/null
+++ b/dreamvoice/src/configs/diffvc_cross_pitch.yaml
@@ -0,0 +1,33 @@
+version: 1.0
+
+system: "cross"
+
+diffwrap:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+
+  unet:  
+    sample_size: [100, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+    
\ No newline at end of file
diff --git a/dreamvoice/src/configs/plugin_cross.yaml b/dreamvoice/src/configs/plugin_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e
--- /dev/null
+++ b/dreamvoice/src/configs/plugin_cross.yaml
@@ -0,0 +1,39 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+
+  unet:  
+    sample_size: [1, 1]
+    in_channels: 256
+    out_channels: 256
+    layers_per_block: 2
+    block_out_channels: [256]
+    down_block_types:
+      [
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 0.05 
+  shift: -0.035
+    
\ No newline at end of file
diff --git a/dreamvoice/src/debug.py b/dreamvoice/src/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/src/extract_features.py b/dreamvoice/src/extract_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3
--- /dev/null
+++ b/dreamvoice/src/extract_features.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+import pandas as pd
+# from feats.hubert_model import get_soft_model, get_hubert_soft_content
+from feats.contentvec_hf import get_content_model, get_content
+# from modules.speaker_encoder.encoder import inference as spk_encoder
+# from pathlib import Path
+from tqdm import tqdm
+from multiprocessing import Process
+import pyworld as pw
+
+
+def resample_save(infolder, audio_path, model,
+                  audio_sr=24000, content_sr=16000, min_length=1.92,
+                  content_resolution=50,
+                  save_path='features'):
+    if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
+        audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
+        final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
+        # final_length = final_length / content_sr
+
+        length = max(round(min_length*content_sr), round(final_length))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+
+        # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = content.cpu()
+        os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
+        torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
+        # print(audio_save.shape)
+        # print(content.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
+        # print(save_path + '/' + 'audio_16k/' + audio_path)
+
+        audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
+        length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+        # print(audio_save.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
+
+
+def extract_f0(in_folder, audio_path, save_path):
+    audio, sr = librosa.load(in_folder + audio_path, sr=None)
+    assert sr == 16000
+    if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
+        # wav = audio
+        # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
+        # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
+        #                         fmin=librosa.note_to_hz('C2'),
+        #                         fmax=librosa.note_to_hz('C6'))
+
+        _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
+        f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
+
+        f0 = np.nan_to_num(f0)
+        os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
+        # print(save_path + '/' + 'f0/' + audio_path + '.pt')
+        torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_f0_main(in_folder, audio_paths, save_path):
+    for audio_path in tqdm(audio_paths):
+        extract_f0(in_folder, audio_path, save_path)
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('../test_data/vc_meta.csv')
+    # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
+    model = get_content_model().to('cuda')
+    # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
+    for i in tqdm(range(len(df))):
+        row = df.iloc[i]
+        in_path = row['path']
+        resample_save('../test_data/', in_path, model, save_path='../features/')
+
+    in_folder = '../features/audio_16k/'
+    audio_files = list(df['path'])
+    save_path = '../features/'
+    cores = 6
+
+    subsets = chunks(audio_files, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
+        t.start()
\ No newline at end of file
diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30
--- /dev/null
+++ b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py
@@ -0,0 +1,42 @@
+import torch
+import librosa
+from fairseq import checkpoint_utils
+import torch.nn.functional as F
+
+
+def get_model(vec_path):
+    print("load model(s) from {}".format(vec_path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    feats = wav_16k_tensor
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(wav_16k_tensor.device),
+        "padding_mask": padding_mask.to(wav_16k_tensor.device),
+        "output_layer": layer
+    }
+    logits = hmodel.extract_features(**inputs)[0]
+    # feats = hmodel.final_proj(logits[0])
+    return logits
+
+
+if __name__ == '__main__':
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
+    model = model.cuda()
+    content = get_content(model, torch.tensor([audio]))
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638
--- /dev/null
+++ b/dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py
@@ -0,0 +1,40 @@
+from transformers import HubertModel
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import librosa
+
+
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+
+
+def get_content_model(config='lengyue233/content-vec-best'):
+    model = HubertModelWithFinalProj.from_pretrained(config)
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(model, wav_16k_tensor, device='cuda'):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    logits = model(wav_16k_tensor)['last_hidden_state']
+    return logits
+
+
+if __name__ == '__main__':
+    model = get_content_model().cuda()
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    audio = torch.tensor([audio])
+    content = get_content(model, audio, 'cuda')
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd
--- /dev/null
+++ b/dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py
@@ -0,0 +1,24 @@
+import torch, torchaudio
+from .hubert.hubert import HubertSoft
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+import librosa
+
+
+def get_soft_model(model_path):
+    hubert = HubertSoft()
+    # Load checkpoint (either hubert_soft or hubert_discrete)
+    # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
+    checkpoint = torch.load(model_path)
+    consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+    hubert.load_state_dict(checkpoint["hubert"])
+    hubert.eval()
+    return hubert
+
+
+@torch.no_grad()
+def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
+    wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
+    # print(wav_16k_tensor.shape)
+    units = hmodel.units(wav_16k_tensor)
+    # print(units.shape)
+    return units.cpu()
\ No newline at end of file
diff --git a/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py b/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..875c33d953e859609ab401e5d9b153581a8c5215
--- /dev/null
+++ b/dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py
@@ -0,0 +1,22 @@
+import torch, torchaudio
+from hubert.hubert import HubertSoft
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+import librosa
+
+
+def get_soft_model(model_path):
+    hubert = HubertSoft()
+    # Load checkpoint (either hubert_soft or hubert_discrete)
+    # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
+    checkpoint = torch.load(model_path)
+    consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+    hubert.load_state_dict(checkpoint["hubert"])
+    hubert.eval()
+    return model
+
+
+@torch.no_grad()
+def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    units = hmodel.units(wav_16k_tensor)
+    return units.cpu()
\ No newline at end of file
diff --git a/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc b/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f38a3fc3fe39f026788685fd465d5899fcb704
Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc differ
diff --git a/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d775fdddd3d767bc827f7d1261d161fa8b65d553
Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc differ
diff --git a/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a3724af7934ca72887aa33c71dad7e4b8e13f6d
Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc differ
diff --git a/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9c28ffe5034e6ece3ce9635ed61a79f3ac38abb
Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc differ
diff --git a/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc b/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c141a6db71db045064c08c6cf0d3636d7fd46ba5
Binary files /dev/null and b/dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc differ
diff --git a/dreamvoice/src/feats/contentvec.py b/dreamvoice/src/feats/contentvec.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30
--- /dev/null
+++ b/dreamvoice/src/feats/contentvec.py
@@ -0,0 +1,42 @@
+import torch
+import librosa
+from fairseq import checkpoint_utils
+import torch.nn.functional as F
+
+
+def get_model(vec_path):
+    print("load model(s) from {}".format(vec_path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    feats = wav_16k_tensor
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(wav_16k_tensor.device),
+        "padding_mask": padding_mask.to(wav_16k_tensor.device),
+        "output_layer": layer
+    }
+    logits = hmodel.extract_features(**inputs)[0]
+    # feats = hmodel.final_proj(logits[0])
+    return logits
+
+
+if __name__ == '__main__':
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
+    model = model.cuda()
+    content = get_content(model, torch.tensor([audio]))
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/src/feats/contentvec_hf.py b/dreamvoice/src/feats/contentvec_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638
--- /dev/null
+++ b/dreamvoice/src/feats/contentvec_hf.py
@@ -0,0 +1,40 @@
+from transformers import HubertModel
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import librosa
+
+
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+
+
+def get_content_model(config='lengyue233/content-vec-best'):
+    model = HubertModelWithFinalProj.from_pretrained(config)
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(model, wav_16k_tensor, device='cuda'):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    logits = model(wav_16k_tensor)['last_hidden_state']
+    return logits
+
+
+if __name__ == '__main__':
+    model = get_content_model().cuda()
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    audio = torch.tensor([audio])
+    content = get_content(model, audio, 'cuda')
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/src/feats/hubert/.gitignore b/dreamvoice/src/feats/hubert/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0202868f93e8b1be2f925f2ec6b22f3df691e8c3
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# VSCode project settings
+.vscode
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/dreamvoice/src/feats/hubert/LICENSE b/dreamvoice/src/feats/hubert/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Benjamin van Niekerk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/src/feats/hubert/README.md b/dreamvoice/src/feats/hubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68602858ed726acd4f99ce9fecca008f3511dc90
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/README.md
@@ -0,0 +1,161 @@
+# HuBERT
+
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2111.02392)
+[![demo](https://img.shields.io/static/v1?message=Audio%20Samples&logo=Github&labelColor=grey&color=blue&logoColor=white&label=%20&style=flat)](https://bshall.github.io/soft-vc/)
+[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb)
+
+Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484).
+For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb).
+
+<div align="center">
+    <img width="100%" alt="Soft-VC"
+      src="https://raw.githubusercontent.com/bshall/hubert/main/content-encoder.png">
+</div>
+<div>
+  <sup>
+    <strong>Fig 1:</strong> Architecture of the voice conversion system. a) The <strong>discrete</strong> content encoder clusters audio features to produce a sequence of discrete speech units. b) The <strong>soft</strong> content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform.
+  </sup>
+</div>
+
+## Example Usage
+
+### Programmatic Usage
+
+```python
+import torch, torchaudio
+
+# Load checkpoint (either hubert_soft or hubert_discrete)
+hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()
+
+# Load audio
+wav, sr = torchaudio.load("path/to/wav")
+assert sr == 16000
+wav = wav.unsqueeze(0).cuda()
+
+# Extract speech units
+units = hubert.units(x)
+```
+
+### Script-Based Usage
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+  {soft,discrete}       available models (HuBERT-Soft or HuBERT-Discrete)
+  in-dir                path to the dataset directory.
+  out-dir               path to the output directory.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --extension EXTENSION
+                        extension of the audio files (defaults to .flac).
+```
+
+## Training
+
+### Step 1: Dataset Preparation
+
+Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory:
+
+```
+│   lengths.json
+│
+└───wavs
+    ├───dev-*
+    │   ├───84
+    │   ├───...
+    │   └───8842
+    └───train-*
+        ├───19
+        ├───...
+        └───8975
+```
+
+The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples:
+
+```json
+{
+    "dev-clean/1272/128104/1272-128104-0000": 93680,
+    "dev-clean/1272/128104/1272-128104-0001": 77040,
+}
+```
+
+### Step 2: Extract Discrete Speech Units
+
+Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script:
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+  {soft,discrete}       available models (HuBERT-Soft or HuBERT-Discrete)
+  in-dir                path to the dataset directory.
+  out-dir               path to the output directory.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --extension EXTENSION
+                        extension of the audio files (defaults to .flac).
+```
+
+for example:
+
+```
+python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete
+```
+
+At this point the directory tree should look like:
+
+```
+│   lengths.json
+│
+├───discrete
+│   ├───...
+└───wavs
+    ├───...
+```
+
+### Step 3: Train the HuBERT-Soft Content Encoder
+
+```
+usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir
+
+Train HuBERT soft content encoder.
+
+positional arguments:
+  dataset-dir      path to the data directory.
+  checkpoint-dir   path to the checkpoint directory.
+
+optional arguments:
+  -h, --help       show this help message and exit
+  --resume RESUME  path to the checkpoint to resume from.
+  --warmstart      whether to initialize from the fairseq HuBERT checkpoint.
+  --mask           whether to use input masking.
+  --alpha ALPHA    weight for the masked loss.
+```
+
+## Links
+
+- [Soft-VC repo](https://github.com/bshall/soft-vc)
+- [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484)
+- [Official HuBERT repo](https://github.com/pytorch/fairseq)
+- [HuBERT paper](https://arxiv.org/abs/2106.07447)
+
+## Citation
+
+If you found this work helpful please consider citing our paper:
+
+```
+@inproceedings{
+    soft-vc-2022,
+    author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman},
+    booktitle={ICASSP}, 
+    title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion}, 
+    year={2022}
+}
+```
diff --git a/dreamvoice/src/feats/hubert/cluster.py b/dreamvoice/src/feats/hubert/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b754c73c63b79e943d51e76414f0056f05589f
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/cluster.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+import logging
+import argparse
+
+import torch
+import numpy as np
+from sklearn.cluster import KMeans
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def cluster(args):
+    with open(args.subset) as file:
+        subset = [line.strip() for line in file]
+
+    logger.info(f"Loading features from {args.in_dir}")
+    features = []
+    for path in subset:
+        in_path = args.in_dir / path
+        features.append(np.load(in_path.with_suffix(".npy")))
+    features = np.concatenate(features, axis=0)
+
+    logger.info(f"Clustering features of shape: {features.shape}")
+    kmeans = KMeans(n_clusters=args.n_clusters).fit(features)
+
+    checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt"
+    checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
+    torch.save(
+        checkpoint_path,
+        {
+            "n_features_in_": kmeans.n_features_in_,
+            "_n_threads": kmeans._n_threads,
+            "cluster_centers_": kmeans.cluster_centers_,
+        },
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Cluster speech features features.")
+    parser.add_argument(
+        "in_dir",
+        metavar="in-dir",
+        help="path to the encoded dataset",
+        type=Path,
+    )
+    parser.add_argument(
+        "subset",
+        matavar="subset",
+        help="path to the .txt file containing the list of files to cluster",
+        type=Path,
+    )
+    parser.add_argument(
+        "checkpoint_dir",
+        metavar="checkpoint-dir",
+        help="path to the checkpoint directory",
+        type=Path,
+    )
+    parser.add_argument(
+        "--n-clusters",
+        help="number of clusters",
+        type=int,
+        default=100,
+    )
+    args = parser.parse_args()
+    cluster(args)
diff --git a/dreamvoice/src/feats/hubert/content-encoder.png b/dreamvoice/src/feats/hubert/content-encoder.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc59d538a9383896cf0c36e1d4a3f5030fce38fe
Binary files /dev/null and b/dreamvoice/src/feats/hubert/content-encoder.png differ
diff --git a/dreamvoice/src/feats/hubert/encode.py b/dreamvoice/src/feats/hubert/encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..14246e985fb0e9dc157d290853af6dcf6036f61c
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/encode.py
@@ -0,0 +1,60 @@
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+
+import torch
+import torchaudio
+from torchaudio.functional import resample
+
+
+def encode_dataset(args):
+    print(f"Loading hubert checkpoint")
+    hubert = torch.hub.load(
+        "bshall/hubert:main",
+        f"hubert_{args.model}",
+        trust_repo=True,
+    ).cuda()
+
+    print(f"Encoding dataset at {args.in_dir}")
+    for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))):
+        wav, sr = torchaudio.load(in_path)
+        wav = resample(wav, sr, 16000)
+        wav = wav.unsqueeze(0).cuda()
+
+        with torch.inference_mode():
+            units = hubert.units(wav)
+
+        out_path = args.out_dir / in_path.relative_to(args.in_dir)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Encode an audio dataset.")
+    parser.add_argument(
+        "model",
+        help="available models (HuBERT-Soft or HuBERT-Discrete)",
+        choices=["soft", "discrete"],
+    )
+    parser.add_argument(
+        "in_dir",
+        metavar="in-dir",
+        help="path to the dataset directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "out_dir",
+        metavar="out-dir",
+        help="path to the output directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--extension",
+        help="extension of the audio files (defaults to .flac).",
+        default=".flac",
+        type=str,
+    )
+    args = parser.parse_args()
+    encode_dataset(args)
diff --git a/dreamvoice/src/feats/hubert/hubconf.py b/dreamvoice/src/feats/hubert/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58749e4a40b29eab470686b27e06a97bfecb321
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/hubconf.py
@@ -0,0 +1,80 @@
+dependencies = ["torch", "torchaudio", "sklearn"]
+
+URLS = {
+    "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt",
+    "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt",
+    "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt",
+}
+
+import torch
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from sklearn.cluster import KMeans
+
+from hubert import HubertDiscrete, HubertSoft
+
+
+def hubert_discrete(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> HubertDiscrete:
+    r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    kmeans = kmeans100(pretrained=pretrained, progress=progress)
+    hubert = HubertDiscrete(kmeans)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+        hubert.load_state_dict(checkpoint["hubert"])
+        hubert.eval()
+    return hubert
+
+
+def hubert_soft(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model.
+        progress (bool): show progress bar when downloading model.
+    """
+    hubert = HubertSoft()
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-soft"],
+            progress=progress,
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+        hubert.load_state_dict(checkpoint["hubert"])
+        hubert.eval()
+    return hubert
+
+
+def _kmeans(
+    num_clusters: int, pretrained: bool = True, progress: bool = True
+) -> KMeans:
+    kmeans = KMeans(num_clusters)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS[f"kmeans{num_clusters}"], progress=progress
+        )
+        kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
+        kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
+        kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
+    return kmeans
+
+
+def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
+    r"""
+    k-means checkpoint for HuBERT-Discrete with 100 clusters.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    return _kmeans(100, pretrained, progress)
diff --git a/dreamvoice/src/feats/hubert/hubert/__init__.py b/dreamvoice/src/feats/hubert/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e07f859e99f51dcf35639f26a3eb53a81c993f3
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/hubert/__init__.py
@@ -0,0 +1,5 @@
+from .model import (
+    Hubert,
+    HubertDiscrete,
+    HubertSoft,
+)
diff --git a/dreamvoice/src/feats/hubert/hubert/dataset.py b/dreamvoice/src/feats/hubert/hubert/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac2b84f95340e088913e06db8e5db0a68e83c2e
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/hubert/dataset.py
@@ -0,0 +1,91 @@
+import random
+from pathlib import Path
+import numpy as np
+import json
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import torchaudio
+
+
+class AcousticUnitsDataset(Dataset):
+    def __init__(
+        self,
+        root: Path,
+        sample_rate: int = 16000,
+        label_rate: int = 50,
+        min_samples: int = 32000,
+        max_samples: int = 250000,
+        train: bool = True,
+    ):
+        self.wavs_dir = root / "wavs"
+        self.units_dir = root / "discrete"
+
+        with open(root / "lengths.json") as file:
+            self.lenghts = json.load(file)
+
+        pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
+        metadata = (
+            (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
+            for path in self.wavs_dir.rglob(pattern)
+        )
+        metadata = ((path, key) for path, key in metadata if key in self.lenghts)
+        self.metadata = [
+            path for path, key in metadata if self.lenghts[key] > min_samples
+        ]
+
+        self.sample_rate = sample_rate
+        self.label_rate = label_rate
+        self.min_samples = min_samples
+        self.max_samples = max_samples
+        self.train = train
+
+    def __len__(self):
+        return len(self.metadata)
+
+    def __getitem__(self, index):
+        wav_path = self.metadata[index]
+        units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
+
+        wav, _ = torchaudio.load(wav_path)
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        codes = np.load(units_path.with_suffix(".npy"))
+
+        return wav, torch.from_numpy(codes).long()
+
+    def collate(self, batch):
+        wavs, codes = zip(*batch)
+        wavs, codes = list(wavs), list(codes)
+
+        wav_lengths = [wav.size(-1) for wav in wavs]
+        code_lengths = [code.size(-1) for code in codes]
+
+        wav_frames = min(self.max_samples, *wav_lengths)
+
+        collated_wavs, wav_offsets = [], []
+        for wav in wavs:
+            wav_diff = wav.size(-1) - wav_frames
+            wav_offset = random.randint(0, wav_diff)
+            wav = wav[:, wav_offset : wav_offset + wav_frames]
+
+            collated_wavs.append(wav)
+            wav_offsets.append(wav_offset)
+
+        rate = self.label_rate / self.sample_rate
+        code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
+        code_frames = round(wav_frames * rate)
+        remaining_code_frames = [
+            length - offset for length, offset in zip(code_lengths, code_offsets)
+        ]
+        code_frames = min(code_frames, *remaining_code_frames)
+
+        collated_codes = []
+        for code, code_offset in zip(codes, code_offsets):
+            code = code[code_offset : code_offset + code_frames]
+            collated_codes.append(code)
+
+        wavs = torch.stack(collated_wavs, dim=0)
+        codes = torch.stack(collated_codes, dim=0)
+
+        return wavs, codes
diff --git a/dreamvoice/src/feats/hubert/hubert/model.py b/dreamvoice/src/feats/hubert/hubert/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..523dd95633ba73babff8b6836324ae0a7c2d267f
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/hubert/model.py
@@ -0,0 +1,241 @@
+import copy
+from typing import Optional, Tuple
+import random
+
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+
+    def encode(
+        self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+
+
+class HubertSoft(Hubert):
+    """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+    def __init__(self):
+        super().__init__()
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        """Extract soft speech units.
+
+        Args:
+            wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+        Returns:
+            Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions.
+        """
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+
+
+class HubertDiscrete(Hubert):
+    """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+    def __init__(self, kmeans: KMeans):
+        super().__init__(504)
+        self.kmeans = kmeans
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.LongTensor:
+        """Extract discrete speech units.
+
+        Args:
+            wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+        Returns:
+            LongTensor: soft speech units of shape (N,), where N is the number of frames.
+        """
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav, layer=7)
+        x = self.kmeans.predict(x.squeeze().cpu().numpy())
+        return torch.tensor(x, dtype=torch.long, device=wav.device)
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.gelu(self.norm0(self.conv0(x)))
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = F.gelu(self.conv3(x))
+        x = F.gelu(self.conv4(x))
+        x = F.gelu(self.conv5(x))
+        x = F.gelu(self.conv6(x))
+        return x
+
+
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = F.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        mask: torch.Tensor = None,
+        src_key_padding_mask: torch.Tensor = None,
+        output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+
+
+def _compute_mask(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    device: torch.device,
+    min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+
+    return mask
diff --git a/dreamvoice/src/feats/hubert/hubert/utils.py b/dreamvoice/src/feats/hubert/hubert/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42ba3acb822938f246dba27b3de81ec51aa72b0
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/hubert/utils.py
@@ -0,0 +1,61 @@
+import torch
+
+
+class Metric:
+    def __init__(self):
+        self.steps = 0
+        self.value = 0
+
+    def update(self, value):
+        self.steps += 1
+        self.value += (value - self.value) / self.steps
+        return self.value
+
+    def reset(self):
+        self.steps = 0
+        self.value = 0
+
+
+def save_checkpoint(
+    checkpoint_dir,
+    hubert,
+    optimizer,
+    scaler,
+    step,
+    loss,
+    best,
+    logger,
+):
+    state = {
+        "hubert": hubert.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "scaler": scaler.state_dict(),
+        "step": step,
+        "loss": loss,
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+    torch.save(state, checkpoint_path)
+    if best:
+        best_path = checkpoint_dir / "model-best.pt"
+        torch.save(state, best_path)
+    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+
+
+def load_checkpoint(
+    load_path,
+    hubert,
+    optimizer,
+    scaler,
+    rank,
+    logger,
+):
+    logger.info(f"Loading checkpoint from {load_path}")
+    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+    hubert.load_state_dict(checkpoint["hubert"])
+    if "scaler" in checkpoint:
+        scaler.load_state_dict(checkpoint["scaler"])
+    if "optimizer" in checkpoint:
+        optimizer.load_state_dict(checkpoint["optimizer"])
+    step, loss = checkpoint.get("step", 0), checkpoint.get("loss", float("inf"))
+    return step, loss
diff --git a/dreamvoice/src/feats/hubert/train.py b/dreamvoice/src/feats/hubert/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5ca9de087f72e343ffb4e5ef00cdbb90765097
--- /dev/null
+++ b/dreamvoice/src/feats/hubert/train.py
@@ -0,0 +1,459 @@
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from hubert.model import Hubert, URLS
+from hubert.dataset import AcousticUnitsDataset
+from hubert.utils import Metric, save_checkpoint, load_checkpoint
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+########################################################################################
+# Define hyperparameters for training:
+########################################################################################
+
+BATCH_SIZE = 32
+LEARNING_RATE = 2e-5
+BETAS = (0.9, 0.98)
+EPS = 1e-06
+WEIGHT_DECAY = 1e-2
+MAX_NORM = 10
+STEPS = 25000
+LOG_INTERVAL = 5
+VALIDATION_INTERVAL = 1000
+CHECKPOINT_INTERVAL = 5000
+BACKEND = "nccl"
+INIT_METHOD = "tcp://localhost:54321"
+
+
+def train(rank, world_size, args):
+    dist.init_process_group(
+        BACKEND,
+        rank=rank,
+        world_size=world_size,
+        init_method=INIT_METHOD,
+    )
+
+    ####################################################################################
+    # Setup logging utilities:
+    ####################################################################################
+
+    log_dir = args.checkpoint_dir / "logs"
+    log_dir.mkdir(exist_ok=True, parents=True)
+
+    if rank == 0:
+        logger.setLevel(logging.INFO)
+        handler = logging.FileHandler(log_dir / f"{args.checkpoint_dir.stem}.log")
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S"
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    writer = SummaryWriter(log_dir) if rank == 0 else None
+
+    ####################################################################################
+    # Initialize models
+    ####################################################################################
+
+    hubert = Hubert(mask=args.mask).to(rank)
+
+    if args.warmstart:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], map_location={"cuda:0": f"cuda:{rank}"}
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+
+        # don't use warmstart weights for label embeddings and proj layer
+        del checkpoint["hubert"]["label_embedding.weight"]
+        del checkpoint["hubert"]["proj.weight"]
+        del checkpoint["hubert"]["proj.bias"]
+
+        hubert.load_state_dict(checkpoint["hubert"], strict=False)
+
+    hubert = DDP(hubert, device_ids=[rank])
+
+    ####################################################################################
+    # Initialze optimizer and grad scaler
+    ####################################################################################
+
+    optimizer = optim.AdamW(
+        hubert.parameters(),
+        lr=LEARNING_RATE,
+        betas=BETAS,
+        eps=EPS,
+        weight_decay=WEIGHT_DECAY,
+    )
+    scaler = amp.GradScaler()
+
+    ####################################################################################
+    # Initialize datasets and dataloaders
+    ####################################################################################
+
+    train_dataset = AcousticUnitsDataset(
+        root=args.dataset_dir,
+        train=True,
+    )
+    train_sampler = DistributedSampler(train_dataset, drop_last=True)
+    train_loader = DataLoader(
+        train_dataset,
+        collate_fn=train_dataset.collate,
+        batch_size=BATCH_SIZE,
+        sampler=train_sampler,
+        num_workers=8,
+        pin_memory=True,
+        shuffle=False,
+        drop_last=True,
+    )
+
+    validation_dataset = AcousticUnitsDataset(
+        root=args.dataset_dir,
+        train=False,
+    )
+    validation_loader = DataLoader(
+        validation_dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=8,
+        pin_memory=True,
+    )
+
+    ####################################################################################
+    # Load checkpoint if args.resume is set
+    ####################################################################################
+
+    if args.resume is not None:
+        global_step, best_loss = load_checkpoint(
+            load_path=args.resume,
+            hubert=hubert,
+            optimizer=optimizer,
+            scaler=scaler,
+            rank=rank,
+            logger=logger,
+        )
+    else:
+        global_step, best_loss = 0, float("inf")
+
+    # =================================================================================#
+    # Start training loop
+    # =================================================================================#
+
+    n_epochs = STEPS // len(train_loader) + 1
+    start_epoch = global_step // len(train_loader) + 1
+
+    logger.info("**" * 40)
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"CUDA version: {torch.version.cuda}")
+    logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
+    logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}")
+    logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}")
+    logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}")
+    logger.info(f"# of GPUS: {torch.cuda.device_count()}")
+    logger.info(f"batch size: {BATCH_SIZE}")
+    logger.info(f"iterations per epoch: {len(train_loader)}")
+    logger.info(f"# of epochs: {n_epochs}")
+    logger.info(f"started at epoch: {start_epoch}")
+    logger.info("**" * 40 + "\n")
+
+    if args.mask:
+        average_masked_loss = Metric()
+        average_unmasked_loss = Metric()
+        average_masked_accuracy = Metric()
+        average_unmasked_accuracy = Metric()
+
+        epoch_masked_loss = Metric()
+        epoch_unmasked_loss = Metric()
+        epoch_masked_accuracy = Metric()
+        epoch_unmasked_accuracy = Metric()
+    else:
+        average_loss = Metric()
+        average_accuracy = Metric()
+
+        epoch_loss = Metric()
+        epoch_accuracy = Metric()
+
+    validation_loss = Metric()
+    validation_accuracy = Metric()
+
+    for epoch in range(start_epoch, n_epochs + 1):
+        train_sampler.set_epoch(epoch)
+
+        hubert.train()
+        if args.mask:
+            epoch_masked_loss.reset()
+            epoch_unmasked_loss.reset()
+            epoch_masked_accuracy.reset()
+            epoch_unmasked_accuracy.reset()
+        else:
+            epoch_loss.reset()
+            epoch_accuracy.reset()
+
+        for wavs, codes in train_loader:
+            global_step += 1
+            wavs, codes = wavs.to(rank), codes.to(rank)
+
+            ############################################################################
+            # Compute training loss
+            ############################################################################
+
+            optimizer.zero_grad()
+
+            with amp.autocast():
+                logits, mask = hubert(wavs)
+                length = min(
+                    mask.size(-1) if args.mask else float("inf"), codes.size(-1)
+                )
+                logits = logits[:, :length, :]
+                codes = codes[:, :length]
+                if args.mask:
+                    mask = mask[:, :length]
+
+                if args.mask:
+                    masked_loss = F.cross_entropy(logits[mask], codes[mask])
+                    unmasked_loss = F.cross_entropy(logits[~mask], codes[~mask])
+                    loss = args.alpha * masked_loss + (1 - args.alpha) * unmasked_loss
+                else:
+                    loss = F.cross_entropy(logits.transpose(1, 2), codes)
+
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+
+            nn.utils.clip_grad_norm_(hubert.parameters(), MAX_NORM)
+
+            scaler.step(optimizer)
+            scaler.update()
+
+            if args.mask:
+                masked_accuracy = logits[mask].argmax(dim=-1) == codes[mask]
+                masked_accuracy = torch.mean(masked_accuracy.float())
+
+                unmasked_accuracy = logits[~mask].argmax(dim=-1) == codes[~mask]
+                unmasked_accuracy = torch.mean(unmasked_accuracy.float())
+            else:
+                accuracy = logits.argmax(dim=-1) == codes
+                accuracy = torch.mean(accuracy.float())
+
+            ############################################################################
+            # Update and log training metrics
+            ############################################################################
+
+            if args.mask:
+                average_masked_loss.update(masked_loss.item())
+                average_unmasked_loss.update(unmasked_loss.item())
+                average_masked_accuracy.update(masked_accuracy.item())
+                average_unmasked_accuracy.update(unmasked_accuracy.item())
+
+                epoch_masked_loss.update(masked_loss.item())
+                epoch_unmasked_loss.update(unmasked_loss.item())
+                epoch_masked_accuracy.update(masked_accuracy.item())
+                epoch_unmasked_accuracy.update(unmasked_accuracy.item())
+            else:
+                average_loss.update(loss.item())
+                average_accuracy.update(accuracy.item())
+
+                epoch_loss.update(loss.item())
+                epoch_accuracy.update(accuracy.item())
+
+            if rank == 0 and global_step % LOG_INTERVAL == 0:
+                if args.mask:
+                    writer.add_scalar(
+                        "train/masked_loss",
+                        average_masked_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/unmasked_loss",
+                        average_unmasked_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/masked_accuracy",
+                        average_masked_accuracy.value * 100,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/unmasked_accuracy",
+                        average_unmasked_accuracy.value * 100,
+                        global_step,
+                    )
+                    average_masked_loss.reset()
+                    average_unmasked_loss.reset()
+                    average_masked_accuracy.reset()
+                    average_unmasked_accuracy.reset()
+                else:
+                    writer.add_scalar(
+                        "train/loss",
+                        average_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/accuracy",
+                        average_accuracy.value,
+                        global_step,
+                    )
+                    average_loss.reset()
+                    average_accuracy.reset()
+
+            # --------------------------------------------------------------------------#
+            # Start validation loop
+            # --------------------------------------------------------------------------#
+
+            if global_step % VALIDATION_INTERVAL == 0:
+                hubert.eval()
+                validation_loss.reset()
+                validation_accuracy.reset()
+                for wavs, codes in validation_loader:
+                    wavs, codes = wavs.to(rank), codes.to(rank)
+
+                    with torch.no_grad():
+                        logits, _ = hubert(wavs)
+                        logits = logits.transpose(1, 2)
+
+                    loss = F.cross_entropy(logits, codes)
+
+                    accuracy = logits.argmax(dim=1) == codes
+                    accuracy = torch.mean(accuracy.float())
+
+                    ####################################################################
+                    # Update validation metrics
+                    ####################################################################
+
+                    validation_loss.update(loss.item())
+                    validation_accuracy.update(accuracy.item())
+
+                hubert.train()
+
+                ############################################################################
+                # Log validation metrics
+                ############################################################################
+
+                if rank == 0:
+                    writer.add_scalar(
+                        "validation/unit_loss",
+                        validation_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "validation/unit_accuracy",
+                        validation_accuracy.value * 100,
+                        global_step,
+                    )
+                    logger.info(
+                        f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}, accuracy: {validation_accuracy.value * 100:.2f}"
+                    )
+
+                ############################################################################
+                # Save model checkpoint
+                ############################################################################
+
+                new_best = best_loss > validation_loss.value
+                if new_best or global_step % CHECKPOINT_INTERVAL == 0:
+                    if new_best:
+                        logger.info("-------- new best model found!")
+                        best_loss = validation_loss.value
+
+                    if rank == 0:
+                        save_checkpoint(
+                            checkpoint_dir=args.checkpoint_dir,
+                            hubert=hubert,
+                            optimizer=optimizer,
+                            scaler=scaler,
+                            step=global_step,
+                            loss=validation_loss.value,
+                            best=new_best,
+                            logger=logger,
+                        )
+
+            # -----------------------------------------------------------------------------#
+            # End validation loop
+            # -----------------------------------------------------------------------------#
+
+        ####################################################################################
+        # Log training metrics
+        ####################################################################################
+
+        logger.info(
+            f"""
+            train -- epoch: {epoch}, masked loss: {epoch_masked_loss.value:.4f}, unmasked loss: {epoch_unmasked_loss.value:.4f}, 
+                     masked accuracy: {epoch_masked_accuracy.value * 100:.2f}, umasked accuracy: {epoch_unmasked_accuracy.value * 100:.2f}
+            """
+        )
+
+        # ==================================================================================#
+        # End training loop
+        # ==================================================================================#
+
+    dist.destroy_process_group()
+
+
+def train_hubert(args):
+    world_size = torch.cuda.device_count()
+    mp.spawn(
+        train,
+        args=(world_size, args),
+        nprocs=world_size,
+        join=True,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train HuBERT soft content encoder.")
+    parser.add_argument(
+        "dataset_dir",
+        metavar="dataset-dir",
+        help="path to the data directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "checkpoint_dir",
+        metavar="checkpoint-dir",
+        help="path to the checkpoint directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--resume",
+        help="path to the checkpoint to resume from.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--warmstart",
+        help="whether to initialize from the fairseq HuBERT checkpoint.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--mask",
+        help="whether to use input masking.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--alpha",
+        help="weight for the masked loss.",
+        default=1,
+        type=float,
+    )
+    args = parser.parse_args()
+
+    world_size = torch.cuda.device_count()
+    mp.spawn(
+        train,
+        args=(world_size, args),
+        nprocs=world_size,
+        join=True,
+    )
diff --git a/dreamvoice/src/feats/hubert_model.py b/dreamvoice/src/feats/hubert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd
--- /dev/null
+++ b/dreamvoice/src/feats/hubert_model.py
@@ -0,0 +1,24 @@
+import torch, torchaudio
+from .hubert.hubert import HubertSoft
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+import librosa
+
+
+def get_soft_model(model_path):
+    hubert = HubertSoft()
+    # Load checkpoint (either hubert_soft or hubert_discrete)
+    # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
+    checkpoint = torch.load(model_path)
+    consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+    hubert.load_state_dict(checkpoint["hubert"])
+    hubert.eval()
+    return hubert
+
+
+@torch.no_grad()
+def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
+    wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
+    # print(wav_16k_tensor.shape)
+    units = hmodel.units(wav_16k_tensor)
+    # print(units.shape)
+    return units.cpu()
\ No newline at end of file
diff --git a/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2
--- /dev/null
+++ b/dreamvoice/src/model/.ipynb_checkpoints/model-checkpoint.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class DiffVC(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.speaker_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim']))
+        self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) /
+                                   self.config['cls_embedding']['speaker_dim'] ** 0.5)
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, speaker, pitch,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype)
+            batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device)
+            speaker = torch.where(batch_mask, uncond, speaker)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        speaker = self.speaker_embedding(speaker)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_base_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    spk = torch.rand(2, 256).to(device)
+
+    output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2
--- /dev/null
+++ b/dreamvoice/src/model/.ipynb_checkpoints/model_cross-checkpoint.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class DiffVC_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            # Randomly mask embedding
+            batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device)
+            fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+            prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        prompt = self.context_embedding(prompt)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_cross_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+
+    output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5)
\ No newline at end of file
diff --git a/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py b/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..266e177f862a0d66658ed1b7e9d73e1947755ab4
--- /dev/null
+++ b/dreamvoice/src/model/.ipynb_checkpoints/p2e_cross-checkpoint.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class P2E_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+    def forward(self, target, t, prompt, prompt_mask=None,
+                train_cfg=False, cfg_prob=0.0):
+        B, C = target.shape
+        target = target.unsqueeze(-1).unsqueeze(-1)
+
+        if train_cfg:
+            if cfg_prob > 0.0:
+                # Randomly mask embedding
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
+                fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+                prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+        prompt = self.context_embedding(prompt)
+        # fix the bug that prompt will copy dtype from target in diffusers
+        target = target.to(prompt.dtype)
+
+        output = self.unet(sample=target, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output.squeeze(-1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    with open('p2e_cross.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = P2E_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 256)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+
+    output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/src/model/__pycache__/model.cpython-310.pyc b/dreamvoice/src/model/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..564230e72743052a16a565fd58b54b6d88a234de
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model.cpython-310.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/model.cpython-311.pyc b/dreamvoice/src/model/__pycache__/model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d044dc24ae43a0e18c8b14a3f3d61a073725134a
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model.cpython-311.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b07fb812aaf24610b9ac2f9e54746a53e61950b
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-310.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c849e2bff7d11e4c03eac35ab1d4964d0d948677
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-311.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc b/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ce98cc327edd0c0dedc479a3e950985e1794694
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/model_cross.cpython-39.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7f1e6c9c0a5b94c07d393f37cf2aa2a8e8bd2f1
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-310.pyc differ
diff --git a/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ab2868fb6e1eadb7ce8f20ee4ce52c05f62de68
Binary files /dev/null and b/dreamvoice/src/model/__pycache__/p2e_cross.cpython-311.pyc differ
diff --git a/dreamvoice/src/model/model.py b/dreamvoice/src/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2
--- /dev/null
+++ b/dreamvoice/src/model/model.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class DiffVC(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.speaker_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim']))
+        self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) /
+                                   self.config['cls_embedding']['speaker_dim'] ** 0.5)
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, speaker, pitch,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype)
+            batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device)
+            speaker = torch.where(batch_mask, uncond, speaker)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        speaker = self.speaker_embedding(speaker)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_base_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    spk = torch.rand(2, 256).to(device)
+
+    output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/src/model/model_cross.py b/dreamvoice/src/model/model_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2
--- /dev/null
+++ b/dreamvoice/src/model/model_cross.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class DiffVC_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            # Randomly mask embedding
+            batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device)
+            fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+            prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        prompt = self.context_embedding(prompt)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_cross_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+
+    output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5)
\ No newline at end of file
diff --git a/dreamvoice/src/model/p2e_cross.py b/dreamvoice/src/model/p2e_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..266e177f862a0d66658ed1b7e9d73e1947755ab4
--- /dev/null
+++ b/dreamvoice/src/model/p2e_cross.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class P2E_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+    def forward(self, target, t, prompt, prompt_mask=None,
+                train_cfg=False, cfg_prob=0.0):
+        B, C = target.shape
+        target = target.unsqueeze(-1).unsqueeze(-1)
+
+        if train_cfg:
+            if cfg_prob > 0.0:
+                # Randomly mask embedding
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
+                fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+                prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+        prompt = self.context_embedding(prompt)
+        # fix the bug that prompt will copy dtype from target in diffusers
+        target = target.to(prompt.dtype)
+
+        output = self.unet(sample=target, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output.squeeze(-1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    with open('p2e_cross.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = P2E_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 256)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+
+    output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py b/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f
--- /dev/null
+++ b/dreamvoice/src/modules/.ipynb_checkpoints/mel-checkpoint.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as transforms
+
+
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,):
+        super().__init__()
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.mel = transforms.MelSpectrogram(
+            sample_rate=sr,
+            n_fft=frame_length,
+            win_length=frame_length,
+            hop_length=hop_length,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            n_mels=n_mel,
+            mel_scale="slaney",
+            f_min=f_min,
+            f_max=f_max
+        )
+
+    @torch.no_grad()
+    def forward(self, x, target_length=None):
+        x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
+                      (self.frame_length - self.hop_length) // 2), "reflect")
+        mel = self.mel(x)
+
+        target_length = mel.shape[-1] if target_length is None else target_length
+        logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
+        logmel[:, :, :mel.shape[2]] = mel
+
+        logmel = torch.log(torch.clamp(logmel, min=1e-5))
+        return logmel
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py b/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/.ipynb_checkpoints/models-checkpoint.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 NVIDIA CORPORATION. 
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from .activations import activations
+from .utils import init_weights, get_padding
+from .alias_free_torch import *
+
+LRELU_SLOPE = 0.1
+
+
+class AMPBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+        super(AMPBlock1, self).__init__()
+        self.h = h
+
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+        super(AMPBlock2, self).__init__()
+        self.h = h
+
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        for c, a in zip (self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, h):
+        super(BigVGAN, self).__init__()
+        self.h = h
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(nn.ModuleList([
+                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+                                            h.upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+
+        # post conv
+        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.d_mult = h.discriminator_channel_mult
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, h):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.mpd_reshapes = h.mpd_reshapes
+        print("mpd_reshapes: {}".format(self.mpd_reshapes))
+        discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
+        self.discriminators = nn.ModuleList(discriminators)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(self, cfg, resolution):
+        super().__init__()
+
+        self.resolution = resolution
+        assert len(self.resolution) == 3, \
+            "MRD layer requires list with len=3, got {}".format(self.resolution)
+        self.lrelu_slope = LRELU_SLOPE
+
+        norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
+        if hasattr(cfg, "mrd_use_spectral_norm"):
+            print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
+            norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
+        self.d_mult = cfg.discriminator_channel_mult
+        if hasattr(cfg, "mrd_channel_mult"):
+            print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
+            self.d_mult = cfg.mrd_channel_mult
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.resolution
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
+        x = torch.view_as_real(x)  # [B, F, TT, 2]
+        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+        return mag
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(self, cfg, debug=False):
+        super().__init__()
+        self.resolutions = cfg.resolutions
+        assert len(self.resolutions) == 3,\
+            "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
+                format(self.resolutions)
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
+        )
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(x=y)
+            y_d_g, fmap_g = d(x=y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
diff --git a/dreamvoice/src/modules/BigVGAN/LICENSE b/dreamvoice/src/modules/BigVGAN/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/README.md b/dreamvoice/src/modules/BigVGAN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/README.md
@@ -0,0 +1,95 @@
+## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
+
+<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
+
+
+### [Paper](https://arxiv.org/abs/2206.04658)
+### [Audio demo](https://bigvgan-demo.github.io/)
+
+## Installation
+Clone the repository and install dependencies.
+```shell
+# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
+git clone https://github.com/NVIDIA/BigVGAN
+pip install -r requirements.txt
+```
+
+Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
+``` shell
+cd LibriTTS && \
+ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
+ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
+ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
+ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
+ln -s /path/to/your/LibriTTS/dev-other dev-other && \
+ln -s /path/to/your/LibriTTS/test-clean test-clean && \
+ln -s /path/to/your/LibriTTS/test-other test-other && \
+cd ..
+```
+
+## Training
+Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
+```shell
+python train.py \
+--config configs/bigvgan_24khz_100band.json \
+--input_wavs_dir LibriTTS \
+--input_training_file LibriTTS/train-full.txt \
+--input_validation_file LibriTTS/val-full.txt \
+--list_input_unseen_wavs_dir LibriTTS LibriTTS \
+--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
+--checkpoint_path exp/bigvgan
+```
+
+## Synthesis
+Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
+It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
+```shell
+python inference.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_wavs_dir /path/to/your/input_wav \
+--output_dir /path/to/your/output_wav
+```
+
+`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
+It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
+
+Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
+```shell
+python inference_e2e.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_mels_dir /path/to/your/input_mel \
+--output_dir /path/to/your/output_wav
+```
+
+## Pretrained Models
+We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
+One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
+
+|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
+|------|---|---|---|---|------|---|
+|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
+|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
+|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
+|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
+
+The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
+We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
+Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
+
+
+## TODO
+
+Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. 
+
+
+## References
+* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
+
+* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
+
+* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
+
+* [Julius](https://github.com/adefossez/julius) (for low-pass filter)
+
+* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8ec6a9f201528dae3177dec447dba562779d13
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..944bf499d4d9e2293856a3fc1b7b589f09b11bba
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6ad6022b905f0726278468c130c38e351229424
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/env.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f5238ccc2abe7d89ad07b8be8d28aaf2b434b24
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9452098b19c32715461aaed8412d15a78947021
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4da540abb6cee481f0857f4ed62b888194fa9b1f
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/inference.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e63b2b64f00899b7ec2910443d7058cfb05570
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ffcf33665e7ad868aa70941e5050c25c06e1277
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..717995369d8f414c631aeda4d0d421b77ecc7ce3
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1fb7f580e6ffa7546e2a08bf4f1fb064422e0fc
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/models.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d62632c6c8d75cb2014386a0dd10d78831f7cfa4
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef60d0e501cc1558847c17382aea818671cb73c
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb9eb3da3c95ebeb39caad5e879e10beaff693d
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/__pycache__/utils.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a8baa0a3aa82168837a209ad631a824fa21cb7a
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcde8e2762c89f5509b59c3537f8b2caa5bc36e3
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae498304bce7351eedd5837cc7488a7a7e8583b4
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/activations/__pycache__/activations.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/activations/activations.py b/dreamvoice/src/modules/BigVGAN/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/activations/activations.py
@@ -0,0 +1,120 @@
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e7c086a019eff89bcbab80a1911bbe3824c7793
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70a72c2a6fe2b7600a0ae70c5bddc68543f7f916
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89ddc0d89af089d72dd97520e57b0b2375063336
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a28bab23adbe73558251485a19ba613c5c4b6df
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b00e6040d42b21f7a84c9c541ab0cec6964ace8f
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a08646fdf31cb36321bedf80b911f6dbfd5209c
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a0de1a364c0d5cb89bfa72e4b015dc8652a10f7
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7325ef7be6657739011bc6c157db7986bea065a
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5df92a971fff07fd60e120ba8f0fd2633b1c34fe
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50726ae35e7fc66531f1f1cf9ef5014fa673290b
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa22888731722dd9bda74d239cad5135419a4f6d
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..273d9fb158e2cbdc8f3e4dad85efaa5b15cf7286
Binary files /dev/null and b/dreamvoice/src/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/act.py
@@ -0,0 +1,28 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/filter.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+
+        return out
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py b/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/alias_free_torch/resample.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/env.py b/dreamvoice/src/modules/BigVGAN/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/env.py
@@ -0,0 +1,18 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
\ No newline at end of file
diff --git a/dreamvoice/src/modules/BigVGAN/inference.py b/dreamvoice/src/modules/BigVGAN/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/inference.py
@@ -0,0 +1,36 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+import os
+import argparse
+import json
+import torch
+from scipy.io.wavfile import write
+from .env import AttrDict
+from .utils import MAX_WAV_VALUE
+from .models import BigVGAN as Generator
+import librosa
+
+
+def load_model(model_path, device='cuda'):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+
+    global h
+    json_config = json.loads(data)
+
+    h = AttrDict(json_config)
+
+    generator = Generator(h).to(device)
+
+    cp_dict = torch.load(model_path, map_location=device)
+    generator.load_state_dict(cp_dict['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+
diff --git a/dreamvoice/src/modules/BigVGAN/models.py b/dreamvoice/src/modules/BigVGAN/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/models.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 NVIDIA CORPORATION. 
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from .activations import activations
+from .utils import init_weights, get_padding
+from .alias_free_torch import *
+
+LRELU_SLOPE = 0.1
+
+
+class AMPBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+        super(AMPBlock1, self).__init__()
+        self.h = h
+
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+        super(AMPBlock2, self).__init__()
+        self.h = h
+
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        for c, a in zip (self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, h):
+        super(BigVGAN, self).__init__()
+        self.h = h
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(nn.ModuleList([
+                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+                                            h.upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+
+        # post conv
+        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.d_mult = h.discriminator_channel_mult
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, h):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.mpd_reshapes = h.mpd_reshapes
+        print("mpd_reshapes: {}".format(self.mpd_reshapes))
+        discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
+        self.discriminators = nn.ModuleList(discriminators)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(self, cfg, resolution):
+        super().__init__()
+
+        self.resolution = resolution
+        assert len(self.resolution) == 3, \
+            "MRD layer requires list with len=3, got {}".format(self.resolution)
+        self.lrelu_slope = LRELU_SLOPE
+
+        norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
+        if hasattr(cfg, "mrd_use_spectral_norm"):
+            print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
+            norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
+        self.d_mult = cfg.discriminator_channel_mult
+        if hasattr(cfg, "mrd_channel_mult"):
+            print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
+            self.d_mult = cfg.mrd_channel_mult
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.resolution
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
+        x = torch.view_as_real(x)  # [B, F, TT, 2]
+        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+        return mag
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(self, cfg, debug=False):
+        super().__init__()
+        self.resolutions = cfg.resolutions
+        assert len(self.resolutions) == 3,\
+            "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
+                format(self.resolutions)
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
+        )
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(x=y)
+            y_d_g, fmap_g = d(x=y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
diff --git a/dreamvoice/src/modules/BigVGAN/utils.py b/dreamvoice/src/modules/BigVGAN/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017
--- /dev/null
+++ b/dreamvoice/src/modules/BigVGAN/utils.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+from scipy.io.wavfile import write
+
+MAX_WAV_VALUE = 32768.0
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_spectrogram_clipped(spectrogram, clip_max=2.):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none', vmin=1e-6, vmax=clip_max)
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+def save_audio(audio, path, sr):
+    # wav: torch with 1d shape
+    audio = audio * MAX_WAV_VALUE
+    audio = audio.cpu().numpy().astype('int16')
+    write(path, sr, audio)
\ No newline at end of file
diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce88fc172e8b0905e89324595156d7887cb217e6
Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9d4a199212776cb8616a01bb5f7bdc71f698026
Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc b/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73fd8d01f9aedd492a6a02bd70109ebe593b2ceb
Binary files /dev/null and b/dreamvoice/src/modules/__pycache__/mel.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/mel.py b/dreamvoice/src/modules/mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f
--- /dev/null
+++ b/dreamvoice/src/modules/mel.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as transforms
+
+
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,):
+        super().__init__()
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.mel = transforms.MelSpectrogram(
+            sample_rate=sr,
+            n_fft=frame_length,
+            win_length=frame_length,
+            hop_length=hop_length,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            n_mels=n_mel,
+            mel_scale="slaney",
+            f_min=f_min,
+            f_max=f_max
+        )
+
+    @torch.no_grad()
+    def forward(self, x, target_length=None):
+        x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
+                      (self.frame_length - self.hop_length) // 2), "reflect")
+        mel = self.mel(x)
+
+        target_length = mel.shape[-1] if target_length is None else target_length
+        logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
+        logmel[:, :, :mel.shape[2]] = mel
+
+        logmel = torch.log(torch.clamp(logmel, min=1e-5))
+        return logmel
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint b/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint
new file mode 100644
index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/.ipynb_checkpoints/LICENSE-checkpoint
@@ -0,0 +1,24 @@
+MIT License
+
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/src/modules/speaker_encoder/LICENSE b/dreamvoice/src/modules/speaker_encoder/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/LICENSE
@@ -0,0 +1,24 @@
+MIT License
+
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/src/modules/speaker_encoder/README.md b/dreamvoice/src/modules/speaker_encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95663cf5b29be905a8422176f661a8f7745b5cb0
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/README.md
@@ -0,0 +1,64 @@
+# Real-Time Voice Cloning
+This repository is an implementation of [Transfer Learning from Speaker Verification to
+Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
+
+SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
+
+**Video demonstration** (click the picture):
+
+[![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
+
+
+
+### Papers implemented  
+| URL | Designation | Title | Implementation source |
+| --- | ----------- | ----- | --------------------- |
+|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
+|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
+|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
+
+## News
+**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
+
+**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
+
+**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
+
+**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
+- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
+- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
+
+**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
+
+
+## Setup
+
+### 1. Install Requirements
+1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
+2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
+3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
+4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
+5. Install the remaining requirements with `pip install -r requirements.txt`
+
+### 2. (Optional) Download Pretrained Models
+Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
+
+### 3. (Optional) Test Configuration
+Before you download any dataset, you can begin by testing your configuration with:
+
+`python demo_cli.py`
+
+If all tests pass, you're good to go.
+
+### 4. (Optional) Download Datasets
+For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
+
+### 5. Launch the Toolbox
+You can then try the toolbox:
+
+`python demo_toolbox.py -d <datasets_root>`  
+or  
+`python demo_toolbox.py`  
+
+depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/inference-checkpoint.py
@@ -0,0 +1,211 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_data import *
+from .model import SpeakerEncoder
+from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
+from matplotlib import cm
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device="cpu"):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    first call to embed_frames() with the default weights file.
+    
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath, map_location="cpu")
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    _model = _model.to(device)
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    
+    
+def is_loaded():
+    return _model is not None
+
+
+@torch.no_grad()
+def embed_frames_batch(frames, use_torch=False):
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+
+    if not use_torch:
+        frames = torch.from_numpy(frames)
+    frames = frames.to(_device)
+
+    embeds = _model.forward(frames)
+    if not use_torch:
+        embeds = embeds.detach().cpu().numpy()
+    return embeds
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+        
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    
+    return wav_slices, mel_slices
+
+
+@torch.no_grad()
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of 
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
+    normalized average. If False, the utterance is instead computed from feeding the entire 
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    
+    # Split the utterance into partials
+    frames = wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+
+
+@torch.no_grad()
+def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if not using_partials:
+        frames = wav_to_mel_spectrogram_batch(wavs)
+        embeds = embed_frames_batch(frames)
+        if return_partials:
+            return embeds, None, None
+        return embeds
+
+    wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= wavs.shape[-1]:
+        wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 
+                                            dtype=wavs.dtype, device=wavs.device)], 1)
+
+    frames = wav_to_mel_spectrogram_batch(wavs)
+    frames_batch = []
+    for i in range(len(frames)):
+        frames_batch += [frames[i][s] for s in mel_slices]
+    frames_batch = torch.stack(frames_batch, 0)
+    partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
+    partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
+
+    raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
+    embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
+
+    if return_partials:
+        return embeds, partial_embeds, wave_slices
+    return embeds
+
+
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/.ipynb_checkpoints/preprocess-checkpoint.py
@@ -0,0 +1,177 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from multiprocess.pool import ThreadPool
+from .params_data import *
+from .config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+        
+    def _log_params(self):
+        from encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+        
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+            
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+       
+        
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+    
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return 
+        
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+                        nationality.lower() in anglophone_nationalites]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+          (len(keep_speaker_ids), len(nationalities)))
+    
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15c3671020065c3fce2c4c2fc7559755bf817801
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19aa499c5843528516f37122c22dd016a7aa7772
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..198619d9f47b4706b813cbe59638c6209227ae86
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/__init__.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22ae43eedbaa103e831a2e6169054b8c37842ef7
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..167e3b4a022ebacbeb73a4db6a4a1cac597bd3b1
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1472b1678eb31f1810967fbdc2f58a8608a9f5d7
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/audio.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c398427ab385e5abf0274d9175cb15f6363eeb1c
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b104e5ded94920d3cbc07ae2cbb480807d84b6e5
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c5c543b64e69f4f112bd06850ad50ebaacef52
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/inference.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9e03f3b611dcf9d00938c48645fa77c38f1e7d0
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2719668e4e12b38078da2488a71639dd013136d3
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d7b54b8c3e251748d7a2da8e47e0c2997553f86
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/model.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6afe779501d6e220de041c7c133b03bf8c5cdb5
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6299e6dc1f67558a447c06b3bd2646015d606859
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa3c889ecee1a6f96a64896efc41f4652d58bb64
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_data.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8457e1f0a4ee2d82defdead8e92253dcee74e86f
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-310.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3134901b6849333f838c72b9c6a269fade00ce82
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-311.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1737b0fa586b3d5b6691a205c4239e32e1feed55
Binary files /dev/null and b/dreamvoice/src/modules/speaker_encoder/encoder/__pycache__/params_model.cpython-39.pyc differ
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/audio.py b/dreamvoice/src/modules/speaker_encoder/encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..de650b972fc7a4f3f8a698c128ee4642a373a6d6
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/audio.py
@@ -0,0 +1,157 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from scipy.ndimage.morphology import binary_dilation
+from .params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+import torch
+from torchaudio.transforms import Resample
+from librosa.filters import mel as librosa_mel_fn
+
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
+
+    # Apply the preprocessing: normalize volume and shorten long silences 
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    
+    return wav
+
+
+def preprocess_wav_batch(wavs, source_sr=22050):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if sampling_rate != source_sr:
+        resample = Resample(source_sr, sampling_rate)
+        wavs = resample(wavs)
+    wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, 
+                                               increase_only=True)
+    # Trimming silence is not implemented in this version yet!
+    return wavs_preprocessed
+
+
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+
+
+def wav_to_mel_spectrogram_batch(wavs):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    n_fft = int(sampling_rate * mel_window_length / 1000)
+    hop_length = int(sampling_rate * mel_window_step / 1000)
+    win_length = int(sampling_rate * mel_window_length / 1000)
+    window = torch.hann_window(n_fft).to(wavs)
+    mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
+                                                n_mels=mel_n_channels)).to(wavs)
+    s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, 
+                   win_length=win_length, window=window, center=True, return_complex=False)
+    real_part, imag_part = s.unbind(-1)
+    stftm = real_part**2 + imag_part**2
+    mels = torch.matmul(mel_basis, stftm)
+    return torch.transpose(mels, 1, 2)
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))
+
+
+def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
+    scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
+    if increase_only:
+        mask = (dBFS_change > 0).to(scales)
+    elif decrease_only:
+        mask = (dBFS_change < 0).to(scales)
+    else:
+        mask = torch.zeros_like(scales)
+    scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
+    return wavs * scales.unsqueeze(-1)
+
+
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a 
+    threshold determined by the VAD parameters in params.py.
+
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    
+    return wav[audio_mask == True]
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/config.py b/dreamvoice/src/modules/speaker_encoder/encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1f5aab0d3899c5e5045b40d4cecee1a11d844c
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/config.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9af30b406f2a8debe81a8275cb2682cbd896245a
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/__init__.py
@@ -0,0 +1,4 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .speaker_verification_dataset import SpeakerVerificationDataset
+from .speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd5bb005923852327581e2dcaa03fec7dbce5b8
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
@@ -0,0 +1,39 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import random
+
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random 
+    order. For a source sequence of n items and one or several consecutive queries of a total 
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    
+    def __next__(self):
+        return self.sample(1)[0]
+
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d189c835859efefa686d49b53f4e79aa444d96
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+        
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+               
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all 
+        utterances come up at least once every two cycles and in a random order every time.
+        
+        :param count: The number of partial utterances to sample from the set of utterances from 
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
+        frames are the frames of the partial utterances and range is the range of the partial 
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+
+        utterances = self.utterance_cycler.sample(count)
+
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+        return a
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4080d636338bedcb8d1b8fc77945057027fd0ac1
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
@@ -0,0 +1,14 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+from typing import List
+from .speaker import Speaker
+
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc31fee9e0d62545caa2599aebc22decfb50aa0
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,58 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .speaker_batch import SpeakerBatch
+from .speaker import Speaker
+from ..params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+
+    def __len__(self):
+        return int(1e10)
+        
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+    
+    
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+
+        super().__init__(
+            dataset=dataset, 
+            batch_size=speakers_per_batch, 
+            shuffle=False, 
+            sampler=sampler, 
+            batch_sampler=batch_sampler, 
+            num_workers=num_workers,
+            collate_fn=self.collate, 
+            pin_memory=pin_memory, 
+            drop_last=False, 
+            timeout=timeout, 
+            worker_init_fn=worker_init_fn
+        )
+
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
+    
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b878c58fd7d70d3ba0b33def66912adc1c1a45d
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/data_objects/utterance.py
@@ -0,0 +1,28 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+
+
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the 
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/inference.py b/dreamvoice/src/modules/speaker_encoder/encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/inference.py
@@ -0,0 +1,211 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_data import *
+from .model import SpeakerEncoder
+from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
+from matplotlib import cm
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device="cpu"):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    first call to embed_frames() with the default weights file.
+    
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath, map_location="cpu")
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    _model = _model.to(device)
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    
+    
+def is_loaded():
+    return _model is not None
+
+
+@torch.no_grad()
+def embed_frames_batch(frames, use_torch=False):
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+
+    if not use_torch:
+        frames = torch.from_numpy(frames)
+    frames = frames.to(_device)
+
+    embeds = _model.forward(frames)
+    if not use_torch:
+        embeds = embeds.detach().cpu().numpy()
+    return embeds
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+        
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    
+    return wav_slices, mel_slices
+
+
+@torch.no_grad()
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of 
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
+    normalized average. If False, the utterance is instead computed from feeding the entire 
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    
+    # Split the utterance into partials
+    frames = wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+
+
+@torch.no_grad()
+def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if not using_partials:
+        frames = wav_to_mel_spectrogram_batch(wavs)
+        embeds = embed_frames_batch(frames)
+        if return_partials:
+            return embeds, None, None
+        return embeds
+
+    wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= wavs.shape[-1]:
+        wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 
+                                            dtype=wavs.dtype, device=wavs.device)], 1)
+
+    frames = wav_to_mel_spectrogram_batch(wavs)
+    frames_batch = []
+    for i in range(len(frames)):
+        frames_batch += [frames[i][s] for s in mel_slices]
+    frames_batch = torch.stack(frames_batch, 0)
+    partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
+    partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
+
+    raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
+    embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
+
+    if return_partials:
+        return embeds, partial_embeds, wave_slices
+    return embeds
+
+
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/model.py b/dreamvoice/src/modules/speaker_encoder/encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d246bc359ce1ffc6229ba8a4ced24d07b77e703
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/model.py
@@ -0,0 +1,137 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_model import *
+from .params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size, 
+                            num_layers=model_num_layers, 
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size, 
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+        
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+            
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+        return embeds
+    
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+            
+        return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py b/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d04121aed3d7862889ad6c771055db9b74ab6e
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/params_data.py
@@ -0,0 +1,30 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py b/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c535205028bfec75ba7c58ea7e750ba3fff1633
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/params_model.py
@@ -0,0 +1,12 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py b/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/preprocess.py
@@ -0,0 +1,177 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from multiprocess.pool import ThreadPool
+from .params_data import *
+from .config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+        
+    def _log_params(self):
+        from encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+        
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+            
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+       
+        
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+    
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return 
+        
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+                        nationality.lower() in anglophone_nationalites]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+          (len(keep_speaker_ids), len(nationalities)))
+    
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/train.py b/dreamvoice/src/modules/speaker_encoder/encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..250d038a33b72d09dfe67811c917708aa0ea6714
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/train.py
@@ -0,0 +1,127 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .visualizations import Visualizations
+from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from .params_model import *
+from .model import SpeakerEncoder
+from .utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+    # FIXME
+    return 
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,
+        utterances_per_speaker,
+        num_workers=8,
+    )
+    
+    # Setup the device on which to run the forward pass and the loss. These can be different, 
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+            
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+            
+        profiler.tick("Extras (visualizations, saving)")
+        
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de50f3ec61f6b61798299726b13a1caa1638abb
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/argutils.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from pathlib import Path
+import numpy as np
+import argparse
+
+_type_priorities = [    # In decreasing order
+    Path,
+    str,
+    int,
+    float,
+    bool,
+]
+
+def _priority(o):
+    p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 
+    if p is not None:
+        return p
+    p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 
+    if p is not None:
+        return p
+    return len(_type_priorities)
+
+def print_args(args: argparse.Namespace, parser=None):
+    args = vars(args)
+    if parser is None:
+        priorities = list(map(_priority, args.values()))
+    else:
+        all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
+        priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
+        priorities = list(map(priority, args.keys()))
+    
+    pad = max(map(len, args.keys())) + 3
+    indices = np.lexsort((list(args.keys()), priorities))
+    items = list(args.items())
+    
+    print("Arguments:")
+    for i in indices:
+        param, value = items[i]
+        print("    {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
+    print("")
+    
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py
new file mode 100644
index 0000000000000000000000000000000000000000..43de43e4c29821df5d20d8303ce491101a041a86
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/logmmse.py
@@ -0,0 +1,222 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+import math
+from scipy.special import expn
+from collections import namedtuple
+
+NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
+
+
+def profile_noise(noise, sampling_rate, window_size=0):
+    """
+    Creates a profile of the noise in a given waveform.
+    
+    :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 
+    :param sampling_rate: the sampling rate of the audio
+    :param window_size: the size of the window the logmmse algorithm operates on. A default value 
+    will be picked if left as 0.
+    :return: a NoiseProfile object
+    """
+    noise, dtype = to_float(noise)
+    noise += np.finfo(np.float64).eps
+
+    if window_size == 0:
+        window_size = int(math.floor(0.02 * sampling_rate))
+
+    if window_size % 2 == 1:
+        window_size = window_size + 1
+    
+    perc = 50
+    len1 = int(math.floor(window_size * perc / 100))
+    len2 = int(window_size - len1)
+
+    win = np.hanning(window_size)
+    win = win * len2 / np.sum(win)
+    n_fft = 2 * window_size
+
+    noise_mean = np.zeros(n_fft)
+    n_frames = len(noise) // window_size
+    for j in range(0, window_size * n_frames, window_size):
+        noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
+    noise_mu2 = (noise_mean / n_frames) ** 2
+    
+    return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
+
+
+def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
+    """
+    Cleans the noise from a speech waveform given a noise profile. The waveform must have the 
+    same sampling rate as the one used to create the noise profile. 
+    
+    :param wav: a speech waveform as a numpy array of floats or ints.
+    :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 
+    the same) waveform.
+    :param eta: voice threshold for noise update. While the voice activation detection value is 
+    below this threshold, the noise profile will be continuously updated throughout the audio. 
+    Set to 0 to disable updating the noise profile.
+    :return: the clean wav as a numpy array of floats or ints of the same length.
+    """
+    wav, dtype = to_float(wav)
+    wav += np.finfo(np.float64).eps
+    p = noise_profile
+    
+    nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
+    x_final = np.zeros(nframes * p.len2)
+
+    aa = 0.98
+    mu = 0.98
+    ksi_min = 10 ** (-25 / 10)
+    
+    x_old = np.zeros(p.len1)
+    xk_prev = np.zeros(p.len1)
+    noise_mu2 = p.noise_mu2
+    for k in range(0, nframes * p.len2, p.len2):
+        insign = p.win * wav[k:k + p.window_size]
+
+        spec = np.fft.fft(insign, p.n_fft, axis=0)
+        sig = np.absolute(spec)
+        sig2 = sig ** 2
+
+        gammak = np.minimum(sig2 / noise_mu2, 40)
+
+        if xk_prev.all() == 0:
+            ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+        else:
+            ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+            ksi = np.maximum(ksi_min, ksi)
+
+        log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
+        vad_decision = np.sum(log_sigma_k) / p.window_size
+        if vad_decision < eta:
+            noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+
+        a = ksi / (1 + ksi)
+        vk = a * gammak
+        ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+        hw = a * np.exp(ei_vk)
+        sig = sig * hw
+        xk_prev = sig ** 2
+        xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
+        xi_w = np.real(xi_w)
+
+        x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
+        x_old = xi_w[p.len1:p.window_size]
+
+    output = from_float(x_final, dtype)
+    output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
+    return output
+
+
+## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 
+## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 
+## webrctvad
+# def vad(wav, sampling_rate, eta=0.15, window_size=0):
+#     """
+#     TODO: fix doc
+#     Creates a profile of the noise in a given waveform.
+# 
+#     :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 
+#     :param sampling_rate: the sampling rate of the audio
+#     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
+#     will be picked if left as 0.
+#     :param eta: voice threshold for noise update. While the voice activation detection value is 
+#     below this threshold, the noise profile will be continuously updated throughout the audio. 
+#     Set to 0 to disable updating the noise profile.
+#     """
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#     
+#     if window_size == 0:
+#         window_size = int(math.floor(0.02 * sampling_rate))
+#     
+#     if window_size % 2 == 1:
+#         window_size = window_size + 1
+#     
+#     perc = 50
+#     len1 = int(math.floor(window_size * perc / 100))
+#     len2 = int(window_size - len1)
+#     
+#     win = np.hanning(window_size)
+#     win = win * len2 / np.sum(win)
+#     n_fft = 2 * window_size
+#     
+#     wav_mean = np.zeros(n_fft)
+#     n_frames = len(wav) // window_size
+#     for j in range(0, window_size * n_frames, window_size):
+#         wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
+#     noise_mu2 = (wav_mean / n_frames) ** 2
+#     
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#     
+#     nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
+#     vad = np.zeros(nframes * len2, dtype=np.bool)
+# 
+#     aa = 0.98
+#     mu = 0.98
+#     ksi_min = 10 ** (-25 / 10)
+#     
+#     xk_prev = np.zeros(len1)
+#     noise_mu2 = noise_mu2
+#     for k in range(0, nframes * len2, len2):
+#         insign = win * wav[k:k + window_size]
+#         
+#         spec = np.fft.fft(insign, n_fft, axis=0)
+#         sig = np.absolute(spec)
+#         sig2 = sig ** 2
+#         
+#         gammak = np.minimum(sig2 / noise_mu2, 40)
+#         
+#         if xk_prev.all() == 0:
+#             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+#         else:
+#             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+#             ksi = np.maximum(ksi_min, ksi)
+#         
+#         log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
+#         vad_decision = np.sum(log_sigma_k) / window_size
+#         if vad_decision < eta:
+#             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+#         print(vad_decision)
+#         
+#         a = ksi / (1 + ksi)
+#         vk = a * gammak
+#         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+#         hw = a * np.exp(ei_vk)
+#         sig = sig * hw
+#         xk_prev = sig ** 2
+#         
+#         vad[k:k + len2] = vad_decision >= eta
+#         
+#     vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
+#     return vad
+
+
+def to_float(_input):
+    if _input.dtype == np.float64:
+        return _input, _input.dtype
+    elif _input.dtype == np.float32:
+        return _input.astype(np.float64), _input.dtype
+    elif _input.dtype == np.uint8:
+        return (_input - 128) / 128., _input.dtype
+    elif _input.dtype == np.int16:
+        return _input / 32768., _input.dtype
+    elif _input.dtype == np.int32:
+        return _input / 2147483648., _input.dtype
+    raise ValueError('Unsupported wave file format')
+
+
+def from_float(_input, dtype):
+    if dtype == np.float64:
+        return _input, np.float64
+    elif dtype == np.float32:
+        return _input.astype(np.float32)
+    elif dtype == np.uint8:
+        return ((_input * 128) + 128).astype(np.uint8)
+    elif dtype == np.int16:
+        return (_input * 32768).astype(np.int16)
+    elif dtype == np.int32:
+        print(_input)
+        return (_input * 2147483648).astype(np.int32)
+    raise ValueError('Unsupported wave file format')
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py b/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0176f632b58dfde15e31c04e79543b629bd4499
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/utils/profiler.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from time import perf_counter as timer
+from collections import OrderedDict
+import numpy as np
+
+
+class Profiler:
+    def __init__(self, summarize_every=5, disabled=False):
+        self.last_tick = timer()
+        self.logs = OrderedDict()
+        self.summarize_every = summarize_every
+        self.disabled = disabled
+    
+    def tick(self, name):
+        if self.disabled:
+            return
+        
+        # Log the time needed to execute that function
+        if not name in self.logs:
+            self.logs[name] = []
+        if len(self.logs[name]) >= self.summarize_every:
+            self.summarize()
+            self.purge_logs()
+        self.logs[name].append(timer() - self.last_tick)
+        
+        self.reset_timer()
+        
+    def purge_logs(self):
+        for name in self.logs:
+            self.logs[name].clear()
+    
+    def reset_timer(self):
+        self.last_tick = timer()
+    
+    def summarize(self):
+        n = max(map(len, self.logs.values()))
+        assert n == self.summarize_every
+        print("\nAverage execution time over %d steps:" % n)
+
+        name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
+        pad = max(map(len, name_msgs))
+        for name_msg, deltas in zip(name_msgs, self.logs.values()):
+            print("  %s  mean: %4.0fms   std: %4.0fms" % 
+                  (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
+        print("", flush=True)    
+        
\ No newline at end of file
diff --git a/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py b/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b0ffc1f3c54d85158521cac6d09f05dd21de6d
--- /dev/null
+++ b/dreamvoice/src/modules/speaker_encoder/encoder/visualizations.py
@@ -0,0 +1,180 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255 
+
+
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled    
+        if self.disabled:
+            return 
+        
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+        
+    def log_params(self):
+        if self.disabled:
+            return 
+        from encoder import params_data
+        from encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+        
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return 
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+        
+    def log_implementation(self, params):
+        if self.disabled:
+            return 
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string, 
+            opts={"title": "Training implementation"}
+        )
+
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string), 
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+        
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+        
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])
+        
\ No newline at end of file
diff --git a/dreamvoice/src/plugin_wrapper.py b/dreamvoice/src/plugin_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5
--- /dev/null
+++ b/dreamvoice/src/plugin_wrapper.py
@@ -0,0 +1,76 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.p2e_cross import P2E_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class DreamVG(object):
+    def __init__(self,
+                 config_path='configs/plugin_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_plugin.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = P2E_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.spk_shape = config['model']['unet']['in_channels']
+
+    @torch.no_grad()
+    def inference(self, text,
+                  guidance_scale=5, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023,
+                 ):
+        text, text_mask = text
+        self.model.eval()
+
+        gen_shape = (1, self.spk_shape)
+        
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+    
+        # init noise
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+    
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+    
+            if guidance_scale:
+                output_text = self.model(latents, t, text, text_mask, train_cfg=False)
+                output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+    
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
+    
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+    
+        # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+        pred = scale_shift_re(latents, 1/self.scale, self.shift)
+        # pred = torch.clip(pred, min=0.0, max=0.5)
+        return pred
\ No newline at end of file
diff --git a/dreamvoice/src/train_plugin.py b/dreamvoice/src/train_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/src/train_vc.py b/dreamvoice/src/train_vc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py b/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67
--- /dev/null
+++ b/dreamvoice/src/utils/.ipynb_checkpoints/__init__-checkpoint.py
@@ -0,0 +1 @@
+from .utils import *
\ No newline at end of file
diff --git a/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py b/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a
--- /dev/null
+++ b/dreamvoice/src/utils/.ipynb_checkpoints/utils-checkpoint.py
@@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def scale_shift(x, scale, shift):
+    return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+    return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+    source_len = source.shape[1]
+    if mapping_method == 'hard':
+        mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+        output = source[:, mapping_idx]
+    else:
+        # TBD
+        raise NotImplementedError
+
+    return output
+
+
+def save_plot(tensor, savepath):
+    tensor = tensor.squeeze().cpu()
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+
+
+def save_audio(file_path, sampling_rate, audio):
+    audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999)
+    wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, vmin, vmax)
+    tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+    return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, -1.0, 1.0)
+    tensor = (tensor + 1) / 2
+    tensor = tensor * (vmax - vmin) + vmin
+    return tensor
+
+
+if __name__ == "__main__":
+
+    a = torch.rand(2, 10)
+    target_len = 15
+
+    b = align_seq(a, target_len)
\ No newline at end of file
diff --git a/dreamvoice/src/utils/__init__.py b/dreamvoice/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67
--- /dev/null
+++ b/dreamvoice/src/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *
\ No newline at end of file
diff --git a/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc b/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..405d01af1c119ddc6e4d9d75f19f83ffd18ade03
Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc b/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16806b791a4fd0955747481eb9aeae12108cec3a
Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc b/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36df2ec1198ddc4695a17a082bc6340e8e7d4fe8
Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/utils.cpython-310.pyc differ
diff --git a/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc b/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebbb3f58af0e9432e9c295fec282ecbe4f78f90f
Binary files /dev/null and b/dreamvoice/src/utils/__pycache__/utils.cpython-311.pyc differ
diff --git a/dreamvoice/src/utils/utils.py b/dreamvoice/src/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a
--- /dev/null
+++ b/dreamvoice/src/utils/utils.py
@@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def scale_shift(x, scale, shift):
+    return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+    return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+    source_len = source.shape[1]
+    if mapping_method == 'hard':
+        mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+        output = source[:, mapping_idx]
+    else:
+        # TBD
+        raise NotImplementedError
+
+    return output
+
+
+def save_plot(tensor, savepath):
+    tensor = tensor.squeeze().cpu()
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+
+
+def save_audio(file_path, sampling_rate, audio):
+    audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999)
+    wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, vmin, vmax)
+    tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+    return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, -1.0, 1.0)
+    tensor = (tensor + 1) / 2
+    tensor = tensor * (vmax - vmin) + vmin
+    return tensor
+
+
+if __name__ == "__main__":
+
+    a = torch.rand(2, 10)
+    target_len = 15
+
+    b = align_seq(a, target_len)
\ No newline at end of file
diff --git a/dreamvoice/src/vc_wrapper.py b/dreamvoice/src/vc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3
--- /dev/null
+++ b/dreamvoice/src/vc_wrapper.py
@@ -0,0 +1,144 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.model import DiffVC
+from .model.model_cross import DiffVC_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class ReDiffVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_base.yaml',
+                 ckpt_path='../ckpts/dreamvc_base.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  spk_embed, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
+
+class DreamVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_cross.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  text, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        text, text_mask = text
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
diff --git a/example.py b/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..522acf6d173f802bc1bebb0aee107658d08508a1
--- /dev/null
+++ b/example.py
@@ -0,0 +1,47 @@
+from dreamvoice import DreamVoice
+
+# Plugin mode (DreamVG + ReDiffVC)
+# Initialize DreamVoice in plugin mode with CUDA device
+dreamvoice = DreamVoice(mode='plugin', device='cuda')
+# Description of the target voice
+prompt = 'young female voice, sounds young and cute'
+# Provide the path to the content audio and generate the converted audio
+gen_audio, sr = dreamvoice.genvc('examples/test1.wav', prompt)
+# Save the converted audio
+dreamvoice.save_audio('gen1.wav', gen_audio, sr)
+
+# Save the speaker embedding if you like the generated voice
+dreamvoice.save_spk_embed('voice_stash1.pt')
+# Load the saved speaker embedding
+dreamvoice.load_spk_embed('voice_stash1.pt')
+# Use the saved speaker embedding for another audio sample
+gen_audio2, sr = dreamvoice.simplevc('examples/test2.wav', use_spk_cache=True)
+dreamvoice.save_audio('gen2.wav', gen_audio2, sr)
+
+
+# End-to-end mode (DreamVC)
+# Initialize DreamVoice in end-to-end mode with CUDA device
+dreamvoice = DreamVoice(mode='end2end', device='cuda')
+# Provide the path to the content audio and generate the converted audio
+gen_end2end, sr = dreamvoice.genvc('examples/test1.wav', prompt)
+# Save the converted audio
+dreamvoice.save_audio('gen_end2end.wav', gen_end2end, sr)
+
+# Note: End-to-end mode does not support saving speaker embeddings
+# To use a voice generated in end-to-end mode, switch back to plugin mode
+# and extract the speaker embedding from the generated audio
+# Switch back to plugin mode
+dreamvoice = DreamVoice(mode='plugin', device='cuda')
+# Load the speaker audio from the previously generated file
+gen_end2end2, sr = dreamvoice.simplevc('examples/test2.wav', speaker_audio='gen_end2end.wav')
+# Save the new converted audio
+dreamvoice.save_audio('gen_end2end2.wav', gen_end2end2, sr)
+
+
+# Traditional VC
+# Plugin mode can be used for traditional one-shot voice conversion
+dreamvoice = DreamVoice(mode='plugin', device='cuda')
+# Generate audio using traditional one-shot voice conversion
+gen_tradition, sr = dreamvoice.simplevc('examples/test1.wav', speaker_audio='examples/speaker.wav')
+# Save the converted audio
+dreamvoice.save_audio('gen_tradition.wav', gen_tradition, sr)
diff --git a/examples/speaker.wav b/examples/speaker.wav
new file mode 100644
index 0000000000000000000000000000000000000000..8e9057806844aca84ae455b519d4f3e836b2c6d9
Binary files /dev/null and b/examples/speaker.wav differ
diff --git a/examples/test1.wav b/examples/test1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1384ae82d0794281542b46ed638f4dd17004df46
Binary files /dev/null and b/examples/test1.wav differ
diff --git a/examples/test2.wav b/examples/test2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..3a47e034433c59b33fa5fae82ed15dfd9b71ae99
Binary files /dev/null and b/examples/test2.wav differ