Multi-voice-TTS-GPT-SoVITS

Running

App Files Files Community

Ailyth commited on Mar 7

Commit

516fd45

•

1 Parent(s): 9a035cf

0308-022448-Synchronize_GitHub_update_improve_inference_speed

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

AR/__pycache__/__init__.cpython-310.pyc +0 -0
AR/data/bucket_sampler.py +2 -1
AR/data/data_module.py +4 -2
AR/data/dataset.py +2 -1
AR/models/__pycache__/__init__.cpython-310.pyc +0 -0
AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc +0 -0
AR/models/__pycache__/t2s_model.cpython-310.pyc +0 -0
AR/models/__pycache__/utils.cpython-310.pyc +0 -0
AR/models/t2s_lightning_module.py +4 -3
AR/models/t2s_lightning_module_onnx.py +2 -1
AR/models/t2s_model.py +165 -44
AR/models/t2s_model_onnx.py +2 -1
AR/models/utils.py +72 -3
AR/modules/__pycache__/__init__.cpython-310.pyc +0 -0
AR/modules/__pycache__/activation.cpython-310.pyc +0 -0
AR/modules/__pycache__/embedding.cpython-310.pyc +0 -0
AR/modules/__pycache__/lr_schedulers.cpython-310.pyc +0 -0
AR/modules/__pycache__/optim.cpython-310.pyc +0 -0
AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc +0 -0
AR/modules/__pycache__/scaling.cpython-310.pyc +0 -0
AR/modules/__pycache__/transformer.cpython-310.pyc +0 -0
AR/modules/lr_schedulers.py +2 -1
AR/modules/patched_mha_with_cache.py +4 -2
AR/modules/scaling.py +1 -1
AR/text_processing/phonemizer.py +2 -1
AR/text_processing/symbols.py +2 -1
MODELS/21/1.mp3 +0 -0
MODELS/21/11.mp3 +0 -0
MODELS/21/191.mp3 +0 -0
MODELS/21/21.ckpt +0 -3
MODELS/21/21.pth +0 -3
MODELS/21/s1.mp3 +0 -0
MODELS/21/s2.mp3 +0 -0
MODELS/21/s3.mp3 +0 -0
MODELS/22/22.ckpt +0 -3
MODELS/22/22.pth +0 -3
MODELS/22/passion.mp3 +0 -0
MODELS/22/s1.mp3 +0 -0
MODELS/22/s2.mp3 +0 -0
MODELS/22/s3.mp3 +0 -0
MODELS/22/slow_calm.mp3 +0 -0
MODELS/22/speed.mp3 +0 -0
MODELS/31/1.mp3 +0 -0
MODELS/31/148.mp3 +0 -0
MODELS/31/31.ckpt +0 -3
MODELS/31/31.pth +0 -3
MODELS/31/96.mp3 +0 -0
MODELS/31/s1.mp3 +0 -0
MODELS/31/s2.mp3 +0 -0
MODELS/31/s3.mp3 +0 -0

AR/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/AR/__pycache__/__init__.cpython-310.pyc and b/AR/__pycache__/__init__.cpython-310.pyc differ

AR/data/bucket_sampler.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/bucketsampler.py
 import itertools
 import math
 import random

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py
+# reference: https://github.com/lifeiteng/vall-e
 import itertools
 import math
 import random

AR/data/data_module.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/data_module.py
 from pytorch_lightning import LightningDataModule
 from AR.data.bucket_sampler import DistributedBucketSampler
 from AR.data.dataset import Text2SemanticDataset
@@ -41,7 +42,8 @@ class Text2SemanticDataModule(LightningDataModule):
         #     pad_val=self.config['data']['pad_val'])
     def train_dataloader(self):
-        batch_size = max(min(self.config["train"]["batch_size"],len(self._train_dataset)//4),1)#防止不保存
         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
         return DataLoader(
             self._train_dataset,

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
+# reference: https://github.com/lifeiteng/vall-e
 from pytorch_lightning import LightningDataModule
 from AR.data.bucket_sampler import DistributedBucketSampler
 from AR.data.dataset import Text2SemanticDataset
         #     pad_val=self.config['data']['pad_val'])
     def train_dataloader(self):
+        batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
+        batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
         return DataLoader(
             self._train_dataset,

AR/data/dataset.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/t2s_dataset.py
 import pdb
 import sys

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py
+# reference: https://github.com/lifeiteng/vall-e
 import pdb
 import sys

AR/models/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/AR/models/__pycache__/__init__.cpython-310.pyc and b/AR/models/__pycache__/__init__.cpython-310.pyc differ

AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc CHANGED Viewed

Binary files a/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc and b/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc differ

AR/models/__pycache__/t2s_model.cpython-310.pyc CHANGED Viewed

Binary files a/AR/models/__pycache__/t2s_model.cpython-310.pyc and b/AR/models/__pycache__/t2s_model.cpython-310.pyc differ

AR/models/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/AR/models/__pycache__/utils.cpython-310.pyc and b/AR/models/__pycache__/utils.cpython-310.pyc differ

AR/models/t2s_lightning_module.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
 import os, sys
 now_dir = os.getcwd()
@@ -11,7 +12,6 @@ from AR.models.t2s_model import Text2SemanticDecoder
 from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 from AR.modules.optim import ScaledAdam
 class Text2SemanticLightningModule(LightningModule):
     def __init__(self, config, output_dir, is_train=True):
         super().__init__()
@@ -35,7 +35,8 @@ class Text2SemanticLightningModule(LightningModule):
     def training_step(self, batch: Dict, batch_idx: int):
         opt = self.optimizers()
         scheduler = self.lr_schedulers()
-        loss, acc = self.model.forward(
             batch["phoneme_ids"],
             batch["phoneme_ids_len"],
             batch["semantic_ids"],

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
+# reference: https://github.com/lifeiteng/vall-e
 import os, sys
 now_dir = os.getcwd()
 from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 from AR.modules.optim import ScaledAdam
 class Text2SemanticLightningModule(LightningModule):
     def __init__(self, config, output_dir, is_train=True):
         super().__init__()
     def training_step(self, batch: Dict, batch_idx: int):
         opt = self.optimizers()
         scheduler = self.lr_schedulers()
+        forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old
+        loss, acc = forward(
             batch["phoneme_ids"],
             batch["phoneme_ids_len"],
             batch["semantic_ids"],

AR/models/t2s_lightning_module_onnx.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
 import os, sys
 now_dir = os.getcwd()

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
+# reference: https://github.com/lifeiteng/vall-e
 import os, sys
 now_dir = os.getcwd()

AR/models/t2s_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
 import torch
 from tqdm import tqdm
@@ -8,6 +9,9 @@ from AR.models.utils import (
     sample,
     logits_to_probs,
     multinomial_sample_one_no_sync,
 )
 from AR.modules.embedding import SinePositionalEmbedding
 from AR.modules.embedding import TokenEmbedding
@@ -85,11 +89,104 @@ class Text2SemanticDecoder(nn.Module):
             ignore_index=self.EOS,
         )
     def forward(self, x, x_lens, y, y_lens, bert_feature):
         """
         x: phoneme_ids
         y: semantic_ids
         """
         x = self.ar_text_embedding(x)
         x = x + self.bert_proj(bert_feature.transpose(1, 2))
         x = self.ar_text_position(x)
@@ -231,6 +328,7 @@ class Text2SemanticDecoder(nn.Module):
         prompts,  ####参考音频token
         bert_feature,
         top_k: int = -100,
         early_stop_num: int = -1,
         temperature: float = 1.0,
     ):
@@ -240,7 +338,7 @@ class Text2SemanticDecoder(nn.Module):
         # AR Decoder
         y = prompts
-        prefix_len = y.shape[1]
         x_len = x.shape[1]
         x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
         stop = False
@@ -256,47 +354,41 @@ class Text2SemanticDecoder(nn.Module):
             "first_infer": 1,
             "stage": 0,
         }
-        for idx in tqdm(range(1500)):
-            if cache["first_infer"] == 1:
-                y_emb = self.ar_audio_embedding(y)
-            else:
-                y_emb = torch.cat(
-                    [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1
-                )
-            cache["y_emb"] = y_emb
             y_pos = self.ar_audio_position(y_emb)
-            # x 和逐渐增长的 y 一起输入给模型
-            if cache["first_infer"] == 1:
-                xy_pos = torch.concat([x, y_pos], dim=1)
-            else:
-                xy_pos = y_pos[:, -1:]
-            y_len = y_pos.shape[1]
-            ###以下3个不做缓存
-            if cache["first_infer"] == 1:
-                x_attn_mask_pad = F.pad(
                     x_attn_mask,
                     (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
                     value=True,
                 )
-                y_attn_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
-                    torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
-                    (x_len, 0),
-                    value=False,
-                )
-                xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
-                    y.device
-                )
-            else:
-                ###最右边一列（是错的）
-                # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
-                # xy_attn_mask[:,-1]=False
-                ###最下面一行（是对的）
-                xy_attn_mask = torch.zeros(
-                    (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
-                )
-            # pdb.set_trace()
-            ###缓存重头戏
-            # print(1111,xy_pos.shape,xy_attn_mask.shape,x_len,y_len)
             xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
             logits = self.ar_predict_layer(
                 xy_dec[:, -1]
@@ -305,8 +397,12 @@ class Text2SemanticDecoder(nn.Module):
             if(idx==0):###第一次跑不能EOS否则没有了
                 logits = logits[:, :-1]  ###刨除1024终止符号的概率
             samples = sample(
-                logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35
             )[0].unsqueeze(0)
             if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
                 print("use early stop num:", early_stop_num)
                 stop = True
@@ -315,13 +411,38 @@ class Text2SemanticDecoder(nn.Module):
                 # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
                 stop = True
             if stop:
-                if prompts.shape[1] == y.shape[1]:
                     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
                     print("bad zero prediction")
                 print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
                 break
-            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
-            # print(samples.shape)#[1,1]#第一个1是bs
-            y = torch.concat([y, samples], dim=1)
             cache["first_infer"] = 0
-        return y, idx

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
+# reference: https://github.com/lifeiteng/vall-e
 import torch
 from tqdm import tqdm
     sample,
     logits_to_probs,
     multinomial_sample_one_no_sync,
+    dpo_loss,
+    make_reject_y,
+    get_batch_logps
 )
 from AR.modules.embedding import SinePositionalEmbedding
 from AR.modules.embedding import TokenEmbedding
             ignore_index=self.EOS,
         )
+    def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1, 2))
+        x = self.ar_text_position(x)
+        x_mask = make_pad_mask(x_lens)
+        y_mask = make_pad_mask(y_lens)
+        y_mask_int = y_mask.type(torch.int64)
+        codes = y.type(torch.int64) * (1 - y_mask_int)
+        # Training
+        # AR Decoder
+        y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS)
+        x_len = x_lens.max()
+        y_len = y_lens.max()
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position(y_emb)
+        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
+        ar_xy_padding_mask = xy_padding_mask
+        x_attn_mask = F.pad(
+            torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
+            (0, y_len),
+            value=True,
+        )
+        y_attn_mask = F.pad(
+            torch.triu(
+                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
+                diagonal=1,
+            ),
+            (x_len, 0),
+            value=False,
+        )
+        xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
+        bsz, src_len = x.shape[0], x_len + y_len
+        _xy_padding_mask = (
+            ar_xy_padding_mask.view(bsz, 1, 1, src_len)
+            .expand(-1, self.num_head, -1, -1)
+            .reshape(bsz * self.num_head, 1, src_len)
+        )
+        xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)
+        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
+        new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
+        xy_attn_mask = new_attn_mask
+        # x 和完整的 y 一次性输入模型
+        xy_pos = torch.concat([x, y_pos], dim=1)
+        return xy_pos, xy_attn_mask, targets
     def forward(self, x, x_lens, y, y_lens, bert_feature):
         """
         x: phoneme_ids
         y: semantic_ids
         """
+        reject_y, reject_y_lens = make_reject_y(y, y_lens)
+        xy_pos, xy_attn_mask, targets = self.make_input_data(x, x_lens, y, y_lens, bert_feature)
+        xy_dec, _ = self.h(
+            (xy_pos, None),
+            mask=xy_attn_mask,
+        )
+        x_len = x_lens.max()
+        logits = self.ar_predict_layer(xy_dec[:, x_len:])
+        ###### DPO #############
+        reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(x, x_lens, reject_y, reject_y_lens, bert_feature)
+        reject_xy_dec, _ = self.h(
+            (reject_xy_pos, None),
+            mask=reject_xy_attn_mask,
+        )
+        x_len = x_lens.max()
+        reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:])
+        # loss
+        # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
+        loss_1 = F.cross_entropy(logits.permute(0, 2, 1), targets, reduction="sum")
+        acc = self.ar_accuracy_metric(logits.permute(0, 2, 1).detach(), targets).item()
+        A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets)
+        loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True)
+        loss = loss_1 + loss_2
+        return loss, acc
+    def forward_old(self, x, x_lens, y, y_lens, bert_feature):
+        """
+        x: phoneme_ids
+        y: semantic_ids
+        """
         x = self.ar_text_embedding(x)
         x = x + self.bert_proj(bert_feature.transpose(1, 2))
         x = self.ar_text_position(x)
         prompts,  ####参考音频token
         bert_feature,
         top_k: int = -100,
+        top_p: int = 100,
         early_stop_num: int = -1,
         temperature: float = 1.0,
     ):
         # AR Decoder
         y = prompts
         x_len = x.shape[1]
         x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
         stop = False
             "first_infer": 1,
             "stage": 0,
         }
+        ###################  first step ##########################
+        if y is not None:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
             y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            cache["y_emb"] = y_emb
+            ref_free = False
+        else:
+            y_emb = None
+            y_len = 0
+            prefix_len = 0
+            y_pos = None
+            xy_pos = x
+            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            ref_free = True
+        x_attn_mask_pad = F.pad(
                     x_attn_mask,
                     (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
                     value=True,
                 )
+        y_attn_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
+            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+            (x_len, 0),
+            value=False,
+        )
+        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
+            x.device
+        )
+        for idx in tqdm(range(1500)):
             xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
             logits = self.ar_predict_layer(
                 xy_dec[:, -1]
             if(idx==0):###第一次跑不能EOS否则没有了
                 logits = logits[:, :-1]  ###刨除1024终止符号的概率
             samples = sample(
+                logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
             )[0].unsqueeze(0)
+            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+            # print(samples.shape)#[1,1]#第一个1是bs
+            y = torch.concat([y, samples], dim=1)
             if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
                 print("use early stop num:", early_stop_num)
                 stop = True
                 # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
                 stop = True
             if stop:
+                # if prompts.shape[1] == y.shape[1]:
+                #     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                #     print("bad zero prediction")
+                if y.shape[1]==0:
                     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
                     print("bad zero prediction")
                 print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
                 break
+            ####################### update next step ###################################
             cache["first_infer"] = 0
+            if cache["y_emb"] is not None:
+                y_emb = torch.cat(
+                    [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], dim = 1
+                )
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos[:, -1:]
+            else:
+                y_emb = self.ar_audio_embedding(y[:, -1:])
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos
+            y_len = y_pos.shape[1]
+            ###最右边一列（是错的）
+            # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
+            # xy_attn_mask[:,-1]=False
+            ###最下面一行（是对的）
+            xy_attn_mask = torch.zeros(
+                (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
+            )
+        if ref_free:
+            return y[:, :-1], 0
+        return y[:, :-1], idx-1

AR/models/t2s_model_onnx.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
 import torch
 from tqdm import tqdm

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
+# reference: https://github.com/lifeiteng/vall-e
 import torch
 from tqdm import tqdm

AR/models/utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/utils.py\
 import torch
 import torch.nn.functional as F
 def sequence_mask(length, max_length=None):
     if max_length is None:
@@ -114,7 +115,8 @@ def logits_to_probs(
     top_p: Optional[int] = None,
     repetition_penalty: float = 1.0,
 ):
-    previous_tokens = previous_tokens.squeeze()
     # print(logits.shape,previous_tokens.shape)
     # pdb.set_trace()
     if previous_tokens is not None and repetition_penalty != 1.0:
@@ -158,3 +160,70 @@ def sample(
     )
     idx_next = multinomial_sample_one_no_sync(probs)
     return idx_next, probs

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py
+# reference: https://github.com/lifeiteng/vall-e
 import torch
 import torch.nn.functional as F
+from typing import Tuple
 def sequence_mask(length, max_length=None):
     if max_length is None:
     top_p: Optional[int] = None,
     repetition_penalty: float = 1.0,
 ):
+    if previous_tokens is not None:
+        previous_tokens = previous_tokens.squeeze()
     # print(logits.shape,previous_tokens.shape)
     # pdb.set_trace()
     if previous_tokens is not None and repetition_penalty != 1.0:
     )
     idx_next = multinomial_sample_one_no_sync(probs)
     return idx_next, probs
+def dpo_loss(policy_chosen_logps: torch.FloatTensor,
+             policy_rejected_logps: torch.FloatTensor,
+             reference_chosen_logps: torch.FloatTensor,
+             reference_rejected_logps: torch.FloatTensor,
+             beta: float,
+             reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    pi_logratios = policy_chosen_logps - policy_rejected_logps
+    ref_logratios = reference_chosen_logps - reference_rejected_logps
+    if reference_free:
+        ref_logratios = 0
+    logits = pi_logratios - ref_logratios
+    losses = -F.logsigmoid(beta * logits)
+    chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach()
+    rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach()
+    return losses.mean(), chosen_rewards, rejected_rewards
+def get_batch_logps(logits_target: torch.FloatTensor, logits_reject: torch.FloatTensor, labels_target: torch.LongTensor, labels_reject: torch.LongTensor, average_log_prob: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    # dummy token; we'll ignore the losses on these tokens later
+    per_token_logps_target = torch.gather(logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)).squeeze(2)
+    per_token_logps_reject = torch.gather(logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)).squeeze(2)
+    return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1)
+def make_reject_y(y_o, y_lens):
+    def repeat_P(y):
+        range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
+        pre = y[:range_idx[0]]
+        shf = y[range_idx[1]:]
+        range_text = y[range_idx[0]:range_idx[1]]
+        new_y = torch.cat([pre, range_text, range_text, shf])
+        return new_y
+    def lost_P(y):
+        range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
+        pre = y[:range_idx[0]]
+        shf = y[range_idx[1]:]
+        range_text = y[range_idx[0]:range_idx[1]]
+        new_y = torch.cat([pre, shf])
+        return new_y
+    bs = len(y_lens)
+    reject_y = []
+    reject_y_lens = []
+    for b in range(bs):
+        process_item_idx = torch.randint(0, 1, size=(1, ))[0]
+        if process_item_idx == 0:
+            new_y = repeat_P(y_o[b])
+            reject_y.append(new_y)
+            reject_y_lens.append(len(new_y))
+        elif process_item_idx==1:
+            new_y = lost_P(y_o[b])
+            reject_y.append(new_y)
+            reject_y_lens.append(len(new_y))
+    max_length = max(reject_y_lens)
+    for b in range(bs):
+        pad_length = max_length - reject_y_lens[b]
+        reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0)
+    reject_y = torch.stack(reject_y, dim = 0)
+    reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device)
+    return reject_y, reject_y_lens

AR/modules/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/__init__.cpython-310.pyc and b/AR/modules/__pycache__/__init__.cpython-310.pyc differ

AR/modules/__pycache__/activation.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/activation.cpython-310.pyc and b/AR/modules/__pycache__/activation.cpython-310.pyc differ

AR/modules/__pycache__/embedding.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/embedding.cpython-310.pyc and b/AR/modules/__pycache__/embedding.cpython-310.pyc differ

AR/modules/__pycache__/lr_schedulers.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc and b/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc differ

AR/modules/__pycache__/optim.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/optim.cpython-310.pyc and b/AR/modules/__pycache__/optim.cpython-310.pyc differ

AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc and b/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc differ

AR/modules/__pycache__/scaling.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/scaling.cpython-310.pyc and b/AR/modules/__pycache__/scaling.cpython-310.pyc differ

AR/modules/__pycache__/transformer.cpython-310.pyc CHANGED Viewed

Binary files a/AR/modules/__pycache__/transformer.cpython-310.pyc and b/AR/modules/__pycache__/transformer.cpython-310.pyc differ

AR/modules/lr_schedulers.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/lr_schedulers.py
 import math
 import torch

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
+# reference: https://github.com/lifeiteng/vall-e
 import math
 import torch

AR/modules/patched_mha_with_cache.py CHANGED Viewed

@@ -5,8 +5,8 @@ from torch.nn.functional import (
     _none_or_dtype,
     _in_projection_packed,
 )
-# import torch
 # Tensor = torch.Tensor
 # from typing import Callable, List, Optional, Tuple, Union
@@ -448,9 +448,11 @@ def multi_head_attention_forward_patched(
         k = k.view(bsz, num_heads, src_len, head_dim)
         v = v.view(bsz, num_heads, src_len, head_dim)
         attn_output = scaled_dot_product_attention(
             q, k, v, attn_mask, dropout_p, is_causal
         )
         attn_output = (
             attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
         )

     _none_or_dtype,
     _in_projection_packed,
 )
+from torch.nn import functional as F
+import torch
 # Tensor = torch.Tensor
 # from typing import Callable, List, Optional, Tuple, Union
         k = k.view(bsz, num_heads, src_len, head_dim)
         v = v.view(bsz, num_heads, src_len, head_dim)
+        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
         attn_output = scaled_dot_product_attention(
             q, k, v, attn_mask, dropout_p, is_causal
         )
         attn_output = (
             attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
         )

AR/modules/scaling.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#import logging
 import math
 import random
 from typing import Optional

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import math
 import random
 from typing import Optional

AR/text_processing/phonemizer.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/phonemizer.py
 import itertools
 import re
 from typing import Dict

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
+# reference: https://github.com/lifeiteng/vall-e
 import itertools
 import re
 from typing import Dict

AR/text_processing/symbols.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/symbols.py
 PAD = "_"
 PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

+# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
+# reference: https://github.com/lifeiteng/vall-e
 PAD = "_"
 PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

MODELS/21/1.mp3 DELETED Viewed

Binary file (30.9 kB)

MODELS/21/11.mp3 DELETED Viewed

Binary file (28 kB)

MODELS/21/191.mp3 DELETED Viewed

Binary file (29.5 kB)

MODELS/21/21.ckpt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4b29bb398a9dbed95c50489a2633f90a01c0c4ae1e4432f5d37d388401f9887
-size 155077753

MODELS/21/21.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bfb359648e858765e9c1e3f7d51869aec9f607d18efd90d059cb83f1a7988141
-size 84927748

MODELS/21/s1.mp3 DELETED Viewed

Binary file (29 kB)

MODELS/21/s2.mp3 DELETED Viewed

Binary file (29 kB)

MODELS/21/s3.mp3 DELETED Viewed

Binary file (28.5 kB)

MODELS/22/22.ckpt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3632e3d1876f7a8e86850f346338c5e2390d09f382891277acf77a4e1a65a25
-size 155083315

MODELS/22/22.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3dfe7fe2765b179db75d8e12bd2b32e1f8d624dcee9a3fecdecbc94904757c29
-size 84927982

MODELS/22/passion.mp3 DELETED Viewed

Binary file (131 kB)

MODELS/22/s1.mp3 DELETED Viewed

Binary file (26.8 kB)

MODELS/22/s2.mp3 DELETED Viewed

Binary file (33.1 kB)

MODELS/22/s3.mp3 DELETED Viewed

Binary file (30.2 kB)

MODELS/22/slow_calm.mp3 DELETED Viewed

Binary file (79.2 kB)

MODELS/22/speed.mp3 DELETED Viewed

Binary file (122 kB)

MODELS/31/1.mp3 DELETED Viewed

Binary file (111 kB)

MODELS/31/148.mp3 DELETED Viewed

Binary file (86.8 kB)

MODELS/31/31.ckpt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:532d92b5b2a1550ed1151aa2d0a801a2fc390fc7b87a7d0278ca7af4cad50c7f
-size 155084485

MODELS/31/31.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e3d128cd00c3853ebe375dd5aeccd979c55a7e8d036cc41843507e2191ccd6d3
-size 84929396

MODELS/31/96.mp3 DELETED Viewed

Binary file (83.4 kB)

MODELS/31/s1.mp3 DELETED Viewed

Binary file (32.2 kB)

MODELS/31/s2.mp3 DELETED Viewed

Binary file (43 kB)

MODELS/31/s3.mp3 DELETED Viewed

Binary file (39.1 kB)