将一个batch中的padding策略，从padding on right改为了padding on left。

2025-10-08 07:49:59 +08:00 · 2024-04-12 00:49:44 +08:00 · 2024-04-12 00:49:44 +08:00 · 5fe9842069
commit 5fe9842069
parent 3706ad1b8b
8 changed files with 569 additions and 125 deletions
--- a/GPT_SoVITS/AR/data/data_module.py
+++ b/GPT_SoVITS/AR/data/data_module.py
@ -32,6 +32,7 @@ class Text2SemanticDataModule(LightningDataModule):
            semantic_path=self.train_semantic_path,
            max_sec=self.config["data"]["max_sec"],
            pad_val=self.config["data"]["pad_val"],
+            padding_on_left=self.config["train"]["padding_on_left"],
        )
        self._dev_dataset = self._train_dataset
        # self._dev_dataset = Text2SemanticDataset(
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@ -55,9 +55,10 @@ class Text2SemanticDataset(Dataset):
        min_ps_ratio: int = 3,
        # max value of phoneme/sec
        max_ps_ratio: int = 25,
+        padding_on_left:bool=False,
    ) -> None:
        super().__init__()
-
+        self.padding_on_left=padding_on_left
        self.semantic_data = pd.read_csv(
            semantic_path, delimiter="\t", encoding="utf-8"
        )
@ -164,7 +165,9 @@ class Text2SemanticDataset(Dataset):
            # if len(semantic_ids) > 1000:###########3
            #     num_deleted_bigger += 1
            #     continue
-
+            if (len(semantic_ids)+len(phoneme_ids)) > 1000:###########3
+                num_deleted_bigger += 1
+                continue
            ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz)

            if (
@ -173,6 +176,7 @@ class Text2SemanticDataset(Dataset):
                num_deleted_ps += 1
                # print(item_name)
                continue
+            idx_len=[]
            
            self.semantic_phoneme.append((semantic_ids, phoneme_ids))
            idx += 1
@ -253,8 +257,8 @@ class Text2SemanticDataset(Dataset):
        phoneme_ids_lens: List[int] = []
        semantic_ids: List[torch.Tensor] = []
        semantic_ids_lens: List[int] = []
-        # return

+        if not self.padding_on_left:
            for item in examples:
                sample_index.append(item["idx"])
                phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64))
@ -294,6 +298,33 @@ class Text2SemanticDataset(Dataset):
                "bert_feature": bert_padded,
            }

+        else:
+            for item in examples:
+                sample_index.append(item["idx"])
+                phoneme_ids.append(torch.LongTensor(np.array(item["phoneme_ids"], dtype=np.int64)))
+                semantic_ids.append(torch.LongTensor(np.array(item["semantic_ids"], dtype=np.int64)))
+                phoneme_ids_lens.append(item["phoneme_ids_len"])
+                semantic_ids_lens.append(item["semantic_ids_len"])
+
+            phoneme_ids_lens = torch.tensor(phoneme_ids_lens)
+            semantic_ids_lens = torch.tensor(semantic_ids_lens)
+            bert_features: List[torch.Tensor] = [item["bert_feature"] for item in examples]
+
+            return {
+                # List[int]
+                "ids": sample_index,
+                # List[torch.Tensor] (B, max_phoneme_length)
+                "phoneme_ids": phoneme_ids,
+                # torch.Tensor (B)
+                "phoneme_ids_len": phoneme_ids_lens,
+                # List[torch.Tensor] (B, max_semantic_ids_length)
+                "semantic_ids": semantic_ids,
+                # torch.Tensor (B)
+                "semantic_ids_len": semantic_ids_lens,
+                # List[torch.Tensor] (B, 1024, max_phoneme_length)
+                "bert_feature": bert_features,
+            }
+

 if __name__ == "__main__":
    root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/"
--- a/GPT_SoVITS/AR/models/t2s_lightning_module.py
+++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py
@ -14,7 +14,7 @@ from AR.modules.optim import ScaledAdam

 class Text2SemanticLightningModule(LightningModule):
    def __init__(self, config, output_dir, is_train=True,  flash_attn_enabled:bool = False):
-        super().__init__()
+        super(Text2SemanticLightningModule,self).__init__()
        self.config = config
        self.top_k = 3
        self.model = Text2SemanticDecoder(config=config, top_k=self.top_k,flash_attn_enabled=flash_attn_enabled)
@ -35,7 +35,14 @@ class Text2SemanticLightningModule(LightningModule):
    def training_step(self, batch: Dict, batch_idx: int):
        opt = self.optimizers()
        scheduler = self.lr_schedulers()
-        forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old
+        forward=None
+        if self.config["train"].get("if_dpo",False):
+            forward=self.model.forward
+        elif self.config["train"].get("padding_on_left",False):
+            forward=self.model.forward_old_padding_on_left
+        else:
+            forward=self.model.forward_old
+
        loss, acc = forward(
            batch["phoneme_ids"],
            batch["phoneme_ids_len"],
@ -56,6 +63,7 @@ class Text2SemanticLightningModule(LightningModule):
            on_epoch=True,
            prog_bar=True,
            sync_dist=True,
+            batch_size=batch["phoneme_ids_len"].shape[0],
        )
        self.log(
            "lr",
@ -63,6 +71,7 @@ class Text2SemanticLightningModule(LightningModule):
            on_epoch=True,
            prog_bar=True,
            sync_dist=True,
+            batch_size=batch["phoneme_ids_len"].shape[0],
        )
        self.log(
            f"top_{self.top_k}_acc",
@ -71,7 +80,10 @@ class Text2SemanticLightningModule(LightningModule):
            on_epoch=True,
            prog_bar=True,
            sync_dist=True,
+            batch_size=batch["phoneme_ids_len"].shape[0],
        )
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()

    def validation_step(self, batch: Dict, batch_idx: int):
        return
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -1,5 +1,6 @@
 # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
 # reference: https://github.com/lifeiteng/vall-e
+import math
 import os, sys
 now_dir = os.getcwd()
 sys.path.append(now_dir)
@ -38,7 +39,6 @@ default_config = {
    "EOS": 1024,
 }

-
@torch.jit.script
 class T2SMLP:
    def __init__(self, w1, b1, w2, b2):
@ -363,6 +363,7 @@ class Text2SemanticDecoder(nn.Module):

        return loss, acc
    
+    #padding on right
    def forward_old(self, x, x_lens, y, y_lens, bert_feature):
        """
        x: phoneme_ids
@ -425,6 +426,91 @@ class Text2SemanticDecoder(nn.Module):
        acc = self.ar_accuracy_metric(logits.detach(), targets).item()
        return loss, acc
    
+    def forward_old_padding_on_left(self, 
+                    x:List[torch.Tensor], 
+                    x_lens:torch.LongTensor, 
+                    y:List[torch.Tensor], 
+                    y_lens:torch.LongTensor, 
+                    bert_feature:List[torch.Tensor]):
+        """
+        x: phoneme_ids
+        y: semantic_ids
+        """
+        device = x[0].device
+        x_len = x_lens.max()
+        y_len = y_lens.max()
+        batch_size = len(x)
+    
+        xy_pos = torch.zeros((batch_size, x_len+y_len, self.embedding_dim)).to(device)
+        targets:List[torch.LongTensor] = []
+        xy_attn_mask_list = []
+        for i in range(batch_size):
+            padding_len = (x_len-x_lens[i])+(y_len-y_lens[i])
+            
+            x_item=self.ar_text_embedding(x[i].unsqueeze(0))
+            if bert_feature[i] is not None:
+                x_item = x_item + self.bert_proj(bert_feature[i].transpose(0, 1).unsqueeze(0))
+                
+            # x_item = F.pad(x_item, (0, 0, padding_len, 0), value=0)
+            x_item = self.ar_text_position(x_item).squeeze(0)
+            y_item = self.ar_audio_position(self.ar_audio_embedding(y[i].unsqueeze(0))).squeeze(0)
+            
+            xy_pos[i, padding_len:padding_len+x_lens[i],:] = x_item
+            xy_pos[i, padding_len+x_lens[i]:,:] = y_item
+            target = torch.zeros(y_lens[i], dtype=torch.long).to(device)
+            target[:-1] = y[i][1:]
+            target[-1] = self.EOS
+            targets.append(target.unsqueeze(0))
+
+            x_attn_mask = torch.zeros((x_len+(y_len-y_lens[i]), x_len+y_len), dtype=torch.bool).to(device)
+            x_attn_mask[:, -y_lens[i]:] = True
+            y_attn_mask = F.pad(
+                torch.triu(
+                    torch.ones(y_lens[i], y_lens[i], dtype=torch.bool).to(device),
+                    diagonal=1,
+                ),
+                (x_len+(y_len-y_lens[i]), 0),
+                value=False,
+            )
+            attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
+            if padding_len>0:
+                attn_mask[:, :padding_len] = True
+            xy_attn_mask_list.append(attn_mask)
+            
+        xy_attn_mask = torch.stack(xy_attn_mask_list, dim=0)
+        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=xy_pos.dtype)
+        new_attn_mask.masked_fill_(xy_attn_mask, torch.finfo(xy_pos.dtype).min)
+        xy_attn_mask = new_attn_mask
+        xy_attn_mask = (xy_attn_mask.view(batch_size, 1, x_len+y_len, x_len+y_len)
+                                    .expand(-1, self.num_head, -1, -1)
+                                    .reshape(batch_size * self.num_head, x_len+y_len, x_len+y_len))
+        
+        
+        # x 和完整的 y 一次性输入模型
+        # xy_pos = torch.concat([x, y_pos], dim=1)
+        xy_dec, _ = self.h(
+            (xy_pos, None),
+            mask=xy_attn_mask,
+        )
+        logits = [self.ar_predict_layer(xy_dec[i, -y_lens[i]:, :].unsqueeze(0)).permute(0, 2, 1) for i in range(batch_size)]
+
+        # loss
+        # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
+        loss = None
+        acc = None
+        for i in range(batch_size):
+            if loss is None:
+                loss = F.cross_entropy(logits[i], targets[i], reduction="sum")
+                acc = self.ar_accuracy_metric(logits[i].detach(), targets[i].detach()).item()
+            else:
+                loss += F.cross_entropy(logits[i], targets[i], reduction="sum")
+                acc += self.ar_accuracy_metric(logits[i].detach(), targets[i].detach()).item()
+        acc /= batch_size
+        
+        # loss = F.cross_entropy(logits, targets, reduction="sum")
+        # acc = self.ar_accuracy_metric(logits.detach(), targets).item()
+        return loss, acc
+
    # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么
    def infer(
        self,
@ -512,85 +598,80 @@ class Text2SemanticDecoder(nn.Module):
        top_p: int = 100,
        early_stop_num: int = -1,
        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        dtype:torch.dtype = torch.float32,
    ):
-        # 先对phones进行embedding、对bert_features进行project，再pad到相同长度，以缓解复读问题。（可能还有其他因素导致复读）
-        max_len = 0
-        for x_item, bert_item in zip(x, bert_feature):
-            max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
-        x_list = [self.ar_text_embedding(item) for item in x]
-        x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
-        x = torch.stack(x_list, dim=0)

-        bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
-        bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
-        bert_feature = torch.stack(bert_features_list, dim=0)
+        device = x[0].device
+        x_len = x_lens.max()
+        batch_size = len(x)
        
-        # bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
-        # x = self.ar_text_embedding(x)
-        x = x + bert_feature 
-        x = self.ar_text_position(x)
-
-        # AR Decoder
        y = prompts
-        
-        x_len = x.shape[1]
-        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
        stop = False
-        # print(1111111,self.num_layers)
-
        k_cache = None
        v_cache = None
        ###################  first step ##########################
-        if y is not None:
-            y_emb = self.ar_audio_embedding(y)
-            y_len = y_emb.shape[1]
-            prefix_len = y.shape[1]
-            y_pos = self.ar_audio_position(y_emb)
-            xy_pos = torch.concat([x, y_pos], dim=1)
-            ref_free = False
-        else:
+        if y is None:
            y_emb = None
            y_len = 0
            prefix_len = 0
            y_pos = None
-            xy_pos = x
-            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            y = torch.zeros(batch_size, 0, dtype=torch.int, device=device)
            ref_free = True

+        else:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
+            y_pos = self.ar_audio_position(y_emb)
+            ref_free = False

-        ##### create mask #####
-        bsz = x.shape[0]
-        src_len = x_len + y_len
-        y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
-        y_mask = make_pad_mask(y_lens)
-        x_mask = make_pad_mask(x_lens)
        
-        # (bsz, x_len + y_len)
-        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
+        xy_pos = torch.zeros((batch_size, x_len+y_len, self.embedding_dim),dtype=dtype).to(device)
+        # ar_xy_padding_mask = torch.zeros((batch_size, x_len+y_len), device=device, dtype=torch.bool)
+        xy_attn_mask_list = []
+        for i in range(batch_size):
+            padding_len = (x_len-x_lens[i])
            
-        x_mask = F.pad(
-            x_attn_mask,
-            (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
-            value=True,
-        )
-        y_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
-            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+            x_item=self.ar_text_embedding(x[i].unsqueeze(0))
+            if bert_feature[i] is not None:
+                x_item = x_item + self.bert_proj(bert_feature[i].transpose(0, 1).unsqueeze(0))
+                
+            # x_item = F.pad(x_item, (0, 0, padding_len, 0), value=0)
+            x_item = self.ar_text_position(x_item).squeeze(0)
+            
+            xy_pos[i, padding_len:padding_len+x_lens[i],:] = x_item
+            if not ref_free:
+                xy_pos[i, padding_len+x_lens[i]:,:] = y_pos[i]
+
+            x_attn_mask = torch.zeros((x_len, x_len+y_len), dtype=torch.bool).to(device)
+            if not ref_free:
+                x_attn_mask[:, -y_len:] = True
+            y_attn_mask = F.pad(
+                torch.triu(
+                    torch.ones(y_len, y_len, dtype=torch.bool).to(device),
+                    diagonal=1,
+                ),
                (x_len, 0),
                value=False,
            )
+            attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
+            if padding_len>0:
+                attn_mask[:, :padding_len] = True
+            xy_attn_mask_list.append(attn_mask)
+            
+        xy_attn_mask = torch.stack(xy_attn_mask_list, dim=0)
+        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=xy_pos.dtype)
+        new_attn_mask.masked_fill_(xy_attn_mask, torch.finfo(xy_pos.dtype).min)
+        xy_attn_mask = new_attn_mask
+        xy_attn_mask = (xy_attn_mask.view(batch_size, 1, x_len+y_len, x_len+y_len)
+                                    .expand(-1, self.num_head, -1, -1))
                                        
-        xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz, -1, -1).to(x.device)
-        # xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
-        xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(-1, src_len, src_len)
-        xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
-        xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
-        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
-        xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
        
        ###### decode #####
-        y_list = [None]*y.shape[0]
-        batch_idx_map = list(range(y.shape[0]))
-        idx_list = [None]*y.shape[0]
+        y_list = [None]*batch_size
+        batch_idx_map = list(range(batch_size))
+        idx_list = [None]*batch_size
        for idx in tqdm(range(1500)):
            if idx == 0:
                xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask)
@ -606,7 +687,7 @@ class Text2SemanticDecoder(nn.Module):
                logits = logits[:, :-1]
                
            samples = sample(
-                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
+                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
            )[0]

            y = torch.concat([y, samples], dim=1)
@ -659,12 +740,12 @@ class Text2SemanticDecoder(nn.Module):
            xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device)

        if (None in idx_list):
-            for i in range(x.shape[0]):
+            for i in range(batch_size):
                if idx_list[i] is None:
                    idx_list[i] = 1500-1  ###如果没有生成到EOS，就用最大长度代替
                    
        if ref_free:
-            return y_list, [0]*x.shape[0]
+            return y_list, [0]*batch_size
        return y_list, idx_list
    
    def infer_panel_batch_only(
@ -677,6 +758,8 @@ class Text2SemanticDecoder(nn.Module):
        top_p: int = 100,
        early_stop_num: int = -1,
        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs
    ):
        # 先对phones进行embedding、对bert_features进行project，再pad到相同长度，以缓解复读问题。（可能还有其他因素导致复读）
        max_len = 0
@ -772,7 +855,7 @@ class Text2SemanticDecoder(nn.Module):
            if(idx==0):###第一次跑不能EOS否则没有了
                logits = logits[:, :-1]  ###刨除1024终止符号的概率
            samples = sample(
-                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
+                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
            )[0]
            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
            # print(samples.shape)#[1,1]#第一个1是bs
@ -855,3 +938,297 @@ class Text2SemanticDecoder(nn.Module):
        if ref_free:
            return y_list, [0]*x.shape[0]
        return y_list, idx_list
+    
+    # padding on right
+    def infer_panel_batch_infer_with_flash_attn_old(
+        self,
+        x:List[torch.LongTensor],  #####全部文本token
+        x_lens:torch.LongTensor,
+        prompts:torch.LongTensor,  ####参考音频token
+        bert_feature:List[torch.LongTensor],
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.35,
+        **kwargs
+    ):
+        # 先对phones进行embedding、对bert_features进行project，再pad到相同长度，以缓解复读问题。（可能还有其他因素导致复读）
+        max_len = 0
+        for x_item, bert_item in zip(x, bert_feature):
+            max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
+        x_list = [self.ar_text_embedding(item) for item in x]
+        x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
+        x = torch.stack(x_list, dim=0)
+
+        bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
+        bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
+        bert_feature = torch.stack(bert_features_list, dim=0)
+        
+        # bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
+        # x = self.ar_text_embedding(x)
+        x = x + bert_feature 
+        x = self.ar_text_position(x)
+
+        # AR Decoder
+        y = prompts
+        
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+        # print(1111111,self.num_layers)
+
+        k_cache = None
+        v_cache = None
+        ###################  first step ##########################
+        if y is not None:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
+            y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            ref_free = False
+        else:
+            y_emb = None
+            y_len = 0
+            prefix_len = 0
+            y_pos = None
+            xy_pos = x
+            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            ref_free = True
+
+
+        ##### create mask #####
+        bsz = x.shape[0]
+        src_len = x_len + y_len
+        y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
+        y_mask = make_pad_mask(y_lens)
+        x_mask = make_pad_mask(x_lens)
+        
+        # (bsz, x_len + y_len)
+        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
+
+        x_mask = F.pad(
+            x_attn_mask,
+            (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+            value=True,
+        )
+        y_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
+            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+            (x_len, 0),
+            value=False,
+        )
+        
+        xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz, -1, -1).to(x.device)
+        # xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
+        xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(-1, src_len, src_len)
+        xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
+        xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
+        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
+        xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
+        
+        ###### decode #####
+        y_list = [None]*y.shape[0]
+        batch_idx_map = list(range(y.shape[0]))
+        idx_list = [None]*y.shape[0]
+        for idx in tqdm(range(1500)):
+            if idx == 0:
+                xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask)
+            else:
+                xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)
+
+            logits = self.ar_predict_layer(
+                xy_dec[:, -1]
+            )
+
+            if idx == 0:
+                xy_attn_mask = None
+                logits = logits[:, :-1]
+                
+            samples = sample(
+                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
+            )[0]
+
+            y = torch.concat([y, samples], dim=1)
+            
+            ####### 移除batch中已经生成完毕的序列,进一步优化计算量
+            reserved_idx_of_batch_for_y = None
+            if (self.EOS in samples[:, 0]) or \
+                (self.EOS in torch.argmax(logits, dim=-1)):  ###如果生成到EOS，则停止
+                    l = samples[:, 0]==self.EOS
+                    removed_idx_of_batch_for_y = torch.where(l==True)[0].tolist()
+                    reserved_idx_of_batch_for_y = torch.where(l==False)[0]
+                    # batch_indexs = torch.tensor(batch_idx_map, device=y.device)[removed_idx_of_batch_for_y]
+                    for i in removed_idx_of_batch_for_y:
+                        batch_index = batch_idx_map[i]
+                        idx_list[batch_index] = idx - 1
+                        y_list[batch_index] = y[i, :-1]
+                
+                    batch_idx_map = [batch_idx_map[i] for i in reserved_idx_of_batch_for_y.tolist()]
+                
+            # 只保留batch中未生成完毕的序列 
+            if reserved_idx_of_batch_for_y is not None:
+                # index = torch.LongTensor(batch_idx_map).to(y.device)
+                y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
+                if k_cache is not None :
+                    for i in range(len(k_cache)):
+                        k_cache[i] = torch.index_select(k_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
+                        v_cache[i] = torch.index_select(v_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
+                
+                
+            if (early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num) or idx==1499:
+                print("use early stop num:", early_stop_num)
+                stop = True
+                for i, batch_index in enumerate(batch_idx_map):
+                    batch_index = batch_idx_map[i]
+                    idx_list[batch_index] = idx
+                    y_list[batch_index] = y[i, :-1]
+                
+            if not (None in idx_list):
+                stop = True
+                
+            if stop:
+                if y.shape[1]==0:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print("bad zero prediction")
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+
+            ####################### update next step ###################################
+            y_emb = self.ar_audio_embedding(y[:, -1:])
+            xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device)
+
+        if (None in idx_list):
+            for i in range(x.shape[0]):
+                if idx_list[i] is None:
+                    idx_list[i] = 1500-1  ###如果没有生成到EOS，就用最大长度代替
+                    
+        if ref_free:
+            return y_list, [0]*x.shape[0]
+        return y_list, idx_list
+
+    def infer_panel_old(
+        self,
+        x,  #####全部文本token
+        x_lens,
+        prompts,  ####参考音频token
+        bert_feature,
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+    ):
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1, 2))
+        x = self.ar_text_position(x)
+
+        # AR Decoder
+        y = prompts
+        
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+        # print(1111111,self.num_layers)
+        cache = {
+            "all_stage": self.num_layers,
+            "k": [None] * self.num_layers,  ###根据配置自己手写
+            "v": [None] * self.num_layers,
+            # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存，每次都要重新拼xy_pos.主要还是写法原因，其实是可以历史统一一样的，但也没啥计算量就不管了
+            "y_emb": None,  ##只需要对最新的samples求emb，再拼历史的就行
+            # "logits":None,###原版就已经只对结尾求再拼接了，不用管
+            # "xy_dec":None,###不需要，本来只需要最后一个做logits
+            "first_infer": 1,
+            "stage": 0,
+        }
+        ###################  first step ##########################
+        if y is not None:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
+            y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            cache["y_emb"] = y_emb
+            ref_free = False
+        else:
+            y_emb = None
+            y_len = 0
+            prefix_len = 0
+            y_pos = None
+            xy_pos = x
+            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            ref_free = True
+
+        x_attn_mask_pad = F.pad(
+                    x_attn_mask,
+                    (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+                    value=True,
+                )
+        y_attn_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
+            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+            (x_len, 0),
+            value=False,
+        )
+        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
+            x.device
+        )
+        
+
+        for idx in tqdm(range(1500)):
+            
+            xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
+            logits = self.ar_predict_layer(
+                xy_dec[:, -1]
+            )  ##不用改，如果用了cache的默认就是只有一帧，取最后一帧一样的
+            # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
+            if(idx==0):###第一次跑不能EOS否则没有了
+                logits = logits[:, :-1]  ###刨除1024终止符号的概率
+            samples = sample(
+                logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
+            )[0].unsqueeze(0)
+            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+            # print(samples.shape)#[1,1]#第一个1是bs
+            y = torch.concat([y, samples], dim=1) 
+
+            if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+                print("use early stop num:", early_stop_num)
+                stop = True
+
+            if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
+                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
+                stop = True
+            if stop:
+                # if prompts.shape[1] == y.shape[1]:
+                #     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                #     print("bad zero prediction")
+                if y.shape[1]==0:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print("bad zero prediction")
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+            
+            ####################### update next step ###################################
+            cache["first_infer"] = 0
+            if cache["y_emb"] is not None:
+                y_emb = torch.cat(
+                    [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], dim = 1
+                )
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos[:, -1:]
+            else:
+                y_emb = self.ar_audio_embedding(y[:, -1:])
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos
+            y_len = y_pos.shape[1]
+
+            ###最右边一列（是错的）
+            # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
+            # xy_attn_mask[:,-1]=False
+            ###最下面一行（是对的）
+            xy_attn_mask = torch.zeros(
+                (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
+            )
+        if ref_free:
+            return y[:, :-1], 0
+        return y[:, :-1], idx-1
--- a/GPT_SoVITS/AR/models/utils.py
+++ b/GPT_SoVITS/AR/models/utils.py
@ -11,7 +11,7 @@ def sequence_mask(length, max_length=None):
    return x.unsqueeze(0) < length.unsqueeze(1)


-def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0, padding_left:bool=False) -> torch.Tensor:
    """
    Args:
      lengths:
@ -35,7 +35,9 @@ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
    n = lengths.size(0)
    seq_range = torch.arange(0, max_len, device=lengths.device)
    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
-
+    if padding_left:
+        return expaned_lengths < (max_len-lengths.unsqueeze(-1))
+    else:
        return expaned_lengths >= lengths.unsqueeze(-1)


--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -63,7 +63,7 @@ def set_seed(seed:int):
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
-            # torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.deterministic = True
            # torch.backends.cudnn.benchmark = False
            # torch.backends.cudnn.enabled = True
    except:
@ -435,8 +435,7 @@ class TTS:
                 device:torch.device=torch.device("cpu"),
                 precision:torch.dtype=torch.float32,
                 ):
-        # 但是这里不能套，反而会负优化
-        # with torch.no_grad():
+
        _data:list = []
        index_and_len_list = []
        for idx, item in enumerate(data):
@ -484,8 +483,7 @@ class TTS:
            norm_text_batch = []
            bert_max_len = 0
            phones_max_len = 0
-            # 但是这里也不能套，反而会负优化
-            # with torch.no_grad():
+
            for item in item_list:
                if prompt_data is not None:
                    all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\
@ -533,6 +531,12 @@ class TTS:
            # all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
            # all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
            
+            #### padding on left
+            # all_phones_list = [F.pad(item,(max_len-item.shape[0],0),value=0) for item in all_phones_list]
+            # all_phones_batch = torch.stack(all_phones_list, dim=0)
+            # all_bert_features_list = [F.pad(item,(max_len-item.shape[1],0,0,0), value=0) for item in all_bert_features_list]
+            # all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
+            
            batch = {
                "phones": phones_batch,
                "phones_len": torch.LongTensor(phones_len_list).to(device),
@ -569,7 +573,6 @@ class TTS:
        '''
        self.stop_flag = True
    
-    # 使用装饰器
    @torch.no_grad()
    def run(self, inputs:dict):
        """
@ -586,9 +589,10 @@ class TTS:
                    "top_k": 5,                   # int. top k sampling
                    "top_p": 1,                   # float. top p sampling
                    "temperature": 1,             # float. temperature for sampling
+                    "repetition_penalty": 1.35,   # float. repetition penalty for sampling of T2S model.
                    "text_split_method": "cut0",  # str. text split method, see text_segmentation_method.py for details.
                    "batch_size": 1,              # int. batch size for inference
-                    "batch_threshold": 0.75,      # float. threshold for batch splitting.
+                    "batch_threshold": 1,      # float. threshold for batch splitting.
                    "split_bucket: True,          # bool. whether to split the batch into multiple buckets.
                    "return_fragment": False,     # bool. step by step return the audio fragment.
                    "speed_factor":1.0,           # float. control the speed of the synthesized audio.
@ -608,6 +612,7 @@ class TTS:
        top_k:int = inputs.get("top_k", 5)
        top_p:float = inputs.get("top_p", 1)
        temperature:float = inputs.get("temperature", 1)
+        repetition_penalty: float = inputs.get("repetition_penalty", 1.35)
        text_split_method:str = inputs.get("text_split_method", "cut0")
        batch_size = inputs.get("batch_size", 1)
        batch_threshold = inputs.get("batch_threshold", 0.75)
@ -618,9 +623,16 @@ class TTS:
        seed = inputs.get("seed", -1)
        seed = -1 if seed in ["", None] else seed
        actual_seed = set_seed(seed)
+        padding_on_left = inputs.get("padding_on_left", False)
+
+        if padding_on_left:
+            print("padding on left")
+            self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer_with_flash_attn
+        else:
+            print("padding on right")
+            self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer_with_flash_attn_old
        
        if return_fragment:
-            # split_bucket = False
            print(i18n("分段返回模式已开启"))
            if split_bucket:
                split_bucket = False
@ -746,7 +758,6 @@ class TTS:
                else:
                    prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
                
-
                pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
                    all_phoneme_ids,
                    all_phoneme_lens,
@ -756,7 +767,9 @@ class TTS:
                    top_k=top_k,
                    top_p=top_p,
                    temperature=temperature,
+                    repetition_penalty = repetition_penalty,
                    early_stop_num=self.configs.hz * self.configs.max_sec,
+                    dtype = self.precision,
                )
                t4 = ttime()
                t_34 += t4 - t3
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -89,12 +89,14 @@ sovits_path = tts_config.vits_weights_path
 def inference(text, text_lang, 
              ref_audio_path, prompt_text, 
              prompt_lang, top_k, 
-              top_p, temperature, 
+              top_p, temperature, repetition_penalty,
              text_split_method, batch_size, 
              speed_factor, ref_text_free,
              split_bucket, fragment_interval,
-              seed,
+              seed, keep_random, padding_on_left
              ):
+    if keep_random:
+        seed = random.randrange(1 << 32)
    actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
    inputs={
        "text": text,
@ -105,6 +107,7 @@ def inference(text, text_lang,
        "top_k": top_k,
        "top_p": top_p,
        "temperature": temperature,
+        "repetition_penalty": repetition_penalty,
        "text_split_method": cut_method[text_split_method],
        "batch_size":int(batch_size),
        "speed_factor":float(speed_factor),
@ -112,6 +115,7 @@ def inference(text, text_lang,
        "return_fragment":False,
        "fragment_interval":fragment_interval,
        "seed":actual_seed,
+        "padding_on_left":padding_on_left
    }
    for item in tts_pipeline.run(inputs):
        yield item, actual_seed
@ -197,6 +201,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
                top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
                top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
                temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
+                repetition_penalty = gr.Slider(minimum=0.5,maximum=4.0,step=0.01,label=i18n("repetition_penalty"),value=1.35,interactive=True)
            with gr.Column():
                how_to_cut = gr.Radio(
                    label=i18n("怎么切"),
@ -207,6 +212,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
                with gr.Row():
                    split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量，选就对了)"), value=True, interactive=True, show_label=True)
                    seed = gr.Number(label=i18n("随机种子"),value=-1)
+                    keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True)
+                    padding_on_left = gr.Checkbox(label=i18n("左侧补齐"), value=True, interactive=True, show_label=True)
            # with gr.Column():
                output = gr.Audio(label=i18n("输出的语音"))
                with gr.Row():
@ -219,11 +226,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            [
                text,text_language, inp_ref, 
                prompt_text, prompt_language, 
-                top_k, top_p, temperature, 
+                top_k, top_p, temperature, repetition_penalty,
                how_to_cut, batch_size, 
                speed_factor, ref_text_free,
                split_bucket,fragment_interval,
-                seed
+                seed, keep_random, padding_on_left
             ],
            [output, seed],
        )
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@ -126,6 +126,7 @@ def main(args):
        benchmark=False,
        fast_dev_run=False,
        strategy = DDPStrategy(
+            find_unused_parameters=True,
            process_group_backend="nccl" if platform.system() != "Windows" else "gloo"
        ) if torch.cuda.is_available() else "auto",
        precision=config["train"]["precision"],