Merge pull request #730 from ChasonJiang/fast_inference

增加flash attention选项，防止影响训练
2026-07-03 04:30:24 +08:00 · 2024-03-10 14:45:16 +08:00 · 2024-03-10 14:45:16 +08:00 · a680939ce2
commit a680939ce2
parent 8fd56afe91 174c4bbab3
6 changed files with 225 additions and 44 deletions
--- a/GPT_SoVITS/AR/models/t2s_lightning_module.py
+++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py
@ -13,11 +13,11 @@ from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 from AR.modules.optim import ScaledAdam

 class Text2SemanticLightningModule(LightningModule):
-    def __init__(self, config, output_dir, is_train=True):
+    def __init__(self, config, output_dir, is_train=True,  flash_attn_enabled:bool = False):
        super().__init__()
        self.config = config
        self.top_k = 3
-        self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
+        self.model = Text2SemanticDecoder(config=config, top_k=self.top_k,flash_attn_enabled=flash_attn_enabled)
        pretrained_s1 = config.get("pretrained_s1")
        if pretrained_s1 and is_train:
            # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -1,7 +1,9 @@
 # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
 # reference: https://github.com/lifeiteng/vall-e
+import os, sys
+now_dir = os.getcwd()
+sys.path.append(now_dir)
 from typing import List
-
 import torch
 from tqdm import tqdm

@ -174,7 +176,7 @@ class T2STransformer:


 class Text2SemanticDecoder(nn.Module):
-    def __init__(self, config, norm_first=False, top_k=3):
+    def __init__(self, config, norm_first=False, top_k=3, flash_attn_enabled:bool=False):
        super(Text2SemanticDecoder, self).__init__()
        self.model_dim = config["model"]["hidden_dim"]
        self.embedding_dim = config["model"]["embedding_dim"]
@ -226,37 +228,42 @@ class Text2SemanticDecoder(nn.Module):
            multidim_average="global",
            ignore_index=self.EOS,
        )
-
-        blocks = []
-
-        for i in range(self.num_layers):
-            layer = self.h.layers[i]
-            t2smlp = T2SMLP(
-                layer.linear1.weight,
-                layer.linear1.bias,
-                layer.linear2.weight,
-                layer.linear2.bias
-            )
-
-            block = T2SBlock(
-                self.num_head,
-                self.model_dim,
-                t2smlp,
-                layer.self_attn.in_proj_weight,
-                layer.self_attn.in_proj_bias,
-                layer.self_attn.out_proj.weight,
-                layer.self_attn.out_proj.bias,
-                layer.norm1.weight,
-                layer.norm1.bias,
-                layer.norm1.eps,
-                layer.norm2.weight,
-                layer.norm2.bias,
-                layer.norm2.eps
-            )
-
-            blocks.append(block)
        
-        self.t2s_transformer = T2STransformer(self.num_layers, blocks)
+        if not flash_attn_enabled:
+            print("Not Using Flash Attention")
+            self.infer_panel = self.infer_panel_batch_only
+        else:
+            print("Using Flash Attention")
+            blocks = []
+
+            for i in range(self.num_layers):
+                layer = self.h.layers[i]
+                t2smlp = T2SMLP(
+                    layer.linear1.weight,
+                    layer.linear1.bias,
+                    layer.linear2.weight,
+                    layer.linear2.bias
+                )
+
+                block = T2SBlock(
+                    self.num_head,
+                    self.model_dim,
+                    t2smlp,
+                    layer.self_attn.in_proj_weight,
+                    layer.self_attn.in_proj_bias,
+                    layer.self_attn.out_proj.weight,
+                    layer.self_attn.out_proj.bias,
+                    layer.norm1.weight,
+                    layer.norm1.bias,
+                    layer.norm1.eps,
+                    layer.norm2.weight,
+                    layer.norm2.bias,
+                    layer.norm2.eps
+                )
+
+                blocks.append(block)
+            
+            self.t2s_transformer = T2STransformer(self.num_layers, blocks)

    def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
        x = self.ar_text_embedding(x)
@ -640,6 +647,168 @@ class Text2SemanticDecoder(nn.Module):
                if idx_list[i] is None:
                    idx_list[i] = 1500-1  ###如果没有生成到EOS，就用最大长度代替
                    
+        if ref_free:
+            return y_list, [0]*x.shape[0]
+        return y_list, idx_list
+    
+    def infer_panel_batch_only(
+        self,
+        x,  #####全部文本token
+        x_lens,
+        prompts,  ####参考音频token
+        bert_feature,
+        top_k: int = -100,
+        top_p: int = 100,
+        early_stop_num: int = -1,
+        temperature: float = 1.0,
+    ):
+        x = self.ar_text_embedding(x)
+        x = x + self.bert_proj(bert_feature.transpose(1, 2))
+        x = self.ar_text_position(x)
+
+        # AR Decoder
+        y = prompts
+        
+        x_len = x.shape[1]
+        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
+        stop = False
+        # print(1111111,self.num_layers)
+        cache = {
+            "all_stage": self.num_layers,
+            "k": [None] * self.num_layers,  ###根据配置自己手写
+            "v": [None] * self.num_layers,
+            # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存，每次都要重新拼xy_pos.主要还是写法原因，其实是可以历史统一一样的，但也没啥计算量就不管了
+            "y_emb": None,  ##只需要对最新的samples求emb，再拼历史的就行
+            # "logits":None,###原版就已经只对结尾求再拼接了，不用管
+            # "xy_dec":None,###不需要，本来只需要最后一个做logits
+            "first_infer": 1,
+            "stage": 0,
+        }
+        ###################  first step ##########################
+        if y is not None:
+            y_emb = self.ar_audio_embedding(y)
+            y_len = y_emb.shape[1]
+            prefix_len = y.shape[1]
+            y_pos = self.ar_audio_position(y_emb)
+            xy_pos = torch.concat([x, y_pos], dim=1)
+            cache["y_emb"] = y_emb
+            ref_free = False
+        else:
+            y_emb = None
+            y_len = 0
+            prefix_len = 0
+            y_pos = None
+            xy_pos = x
+            y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
+            ref_free = True
+
+        x_attn_mask_pad = F.pad(
+                    x_attn_mask,
+                    (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+                    value=True,
+                )
+        y_attn_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
+            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
+            (x_len, 0),
+            value=False,
+        )
+        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
+            x.device
+        )
+        
+        y_list = [None]*y.shape[0]
+        batch_idx_map = list(range(y.shape[0]))
+        idx_list = [None]*y.shape[0]
+        for idx in tqdm(range(1500)):
+            
+            xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
+            logits = self.ar_predict_layer(
+                xy_dec[:, -1]
+            )  ##不用改，如果用了cache的默认就是只有一帧，取最后一帧一样的
+            # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
+            if(idx==0):###第一次跑不能EOS否则没有了
+                logits = logits[:, :-1]  ###刨除1024终止符号的概率
+            samples = sample(
+                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
+            )[0]
+            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+            # print(samples.shape)#[1,1]#第一个1是bs
+            y = torch.concat([y, samples], dim=1) 
+
+            # 移除已经生成完毕的序列
+            reserved_idx_of_batch_for_y = None
+            if (self.EOS in torch.argmax(logits, dim=-1)) or \
+                (self.EOS in samples[:, 0]):  ###如果生成到EOS，则停止
+                    l = samples[:, 0]==self.EOS
+                    removed_idx_of_batch_for_y = torch.where(l==True)[0].tolist()
+                    reserved_idx_of_batch_for_y = torch.where(l==False)[0]
+                    # batch_indexs = torch.tensor(batch_idx_map, device=y.device)[removed_idx_of_batch_for_y]
+                    for i in removed_idx_of_batch_for_y:
+                        batch_index = batch_idx_map[i]
+                        idx_list[batch_index] = idx - 1
+                        y_list[batch_index] = y[i, :-1]
+                
+                    batch_idx_map = [batch_idx_map[i] for i in reserved_idx_of_batch_for_y.tolist()]
+                
+            # 只保留未生成完毕的序列 
+            if reserved_idx_of_batch_for_y is not None:
+                # index = torch.LongTensor(batch_idx_map).to(y.device)
+                y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
+                if cache["y_emb"] is not None:
+                    cache["y_emb"] = torch.index_select(cache["y_emb"], dim=0, index=reserved_idx_of_batch_for_y)
+                if cache["k"] is not None:
+                    for i in range(self.num_layers):
+                        # 因为kv转置了，所以batch dim是1
+                        cache["k"][i] = torch.index_select(cache["k"][i], dim=1, index=reserved_idx_of_batch_for_y)
+                        cache["v"][i] = torch.index_select(cache["v"][i], dim=1, index=reserved_idx_of_batch_for_y)
+                
+                
+            if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+                print("use early stop num:", early_stop_num)
+                stop = True
+                
+            if not (None in idx_list):
+                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
+                stop = True
+            if stop:
+                # if prompts.shape[1] == y.shape[1]:
+                #     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                #     print("bad zero prediction")
+                if y.shape[1]==0:
+                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
+                    print("bad zero prediction")
+                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
+                break
+            
+            ####################### update next step ###################################
+            cache["first_infer"] = 0
+            if cache["y_emb"] is not None:
+                y_emb = torch.cat(
+                    [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], dim = 1
+                )
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos[:, -1:]
+            else:
+                y_emb = self.ar_audio_embedding(y[:, -1:])
+                cache["y_emb"] = y_emb
+                y_pos = self.ar_audio_position(y_emb)
+                xy_pos = y_pos
+            y_len = y_pos.shape[1]
+
+            ###最右边一列（是错的）
+            # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
+            # xy_attn_mask[:,-1]=False
+            ###最下面一行（是对的）
+            xy_attn_mask = torch.zeros(
+                (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
+            )
+            
+        if (None in idx_list):
+            for i in range(x.shape[0]):
+                if idx_list[i] is None:
+                    idx_list[i] = 1500-1  ###如果没有生成到EOS，就用最大长度代替
+                    
        if ref_free:
            return y_list, [0]*x.shape[0]
        return y_list, idx_list
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -17,8 +17,8 @@ from time import time as ttime
 from tools.i18n.i18n import I18nAuto
 from my_utils import load_audio
 from module.mel_processing import spectrogram_torch
-from .text_segmentation_method import splits
-from .TextPreprocessor import TextPreprocessor
+from TTS_infer_pack.text_segmentation_method import splits
+from TTS_infer_pack.TextPreprocessor import TextPreprocessor
 i18n = I18nAuto()

 # configs/tts_infer.yaml
@ -30,6 +30,7 @@ default:
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
  vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
+  flash_attn_enabled: true
  
 custom:
  device: cuda
@ -38,7 +39,7 @@ custom:
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
  vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
-  
+  flash_attn_enabled: true


 """
@ -63,7 +64,8 @@ class TTS_Config:
                "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
                "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
                "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
-                "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+                "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+                "flash_attn_enabled": True
            }
        self.configs:dict = configs.get("custom", self.default_configs)
        
@ -73,6 +75,7 @@ class TTS_Config:
        self.vits_weights_path = self.configs.get("vits_weights_path")
        self.bert_base_path = self.configs.get("bert_base_path")
        self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path")
+        self.flash_attn_enabled = self.configs.get("flash_attn_enabled")
        
        
        self.max_sec = None
@ -103,7 +106,8 @@ class TTS_Config:
                "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
                "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
                "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
-                "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+                "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
+                "flash_attn_enabled": True
            },
            "custom": {
                "device": str(self.device),
@ -111,7 +115,8 @@ class TTS_Config:
                "t2s_weights_path": self.t2s_weights_path,
                "vits_weights_path": self.vits_weights_path,
                "bert_base_path": self.bert_base_path,
-                "cnhuhbert_base_path": self.cnhuhbert_base_path
+                "cnhuhbert_base_path": self.cnhuhbert_base_path,
+                "flash_attn_enabled": self.flash_attn_enabled
            }
        }
        if configs_path is None:
@ -128,6 +133,7 @@ class TTS_Config:
        string += "t2s_weights_path: {}\n".format(self.t2s_weights_path)
        string += "vits_weights_path: {}\n".format(self.vits_weights_path)
        string += "cnhuhbert_base_path: {}\n".format(self.cnhuhbert_base_path)
+        string += "flash_attn_enabled: {}\n".format(self.flash_attn_enabled)
        string += "----------------------------------------\n"
        return string

@ -231,7 +237,8 @@ class TTS:
        dict_s1 = torch.load(weights_path, map_location=self.configs.device)
        config = dict_s1["config"]
        self.configs.max_sec = config["data"]["max_sec"]
-        t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
+        t2s_model = Text2SemanticLightningModule(config, "****", is_train=False, 
+                                                 flash_attn_enabled=self.configs.flash_attn_enabled)
        t2s_model.load_state_dict(dict_s1["weight"])
        if self.configs.is_half:
            t2s_model = t2s_model.half()
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -1,4 +1,7 @@

+import os, sys
+now_dir = os.getcwd()
+sys.path.append(now_dir)

 import re
 import torch
@ -7,7 +10,7 @@ from typing import Dict, List, Tuple
 from text.cleaner import clean_text
 from text import cleaned_text_to_sequence
 from transformers import AutoModelForMaskedLM, AutoTokenizer
-from .text_segmentation_method import splits, get_method as get_seg_method
+from TTS_infer_pack.text_segmentation_method import splits, get_method as get_seg_method

 # from tools.i18n.i18n import I18nAuto

--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@ -2,6 +2,7 @@ custom:
  bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  device: cuda
+  flash_attn_enabled: true
  is_half: true
  t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
  vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
@ -9,6 +10,7 @@ default:
  bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  device: cpu
+  flash_attn_enabled: true
  is_half: false
  t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
  vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -20,7 +20,6 @@ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 import pdb
 import torch
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py


 infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
@ -33,8 +32,9 @@ is_half = eval(os.environ.get("is_half", "True")) and not torch.backends.mps.is_
 import gradio as gr
 from TTS_infer_pack.TTS import TTS, TTS_Config
 from TTS_infer_pack.text_segmentation_method import cut1, cut2, cut3, cut4, cut5
-from tools.i18n.i18n import I18nAuto
 from TTS_infer_pack.text_segmentation_method import get_method
+from tools.i18n.i18n import I18nAuto
+
 i18n = I18nAuto()

 os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 确保直接启动推理UI时也能够设置。