同步了0306更新

2026-06-05 22:08:15 +08:00 · 2024-03-07 14:31:33 +08:00 · 2024-03-07 14:31:33 +08:00 · 74f81f3009
commit 74f81f3009
parent a2fd134288
3 changed files with 156 additions and 78 deletions
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -10,7 +10,7 @@ from AR.models.utils import (
    logits_to_probs,
    multinomial_sample_one_no_sync,
    dpo_loss,
-    make_reject_y, 
+    make_reject_y,
    get_batch_logps
 )
 from AR.modules.embedding import SinePositionalEmbedding
@ -22,6 +22,11 @@ from torch import nn
 from torch.nn import functional as F
 from torchmetrics.classification import MulticlassAccuracy
 try:
    from flash_attn import flash_attn_with_kvcache
 except ImportError:
    flash_attn_with_kvcache = None
 default_config = {
    "embedding_dim": 512,
    "hidden_dim": 512,
@ -116,7 +121,7 @@ class Text2SemanticDecoder(nn.Module):
            (0, y_len),
            value=True,
        )
-        
+
        y_attn_mask = F.pad(
            torch.triu(
                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
@ -177,7 +182,7 @@ class Text2SemanticDecoder(nn.Module):
        A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets)
        loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True)
-        
+
        loss = loss_1 + loss_2
        return loss, acc
@ -246,14 +251,14 @@ class Text2SemanticDecoder(nn.Module):
    # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么
    def infer(
-        self,
+            self,
-        x,
+            x,
-        x_lens,
+            x_lens,
-        prompts,
+            prompts,
-        bert_feature,
+            bert_feature,
-        top_k: int = -100,
+            top_k: int = -100,
-        early_stop_num: int = -1,
+            early_stop_num: int = -1,
-        temperature: float = 1.0,
+            temperature: float = 1.0,
    ):
        x = self.ar_text_embedding(x)
        x = x + self.bert_proj(bert_feature.transpose(1, 2))
@ -321,16 +326,100 @@ class Text2SemanticDecoder(nn.Module):
        # 错位
        return targets[:, :-1], targets[:, 1:]
    def infer_one_step(self, x, xy_attn_mask, k_cache, v_cache, cache_seqlens):
        hidden_dim = x.shape[-1]
        for layer_id in range(self.num_layers):
            layer = self.h.layers[layer_id]
            q, k, v = F.linear(
                x,
                layer.self_attn.in_proj_weight,
                layer.self_attn.in_proj_bias
            ).chunk(3, dim=-1)
            batch_size = q.shape[0]
            q_len = q.shape[1]
            if flash_attn_with_kvcache is None:
                past_k = k_cache[layer_id]
                past_v = v_cache[layer_id]
                if past_k is not None:
                    k = torch.cat([past_k, k], 1)
                    v = torch.cat([past_v, v], 1)
                k_cache[layer_id] = k
                v_cache[layer_id] = v
                kv_len = k.shape[1]
                q = q.view(batch_size, q_len, layer.self_attn.num_heads, -1).transpose(1, 2)
                k = k.view(batch_size, kv_len, layer.self_attn.num_heads, -1).transpose(1, 2)
                v = v.view(batch_size, kv_len, layer.self_attn.num_heads, -1).transpose(1, 2)
                if xy_attn_mask is None:
                    attn = F.scaled_dot_product_attention(q, k, v)
                else:
                    attn = F.scaled_dot_product_attention(q, k, v, ~xy_attn_mask)
                attn = attn.permute(2, 0, 1, 3).reshape(-1, hidden_dim)
            else:
                q = q.view(batch_size, q_len, layer.self_attn.num_heads, -1)
                k = k.view(batch_size, q_len, layer.self_attn.num_heads, -1)
                v = v.view(batch_size, q_len, layer.self_attn.num_heads, -1)
                if xy_attn_mask is None:
                    attn = flash_attn_with_kvcache(q, k_cache[layer_id], v_cache[layer_id], k, v, cache_seqlens=cache_seqlens, causal=True)
                else:
                    # NOTE: there's a slight difference with the result produced by SDPA.
                    x_len = (~xy_attn_mask).sum(1)[0].item()
                    attn_x = flash_attn_with_kvcache(
                        q[:, :x_len],
                        k_cache[layer_id],
                        v_cache[layer_id],
                        k[:, :x_len],
                        v[:, :x_len],
                        cache_seqlens=cache_seqlens,
                        causal=False
                    )
                    attn_y = flash_attn_with_kvcache(
                        q[:, x_len:],
                        k_cache[layer_id],
                        v_cache[layer_id],
                        k[:, x_len:],
                        v[:, x_len:],
                        cache_seqlens=cache_seqlens + x_len,
                        causal=True
                    )
                    attn = torch.cat([attn_x, attn_y], dim=1)
                attn = attn.view(-1, hidden_dim)
            attn_out = F.linear(attn, layer.self_attn.out_proj.weight, layer.self_attn.out_proj.bias)
            x = layer.norm1(x + attn_out, None)
            x = layer.norm2(x + layer.linear2(F.relu(layer.linear1(x))), None)
        xy_dec = x
        logits = self.ar_predict_layer(
            xy_dec[:, -1]
        )
        return logits
    def infer_panel(
-        self,
+            self,
-        x,  #####全部文本token
+            x,  #####全部文本token
-        x_lens,
+            x_lens,
-        prompts,  ####参考音频token
+            prompts,  ####参考音频token
-        bert_feature,
+            bert_feature,
-        top_k: int = -100,
+            top_k: int = -100,
-        top_p: int = 100,
+            top_p: int = 100,
-        early_stop_num: int = -1,
+            early_stop_num: int = -1,
-        temperature: float = 1.0,
+            temperature: float = 1.0,
    ):
        x = self.ar_text_embedding(x)
        x = x + self.bert_proj(bert_feature.transpose(1, 2))
@ -338,22 +427,18 @@ class Text2SemanticDecoder(nn.Module):
        # AR Decoder
        y = prompts
-        
+
        x_len = x.shape[1]
        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
        stop = False
        # print(1111111,self.num_layers)
-        cache = {
+
-            "all_stage": self.num_layers,
+        if flash_attn_with_kvcache is not None:
-            "k": [None] * self.num_layers,  ###根据配置自己手写
+            k_cache = [torch.empty(x.shape[0], 2048, 16, 32, dtype=x.dtype, device=x.device) for _ in range(self.num_layers)]
-            "v": [None] * self.num_layers,
+            v_cache = [torch.empty(x.shape[0], 2048, 16, 32, dtype=x.dtype, device=x.device) for _ in range(self.num_layers)]
-            # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存，每次都要重新拼xy_pos.主要还是写法原因，其实是可以历史统一一样的，但也没啥计算量就不管了
+        else:
-            "y_emb": None,  ##只需要对最新的samples求emb，再拼历史的就行
+            k_cache = [None] * self.num_layers
-            # "logits":None,###原版就已经只对结尾求再拼接了，不用管
+            v_cache = [None] * self.num_layers
            # "xy_dec":None,###不需要，本来只需要最后一个做logits
            "first_infer": 1,
            "stage": 0,
        }
        ###################  first step ##########################
        if y is not None:
            y_emb = self.ar_audio_embedding(y)
@ -361,7 +446,6 @@ class Text2SemanticDecoder(nn.Module):
            prefix_len = y.shape[1]
            y_pos = self.ar_audio_position(y_emb)
            xy_pos = torch.concat([x, y_pos], dim=1)
            cache["y_emb"] = y_emb
            ref_free = False
        else:
            y_emb = None
@ -373,10 +457,10 @@ class Text2SemanticDecoder(nn.Module):
            ref_free = True
        x_attn_mask_pad = F.pad(
-                    x_attn_mask,
+            x_attn_mask,
-                    (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
+            (0, y_len),  ###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
-                    value=True,
+            value=True,
-                )
+        )
        y_attn_mask = F.pad(  ###yy的右上1扩展到左边xy的0,(y,x+y)
            torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
            (x_len, 0),
@ -385,64 +469,43 @@ class Text2SemanticDecoder(nn.Module):
        xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
            x.device
        )
-        
+
        cache_seqlens = torch.zeros(x.shape[0], dtype=torch.int32, device=x.device)
        for idx in tqdm(range(1500)):
-            
+            logits = self.infer_one_step(xy_pos, xy_attn_mask, k_cache, v_cache, cache_seqlens)
-            xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
+
-            logits = self.ar_predict_layer(
+            if idx == 0:
-                xy_dec[:, -1]
+                cache_seqlens += xy_pos.shape[1]
-            )  ##不用改，如果用了cache的默认就是只有一帧，取最后一帧一样的
+            else:
-            # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
+                cache_seqlens += 1
-            if(idx==0):###第一次跑不能EOS否则没有了
+            xy_attn_mask = None
-                logits = logits[:, :-1]  ###刨除1024终止符号的概率
+
            if idx == 0:
                logits = logits[:, :-1]
            samples = sample(
                logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
            )[0].unsqueeze(0)
-            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
+
-            # print(samples.shape)#[1,1]#第一个1是bs
+            y = torch.concat([y, samples], dim=1)
            y = torch.concat([y, samples], dim=1) 
            if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
                print("use early stop num:", early_stop_num)
                stop = True
            if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
                stop = True
            if stop:
-                # if prompts.shape[1] == y.shape[1]:
+                if y.shape[1] == 0:
                #     y = torch.concat([y, torch.zeros_like(samples)], dim=1)
                #     print("bad zero prediction")
                if y.shape[1]==0:
                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
                    print("bad zero prediction")
                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
                break
            ####################### update next step ###################################
            cache["first_infer"] = 0
            if cache["y_emb"] is not None:
                y_emb = torch.cat(
                    [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], dim = 1
                )
                cache["y_emb"] = y_emb
                y_pos = self.ar_audio_position(y_emb)
                xy_pos = y_pos[:, -1:]
            else:
                y_emb = self.ar_audio_embedding(y[:, -1:])
                cache["y_emb"] = y_emb
                y_pos = self.ar_audio_position(y_emb)
                xy_pos = y_pos
            y_len = y_pos.shape[1]
-            ###最右边一列（是错的）
+            ####################### update next step ###################################
-            # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
+            y_emb = self.ar_audio_embedding(y[:, -1:])
-            # xy_attn_mask[:,-1]=False
+            xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, prompts.shape[1] + idx]
-            ###最下面一行（是对的）
+
            xy_attn_mask = torch.zeros(
                (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
            )
        if ref_free:
            return y[:, :-1], 0
-        return y[:, :-1], idx-1
+        return y[:, :-1], idx - 1
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -135,6 +135,19 @@
 4-colab修复不开启公网url
 ### 20240306更新
 1-推理加速50%（RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested）https://github.com/RVC-Boss/GPT-SoVITS/pull/672
 2-如果用faster whisper非中文ASR不再需要先下中文funasr模型
 3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610
 4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675
 5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
 todolist：
 1-中文多音字推理优化
--- a/tools/uvr5/uvr5_weights/.gitignore
+++ b/tools/uvr5/uvr5_weights/.gitignore
@ -0,0 +1,2 @@
 *
 !.gitignore