From 6feafac1df77bd8e3cbab92d6b29d626112b646f Mon Sep 17 00:00:00 2001
From: csh <458761603@qq.com>
Date: Fri, 13 Jun 2025 17:33:20 +0800
Subject: [PATCH] =?UTF-8?q?perf(export=5Ftorch=5Fscript):=20=E7=BC=93?=
 =?UTF-8?q?=E5=AD=98=20Vits=20=E4=B8=AD=E7=94=A8=E5=88=B0=E7=9A=84=20hann?=
 =?UTF-8?q?=5Fwindow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/export_torch_script.py      | 22 +++++++++++++---------
 GPT_SoVITS/export_torch_script_v3v4.py | 10 +++++++---
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py
index 66999c0a..bf32ed6e 100644
--- a/GPT_SoVITS/export_torch_script.py
+++ b/GPT_SoVITS/export_torch_script.py
@@ -129,8 +129,8 @@ def sample(
 
 
 @torch.jit.script
-def spectrogram_torch(y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False):
-    hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
+def spectrogram_torch(hann_window:Tensor, y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False):
+    # hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
     y = torch.nn.functional.pad(
         y.unsqueeze(1),
         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
@@ -349,7 +349,7 @@ class T2STransformer:
 
 
 class VitsModel(nn.Module):
-    def __init__(self, vits_path, version=None):
+    def __init__(self, vits_path, version=None, is_half=True, device="cpu"):
         super().__init__()
         # dict_s2 = torch.load(vits_path,map_location="cpu")
         dict_s2 = load_sovits_new(vits_path)
@@ -374,11 +374,18 @@ class VitsModel(nn.Module):
             n_speakers=self.hps.data.n_speakers,
             **self.hps.model,
         )
-        self.vq_model.eval()
         self.vq_model.load_state_dict(dict_s2["weight"], strict=False)
+        self.vq_model.dec.remove_weight_norm()
+        if is_half:
+            self.vq_model = self.vq_model.half()
+        self.vq_model = self.vq_model.to(device)
+        self.vq_model.eval()
+        self.hann_window = torch.hann_window(self.hps.data.win_length, device=device, dtype= torch.float16 if is_half else torch.float32)
+
 
     def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0, sv_emb=None):
         refer = spectrogram_torch(
+            self.hann_window,
             ref_audio,
             self.hps.data.filter_length,
             self.hps.data.sampling_rate,
@@ -668,7 +675,7 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
     ssl_content = ssl(ref_audio).to(device)
 
     # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
-    vits = VitsModel(vits_path).to(device)
+    vits = VitsModel(vits_path,device=device,is_half=False)
     vits.eval()
 
     # gpt_path = "GPT_weights_v2/xw-e15.ckpt"
@@ -766,10 +773,7 @@ def export_prov2(
     sv_model = ExportERes2NetV2(sv_cn_model)
 
     # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
-    vits = VitsModel(vits_path, version)
-    if is_half:
-        vits.vq_model = vits.vq_model.half()
-    vits.to(device)
+    vits = VitsModel(vits_path, version,is_half=is_half,device=device)
     vits.eval()
 
     # gpt_path = "GPT_weights_v2/xw-e15.ckpt"
diff --git a/GPT_SoVITS/export_torch_script_v3v4.py b/GPT_SoVITS/export_torch_script_v3v4.py
index 55d27282..1fd63b3c 100644
--- a/GPT_SoVITS/export_torch_script_v3v4.py
+++ b/GPT_SoVITS/export_torch_script_v3v4.py
@@ -243,6 +243,7 @@ class ExportGPTSovitsHalf(torch.nn.Module):
         self.sampling_rate: int = hps.data.sampling_rate
         self.hop_length: int = hps.data.hop_length
         self.win_length: int = hps.data.win_length
+        self.hann_window = torch.hann_window(self.win_length, device=device, dtype=torch.float32)
 
     def forward(
         self,
@@ -255,6 +256,7 @@ class ExportGPTSovitsHalf(torch.nn.Module):
         top_k,
     ):
         refer = spectrogram_torch(
+            self.hann_window,
             ref_audio_32k,
             self.filter_length,
             self.sampling_rate,
@@ -321,6 +323,7 @@ class ExportGPTSovitsV4Half(torch.nn.Module):
         self.sampling_rate: int = hps.data.sampling_rate
         self.hop_length: int = hps.data.hop_length
         self.win_length: int = hps.data.win_length
+        self.hann_window = torch.hann_window(self.win_length, device=device, dtype=torch.float32)
 
     def forward(
         self,
@@ -333,6 +336,7 @@ class ExportGPTSovitsV4Half(torch.nn.Module):
         top_k,
     ):
         refer = spectrogram_torch(
+            self.hann_window,
             ref_audio_32k,
             self.filter_length,
             self.sampling_rate,
@@ -1149,7 +1153,7 @@ def export_2(version="v3"):
         raw_t2s = raw_t2s.half().to(device)
     t2s_m = T2SModel(raw_t2s).half().to(device)
     t2s_m.eval()
-    t2s_m = torch.jit.script(t2s_m)
+    t2s_m = torch.jit.script(t2s_m).to(device)
     t2s_m.eval()
     # t2s_m.top_k = 15
     logger.info("t2s_m ok")
@@ -1251,6 +1255,6 @@ def test_export_gpt_sovits_v3():
 
 
 with torch.no_grad():
-    export_1("onnx/ad/ref.wav","你这老坏蛋，我找了你这么久，真没想到在这里找到你。他说。","v4")
-    # export_2("v4")
+    # export_1("onnx/ad/ref.wav","你这老坏蛋，我找了你这么久，真没想到在这里找到你。他说。","v4")
+    export_2("v4")
     # test_export_gpt_sovits_v3()