From 0ba0ebcf6534d12b0c63b981249d62657cc1eb7c Mon Sep 17 00:00:00 2001
From: ChasonJiang <1440499136@qq.com>
Date: Mon, 21 Apr 2025 22:10:39 +0800
Subject: [PATCH] =?UTF-8?q?=E9=80=82=E9=85=8Dv4=E7=89=88=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/TTS_infer_pack/TTS.py   | 23 ++++++++++++-----------
 GPT_SoVITS/configs/tts_infer.yaml  |  6 +++---
 GPT_SoVITS/inference_webui_fast.py |  2 +-
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index 19b01116..13cc513c 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -106,11 +106,12 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
 resample_transform_dict = {}
 
 
-def resample(audio_tensor, sr0, device):
+def resample(audio_tensor, sr0, sr1, device):
     global resample_transform_dict
-    if sr0 not in resample_transform_dict:
-        resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
-    return resample_transform_dict[sr0](audio_tensor)
+    key="%s-%s"%(sr0,sr1)
+    if key not in resample_transform_dict:
+        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
+    return resample_transform_dict[key](audio_tensor)
 
 
 class DictToAttrRecursive(dict):
@@ -1372,9 +1373,10 @@ class TTS:
         if ref_audio.shape[0] == 2:
             ref_audio = ref_audio.mean(0).unsqueeze(0)
 
-        tgt_sr = self.vocoder_configs["sr"]
+        # tgt_sr = self.vocoder_configs["sr"]
+        tgt_sr = 24000 if self.configs.version == "v3" else 32000
         if ref_sr != tgt_sr:
-            ref_audio = resample(ref_audio, ref_sr, self.configs.device)
+            ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
 
         mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
         mel2 = norm_spec(mel2)
@@ -1437,12 +1439,11 @@ class TTS:
         ref_audio = ref_audio.to(self.configs.device).float()
         if ref_audio.shape[0] == 2:
             ref_audio = ref_audio.mean(0).unsqueeze(0)
-        if ref_sr != 24000:
-            ref_audio = resample(ref_audio, ref_sr, self.configs.device)
-
-        tgt_sr = self.vocoder_configs["sr"]
+            
+        # tgt_sr = self.vocoder_configs["sr"]
+        tgt_sr = 24000 if self.configs.version == "v3" else 32000
         if ref_sr != tgt_sr:
-            ref_audio = resample(ref_audio, ref_sr, self.configs.device)
+            ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
 
         mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
         mel2 = norm_spec(mel2)
diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml
index 20c41a20..85e8f9d7 100644
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@@ -3,9 +3,9 @@ custom:
   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
   device: cuda
   is_half: true
-  t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
-  version: v2
-  vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
+  t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
+  version: v3
+  vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
 v1:
   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py
index 99cd44d0..547b1e04 100644
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@@ -397,7 +397,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
                         minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True
                     )
                     sample_steps = gr.Radio(
-                        label=i18n("采样步数(仅对V3生效)"), value=32, choices=[4, 8, 16, 32], visible=True
+                        label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32], visible=True
                     )
                 with gr.Row():
                     fragment_interval = gr.Slider(