From c62f629aa722c5755047dc5b786e80003eff0ce9 Mon Sep 17 00:00:00 2001
From: wishhyt <24300810017@m.fudan.edu.cn>
Date: Wed, 18 Mar 2026 10:46:19 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20TTS=20=E9=9F=B3?=
 =?UTF-8?q?=E9=A2=91=E5=90=8E=E5=A4=84=E7=90=86=E4=B8=AD=E7=9A=84=E5=A4=9A?=
 =?UTF-8?q?=E4=B8=AA=E7=BC=BA=E9=99=B7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复音频超采样时 int16 双重转换导致整数溢出（CRITICAL）
   - audio_postprocess 中 `audio = (audio * 32768).astype(np.int16)` 位于
     if/else 块之外无条件执行，当 super_sampling=True 时音频已在分支内
     转为 int16，再次乘以 32768 导致溢出和音频完全失真
   - 同时修复 super_sampling=True 但超分模型不存在时 torch.Tensor 调用
     .astype() 的 AttributeError

2. 修复 batched vocoder 推理中 padding_len=0 导致音频丢失（HIGH）
   - 当 padding_len 恰好为 0 时，`-0 * upsample_rate == 0`，切片
     `audio[x:0]` 返回空张量，导致整段音频丢失

3. 修复文件不存在时错误地抛出 FileExistsError（LOW）
   - 应为 FileNotFoundError

Made-with: Cursor
---
 GPT_SoVITS/TTS_infer_pack/TTS.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index 9c8344b0..667f1a4a 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -499,7 +499,7 @@ class TTS:
 
         if if_lora_v3 == True and os.path.exists(path_sovits) == False:
             info = path_sovits + i18n("SoVITS %s 底模缺失，无法加载相应 LoRA 权重" % model_version)
-            raise FileExistsError(info)
+            raise FileNotFoundError(info)
 
         # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
         dict_s2 = load_sovits_new(weights_path)
@@ -1578,16 +1578,15 @@ class TTS:
                 max_audio = np.abs(audio).max()
                 if max_audio > 1:
                     audio /= max_audio
-            audio = (audio * 32768).astype(np.int16)
+                audio = (audio * 32768).astype(np.int16)
+            else:
+                audio = audio.cpu().numpy()
+                audio = (audio * 32768).astype(np.int16)
             t2 = time.perf_counter()
             print(f"超采样用时：{t2 - t1:.3f}s")
         else:
-            # audio = audio.float() * 32768
-            # audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
-
             audio = audio.cpu().numpy()
-
-        audio = (audio * 32768).astype(np.int16)
+            audio = (audio * 32768).astype(np.int16)
 
 
         # try:
@@ -1768,7 +1767,10 @@ class TTS:
             pos += chunk_len * upsample_rate
 
         audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
-        audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+        if padding_len > 0:
+            audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+        else:
+            audio = audio[overlapped_len * upsample_rate :]
 
         audio_fragments = []
         for feat_len in feat_lens: