From c62f629aa722c5755047dc5b786e80003eff0ce9 Mon Sep 17 00:00:00 2001 From: wishhyt <24300810017@m.fudan.edu.cn> Date: Wed, 18 Mar 2026 10:46:19 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20TTS=20=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E5=90=8E=E5=A4=84=E7=90=86=E4=B8=AD=E7=9A=84=E5=A4=9A?= =?UTF-8?q?=E4=B8=AA=E7=BC=BA=E9=99=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 修复音频超采样时 int16 双重转换导致整数溢出(CRITICAL) - audio_postprocess 中 `audio = (audio * 32768).astype(np.int16)` 位于 if/else 块之外无条件执行,当 super_sampling=True 时音频已在分支内 转为 int16,再次乘以 32768 导致溢出和音频完全失真 - 同时修复 super_sampling=True 但超分模型不存在时 torch.Tensor 调用 .astype() 的 AttributeError 2. 修复 batched vocoder 推理中 padding_len=0 导致音频丢失(HIGH) - 当 padding_len 恰好为 0 时,`-0 * upsample_rate == 0`,切片 `audio[x:0]` 返回空张量,导致整段音频丢失 3. 修复文件不存在时错误地抛出 FileExistsError(LOW) - 应为 FileNotFoundError Made-with: Cursor --- GPT_SoVITS/TTS_infer_pack/TTS.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 9c8344b0..667f1a4a 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -499,7 +499,7 @@ class TTS: if if_lora_v3 == True and os.path.exists(path_sovits) == False: info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) - raise FileExistsError(info) + raise FileNotFoundError(info) # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) dict_s2 = load_sovits_new(weights_path) @@ -1578,16 +1578,15 @@ class TTS: max_audio = np.abs(audio).max() if max_audio > 1: audio /= max_audio - audio = (audio * 32768).astype(np.int16) + audio = (audio * 32768).astype(np.int16) + else: + audio = audio.cpu().numpy() + audio = (audio * 32768).astype(np.int16) t2 = time.perf_counter() print(f"超采样用时:{t2 - t1:.3f}s") else: - # audio = audio.float() * 32768 - # audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy() - audio = audio.cpu().numpy() - - audio = (audio * 32768).astype(np.int16) + audio = (audio * 32768).astype(np.int16) # try: @@ -1768,7 +1767,10 @@ class TTS: pos += chunk_len * upsample_rate audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate) - audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate] + if padding_len > 0: + audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate] + else: + audio = audio[overlapped_len * upsample_rate :] audio_fragments = [] for feat_len in feat_lens: