From d08214dd22e8837b4f9eb92f94a1d419806e182a Mon Sep 17 00:00:00 2001 From: ChasonJiang <1440499136@qq.com> Date: Tue, 1 Jul 2025 22:27:03 +0800 Subject: [PATCH] modified: GPT_SoVITS/TTS_infer_pack/TTS.py --- GPT_SoVITS/TTS_infer_pack/TTS.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index d2a2d3ce..813117a2 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1799,9 +1799,16 @@ class TTS: f2 = audio_fragments[i + 1] w1 = f1[-overlap_len:] w2 = f2[:overlap_len] - assert w1.shape == w2.shape - corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1), padding=w2.shape[-1] // 2).view(-1)[:-1] - idx = corr.argmax() + w2 = w2[-w2.shape[-1]//2:] + # assert w1.shape == w2.shape + corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1)).view(-1) + + squared_sum = F.conv1d(w1.view(1, 1, -1)**2, torch.ones_like(w2).view(1, 1, -1)).view(-1)+ 1e-8 + idx = (corr/squared_sum.sqrt()).argmax() + + print(f"seg_idx: {idx}") + + # idx = corr.argmax() f1_ = f1[: -(overlap_len - idx)] audio_fragments[i] = f1_