From 6bce575d69ef6d415de4d5e17b2c9e1c9df05545 Mon Sep 17 00:00:00 2001
From: ChasonJiang <1440499136@qq.com>
Date: Wed, 26 Nov 2025 14:41:42 +0800
Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=9B=BA=E5=AE=9Achunk?=
 =?UTF-8?q?=E9=95=BF=E5=BA=A6=E7=9A=84=E6=B5=81=E5=BC=8F=E6=8E=A8=E7=90=86?=
 =?UTF-8?q?=EF=BC=8C=E4=BC=98=E5=8C=96sola=E7=AE=97=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/AR/models/t2s_model.py |  3 +-
 GPT_SoVITS/TTS_infer_pack/TTS.py  | 56 ++++++++++++++++---------------
 api_v2.py                         | 33 ++++++++++--------
 3 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py
index d72aa393..0caadd04 100644
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@@ -940,9 +940,10 @@ class Text2SemanticDecoder(nn.Module):
 
 
             elif streaming_mode and (mute_emb_sim_matrix is None) and (token_counter >= chunk_length):
-                token_counter == chunk_length
                 yield y[:, -token_counter:], False
+                curr_ptr+=token_counter
                 token_counter = 0
+                
 
 
             ####################### update next step ###################################
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index 1d25e30a..be3d3a19 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -1014,18 +1014,19 @@ class TTS:
                     "text_split_method": "cut1",  # str. text split method, see text_segmentation_method.py for details.
                     "batch_size": 1,              # int. batch size for inference
                     "batch_threshold": 0.75,      # float. threshold for batch splitting.
-                    "split_bucket: True,          # bool. whether to split the batch into multiple buckets.
-                    "return_fragment": False,     # bool. step by step return the audio fragment.
+                    "split_bucket": True,         # bool. whether to split the batch into multiple buckets.
                     "speed_factor":1.0,           # float. control the speed of the synthesized audio.
                     "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
                     "seed": -1,                   # int. random seed for reproducibility.
                     "parallel_infer": True,       # bool. whether to use parallel inference.
-                    "repetition_penalty": 1.35    # float. repetition penalty for T2S model.
+                    "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
                     "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
                     "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
-                    "streaming_mode": False,      # bool. return audio chunk by chunk.
+                    "return_fragment": False,     # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
+                    "streaming_mode": False,      # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
                     "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
-                    "min_chunk_length: 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
+                    "min_chunk_length": 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
+                    "fixed_length_chunk": False,  # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
                 }
         returns:
             Tuple[int, np.ndarray]: sampling rate and audio data.
@@ -1058,6 +1059,7 @@ class TTS:
         streaming_mode = inputs.get("streaming_mode", False)
         overlap_length = inputs.get("overlap_length", 2)
         min_chunk_length = inputs.get("min_chunk_length", 16)
+        fixed_length_chunk = inputs.get("fixed_length_chunk", False)
         chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值，若大于该阈值，则视为可切分点。
 
         if parallel_infer and not streaming_mode:
@@ -1367,7 +1369,7 @@ class TTS:
                         repetition_penalty=repetition_penalty,
                         streaming_mode=True,
                         chunk_length=min_chunk_length,
-                        mute_emb_sim_matrix=self.configs.mute_emb_sim_matrix,
+                        mute_emb_sim_matrix=self.configs.mute_emb_sim_matrix if not fixed_length_chunk else None,
                         chunk_split_thershold=chunk_split_thershold,
                     )
                     t4 = time.perf_counter()
@@ -1456,11 +1458,6 @@ class TTS:
                                     else audio_chunk_[last_audio_chunk.shape[0]-overlap_size:]
                                     )
 
-                            # audio_chunk_ = (
-                            #     audio_chunk_[overlap_size:-overlap_size] if not is_final \
-                            #         else audio_chunk_[overlap_size:]
-                            #         )
-
                         last_latent = latent
                         last_audio_chunk = audio_chunk
                         yield self.audio_postprocess(
@@ -1785,30 +1782,35 @@ class TTS:
         self,
         audio_fragments: List[torch.Tensor],
         overlap_len: int,
+        search_len:int= 320
     ):
-        for i in range(len(audio_fragments) - 1):
-            f1 = audio_fragments[i]
-            f2 = audio_fragments[i + 1]
-            w1 = f1[-overlap_len:]
-            w2 = f2[:overlap_len]
-            w2 = w2[-w2.shape[-1]//2:]
-            # assert w1.shape == w2.shape
-            corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1)).view(-1)
+        # overlap_len-=search_len
 
-            squared_sum = F.conv1d(w1.view(1, 1, -1)**2, torch.ones_like(w2).view(1, 1, -1)).view(-1)+ 1e-8
-            idx = (corr/squared_sum.sqrt()).argmax()
+        dtype = audio_fragments[0].dtype
+        
+        for i in range(len(audio_fragments) - 1):
+            f1 = audio_fragments[i].float()
+            f2 = audio_fragments[i + 1].float()
+            w1 = f1[-overlap_len:]
+            w2 = f2[:overlap_len+search_len]
+            # w2 = w2[-w2.shape[-1]//2:]
+            # assert w1.shape == w2.shape
+            corr_norm = F.conv1d(w2.view(1, 1, -1), w1.view(1, 1, -1)).view(-1)
+
+            corr_den = F.conv1d(w2.view(1, 1, -1)**2, torch.ones_like(w1).view(1, 1, -1)).view(-1)+ 1e-8
+            idx = (corr_norm/corr_den.sqrt()).argmax()
 
             print(f"seg_idx: {idx}")
 
             # idx = corr.argmax()
-            f1_ = f1[: -(overlap_len - idx)]
+            f1_ = f1[: -overlap_len]
             audio_fragments[i] = f1_
 
             f2_ = f2[idx:]
-            window = torch.hann_window((overlap_len - idx) * 2, device=f1.device, dtype=f1.dtype)
-            f2_[: (overlap_len - idx)] = (
-                window[: (overlap_len - idx)] * f2_[: (overlap_len - idx)]
-                + window[(overlap_len - idx) :] * f1[-(overlap_len - idx) :]
+            window = torch.hann_window((overlap_len) * 2, device=f1.device, dtype=f1.dtype)
+            f2_[: overlap_len] = (
+                window[: overlap_len] * f2_[: overlap_len]
+                + window[overlap_len :] * f1[-overlap_len :]
             )
 
             # window = torch.sin(torch.arange((overlap_len - idx), device=f1.device) * np.pi / (overlap_len - idx))
@@ -1819,4 +1821,4 @@ class TTS:
 
             audio_fragments[i + 1] = f2_
 
-        return torch.cat(audio_fragments, 0)
+        return torch.cat(audio_fragments, 0).to(dtype)
diff --git a/api_v2.py b/api_v2.py
index 7aeb5c16..21e9c0c5 100644
--- a/api_v2.py
+++ b/api_v2.py
@@ -35,15 +35,17 @@ POST:
     "batch_threshold": 0.75,      # float. threshold for batch splitting.
     "split_bucket": True,         # bool. whether to split the batch into multiple buckets.
     "speed_factor":1.0,           # float. control the speed of the synthesized audio.
-    "streaming_mode": False,      # bool. whether to return a streaming response.
+    "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
     "seed": -1,                   # int. random seed for reproducibility.
     "parallel_infer": True,       # bool. whether to use parallel inference.
     "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
     "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
     "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+    "return_fragment": False,     # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
+    "streaming_mode": False,      # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
     "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
-    "min_chunk_length: 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
-    "return_fragment": False,     # bool. step by step return the audio fragment. (old version of streaming mode)
+    "min_chunk_length": 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
+    "fixed_length_chunk": False,  # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
 }
 ```
 
@@ -176,6 +178,7 @@ class TTS_Request(BaseModel):
     overlap_length: int = 2
     min_chunk_length: int = 16
     return_fragment: bool = False
+    fixed_length_chunk: bool = False
 
 
 ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
@@ -313,7 +316,7 @@ async def tts_handle(req: dict):
                 "text": "",                   # str.(required) text to be synthesized
                 "text_lang: "",               # str.(required) language of the text to be synthesized
                 "ref_audio_path": "",         # str.(required) reference audio path
-                "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker synthesis
+                "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
                 "prompt_text": "",            # str.(optional) prompt text for the reference audio
                 "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
                 "top_k": 5,                   # int. top k sampling
@@ -322,19 +325,19 @@ async def tts_handle(req: dict):
                 "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
                 "batch_size": 1,              # int. batch size for inference
                 "batch_threshold": 0.75,      # float. threshold for batch splitting.
-                "split_bucket: True,          # bool. whether to split the batch into multiple buckets.
+                "split_bucket": True,         # bool. whether to split the batch into multiple buckets.
                 "speed_factor":1.0,           # float. control the speed of the synthesized audio.
                 "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
                 "seed": -1,                   # int. random seed for reproducibility.
-                "media_type": "wav",          # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
-                "streaming_mode": False,      # bool. whether to return a streaming response.
-                "parallel_infer": True,       # bool.(optional) whether to use parallel inference.
-                "repetition_penalty": 1.35    # float.(optional) repetition penalty for T2S model.
+                "parallel_infer": True,       # bool. whether to use parallel inference.
+                "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
                 "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
                 "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+                "return_fragment": False,     # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
+                "streaming_mode": False,      # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
                 "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
-                "min_chunk_length: 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
-                "return_fragment": False,     # bool. step by step return the audio fragment. (old version of streaming mode)
+                "min_chunk_length": 16,        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
+                "fixed_length_chunk": False,  # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
             }
     returns:
         StreamingResponse: audio stream response.
@@ -402,7 +405,7 @@ async def tts_get_endpoint(
     top_k: int = 5,
     top_p: float = 1,
     temperature: float = 1,
-    text_split_method: str = "cut0",
+    text_split_method: str = "cut5",
     batch_size: int = 1,
     batch_threshold: float = 0.75,
     split_bucket: bool = True,
@@ -410,14 +413,15 @@ async def tts_get_endpoint(
     fragment_interval: float = 0.3,
     seed: int = -1,
     media_type: str = "wav",
-    streaming_mode: bool = False,
     parallel_infer: bool = True,
     repetition_penalty: float = 1.35,
     sample_steps: int = 32,
     super_sampling: bool = False,
+    return_fragment: bool = False,
+    streaming_mode: bool = False,
     overlap_length: int = 2,
     min_chunk_length: int = 16,
-    return_fragment: bool = False,
+    fixed_length_chunk: bool = False,
 ):
     req = {
         "text": text,
@@ -445,6 +449,7 @@ async def tts_get_endpoint(
         "overlap_length": int(overlap_length),
         "min_chunk_length": int(min_chunk_length),
         "return_fragment": return_fragment,
+        "fixed_length_chunk": fixed_length_chunk
     }
     return await tts_handle(req)