Merge 539ad38d310d452aa1a08e659d5eac22f843a01d into c767f0b83b998e996a4d230d86da575a03f54a3f

修复bug (#2704 )
* 修复bug * fallbak and bug fix
2026-01-09 11:57:01 +08:00 · 2026-01-05 00:00:20 -08:00 · 2025-12-30 16:00:21 +08:00 · 2025-12-30 15:21:03 +08:00 · 2025-12-25 16:44:21 +08:00 · 2025-12-19 17:37:19 +08:00
8 changed files with 2097 additions and 10 deletions
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -707,10 +707,12 @@ class Text2SemanticDecoder(nn.Module):

            if idx == 0:
                attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
-                logits = logits[:, :-1]
            else:
                attn_mask = F.pad(attn_mask, (0, 1), value=False)

+            if idx < 11:  ###至少预测出10个token不然不给停止（0.4s）
+                logits = logits[:, :-1] 
+
            samples = sample(
                logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
            )[0]
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -1008,7 +1008,7 @@ class TTS:
                    "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
                    "prompt_text": "",            # str.(optional) prompt text for the reference audio
                    "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
-                    "top_k": 5,                   # int. top k sampling
+                    "top_k": 15,                  # int. top k sampling
                    "top_p": 1,                   # float. top p sampling
                    "temperature": 1,             # float. temperature for sampling
                    "text_split_method": "cut1",  # str. text split method, see text_segmentation_method.py for details.
@ -1039,7 +1039,7 @@ class TTS:
        aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
        prompt_text: str = inputs.get("prompt_text", "")
        prompt_lang: str = inputs.get("prompt_lang", "")
-        top_k: int = inputs.get("top_k", 5)
+        top_k: int = inputs.get("top_k", 15)
        top_p: float = inputs.get("top_p", 1)
        temperature: float = inputs.get("temperature", 1)
        text_split_method: str = inputs.get("text_split_method", "cut1")
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -385,7 +385,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                        minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
                    )
                with gr.Row():
-                    top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
+                    top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True)
                    top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
                with gr.Row():
                    temperature = gr.Slider(
--- a/README.md
+++ b/README.md
@ -347,7 +347,7 @@ Use v4 from v1/v2/v3 environment:

 2. Clone the latest codes from github.

-3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
+3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.

 ## V2Pro Release Notes

--- a/api_v2.py
+++ b/api_v2.py
@ -27,7 +27,7 @@ POST:
    "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
    "prompt_text": "",            # str.(optional) prompt text for the reference audio
    "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
-    "top_k": 5,                   # int. top k sampling
+    "top_k": 15,                  # int. top k sampling
    "top_p": 1,                   # float. top p sampling
    "temperature": 1,             # float. temperature for sampling
    "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
@ -158,7 +158,7 @@ class TTS_Request(BaseModel):
    aux_ref_audio_paths: list = None
    prompt_lang: str = None
    prompt_text: str = ""
-    top_k: int = 5
+    top_k: int = 15
    top_p: float = 1
    temperature: float = 1
    text_split_method: str = "cut5"
@ -355,7 +355,7 @@ async def tts_handle(req: dict):
                "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
                "prompt_text": "",            # str.(optional) prompt text for the reference audio
                "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
-                "top_k": 5,                   # int. top k sampling
+                "top_k": 15,                  # int. top k sampling
                "top_p": 1,                   # float. top p sampling
                "temperature": 1,             # float. temperature for sampling
                "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
@ -460,7 +460,7 @@ async def tts_get_endpoint(
    aux_ref_audio_paths: list = None,
    prompt_lang: str = None,
    prompt_text: str = "",
-    top_k: int = 5,
+    top_k: int = 15,
    top_p: float = 1,
    temperature: float = 1,
    text_split_method: str = "cut5",
--- a/go-webui-simple-mode.bat
+++ b/go-webui-simple-mode.bat
@ -0,0 +1,2 @@
+runtime\python.exe -I webui_simple.py zh_CN
+pause
--- a/requirements.txt
+++ b/requirements.txt
@ -19,7 +19,7 @@ torchaudio
 modelscope
 sentencepiece
 transformers>=4.43,<=4.50
-peft
+peft<0.18.0
 chardet
 PyYAML
 psutil
--- a/webui_simple.py
+++ b/webui_simple.py
Author	SHA1	Message	Date
Karasukaigan	e78aada32e	Merge 539ad38d310d452aa1a08e659d5eac22f843a01d into c767f0b83b998e996a4d230d86da575a03f54a3f	2026-01-05 00:00:20 -08:00
ChasonJiang	c767f0b83b	修复bug (#2704 ) * 修复bug * fallbak and bug fix	2025-12-30 16:00:21 +08:00
ChasonJiang	9080a967d5	修复采样错误 (#2703 )	2025-12-30 15:21:03 +08:00
sushistack	51df9f7384	Fix model file name in README instructions (#2700 )	2025-12-25 16:44:21 +08:00
ChasonJiang	bfca0f6b2d	对齐naive_infer的解码策略，防止吞句 (#2697 )	2025-12-19 17:37:19 +08:00
ChasonJiang	abe984395c	对齐gpt topk默认采样参数 (#2696 )	2025-12-19 16:05:36 +08:00
RVC-Boss	cc89c3660e	Update requirements.txt	2025-12-19 15:54:54 +08:00
Karasukaigan	539ad38d31	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-07-02 15:35:45 +08:00
Karasukaigan	6df7921d56	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-06-20 00:30:43 +08:00
Karasukaigan	8d212a9255	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-06-06 19:43:11 +08:00
Karasukaigan	2fb92e74a2	更新WebUI简化版以支持V2Pro 更新了webui_simple.py以支持V2Pro系列模型。	2025-06-06 01:20:45 +08:00
Karasukaigan	be16f387f1	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-06-05 18:41:42 +08:00
Karasukaigan	7e6a607b9e	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-06-05 15:47:39 +08:00
Karasukaigan	ab5e8dc0ae	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-05-31 20:15:19 +08:00
Karasukaigan	b666becb19	撤回对于inference_webui的改动	2025-05-27 22:41:20 +08:00
Karasukaigan	47426d18e7	新增微调训练WebUI简化版用户不再需要多次切换不同的选项卡页面来完成一次微调训练。现在微调训练的所有流程都在同一个页面里，按照从上往下的顺序排好，并且隐藏了非常用的设置项。	2025-05-27 22:22:15 +08:00
Karasukaigan	b8de0ec0ac	Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements	2025-05-27 22:11:36 +08:00
Karasukaigan	f20f17c2c0	修复通过Gradio API调用合成语音接口时出现参数类型错误的问题修复通过Gradio API调用合成语音接口`/get_tts_wav`时出现参数类型错误的问题。 ## 报错信息 TypeError: unsupported operand type(s) for /: 'int' and 'str' ## 错误原因 `inference_webui.py`的`get_tts_wav`里并未对传入`sample_steps`的类型进行判断。而由于Gradio在自动生成接口文档时会将`gr.Radio`传入的值判定为字符串，因此如果有用户参考WebUI下面”通过 API 使用“里的说明调用`/get_tts_wav`时，则会因为文档错误导致传参类型错误，从而导致最终的报错。 ## 修复方式通过在`get_tts_wav`开头部分添加对`sample_steps`格式的转换（统一转为int）来解决传参类型错误的问题。	2025-05-16 17:58:56 +08:00