Merge 04014b959e19ef150f54997f406575d36e14040e into 95354647c87fc86a416361b39e7268afd95e193d

2025-10-07 23:48:48 +08:00 · 2024-07-02 02:01:34 +08:00 · 2024-07-02 02:01:34 +08:00 · ae339ef8d9
commit ae339ef8d9
parent 95354647c8 04014b959e
3 changed files with 15 additions and 3 deletions
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -328,6 +328,7 @@ class Text2SemanticDecoder(nn.Module):
        prompts,  ####参考音频token
        bert_feature,
        top_k: int = -100,
+        min_p: float = 0.0,
        top_p: int = 100,
        early_stop_num: int = -1,
        temperature: float = 1.0,
@ -397,7 +398,7 @@ class Text2SemanticDecoder(nn.Module):
            if(idx==0):###第一次跑不能EOS否则没有了
                logits = logits[:, :-1]  ###刨除1024终止符号的概率
            samples = sample(
-                logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
+                logits[0], y, top_k=top_k, min_p=min_p, top_p=top_p, repetition_penalty=1.35, temperature=temperature
            )[0].unsqueeze(0)
            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
            # print(samples.shape)#[1,1]#第一个1是bs
--- a/GPT_SoVITS/AR/models/utils.py
+++ b/GPT_SoVITS/AR/models/utils.py
@ -112,6 +112,7 @@ def logits_to_probs(
    previous_tokens: Optional[torch.Tensor] = None,
    temperature: float = 1.0,
    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
    top_p: Optional[int] = None,
    repetition_penalty: float = 1.0,
 ):
@ -127,6 +128,12 @@ def logits_to_probs(
        )
        logits.scatter_(dim=0, index=previous_tokens, src=score)

+    # Min-p sampling, we sample tokens with a probability above the min_p * max_prob where max_prob is the maximum probability in the distribution
+    if min_p is not None and min_p < 1.0 and min_p > 0.0:
+        max_prob = torch.max(logits)
+        logits = torch.where(logits < min_p * max_prob, -float("Inf"), logits)
+
+    # Top-p sampling, we sample from the smallest set of tokens whose cumulative probability mass exceeds the threshold `p`
    if top_p is not None and top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cum_probs = torch.cumsum(
@ -139,8 +146,10 @@ def logits_to_probs(
        )
        logits = logits.masked_fill(indices_to_remove, -float("Inf"))

+    # Use temperature to smooth the distribution
    logits = logits / max(temperature, 1e-5)

+    # Top-k sampling, we sample only from the top k most likely tokens
    if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        pivot = v.select(-1, -1).unsqueeze(-1)
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -312,7 +312,7 @@ def merge_short_text_in_array(texts, threshold):
            result[len(result) - 1] += text
    return result

-def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False):
+def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, min_p=0.0, top_p=0.6, temperature=0.6, ref_free = False):
    if prompt_text is None or len(prompt_text) == 0:
        ref_free = True
    t0 = ttime()
@ -402,6 +402,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
                bert,
                # prompt_phone_len=ph_offset,
                top_k=top_k,
+                min_p=min_p,
                top_p=top_p,
                temperature=temperature,
                early_stop_num=hz * max_sec,
@ -612,6 +613,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            with gr.Row():
                gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低)："))
                top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
+                min_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("min_p"),value=0.0,interactive=True)
                top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
                temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
            inference_button = gr.Button(i18n("合成语音"), variant="primary")
@ -619,7 +621,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:

        inference_button.click(
            get_tts_wav,
-            [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free],
+            [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, min_p, top_p, temperature, ref_text_free],
            [output],
        )