From b9a9c88a8c7beb0e29fea689188fa715b8733f93 Mon Sep 17 00:00:00 2001 From: PocketDocLabs Date: Tue, 21 May 2024 03:56:13 -0700 Subject: [PATCH] modified: .gitignore modified: GPT_SoVITS/AR/models/t2s_model.py modified: GPT_SoVITS/AR/models/utils.py modified: GPT_SoVITS/inference_webui.py --- .gitignore | 4 +++- GPT_SoVITS/AR/models/t2s_model.py | 3 ++- GPT_SoVITS/AR/models/utils.py | 9 +++++++++ GPT_SoVITS/inference_webui.py | 6 ++++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c484cf22..f476f95f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ logs reference GPT_weights SoVITS_weights -TEMP \ No newline at end of file +TEMP +gweight.txt +sweight.txt diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index c8ad3d82..c53b6383 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -329,6 +329,7 @@ class Text2SemanticDecoder(nn.Module): bert_feature, top_k: int = -100, top_p: int = 100, + min_p: int = 80, early_stop_num: int = -1, temperature: float = 1.0, ): @@ -397,7 +398,7 @@ class Text2SemanticDecoder(nn.Module): if(idx==0):###第一次跑不能EOS否则没有了 logits = logits[:, :-1] ###刨除1024终止符号的概率 samples = sample( - logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature + logits[0], y, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=1.35, temperature=temperature )[0].unsqueeze(0) # 本次生成的 semantic_ids 和之前的 y 构成新的 y # print(samples.shape)#[1,1]#第一个1是bs diff --git a/GPT_SoVITS/AR/models/utils.py b/GPT_SoVITS/AR/models/utils.py index 9678c7e1..94a4cfee 100644 --- a/GPT_SoVITS/AR/models/utils.py +++ b/GPT_SoVITS/AR/models/utils.py @@ -112,6 +112,7 @@ def logits_to_probs( previous_tokens: Optional[torch.Tensor] = None, temperature: float = 1.0, top_k: Optional[int] = None, + min_p: Optional[float] = None, top_p: Optional[int] = None, repetition_penalty: float = 1.0, ): @@ -127,6 +128,12 @@ def logits_to_probs( ) logits.scatter_(dim=0, index=previous_tokens, src=score) + # Min-p sampling, we sample tokens with a probability above the min_p * max_prob where max_prob is the maximum probability in the distribution + if min_p is not None and min_p < 1.0 and min_p > 0.0: + max_prob = torch.max(logits) + logits = torch.where(logits < min_p * max_prob, -float("Inf"), logits) + + # Top-p sampling, we sample from the smallest set of tokens whose cumulative probability mass exceeds the threshold `p` if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cum_probs = torch.cumsum( @@ -139,8 +146,10 @@ def logits_to_probs( ) logits = logits.masked_fill(indices_to_remove, -float("Inf")) + # Use temperature to smooth the distribution logits = logits / max(temperature, 1e-5) + # Top-k sampling, we sample only from the top k most likely tokens if top_k is not None: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) pivot = v.select(-1, -1).unsqueeze(-1) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4fe8045d..4a723e5b 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -311,7 +311,7 @@ def merge_short_text_in_array(texts, threshold): result[len(result) - 1] += text return result -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False): +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, min_p=0.8, top_p=1.0, temperature=0.6, ref_free = False): if prompt_text is None or len(prompt_text) == 0: ref_free = True t0 = ttime() @@ -399,6 +399,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, bert, # prompt_phone_len=ph_offset, top_k=top_k, + min_p=min_p, top_p=top_p, temperature=temperature, early_stop_num=hz * max_sec, @@ -586,6 +587,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低):")) top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) + min_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("min_p"),value=0.8,interactive=True) top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) inference_button = gr.Button(i18n("合成语音"), variant="primary") @@ -593,7 +595,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free], + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, min_p, top_p, temperature, ref_text_free], [output], )