diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 7f37b77..42b7fda 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -312,7 +312,11 @@ def merge_short_text_in_array(texts, threshold): result[len(result) - 1] += text return result -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1): +##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature +# cache_tokens={}#暂未实现清理机制 +cache=None +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False): + global cache if prompt_text is None or len(prompt_text) == 0: ref_free = True t0 = ttime() @@ -395,38 +399,30 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) t2 = ttime() - with torch.no_grad(): - # pred_semantic = t2s_model.model.infer( - pred_semantic, idx = t2s_model.model.infer_panel( - all_phoneme_ids, - all_phoneme_len, - None if ref_free else prompt, - bert, - # prompt_phone_len=ph_offset, - top_k=top_k, - top_p=top_p, - temperature=temperature, - early_stop_num=hz * max_sec, - ) + # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature) + if(type(cache)!=type(None)and if_freeze==True):pred_semantic=cache + else: + with torch.no_grad(): + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + None if ref_free else prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) + cache=pred_semantic t3 = ttime() - # print(pred_semantic.shape,idx) - pred_semantic = pred_semantic[:, -idx:].unsqueeze( - 0 - ) # .unsqueeze(0)#mq要多unsqueeze一次 refer = get_spepc(hps, ref_wav_path) # .to(device) if is_half == True: refer = refer.half().to(device) else: refer = refer.to(device) - # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] - audio = ( - vq_model.decode( - pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer,speed=speed - ) - .detach() - .cpu() - .numpy()[0, 0] - ) ###试试重建不带上prompt部分 + audio = (vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer,speed=speed).detach().cpu().numpy()[0, 0]) max_audio=np.abs(audio).max()#简单防止16bit爆音 if max_audio>1:audio/=max_audio audio_opt.append(audio) @@ -611,29 +607,36 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: ) gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) with gr.Row(): - text = gr.Textbox(label=i18n("需要合成的文本"), value="") - text_language = gr.Dropdown( - label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") - ) - how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, - ) - with gr.Row(): - gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低):")) + with gr.Column(): + text = gr.Textbox(label=i18n("需要合成的文本"), value="") + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], + value=i18n("凑四句一切"), + interactive=True, + ) + with gr.Column(): + gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低。不懂就用默认):")) top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=10,interactive=True) top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - with gr.Row(): - speed = gr.Slider(minimum=0.5,maximum=2,step=0.05,label=i18n("speed"),value=1,interactive=True) + with gr.Column(): + gr.Markdown(value=i18n("语速调整,高为更快")) + if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速。防止随机性。"), value=False, interactive=True, show_label=True) + speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True) + # with gr.Column(): + # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。")) + # phoneme=gr.Textbox(label=i18n("音素框"), value="") + # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") inference_button = gr.Button(i18n("合成语音"), variant="primary") output = gr.Audio(label=i18n("输出的语音")) inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed], + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze], [output], )