diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index c59544a..6ac02e2 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -7,6 +7,8 @@ 全部按日文识别 ''' import logging +import traceback + logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) @@ -14,6 +16,7 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) +logging.getLogger("multipart.multipart").setLevel(logging.ERROR) import LangSegment, os, re, sys, json import pdb import torch @@ -257,6 +260,8 @@ change_gpt_weights(gpt_path) def get_spepc(hps, filename): audio = load_audio(filename, int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio) + maxx=audio.abs().max() + if(maxx>1):audio/=min(2,maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( @@ -387,7 +392,7 @@ def merge_short_text_in_array(texts, threshold): ##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature # cache_tokens={}#暂未实现清理机制 cache= {} -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False): +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False,inp_refs=123): global cache if ref_wav_path:pass else:gr.Warning(i18n('请上传参考音频')) @@ -498,12 +503,16 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) cache[i_text]=pred_semantic t3 = ttime() - refer = get_spepc(hps, ref_wav_path) # .to(device) - if is_half == True: - refer = refer.half().to(device) - else: - refer = refer.to(device) - audio = (vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer,speed=speed).detach().cpu().numpy()[0, 0]) + refers=[] + if(inp_refs): + for path in inp_refs: + try: + refer = get_spepc(hps, path.name).to(dtype).to(device) + refers.append(refer) + except: + traceback.print_exc() + if(len(refers)==0):refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] + audio = (vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers,speed=speed).detach().cpu().numpy()[0, 0]) max_audio=np.abs(audio).max()#简单防止16bit爆音 if max_audio>1:audio/=max_audio audio_opt.append(audio) @@ -692,13 +701,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: prompt_language = gr.Dropdown( label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"), scale=14 ) + inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="file_count",scale=13) gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3')) with gr.Row(): with gr.Column(scale=13): text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26) with gr.Column(scale=7): text_language = gr.Dropdown( - label=i18n("需要合成的语种"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1 + label=i18n("需要合成的语种")+i18n(".限制范围越小判别效果越好。"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1 ) how_to_cut = gr.Dropdown( label=i18n("怎么切"), @@ -707,10 +717,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: interactive=True, scale=1 ) gr.Markdown(value=html_center(i18n("语速调整,高为更快"))) - if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速。防止随机性。"), value=False, interactive=True,show_label=True, scale=1) + if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), value=False, interactive=True,show_label=True, scale=1) speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1) gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):"))) - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=10,interactive=True, scale=1) + top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=15,interactive=True, scale=1) top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1) temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True, scale=1) # with gr.Column(): @@ -723,7 +733,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze], + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs], [output], ) SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])