diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 6a7a8f7e..610efed7 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -4,12 +4,30 @@ logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) + +logging.getLogger("charset_normalizer").setLevel(logging.ERROR) +logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) import pdb -os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -gpt_path = os.environ.get( - "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" -) -sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth") + +if os.path.exists("./gweight.txt"): + with open("./gweight.txt", 'r',encoding="utf-8") as file: + gweight_data = file.read() + gpt_path = os.environ.get( + "gpt_path", gweight_data) +else: + gpt_path = os.environ.get( + "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt") + +if os.path.exists("./sweight.txt"): + with open("./sweight.txt", 'r',encoding="utf-8") as file: + sweight_data = file.read() + sovits_path = os.environ.get("sovits_path", sweight_data) +else: + sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth") +# gpt_path = os.environ.get( +# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +# ) +# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") cnhubert_base_path = os.environ.get( "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base" ) @@ -62,7 +80,7 @@ def get_bert_feature(text, word2ph): with torch.no_grad(): inputs = tokenizer(text, return_tensors="pt") for i in inputs: - inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model + inputs[i] = inputs[i].to(device) res = bert_model(**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] assert len(word2ph) == len(text) @@ -119,13 +137,15 @@ def change_sovits_weights(sovits_path): n_speakers=hps.data.n_speakers, **hps.model ) - del vq_model.enc_q + if("pretrained"not in sovits_path): + del vq_model.enc_q if is_half == True: vq_model = vq_model.half().to(device) else: vq_model = vq_model.to(device) vq_model.eval() print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) + with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path) change_sovits_weights(sovits_path) def change_gpt_weights(gpt_path): @@ -142,6 +162,7 @@ def change_gpt_weights(gpt_path): t2s_model.eval() total = sum([param.nelement() for param in t2s_model.parameters()]) print("Number of parameter: %.2fM" % (total / 1e6)) + with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path) change_gpt_weights(gpt_path) def get_spepc(hps, filename): @@ -167,7 +188,84 @@ dict_language={ } -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): +def splite_en_inf(sentence, language): + pattern = re.compile(r'[a-zA-Z. ]+') + textlist = [] + langlist = [] + pos = 0 + for match in pattern.finditer(sentence): + start, end = match.span() + if start > pos: + textlist.append(sentence[pos:start]) + langlist.append(language) + textlist.append(sentence[start:end]) + langlist.append("en") + pos = end + if pos < len(sentence): + textlist.append(sentence[pos:]) + langlist.append(language) + + return textlist, langlist + + +def clean_text_inf(text, language): + phones, word2ph, norm_text = clean_text(text, language) + phones = cleaned_text_to_sequence(phones) + + return phones, word2ph, norm_text + + +def get_bert_inf(phones, word2ph, norm_text, language): + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +def nonen_clean_text_inf(text, language): + textlist, langlist = splite_en_inf(text, language) + phones_list = [] + word2ph_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang) + phones_list.append(phones) + if lang == "en" or "ja": + pass + else: + word2ph_list.append(word2ph) + norm_text_list.append(norm_text) + print(word2ph_list) + phones = sum(phones_list, []) + word2ph = sum(word2ph_list, []) + norm_text = ' '.join(norm_text_list) + + return phones, word2ph, norm_text + + +def nonen_get_bert_inf(text, language): + textlist, langlist = splite_en_inf(text, language) + print(textlist) + print(langlist) + bert_list = [] + for i in range(len(textlist)): + text = textlist[i] + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(text, lang) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + + return bert + +#i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切") +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,how_to_cut=i18n("不切")): t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") @@ -196,27 +294,38 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) t1 = ttime() prompt_language = dict_language[prompt_language] text_language = dict_language[text_language] - phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language) - phones1 = cleaned_text_to_sequence(phones1) - texts = text.split("\n") + + if prompt_language == "en": + phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language) + else: + phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language) + if(how_to_cut==i18n("凑五句一切")):text=cut1(text) + elif(how_to_cut==i18n("凑50字一切")):text=cut2(text) + elif(how_to_cut==i18n("按中文句号。切")):text=cut3(text) + elif(how_to_cut==i18n("按英文句号.切")):text=cut4(text) + text = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n") + if(text[-1]not in splits):text+="。"if text_language!="en"else "." + texts=text.split("\n") audio_opt = [] + if prompt_language == "en": + bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) + else: + bert1 = nonen_get_bert_inf(prompt_text, prompt_language) + for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): continue - phones2, word2ph2, norm_text2 = clean_text(text, text_language) - phones2 = cleaned_text_to_sequence(phones2) - if prompt_language == "zh": - bert1 = get_bert_feature(norm_text1, word2ph1).to(device) + if text_language == "en": + phones2, word2ph2, norm_text2 = clean_text_inf(text, text_language) else: - bert1 = torch.zeros( - (1024, len(phones1)), - dtype=torch.float16 if is_half == True else torch.float32, - ).to(device) - if text_language == "zh": - bert2 = get_bert_feature(norm_text2, word2ph2).to(device) + phones2, word2ph2, norm_text2 = nonen_clean_text_inf(text, text_language) + + if text_language == "en": + bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language) else: - bert2 = torch.zeros((1024, len(phones2))).to(bert1) + bert2 = nonen_get_bert_inf(text, text_language) + bert = torch.cat([bert1, bert2], 1) all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) @@ -339,6 +448,9 @@ def cut2(inp): def cut3(inp): inp = inp.strip("\n") return "\n".join(["%s。" % item for item in inp.strip("。").split("。")]) +def cut4(inp): + inp = inp.strip("\n") + return "\n".join(["%s." % item for item in inp.strip(".").split(".")]) def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 @@ -445,17 +557,24 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: prompt_language = gr.Dropdown( label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") ) - gr.Markdown(value=i18n("*请填写需要合成的目标文本")) + gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。")) with gr.Row(): text = gr.Textbox(label=i18n("需要合成的文本"), value="") text_language = gr.Dropdown( label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),], + value=i18n("凑50字一切"), + interactive=True, + ) inference_button = gr.Button(i18n("合成语音"), variant="primary") output = gr.Audio(label=i18n("输出的语音")) + inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language], + [inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut], [output], ) history_audio = [] @@ -485,10 +604,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: button1 = gr.Button(i18n("凑五句一切"), variant="primary") button2 = gr.Button(i18n("凑50字一切"), variant="primary") button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + button4 = gr.Button(i18n("按英文句号.切"), variant="primary") text_opt = gr.Textbox(label=i18n("切分后文本"), value="") button1.click(cut1, [text_inp], [text_opt]) button2.click(cut2, [text_inp], [text_opt]) button3.click(cut3, [text_inp], [text_opt]) + button4.click(cut4, [text_inp], [text_opt]) gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。")) app.queue(concurrency_count=511, max_size=1022).launch( diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index 9499db4a..88c9d858 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -47,12 +47,12 @@ if os.path.exists(txt_path) == False: bert_dir = "%s/3-bert" % (opt_dir) os.makedirs(opt_dir, exist_ok=True) os.makedirs(bert_dir, exist_ok=True) -if torch.cuda.is_available(): - device = "cuda:0" -elif torch.backends.mps.is_available(): - device = "mps" -else: - device = "cpu" + if torch.cuda.is_available(): + device = "cuda:0" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) if is_half == True: diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index a3cf0a3d..9ab56a48 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -38,12 +38,12 @@ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) if os.path.exists(semantic_path) == False: os.makedirs(opt_dir, exist_ok=True) -if torch.cuda.is_available(): - device = "cuda" -elif torch.backends.mps.is_available(): - device = "mps" -else: - device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" hps = utils.get_hparams_from_file(s2config_path) vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index eafb179e..9f62abe9 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -455,6 +455,35 @@ class ToneSandhi: "电子", "人人", "虎虎", + "幺幺", + "干嘛", + "学子", + "哈哈", + "数数", + "袅袅", + "局地", + "以下", + "娃哈哈", + "花花草草", + "留得", + "耕地", + "想想", + "熙熙", + "攘攘", + "卵子", + "死死", + "冉冉", + "恳恳", + "佼佼", + "吵吵", + "打打", + "考考", + "整整", + "莘莘", + "落地", + "算子", + "家家户户", + "青青", } self.punc = ":,;。?!“”‘’':,;.?!" diff --git a/api.py b/api.py index 1b5b6a08..60ed9fff 100644 --- a/api.py +++ b/api.py @@ -13,7 +13,7 @@ `-dt` - `默认参考音频文本` `-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"` -`-d` - `推理设备, "cuda","cpu"` +`-d` - `推理设备, "cuda","cpu","mps"` `-a` - `绑定地址, 默认"127.0.0.1"` `-p` - `绑定端口, 默认9880, 可在 config.py 中指定` `-fp` - `覆盖 config.py 使用全精度` @@ -139,7 +139,6 @@ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help=" parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps") -parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度") diff --git a/config.py b/config.py index 897f53c1..97369e70 100644 --- a/config.py +++ b/config.py @@ -29,6 +29,18 @@ webui_port_subfix = 9871 api_port = 9880 +gpu_name = torch.cuda.get_device_name(0) +if ( + ("16" in gpu_name and "V100" not in gpu_name.upper()) + or "P40" in gpu_name.upper() + or "P10" in gpu_name.upper() + or "1060" in gpu_name + or "1070" in gpu_name + or "1080" in gpu_name +): + is_half=False + +if(infer_device=="cpu"):is_half=False class Config: def __init__(self): diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 93fc8be7..beaba424 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -29,3 +29,22 @@ 3-优化模型文件排序逻辑 4-中文分词使用jieba_fast代替jieba + +### 20240126更新 + +1-支持输出文本中英混合、日英混合 + +2-输出可选切分模式 + +3-修复uvr5读取到目录自动跳出的问题 + +4-修复多个换行导致推理报错 + +5-去除推理界面大量冗余log + +6-支持mac训练推理 + +7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。 + + + diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index 71e7ebcc..97170bf0 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -52,39 +52,32 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format paths = [path.name for path in paths] for path in paths: inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 + if(os.path.isfile(inp_path)==False):continue try: - y, sr = librosa.load(inp_path, sr=None) - info = sf.info(inp_path) - channels = info.channels - if channels == 2 and sr == 44100: - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 - ) - done = 1 - else: + done = 0 + try: + y, sr = librosa.load(inp_path, sr=None) + info = sf.info(inp_path) + channels = info.channels + if channels == 2 and sr == 44100: + need_reformat = 0 + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 + ) + done = 1 + else: + need_reformat = 1 + except: need_reformat = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % ( - os.path.join(os.environ["TEMP"]), - os.path.basename(inp_path), - ) - y_resampled = librosa.resample(y, sr, 44100) - sf.write(tmp_path, y_resampled, 44100, "PCM_16") - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 + traceback.print_exc() + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join(os.environ["TEMP"]), + os.path.basename(inp_path), ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: + y_resampled = librosa.resample(y, sr, 44100) + sf.write(tmp_path, y_resampled, 44100, "PCM_16") + inp_path = tmp_path try: if done == 0: pre_fun._path_audio_( @@ -93,10 +86,21 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) + try: + if done == 0: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) + except: + infos.append("Oh my god. %s->%s"%(os.path.basename(inp_path), traceback.format_exc())) + yield "\n".join(infos) except: infos.append(traceback.format_exc()) yield "\n".join(infos)