From 7259cc95fd669c9290d6800079120f469070b5cf Mon Sep 17 00:00:00 2001 From: Wu Zichen Date: Fri, 26 Jan 2024 01:38:03 +0800 Subject: [PATCH 01/19] Update usage introduction, fix an error --- api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/api.py b/api.py index 1b5b6a08..60ed9fff 100644 --- a/api.py +++ b/api.py @@ -13,7 +13,7 @@ `-dt` - `默认参考音频文本` `-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"` -`-d` - `推理设备, "cuda","cpu"` +`-d` - `推理设备, "cuda","cpu","mps"` `-a` - `绑定地址, 默认"127.0.0.1"` `-p` - `绑定端口, 默认9880, 可在 config.py 中指定` `-fp` - `覆盖 config.py 使用全精度` @@ -139,7 +139,6 @@ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help=" parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps") -parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度") From 3029560356cec459a7c109cf08c2be5243a233ba Mon Sep 17 00:00:00 2001 From: Wu Zichen Date: Fri, 26 Jan 2024 01:40:18 +0800 Subject: [PATCH 02/19] Fix indentation issue --- GPT_SoVITS/prepare_datasets/1-get-text.py | 12 ++++++------ GPT_SoVITS/prepare_datasets/3-get-semantic.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index 9499db4a..88c9d858 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -47,12 +47,12 @@ if os.path.exists(txt_path) == False: bert_dir = "%s/3-bert" % (opt_dir) os.makedirs(opt_dir, exist_ok=True) os.makedirs(bert_dir, exist_ok=True) -if torch.cuda.is_available(): - device = "cuda:0" -elif torch.backends.mps.is_available(): - device = "mps" -else: - device = "cpu" + if torch.cuda.is_available(): + device = "cuda:0" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) if is_half == True: diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index a3cf0a3d..9ab56a48 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -38,12 +38,12 @@ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) if os.path.exists(semantic_path) == False: os.makedirs(opt_dir, exist_ok=True) -if torch.cuda.is_available(): - device = "cuda" -elif torch.backends.mps.is_available(): - device = "mps" -else: - device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" hps = utils.get_hparams_from_file(s2config_path) vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, From afdce90c41f0be96f2ea33956ae310c25cf1f0e9 Mon Sep 17 00:00:00 2001 From: Lei Hao Date: Fri, 26 Jan 2024 10:34:22 +0800 Subject: [PATCH 03/19] Resolved the issue of duplicate inference in the Bert1 model --- GPT_SoVITS/inference_webui.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 246748ae..11042c07 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -185,19 +185,22 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1 = cleaned_text_to_sequence(phones1) texts = text.split("\n") audio_opt = [] + + if prompt_language == "zh": + bert1 = get_bert_feature(norm_text1, word2ph1).to(device) + else: + bert1 = torch.zeros( + (1024, len(phones1)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): continue phones2, word2ph2, norm_text2 = clean_text(text, text_language) phones2 = cleaned_text_to_sequence(phones2) - if prompt_language == "zh": - bert1 = get_bert_feature(norm_text1, word2ph1).to(device) - else: - bert1 = torch.zeros( - (1024, len(phones1)), - dtype=torch.float16 if is_half == True else torch.float32, - ).to(device) + if text_language == "zh": bert2 = get_bert_feature(norm_text2, word2ph2).to(device) else: From 177f3c6fc9268258d359a4320010722a368f7d3d Mon Sep 17 00:00:00 2001 From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> Date: Fri, 26 Jan 2024 10:35:13 +0800 Subject: [PATCH 04/19] =?UTF-8?q?=E4=B8=AD=E8=8B=B1/=E6=97=A5=E8=8B=B1?= =?UTF-8?q?=E6=B7=B7=E5=90=88=E6=8E=A8=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 如题,支持在中文/日语模式下夹杂英语内容 --- GPT_SoVITS/inference_webui.py | 108 ++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 13 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 7626bc41..1c5dab61 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -165,6 +165,83 @@ dict_language={ } +def splite_en_inf(sentence, language): + pattern = re.compile(r'[a-zA-Z. ]+') + textlist = [] + langlist = [] + pos = 0 + for match in pattern.finditer(sentence): + start, end = match.span() + if start > pos: + textlist.append(sentence[pos:start]) + langlist.append(language) + textlist.append(sentence[start:end]) + langlist.append("en") + pos = end + if pos < len(sentence): + textlist.append(sentence[pos:]) + langlist.append(language) + + return textlist, langlist + + +def clean_text_inf(text, language): + phones, word2ph, norm_text = clean_text(text, language) + phones = cleaned_text_to_sequence(phones) + + return phones, word2ph, norm_text + + +def get_bert_inf(phones, word2ph, norm_text, language): + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +def nonen_clean_text_inf(text, language): + textlist, langlist = splite_en_inf(text, language) + phones_list = [] + word2ph_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang) + phones_list.append(phones) + if lang=="en" or "ja": + pass + else: + word2ph_list.append(word2ph) + norm_text_list.append(norm_text) + print(word2ph_list) + phones = sum(phones_list, []) + word2ph = sum(word2ph_list, []) + norm_text = ' '.join(norm_text_list) + + return phones, word2ph, norm_text + + +def nonen_get_bert_inf(text, language): + textlist, langlist = splite_en_inf(text, language) + print(textlist) + print(langlist) + bert_list = [] + for i in range(len(textlist)): + text = textlist[i] + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(text, lang) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + + return bert + + def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): t0 = ttime() prompt_text = prompt_text.strip("\n") @@ -194,27 +271,32 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) t1 = ttime() prompt_language = dict_language[prompt_language] text_language = dict_language[text_language] - phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language) - phones1 = cleaned_text_to_sequence(phones1) + if prompt_language == "en": + phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language) + else: + phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language) texts = text.split("\n") audio_opt = [] for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): continue - phones2, word2ph2, norm_text2 = clean_text(text, text_language) - phones2 = cleaned_text_to_sequence(phones2) - if prompt_language == "zh": - bert1 = get_bert_feature(norm_text1, word2ph1).to(device) + + if text_language == "en": + phones2, word2ph2, norm_text2 = clean_text_inf(text, text_language) else: - bert1 = torch.zeros( - (1024, len(phones1)), - dtype=torch.float16 if is_half == True else torch.float32, - ).to(device) - if text_language == "zh": - bert2 = get_bert_feature(norm_text2, word2ph2).to(device) + phones2, word2ph2, norm_text2 = nonen_clean_text_inf(text, text_language) + + if prompt_language == "en": + bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) else: - bert2 = torch.zeros((1024, len(phones2))).to(bert1) + bert1 = nonen_get_bert_inf(prompt_text, prompt_language) + + if text_language == "en": + bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language) + else: + bert2 = nonen_get_bert_inf(text, text_language) + bert = torch.cat([bert1, bert2], 1) all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) From 813cf96e508ba1bb2c658f38c7cc77b797fb4082 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:49:33 +0800 Subject: [PATCH 05/19] Add files via upload --- tools/uvr5/webui.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index 71e7ebcc..f1357e5e 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,12 +1,11 @@ import os import traceback,gradio as gr import logging -from tools.i18n.i18n import I18nAuto +from i18n.i18n import I18nAuto i18n = I18nAuto() logger = logging.getLogger(__name__) -import librosa -import soundfile as sf +import ffmpeg import torch import sys from mdxnet import MDXNetDereverb @@ -20,8 +19,7 @@ for name in os.listdir(weight_uvr5_root): device=sys.argv[1] is_half=sys.argv[2] -webui_port_uvr5=int(sys.argv[3]) -is_share=eval(sys.argv[4]) + def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] @@ -55,17 +53,16 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format need_reformat = 1 done = 0 try: - y, sr = librosa.load(inp_path, sr=None) - info = sf.info(inp_path) - channels = info.channels - if channels == 2 and sr == 44100: + info = ffmpeg.probe(inp_path, cmd="ffprobe") + if ( + info["streams"][0]["channels"] == 2 + and info["streams"][0]["sample_rate"] == "44100" + ): need_reformat = 0 pre_fun._path_audio_( inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 ) done = 1 - else: - need_reformat = 1 except: need_reformat = 1 traceback.print_exc() @@ -74,8 +71,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format os.path.join(os.environ["TEMP"]), os.path.basename(inp_path), ) - y_resampled = librosa.resample(y, sr, 44100) - sf.write(tmp_path, y_resampled, 44100, "PCM_16") + os.system( + "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" + % (inp_path, tmp_path) + ) inp_path = tmp_path try: if done == 0: @@ -116,10 +115,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format yield "\n".join(infos) -with gr.Blocks(title="RVC WebUI") as app: +with gr.Blocks(title="UVR5 WebUI") as app: gr.Markdown( value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + "MIT license. https://github.com/Anjok07/ultimatevocalremovergui" ) with gr.Tabs(): with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): @@ -144,7 +143,7 @@ with gr.Blocks(title="RVC WebUI") as app: minimum=0, maximum=20, step=1, - label=i18n("人声提取激进程度"), + label="人声提取激进程度", value=10, interactive=True, visible=False, # 先不开放调整 @@ -180,7 +179,6 @@ with gr.Blocks(title="RVC WebUI") as app: app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, - share=is_share, - server_port=webui_port_uvr5, + server_port=9873, quiet=True, -) +) \ No newline at end of file From fa0c06b62ce89dd5662ccf7dc37f0a19f49a9c53 Mon Sep 17 00:00:00 2001 From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> Date: Fri, 26 Jan 2024 12:23:40 +0800 Subject: [PATCH 06/19] Update inference_webui.py https://github.com/RVC-Boss/GPT-SoVITS/pull/205 --- GPT_SoVITS/inference_webui.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 1c5dab61..8e826faa 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -277,6 +277,10 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language) texts = text.split("\n") audio_opt = [] + if prompt_language == "en": + bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) + else: + bert1 = nonen_get_bert_inf(prompt_text, prompt_language) for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): @@ -286,11 +290,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones2, word2ph2, norm_text2 = clean_text_inf(text, text_language) else: phones2, word2ph2, norm_text2 = nonen_clean_text_inf(text, text_language) - - if prompt_language == "en": - bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) - else: - bert1 = nonen_get_bert_inf(prompt_text, prompt_language) if text_language == "en": bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language) From 0bcdf0155c340b32d18a33aaf7a96f43b8f1e91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Fri, 26 Jan 2024 14:09:50 +0800 Subject: [PATCH 07/19] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E8=AE=B0=E5=BF=86=E5=8A=9F=E8=83=BD=EF=BC=8C=E4=B8=8D=E7=94=A8?= =?UTF-8?q?=E4=BA=8C=E6=AC=A1=E9=80=89=E6=8B=A9=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加模型记忆功能,不用二次选择模型 --- GPT_SoVITS/inference_webui.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index bb571833..fdee8d9d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -6,10 +6,25 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) import pdb -gpt_path = os.environ.get( - "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" -) -sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") +if os.path.exists("./gweight.txt"): + with open("./gweight.txt", 'r',encoding="utf-8") as file: + gweight_data = file.read() + gpt_path = os.environ.get( + "gpt_path", gweight_data) +else: + gpt_path = os.environ.get( + "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt") + +if os.path.exists("./sweight.txt"): + with open("./sweight.txt", 'r',encoding="utf-8") as file: + sweight_data = file.read() + sovits_path = os.environ.get("sovits_path", sweight_data) +else: + sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth") +# gpt_path = os.environ.get( +# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +# ) +# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") cnhubert_base_path = os.environ.get( "cnhubert_base_path", "pretrained_models/chinese-hubert-base" ) @@ -124,6 +139,7 @@ def change_sovits_weights(sovits_path): vq_model = vq_model.to(device) vq_model.eval() print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) + with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path) change_sovits_weights(sovits_path) def change_gpt_weights(gpt_path): @@ -140,6 +156,7 @@ def change_gpt_weights(gpt_path): t2s_model.eval() total = sum([param.nelement() for param in t2s_model.parameters()]) print("Number of parameter: %.2fM" % (total / 1e6)) + with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path) change_gpt_weights(gpt_path) def get_spepc(hps, filename): From 8adcecd19ad2cba330a89d6d9a2744bd97f8846f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Fri, 26 Jan 2024 15:40:14 +0800 Subject: [PATCH 08/19] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=95=E5=AF=BC?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E8=AE=B0=E5=BF=86=E3=80=81=E5=BC=95=E5=AF=BC?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E6=96=87=E6=9C=AC=E8=AE=B0=E5=BF=86=E3=80=81?= =?UTF-8?q?=E8=AF=AD=E7=A7=8D=E8=AE=B0=E5=BF=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加引导音频记忆、引导音频文本记忆、语种记忆 --- GPT_SoVITS/inference_webui.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index fdee8d9d..9a9dd10b 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -64,6 +64,23 @@ elif torch.backends.mps.is_available(): else: device = "cpu" +# 操作记忆功能 + +file_path = './audio_log.txt' + +upload_audio_path = None +upload_audio_text = "" +upload_audio_lanuage = "中文" + +if os.path.exists(file_path): + with open(file_path, 'r',encoding="utf-8") as file: + text_data = file.read() + text_data = text_data.split("|") + + upload_audio_path = text_data[0] + upload_audio_text = text_data[1] + upload_audio_lanuage = text_data[2] + tokenizer = AutoTokenizer.from_pretrained(bert_path) bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) if is_half == True: @@ -183,6 +200,7 @@ dict_language={ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): + with open("./audio_log.txt","w",encoding="utf-8")as f:f.write(f"{wav_path_log}|{prompt_text}|{prompt_language}") t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") From 16196b6f38fe1dd381440fabb434976669d47b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Fri, 26 Jan 2024 15:42:49 +0800 Subject: [PATCH 09/19] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=95=E5=AF=BC?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E8=AE=B0=E5=BF=86=E3=80=81=E5=BC=95=E5=AF=BC?= =?UTF-8?q?=E9=9F=B3=E9=A2=91=E6=96=87=E6=9C=AC=E8=AE=B0=E5=BF=86=E3=80=81?= =?UTF-8?q?=E8=AF=AD=E7=A7=8D=E8=AE=B0=E5=BF=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加引导音频记忆、引导音频文本记忆、语种记忆 --- GPT_SoVITS/inference_webui.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 9a9dd10b..3b10fa0f 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -418,10 +418,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[]) gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): - inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath") - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") + inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath",value=upload_audio_path) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=upload_audio_text) prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") + label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n(upload_audio_lanuage) ) gr.Markdown(value=i18n("*请填写需要合成的目标文本")) with gr.Row(): From 2d1ddeca42db90c3fe2d0cd79480fd544d87f02b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 16:34:11 +0800 Subject: [PATCH 10/19] Add files via upload --- tools/uvr5/webui.py | 94 ++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index f1357e5e..97170bf0 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,11 +1,12 @@ import os import traceback,gradio as gr import logging -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() logger = logging.getLogger(__name__) -import ffmpeg +import librosa +import soundfile as sf import torch import sys from mdxnet import MDXNetDereverb @@ -19,7 +20,8 @@ for name in os.listdir(weight_uvr5_root): device=sys.argv[1] is_half=sys.argv[2] - +webui_port_uvr5=int(sys.argv[3]) +is_share=eval(sys.argv[4]) def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] @@ -50,40 +52,32 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format paths = [path.name for path in paths] for path in paths: inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 + if(os.path.isfile(inp_path)==False):continue try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 + done = 0 + try: + y, sr = librosa.load(inp_path, sr=None) + info = sf.info(inp_path) + channels = info.channels + if channels == 2 and sr == 44100: + need_reformat = 0 + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 + ) + done = 1 + else: + need_reformat = 1 + except: + need_reformat = 1 + traceback.print_exc() + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join(os.environ["TEMP"]), + os.path.basename(inp_path), ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % ( - os.path.join(os.environ["TEMP"]), - os.path.basename(inp_path), - ) - os.system( - "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" - % (inp_path, tmp_path) - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: + y_resampled = librosa.resample(y, sr, 44100) + sf.write(tmp_path, y_resampled, 44100, "PCM_16") + inp_path = tmp_path try: if done == 0: pre_fun._path_audio_( @@ -92,10 +86,21 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) + try: + if done == 0: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) + except: + infos.append("Oh my god. %s->%s"%(os.path.basename(inp_path), traceback.format_exc())) + yield "\n".join(infos) except: infos.append(traceback.format_exc()) yield "\n".join(infos) @@ -115,10 +120,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format yield "\n".join(infos) -with gr.Blocks(title="UVR5 WebUI") as app: +with gr.Blocks(title="RVC WebUI") as app: gr.Markdown( value= - "MIT license. https://github.com/Anjok07/ultimatevocalremovergui" + i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Tabs(): with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): @@ -143,7 +148,7 @@ with gr.Blocks(title="UVR5 WebUI") as app: minimum=0, maximum=20, step=1, - label="人声提取激进程度", + label=i18n("人声提取激进程度"), value=10, interactive=True, visible=False, # 先不开放调整 @@ -179,6 +184,7 @@ with gr.Blocks(title="UVR5 WebUI") as app: app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, - server_port=9873, + share=is_share, + server_port=webui_port_uvr5, quiet=True, -) \ No newline at end of file +) From 725471b0bfe527153b2712ac80dc617a06049d50 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 17:54:21 +0800 Subject: [PATCH 11/19] =?UTF-8?q?Revert=20"=E5=BC=95=E5=AF=BC=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E8=AE=B0=E5=BF=86=EF=BC=8C=E6=96=87=E6=9C=AC=E8=AE=B0?= =?UTF-8?q?=E5=BF=86=E5=92=8C=E8=AF=AD=E7=A7=8D=E8=AE=B0=E5=BF=86"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/inference_webui.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 1ac11c33..574b9d8d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -67,23 +67,6 @@ elif torch.backends.mps.is_available(): else: device = "cpu" -# 操作记忆功能 - -file_path = './audio_log.txt' - -upload_audio_path = None -upload_audio_text = "" -upload_audio_lanuage = "中文" - -if os.path.exists(file_path): - with open(file_path, 'r',encoding="utf-8") as file: - text_data = file.read() - text_data = text_data.split("|") - - upload_audio_path = text_data[0] - upload_audio_text = text_data[1] - upload_audio_lanuage = text_data[2] - tokenizer = AutoTokenizer.from_pretrained(bert_path) bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) if is_half == True: @@ -280,7 +263,6 @@ def nonen_get_bert_inf(text, language): def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): - with open("./audio_log.txt","w",encoding="utf-8")as f:f.write(f"{wav_path_log}|{prompt_text}|{prompt_language}") t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") @@ -500,10 +482,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[]) gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): - inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath",value=upload_audio_path) - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=upload_audio_text) + inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath") + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n(upload_audio_lanuage) + label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") ) gr.Markdown(value=i18n("*请填写需要合成的目标文本")) with gr.Row(): From 9fe955c1bf5f94546c9f699141281f2661c8a180 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:01:15 +0800 Subject: [PATCH 12/19] Add files via upload --- GPT_SoVITS/inference_webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 574b9d8d..51e18a0e 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -296,7 +296,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language) else: phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language) - texts = text.split("\n") + texts = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n").split("\n") audio_opt = [] if prompt_language == "en": bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) From 84ee471936b332bc2ccee024d6dfdedab4f0dc7b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:11:09 +0800 Subject: [PATCH 13/19] Add files via upload --- config.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/config.py b/config.py index 897f53c1..a95daf92 100644 --- a/config.py +++ b/config.py @@ -29,6 +29,19 @@ webui_port_subfix = 9871 api_port = 9880 +gpu_name = torch.cuda.get_device_name(0) +if ( + ("16" in gpu_name and "V100" not in gpu_name.upper()) + or "P40" in gpu_name.upper() + or "P10" in gpu_name.upper() + or "1060" in gpu_name + or "1070" in gpu_name + or "1080" in gpu_name +): + is_half=False + +if(is_half==False and infer_device=="cuda"):infer_device="cpu" + class Config: def __init__(self): From f4148cf77fb899c22bcdd4e773d2f24ab34a73e7 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:48:30 +0800 Subject: [PATCH 14/19] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=90=88=E6=88=90?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E4=B8=AD=E8=8B=B1=E6=B7=B7=E5=90=88=E3=80=81?= =?UTF-8?q?=E6=97=A5=E8=8B=B1=E6=B7=B7=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/inference_webui.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 51e18a0e..ad36b29d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -135,7 +135,8 @@ def change_sovits_weights(sovits_path): n_speakers=hps.data.n_speakers, **hps.model ) - del vq_model.enc_q + if("pretrained"not in sovits_path): + del vq_model.enc_q if is_half == True: vq_model = vq_model.half().to(device) else: @@ -261,8 +262,8 @@ def nonen_get_bert_inf(text, language): return bert - -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): +#i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切") +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,how_to_cut=i18n("不切")): t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") @@ -296,7 +297,13 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language) else: phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language) - texts = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n").split("\n") + if(how_to_cut==i18n("凑五句一切")):text=cut1(text) + elif(how_to_cut==i18n("凑50字一切")):text=cut2(text) + elif(how_to_cut==i18n("按中文句号。切")):text=cut3(text) + elif(how_to_cut==i18n("按英文句号.切")):text=cut4(text) + text = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n") + if(text[-1]not in splits):text+="。"if text_language=="zh"else "." + texts=text.split("\n") audio_opt = [] if prompt_language == "en": bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language) @@ -439,6 +446,9 @@ def cut2(inp): def cut3(inp): inp = inp.strip("\n") return "\n".join(["%s。" % item for item in inp.strip("。").split("。")]) +def cut4(inp): + inp = inp.strip("\n") + return "\n".join(["%s." % item for item in inp.strip(".").split(".")]) def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 @@ -487,17 +497,24 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: prompt_language = gr.Dropdown( label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") ) - gr.Markdown(value=i18n("*请填写需要合成的目标文本")) + gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。")) with gr.Row(): text = gr.Textbox(label=i18n("需要合成的文本"), value="") text_language = gr.Dropdown( label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文") ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),], + value=i18n("凑50字一切"), + interactive=True, + ) inference_button = gr.Button(i18n("合成语音"), variant="primary") output = gr.Audio(label=i18n("输出的语音")) + inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language], + [inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut], [output], ) @@ -507,10 +524,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: button1 = gr.Button(i18n("凑五句一切"), variant="primary") button2 = gr.Button(i18n("凑50字一切"), variant="primary") button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + button4 = gr.Button(i18n("按英文句号.切"), variant="primary") text_opt = gr.Textbox(label=i18n("切分后文本"), value="") button1.click(cut1, [text_inp], [text_opt]) button2.click(cut2, [text_inp], [text_opt]) button3.click(cut3, [text_inp], [text_opt]) + button4.click(cut4, [text_inp], [text_opt]) gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。")) app.queue(concurrency_count=511, max_size=1022).launch( From b24893f4d16789cad23337942727651155d15181 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:54:13 +0800 Subject: [PATCH 15/19] Update inference_webui.py --- GPT_SoVITS/inference_webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index ad36b29d..aad39920 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -302,7 +302,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, elif(how_to_cut==i18n("按中文句号。切")):text=cut3(text) elif(how_to_cut==i18n("按英文句号.切")):text=cut4(text) text = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n") - if(text[-1]not in splits):text+="。"if text_language=="zh"else "." + if(text[-1]not in splits):text+="。"if text_language!="en"else "." texts=text.split("\n") audio_opt = [] if prompt_language == "en": From 8651717a511721db90b764804d0701aea62b6ba7 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:58:23 +0800 Subject: [PATCH 16/19] Update Changelog_CN.md --- docs/cn/Changelog_CN.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 93fc8be7..beaba424 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -29,3 +29,22 @@ 3-优化模型文件排序逻辑 4-中文分词使用jieba_fast代替jieba + +### 20240126更新 + +1-支持输出文本中英混合、日英混合 + +2-输出可选切分模式 + +3-修复uvr5读取到目录自动跳出的问题 + +4-修复多个换行导致推理报错 + +5-去除推理界面大量冗余log + +6-支持mac训练推理 + +7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。 + + + From 6f1b4fd756996f2f3bc9543389f932575a5922cf Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:59:34 +0800 Subject: [PATCH 17/19] Update config.py --- config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config.py b/config.py index a95daf92..97369e70 100644 --- a/config.py +++ b/config.py @@ -40,8 +40,7 @@ if ( ): is_half=False -if(is_half==False and infer_device=="cuda"):infer_device="cpu" - +if(infer_device=="cpu"):is_half=False class Config: def __init__(self): From 5c6ba3ebcbd4c34a1ea5b96f130ea8a5ae60d818 Mon Sep 17 00:00:00 2001 From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:43:35 +0800 Subject: [PATCH 18/19] Update not neuraL tone words list. --- GPT_SoVITS/text/tone_sandhi.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index eafb179e..68f497db 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -455,6 +455,35 @@ class ToneSandhi: "电子", "人人", "虎虎", + '幺幺', + '干嘛', + '学子', + '哈哈', + '数数', + '袅袅', + '局地', + '以下', + '娃哈哈', + '花花草草', + '留得', + '耕地', + '想想', + '熙熙', + '攘攘', + '卵子', + '死死', + '冉冉', + '恳恳', + '佼佼', + '吵吵', + '打打', + '考考', + '整整', + '莘莘', + '落地', + '算子', + '家家户户', + '青青', } self.punc = ":,;。?!“”‘’':,;.?!" From 9b6f359d05bc10d931a0250609b545a12726a30b Mon Sep 17 00:00:00 2001 From: KakaruHayate <97896816+KakaruHayate@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:48:52 +0800 Subject: [PATCH 19/19] fix " ' " --- GPT_SoVITS/text/tone_sandhi.py | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index 68f497db..9f62abe9 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -455,35 +455,35 @@ class ToneSandhi: "电子", "人人", "虎虎", - '幺幺', - '干嘛', - '学子', - '哈哈', - '数数', - '袅袅', - '局地', - '以下', - '娃哈哈', - '花花草草', - '留得', - '耕地', - '想想', - '熙熙', - '攘攘', - '卵子', - '死死', - '冉冉', - '恳恳', - '佼佼', - '吵吵', - '打打', - '考考', - '整整', - '莘莘', - '落地', - '算子', - '家家户户', - '青青', + "幺幺", + "干嘛", + "学子", + "哈哈", + "数数", + "袅袅", + "局地", + "以下", + "娃哈哈", + "花花草草", + "留得", + "耕地", + "想想", + "熙熙", + "攘攘", + "卵子", + "死死", + "冉冉", + "恳恳", + "佼佼", + "吵吵", + "打打", + "考考", + "整整", + "莘莘", + "落地", + "算子", + "家家户户", + "青青", } self.punc = ":,;。?!“”‘’':,;.?!"