From b1a60801e6ebde7695f78758cc8614e0c3d68a3e Mon Sep 17 00:00:00 2001 From: Ftps Date: Wed, 17 Jan 2024 00:57:07 +0900 Subject: [PATCH 01/58] remove full-width replace init to gitignore --- tools/damo_asr/models/.gitignore | 2 ++ tools/damo_asr/models/init | 1 - tools/init | 1 - tools/uvr5/init | 1 - tools/uvr5/lib/utils.py | 2 +- 5 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 tools/damo_asr/models/.gitignore delete mode 100644 tools/damo_asr/models/init delete mode 100644 tools/init delete mode 100644 tools/uvr5/init diff --git a/tools/damo_asr/models/.gitignore b/tools/damo_asr/models/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/tools/damo_asr/models/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/tools/damo_asr/models/init b/tools/damo_asr/models/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/damo_asr/models/init +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/init b/tools/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/init +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/uvr5/init b/tools/uvr5/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/uvr5/init +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py index 946eb0c..5e8cd22 100644 --- a/tools/uvr5/lib/utils.py +++ b/tools/uvr5/lib/utils.py @@ -24,7 +24,7 @@ def make_padding(width, cropsize, offset): def inference(X_spec, device, model, aggressiveness, data): """ - data : dic configs + data : dic configs """ def _execute( From b28194ea7612215f4cca8976fe13311edc26d4f6 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:01:03 +0100 Subject: [PATCH 02/58] Code refactor --- webui.py | 1319 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 964 insertions(+), 355 deletions(-) diff --git a/webui.py b/webui.py index dbccba7..703c597 100644 --- a/webui.py +++ b/webui.py @@ -1,35 +1,48 @@ -import json,yaml,warnings,torch +import json, yaml, warnings, torch import platform warnings.filterwarnings("ignore") torch.manual_seed(233333) -import os,pdb,sys +import os, sys + now_dir = os.getcwd() tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp import site -site_packages_root="%s/runtime/Lib/site-packages"%now_dir + +site_packages_root = "%s/runtime/Lib/site-packages" % now_dir for path in site.getsitepackages(): - if("site-packages"in path):site_packages_root=path + if "site-packages" in path: + site_packages_root = path os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -with open("%s/users.pth"%(site_packages_root),"w")as f: - f.write("%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"%(now_dir,now_dir,now_dir,now_dir,now_dir)) +with open("%s/users.pth" % (site_packages_root), "w") as f: + f.write( + "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" + % (now_dir, now_dir, now_dir, now_dir, now_dir) + ) import traceback + sys.path.append(now_dir) -import shutil -import pdb import gradio as gr from subprocess import Popen -import signal -from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix +from config import ( + python_exec, + infer_device, + is_half, + exp_root, + webui_port_main, + webui_port_infer_tts, + webui_port_uvr5, + webui_port_subfix, +) from i18n.i18n import I18nAuto + i18n = I18nAuto() -from scipy.io import wavfile -from tools.my_utils import load_audio from multiprocessing import cpu_count -n_cpu=cpu_count() + +n_cpu = cpu_count() # 判断是否有能用来训练和加速推理的N卡 ngpu = torch.cuda.device_count() @@ -40,11 +53,42 @@ if_gpu_ok = False if torch.cuda.is_available() or ngpu != 0: for i in range(ngpu): gpu_name = torch.cuda.get_device_name(i) - if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L"]): + if any( + value in gpu_name.upper() + for value in [ + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "L", + ] + ): # A10#A100#V100#A40#P40#M40#K80#A4500 if_gpu_ok = True # 至少有一张能用的N卡 gpu_infos.append("%s\t%s" % (i, gpu_name)) - mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4)) + mem.append( + int( + torch.cuda.get_device_properties(i).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + ) if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) default_batch_size = min(mem) // 2 @@ -53,230 +97,395 @@ else: default_batch_size = 1 gpus = "-".join([i[0] for i in gpu_infos]) -pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth" -pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_name = ( + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +) + + def get_weights_names(): SoVITS_names = [pretrained_sovits_name] for name in os.listdir(SoVITS_weight_root): - if name.endswith(".pth"):SoVITS_names.append(name) + if name.endswith(".pth"): + SoVITS_names.append(name) GPT_names = [pretrained_gpt_name] for name in os.listdir(GPT_weight_root): - if name.endswith(".ckpt"): GPT_names.append(name) - return SoVITS_names,GPT_names -SoVITS_weight_root="SoVITS_weights" -GPT_weight_root="GPT_weights" -os.makedirs(SoVITS_weight_root,exist_ok=True) -os.makedirs(GPT_weight_root,exist_ok=True) -SoVITS_names,GPT_names = get_weights_names() + if name.endswith(".ckpt"): + GPT_names.append(name) + return SoVITS_names, GPT_names + + +SoVITS_weight_root = "SoVITS_weights" +GPT_weight_root = "GPT_weights" +os.makedirs(SoVITS_weight_root, exist_ok=True) +os.makedirs(GPT_weight_root, exist_ok=True) +SoVITS_names, GPT_names = get_weights_names() + def change_choices(): SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names), "__type__": "update"}, {"choices": sorted(GPT_names), "__type__": "update"} + return {"choices": sorted(SoVITS_names), "__type__": "update"}, { + "choices": sorted(GPT_names), + "__type__": "update", + } + + +p_label = None +p_uvr5 = None +p_asr = None +p_tts_inference = None + +system = platform.system() -p_label=None -p_uvr5=None -p_asr=None -p_tts_inference=None -system=platform.system() def kill_process(pid): - if(system=="Windows"): + if system == "Windows": cmd = "taskkill /t /f /pid %s" % pid else: - cmd = "kill -9 %s"%pid + cmd = "kill -9 %s" % pid print(cmd) - os.system(cmd)###linux上杀了webui,可能还会没杀干净。。。 + os.system(cmd) ###linux上杀了webui,可能还会没杀干净。。。 # os.kill(p_label.pid,19)#主进程#控制台进程#python子进程###不好使,连主进程的webui一起关了,辣鸡 -def change_label(if_label,path_list): + +def change_label(if_label, path_list): global p_label - if(if_label==True and p_label==None): - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s'%(python_exec,path_list,webui_port_subfix) + if if_label == True and p_label == None: + cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s' % ( + python_exec, + path_list, + webui_port_subfix, + ) yield "打标工具WebUI已开启" print(cmd) p_label = Popen(cmd, shell=True) - elif(if_label==False and p_label!=None): + elif if_label == False and p_label != None: kill_process(p_label.pid) - p_label=None + p_label = None yield "打标工具WebUI已关闭" + def change_uvr5(if_uvr5): global p_uvr5 - if(if_uvr5==True and p_uvr5==None): - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5) + if if_uvr5 == True and p_uvr5 == None: + cmd = '"%s" tools/uvr5/webui.py "%s" %s %s' % ( + python_exec, + infer_device, + is_half, + webui_port_uvr5, + ) yield "UVR5已开启" print(cmd) p_uvr5 = Popen(cmd, shell=True) - elif(if_uvr5==False and p_uvr5!=None): + elif if_uvr5 == False and p_uvr5 != None: kill_process(p_uvr5.pid) - p_uvr5=None + p_uvr5 = None yield "UVR5已关闭" -def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path): + +def change_tts_inference( + if_tts, bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path +): global p_tts_inference - if(if_tts==True and p_tts_inference==None): - os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) - os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) - os.environ["cnhubert_base_path"]=cnhubert_base_path - os.environ["bert_path"]=bert_path - os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number - os.environ["is_half"]=str(is_half) - os.environ["infer_ttswebui"]=str(webui_port_infer_tts) - cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec) + if if_tts == True and p_tts_inference == None: + os.environ["gpt_path"] = ( + gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) + ) + os.environ["sovits_path"] = ( + sovits_path + if "/" in sovits_path + else "%s/%s" % (SoVITS_weight_root, sovits_path) + ) + os.environ["cnhubert_base_path"] = cnhubert_base_path + os.environ["bert_path"] = bert_path + os.environ["_CUDA_VISIBLE_DEVICES"] = gpu_number + os.environ["is_half"] = str(is_half) + os.environ["infer_ttswebui"] = str(webui_port_infer_tts) + cmd = '"%s" GPT_SoVITS/inference_webui.py' % (python_exec) yield "TTS推理进程已开启" print(cmd) p_tts_inference = Popen(cmd, shell=True) - elif(if_tts==False and p_tts_inference!=None): + elif if_tts == False and p_tts_inference != None: kill_process(p_tts_inference.pid) - p_tts_inference=None + p_tts_inference = None yield "TTS推理进程已关闭" def open_asr(asr_inp_dir): global p_asr - if(p_asr==None): - cmd = '"%s" tools/damo_asr/cmd-asr.py "%s"'%(python_exec,asr_inp_dir) - yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + if p_asr == None: + cmd = '"%s" tools/damo_asr/cmd-asr.py "%s"' % (python_exec, asr_inp_dir) + yield "ASR任务开启:%s" % cmd, {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } print(cmd) p_asr = Popen(cmd, shell=True) p_asr.wait() - p_asr=None - yield "ASR任务完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_asr = None + yield "ASR任务完成", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的ASR任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + def close_asr(): global p_asr - if(p_asr!=None): + if p_asr != None: kill_process(p_asr.pid) - p_asr=None - return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_asr = None + return ( + "已终止ASR进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -''' + +""" button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Bb,button1Ba_open,button1Ba_close]) button1Ba_close.click(close1Ba, [], [info1Bb,button1Ba_open,button1Ba_close]) -''' -p_train_SoVITS=None -def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D): - global p_train_SoVITS - if(p_train_SoVITS==None): - with open("GPT_SoVITS/configs/s2.json")as f: - data=f.read() - data=json.loads(data) - s2_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["train"]["text_low_lr_rate"]=text_low_lr_rate - data["train"]["pretrained_s2G"]=pretrained_s2G - data["train"]["pretrained_s2D"]=pretrained_s2D - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["save_every_epoch"]=save_every_epoch - data["train"]["gpu_numbers"]=gpu_numbers1Ba - data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir - data["save_weight_dir"]=SoVITS_weight_root - data["name"]=exp_name - tmp_config_path="TEMP/tmp_s2.json" - with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) +""" +p_train_SoVITS = None - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) - yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + +def open1Ba( + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, +): + global p_train_SoVITS + if p_train_SoVITS == None: + with open("GPT_SoVITS/configs/s2.json") as f: + data = f.read() + data = json.loads(data) + s2_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s2" % (s2_dir), exist_ok=True) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["train"]["text_low_lr_rate"] = text_low_lr_rate + data["train"]["pretrained_s2G"] = pretrained_s2G + data["train"]["pretrained_s2D"] = pretrained_s2D + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["save_every_epoch"] = save_every_epoch + data["train"]["gpu_numbers"] = gpu_numbers1Ba + data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir + data["save_weight_dir"] = SoVITS_weight_root + data["name"] = exp_name + tmp_config_path = "TEMP/tmp_s2.json" + with open(tmp_config_path, "w") as f: + f.write(json.dumps(data)) + + cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % ( + python_exec, + tmp_config_path, + ) + yield "SoVITS训练开始:%s" % cmd, {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } print(cmd) p_train_SoVITS = Popen(cmd, shell=True) p_train_SoVITS.wait() - p_train_SoVITS=None - yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_train_SoVITS = None + yield "SoVITS训练完成", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务", { + "__type__": "update", + "visible": False, + }, {"__type__": "update", "visible": True} + def close1Ba(): global p_train_SoVITS - if(p_train_SoVITS!=None): + if p_train_SoVITS != None: kill_process(p_train_SoVITS.pid) - p_train_SoVITS=None - return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_train_SoVITS = None + return ( + "已终止SoVITS训练", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -p_train_GPT=None -def open1Bb(batch_size,total_epoch,exp_name,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): + +p_train_GPT = None + + +def open1Bb( + batch_size, + total_epoch, + exp_name, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers, + pretrained_s1, +): global p_train_GPT - if(p_train_GPT==None): - with open("GPT_SoVITS/configs/s1longer.yaml")as f: - data=f.read() - data=yaml.load(data, Loader=yaml.FullLoader) - s1_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["pretrained_s1"]=pretrained_s1 - data["train"]["save_every_n_epoch"]=save_every_epoch - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["if_save_latest"]=if_save_latest - data["train"]["half_weights_save_dir"]=GPT_weight_root - data["train"]["exp_name"]=exp_name - data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir - data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir - data["output_dir"]="%s/logs_s1"%s1_dir + if p_train_GPT == None: + with open("GPT_SoVITS/configs/s1longer.yaml") as f: + data = f.read() + data = yaml.load(data, Loader=yaml.FullLoader) + s1_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s1" % (s1_dir), exist_ok=True) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["pretrained_s1"] = pretrained_s1 + data["train"]["save_every_n_epoch"] = save_every_epoch + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["if_save_latest"] = if_save_latest + data["train"]["half_weights_save_dir"] = GPT_weight_root + data["train"]["exp_name"] = exp_name + data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir + data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir + data["output_dir"] = "%s/logs_s1" % s1_dir - os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",") - os.environ["hz"]="25hz" - tmp_config_path="TEMP/tmp_s1.yaml" - with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) + os.environ["_CUDA_VISIBLE_DEVICES"] = gpu_numbers.replace("-", ",") + os.environ["hz"] = "25hz" + tmp_config_path = "TEMP/tmp_s1.yaml" + with open(tmp_config_path, "w") as f: + f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) - yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % ( + python_exec, + tmp_config_path, + ) + yield "GPT训练开始:%s" % cmd, {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } print(cmd) p_train_GPT = Popen(cmd, shell=True) p_train_GPT.wait() - p_train_GPT=None - yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_train_GPT = None + yield "GPT训练完成", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务", { + "__type__": "update", + "visible": False, + }, {"__type__": "update", "visible": True} + def close1Bb(): global p_train_GPT - if(p_train_GPT!=None): + if p_train_GPT != None: kill_process(p_train_GPT.pid) - p_train_GPT=None - return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + p_train_GPT = None + return ( + "已终止GPT训练", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps_slice=[] -def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): + +ps_slice = [] + + +def open_slice( + inp, + opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + n_parts, +): global ps_slice - if(os.path.exists(inp)==False): - yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + if os.path.exists(inp) == False: + yield "输入路径不存在", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } return - if os.path.isfile(inp):n_parts=1 - elif os.path.isdir(inp):pass + if os.path.isfile(inp): + n_parts = 1 + elif os.path.isdir(inp): + pass else: - yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "输入路径存在但既不是文件也不是文件夹", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } return - if (ps_slice == []): + if ps_slice == []: for i_part in range(n_parts): - cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts) + cmd = ( + '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' + "" + % ( + python_exec, + inp, + opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + i_part, + n_parts, + ) + ) print(cmd) p = Popen(cmd, shell=True) ps_slice.append(p) - yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "切割执行中", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } for p in ps_slice: p.wait() - ps_slice=[] - yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + ps_slice = [] + yield "切割结束", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + def close_slice(): global ps_slice - if (ps_slice != []): + if ps_slice != []: for p_slice in ps_slice: try: kill_process(p_slice.pid) except: traceback.print_exc() - ps_slice=[] - return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps_slice = [] + return ( + "已终止所有切割进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -''' + +""" inp_text= os.environ.get("inp_text") inp_wav_dir= os.environ.get("inp_wav_dir") exp_name= os.environ.get("exp_name") @@ -285,53 +494,71 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir")#"/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name bert_pretrained_dir= os.environ.get("bert_pretrained_dir")#"/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" -''' -ps1a=[] -def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): +""" +ps1a = [] + + +def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): global ps1a - if (ps1a == []): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), - "bert_pretrained_dir":bert_pretrained_dir, + if ps1a == []: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "bert_pretrained_dir": bert_pretrained_dir, } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { "i_part": str(i_part), "all_parts": str(all_parts), "_CUDA_VISIBLE_DEVICES": gpu_names[i_part], - "is_half": str(is_half) + "is_half": str(is_half), } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1a.append(p) - yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "文本进程执行中", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } for p in ps1a: p.wait() - ps1a=[] - yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + ps1a = [] + yield "文本进程结束", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + def close1a(): global ps1a - if (ps1a != []): + if ps1a != []: for p1a in ps1a: try: kill_process(p1a.pid) except: traceback.print_exc() - ps1a=[] - return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -''' + ps1a = [] + return ( + "已终止所有1a进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +""" inp_text= os.environ.get("inp_text") inp_wav_dir= os.environ.get("inp_wav_dir") exp_name= os.environ.get("exp_name") @@ -340,21 +567,23 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir") cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") -''' -ps1b=[] -def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): +""" +ps1b = [] + + +def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): global ps1b - if (ps1b == []): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), - "cnhubert_base_dir":ssl_pretrained_dir, - "is_half": str(is_half) + if ps1b == []: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "cnhubert_base_dir": ssl_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -364,29 +593,47 @@ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = ( + '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec + ) print(cmd) p = Popen(cmd, shell=True) ps1b.append(p) - yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } for p in ps1b: p.wait() - ps1b=[] - yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + ps1b = [] + yield "SSL提取进程结束", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", { + "__type__": "update", + "visible": False, + }, {"__type__": "update", "visible": True} + def close1b(): global ps1b - if (ps1b != []): + if ps1b != []: for p1b in ps1b: try: kill_process(p1b.pid) except: traceback.print_exc() - ps1b=[] - return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -''' + ps1b = [] + return ( + "已终止所有1b进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + +""" inp_text= os.environ.get("inp_text") exp_name= os.environ.get("exp_name") i_part= os.environ.get("i_part") @@ -394,21 +641,23 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir") pretrained_s2G= os.environ.get("pretrained_s2G") -''' -ps1c=[] -def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): +""" +ps1c = [] + + +def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): global ps1c - if (ps1c == []): - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", - "is_half": str(is_half) + if ps1c == []: + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -418,48 +667,76 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1c.append(p) - yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } for p in ps1c: p.wait() - ps1c=[] - yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + ps1c = [] + yield "语义token提取进程结束", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", { + "__type__": "update", + "visible": False, + }, {"__type__": "update", "visible": True} + def close1c(): global ps1c - if (ps1c != []): + if ps1c != []: for p1c in ps1c: try: kill_process(p1c.pid) except: traceback.print_exc() - ps1c=[] - return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1c = [] + return ( + "已终止所有语义token进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + + #####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G -ps1abc=[] -def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path): +ps1abc = [] + + +def open1abc( + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + ssl_pretrained_dir, + pretrained_s2G_path, +): global ps1abc - if (ps1abc == []): - opt_dir="%s/%s"%(exp_root,exp_name) + if ps1abc == []: + opt_dir = "%s/%s" % (exp_root, exp_name) try: #############################1a - path_text="%s/2-name2text.txt" % opt_dir - if(os.path.exists(path_text)==False): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, - "is_half": str(is_half) + path_text = "%s/2-name2text.txt" % opt_dir + if os.path.exists(path_text) == False: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers1a.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1a.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -469,34 +746,43 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield "进度:1a-ing", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + for p in ps1abc: + p.wait() opt = [] - for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) + for i_part in range( + all_parts + ): # txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r",encoding="utf8") as f: + with open(txt_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(txt_path) - with open(path_text, "w",encoding="utf8") as f: + with open(path_text, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] - #############################1b - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "cnhubert_base_dir":ssl_pretrained_dir, + yield "进度:1a-done", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, } - gpu_names=gpu_numbers1Ba.split("-") - all_parts=len(gpu_names) + ps1abc = [] + #############################1b + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "cnhubert_base_dir": ssl_pretrained_dir, + } + gpu_names = gpu_numbers1Ba.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -506,26 +792,36 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = ( + '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' + % python_exec + ) print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] + yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + for p in ps1abc: + p.wait() + yield "进度:1a1b-done", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + ps1abc = [] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False): - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", + if os.path.exists(path_semantic) == False: + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", } - gpu_names=gpu_numbers1c.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1c.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -535,74 +831,137 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = ( + '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' + % python_exec + ) print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + for p in ps1abc: + p.wait() opt = ["item_name semantic_audio"] for i_part in range(all_parts): semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r",encoding="utf8") as f: + with open(semantic_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(semantic_path) - with open(path_semantic, "w",encoding="utf8") as f: + with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "进度:all-done", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } ps1abc = [] - yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield "一键三连进程结束", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } except: traceback.print_exc() close1abc() - yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield "一键三连中途报错", {"__type__": "update", "visible": True}, { + "__type__": "update", + "visible": False, + } else: - yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { + "__type__": "update", + "visible": True, + } + def close1abc(): global ps1abc - if (ps1abc != []): + if ps1abc != []: for p1abc in ps1abc: try: kill_process(p1abc.pid) except: traceback.print_exc() - ps1abc=[] - return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1abc = [] + return ( + "已终止所有一键三连进程", + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value= - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + value="本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." ) with gr.Tabs(): - with gr.TabItem("0-前置数据集获取工具"):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 + with gr.TabItem("0-前置数据集获取工具"): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 gr.Markdown(value="0a-UVR5人声伴奏分离&去混响去延迟工具") with gr.Row(): - if_uvr5 = gr.Checkbox(label="是否开启UVR5-WebUI",show_label=True) + if_uvr5 = gr.Checkbox(label="是否开启UVR5-WebUI", show_label=True) uvr5_info = gr.Textbox(label="UVR5进程输出信息") gr.Markdown(value="0b-语音切分工具") with gr.Row(): with gr.Row(): - slice_inp_path=gr.Textbox(label="音频自动切分输入路径,可文件可文件夹",value="") - slice_opt_root=gr.Textbox(label="切分后的子音频的输出根目录",value="output/slicer_opt") - threshold=gr.Textbox(label="threshold:音量小于这个值视作静音的备选切割点",value="-34") - min_length=gr.Textbox(label="min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值",value="4000") - min_interval=gr.Textbox(label="min_interval:最短切割间隔",value="300") - hop_size=gr.Textbox(label="hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",value="10") - max_sil_kept=gr.Textbox(label="max_sil_kept:切完后静音最多留多长",value="500") + slice_inp_path = gr.Textbox(label="音频自动切分输入路径,可文件可文件夹", value="") + slice_opt_root = gr.Textbox( + label="切分后的子音频的输出根目录", value="output/slicer_opt" + ) + threshold = gr.Textbox( + label="threshold:音量小于这个值视作静音的备选切割点", value="-34" + ) + min_length = gr.Textbox( + label="min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值", value="4000" + ) + min_interval = gr.Textbox(label="min_interval:最短切割间隔", value="300") + hop_size = gr.Textbox( + label="hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)", value="10" + ) + max_sil_kept = gr.Textbox( + label="max_sil_kept:切完后静音最多留多长", value="500" + ) with gr.Row(): - open_slicer_button=gr.Button("开启语音切割", variant="primary",visible=True) - close_slicer_button=gr.Button("终止语音切割", variant="primary",visible=False) - _max=gr.Slider(minimum=0,maximum=1,step=0.05,label="max:归一化后最大值多少",value=0.9,interactive=True) - alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label="alpha_mix:混多少比例归一化后音频进来",value=0.25,interactive=True) - n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label="切割使用的进程数",value=4,interactive=True) + open_slicer_button = gr.Button( + "开启语音切割", variant="primary", visible=True + ) + close_slicer_button = gr.Button( + "终止语音切割", variant="primary", visible=False + ) + _max = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label="max:归一化后最大值多少", + value=0.9, + interactive=True, + ) + alpha = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label="alpha_mix:混多少比例归一化后音频进来", + value=0.25, + interactive=True, + ) + n_process = gr.Slider( + minimum=1, + maximum=n_cpu, + step=1, + label="切割使用的进程数", + value=4, + interactive=True, + ) slicer_info = gr.Textbox(label="语音切割进程输出信息") gr.Markdown(value="0c-中文批量离线ASR工具") with gr.Row(): - open_asr_button = gr.Button("开启离线批量ASR", variant="primary",visible=True) - close_asr_button = gr.Button("终止ASR进程", variant="primary",visible=False) + open_asr_button = gr.Button( + "开启离线批量ASR", variant="primary", visible=True + ) + close_asr_button = gr.Button( + "终止ASR进程", variant="primary", visible=False + ) asr_inp_dir = gr.Textbox( label="批量ASR(中文only)输入文件夹路径", value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx", @@ -611,115 +970,365 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: asr_info = gr.Textbox(label="ASR进程输出信息") gr.Markdown(value="0d-语音文本校对标注工具") with gr.Row(): - if_label = gr.Checkbox(label="是否开启打标WebUI",show_label=True) + if_label = gr.Checkbox(label="是否开启打标WebUI", show_label=True) path_list = gr.Textbox( label="打标数据标注文件路径", value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", interactive=True, ) label_info = gr.Textbox(label="打标工具进程输出信息") - if_label.change(change_label, [if_label,path_list], [label_info]) + if_label.change(change_label, [if_label, path_list], [label_info]) if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info]) - open_asr_button.click(open_asr, [asr_inp_dir], [asr_info,open_asr_button,close_asr_button]) - close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) - open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button]) - close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) + open_asr_button.click( + open_asr, [asr_inp_dir], [asr_info, open_asr_button, close_asr_button] + ) + close_asr_button.click( + close_asr, [], [asr_info, open_asr_button, close_asr_button] + ) + open_slicer_button.click( + open_slice, + [ + slice_inp_path, + slice_opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + n_process, + ], + [slicer_info, open_slicer_button, close_slicer_button], + ) + close_slicer_button.click( + close_slice, [], [slicer_info, open_slicer_button, close_slicer_button] + ) with gr.TabItem("1-GPT-SoVITS-TTS"): with gr.Row(): exp_name = gr.Textbox(label="*实验/模型名", value="xxx", interactive=True) - gpu_info = gr.Textbox(label="显卡信息", value=gpu_info, visible=True, interactive=False) - pretrained_s2G = gr.Textbox(label="预训练的SoVITS-G模型路径", value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True) - pretrained_s2D = gr.Textbox(label="预训练的SoVITS-D模型路径", value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True) - pretrained_s1 = gr.Textbox(label="预训练的GPT模型路径", value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True) + gpu_info = gr.Textbox( + label="显卡信息", value=gpu_info, visible=True, interactive=False + ) + pretrained_s2G = gr.Textbox( + label="预训练的SoVITS-G模型路径", + value="GPT_SoVITS/pretrained_models/s2G488k.pth", + interactive=True, + ) + pretrained_s2D = gr.Textbox( + label="预训练的SoVITS-D模型路径", + value="GPT_SoVITS/pretrained_models/s2D488k.pth", + interactive=True, + ) + pretrained_s1 = gr.Textbox( + label="预训练的GPT模型路径", + value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + interactive=True, + ) with gr.TabItem("1A-训练集格式化工具"): gr.Markdown(value="输出logs/实验名目录下应有23456开头的文件和文件夹") with gr.Row(): - inp_text = gr.Textbox(label="*文本标注文件",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True) - inp_wav_dir = gr.Textbox(label="*训练集音频文件目录",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",interactive=True) + inp_text = gr.Textbox( + label="*文本标注文件", + value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", + interactive=True, + ) + inp_wav_dir = gr.Textbox( + label="*训练集音频文件目录", + value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", + interactive=True, + ) gr.Markdown(value="1Aa-文本内容") with gr.Row(): - gpu_numbers1a = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) - bert_pretrained_dir = gr.Textbox(label="预训练的中文BERT模型路径",value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False) - button1a_open = gr.Button("开启文本获取", variant="primary",visible=True) - button1a_close = gr.Button("终止文本获取进程", variant="primary",visible=False) - info1a=gr.Textbox(label="文本进程输出信息") + gpu_numbers1a = gr.Textbox( + label="GPU卡号以-分割,每个卡号一个进程", + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + bert_pretrained_dir = gr.Textbox( + label="预训练的中文BERT模型路径", + value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + interactive=False, + ) + button1a_open = gr.Button("开启文本获取", variant="primary", visible=True) + button1a_close = gr.Button( + "终止文本获取进程", variant="primary", visible=False + ) + info1a = gr.Textbox(label="文本进程输出信息") gr.Markdown(value="1Ab-SSL自监督特征提取") with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) - cnhubert_base_dir = gr.Textbox(label="预训练的SSL模型路径",value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False) - button1b_open = gr.Button("开启SSL提取", variant="primary",visible=True) - button1b_close = gr.Button("终止SSL提取进程", variant="primary",visible=False) - info1b=gr.Textbox(label="SSL进程输出信息") + gpu_numbers1Ba = gr.Textbox( + label="GPU卡号以-分割,每个卡号一个进程", + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + cnhubert_base_dir = gr.Textbox( + label="预训练的SSL模型路径", + value="GPT_SoVITS/pretrained_models/chinese-hubert-base", + interactive=False, + ) + button1b_open = gr.Button( + "开启SSL提取", variant="primary", visible=True + ) + button1b_close = gr.Button( + "终止SSL提取进程", variant="primary", visible=False + ) + info1b = gr.Textbox(label="SSL进程输出信息") gr.Markdown(value="1Ac-语义token提取") with gr.Row(): - gpu_numbers1c = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) - button1c_open = gr.Button("开启语义token提取", variant="primary",visible=True) - button1c_close = gr.Button("终止语义token提取进程", variant="primary",visible=False) - info1c=gr.Textbox(label="语义token提取进程输出信息") + gpu_numbers1c = gr.Textbox( + label="GPU卡号以-分割,每个卡号一个进程", + value="%s-%s" % (gpus, gpus), + interactive=True, + ) + button1c_open = gr.Button( + "开启语义token提取", variant="primary", visible=True + ) + button1c_close = gr.Button( + "终止语义token提取进程", variant="primary", visible=False + ) + info1c = gr.Textbox(label="语义token提取进程输出信息") gr.Markdown(value="1Aabc-训练集格式化一键三连") with gr.Row(): - button1abc_open = gr.Button("开启一键三连", variant="primary",visible=True) - button1abc_close = gr.Button("终止一键三连", variant="primary",visible=False) - info1abc=gr.Textbox(label="一键三连进程输出信息") - button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close]) - button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close]) - button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close]) - button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close]) - button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close]) - button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close]) - button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close]) - button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close]) + button1abc_open = gr.Button( + "开启一键三连", variant="primary", visible=True + ) + button1abc_close = gr.Button( + "终止一键三连", variant="primary", visible=False + ) + info1abc = gr.Textbox(label="一键三连进程输出信息") + button1a_open.click( + open1a, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1a, bert_pretrained_dir], + [info1a, button1a_open, button1a_close], + ) + button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) + button1b_open.click( + open1b, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], + [info1b, button1b_open, button1b_close], + ) + button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) + button1c_open.click( + open1c, + [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], + [info1c, button1c_open, button1c_close], + ) + button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) + button1abc_open.click( + open1abc, + [ + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + cnhubert_base_dir, + pretrained_s2G, + ], + [info1abc, button1abc_open, button1abc_close], + ) + button1abc_close.click( + close1abc, [], [info1abc, button1abc_open, button1abc_close] + ) with gr.TabItem("1B-微调训练"): gr.Markdown(value="1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。") with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch = gr.Slider(minimum=1,maximum=20,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True) - text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label="文本模块学习率权重",value=0.4,interactive=True) - save_every_epoch = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True) - if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程", value="%s" % (gpus), interactive=True) + batch_size = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + total_epoch = gr.Slider( + minimum=1, + maximum=20, + step=1, + label=i18n("总训练轮数total_epoch,不建议太高"), + value=8, + interactive=True, + ) + text_low_lr_rate = gr.Slider( + minimum=0.2, + maximum=0.6, + step=0.05, + label="文本模块学习率权重", + value=0.4, + interactive=True, + ) + save_every_epoch = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=4, + interactive=True, + ) + if_save_latest = gr.Checkbox( + label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + gpu_numbers1Ba = gr.Textbox( + label="GPU卡号以-分割,每个卡号一个进程", + value="%s" % (gpus), + interactive=True, + ) with gr.Row(): - button1Ba_open = gr.Button("开启SoVITS训练", variant="primary",visible=True) - button1Ba_close = gr.Button("终止SoVITS训练", variant="primary",visible=False) - info1Ba=gr.Textbox(label="SoVITS训练进程输出信息") + button1Ba_open = gr.Button( + "开启SoVITS训练", variant="primary", visible=True + ) + button1Ba_close = gr.Button( + "终止SoVITS训练", variant="primary", visible=False + ) + info1Ba = gr.Textbox(label="SoVITS训练进程输出信息") gr.Markdown(value="1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。") with gr.Row(): - batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch1Bb = gr.Slider(minimum=2,maximum=100,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True) - if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True) - gpu_numbers1Bb = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程", value="%s" % (gpus), interactive=True) + batch_size1Bb = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + total_epoch1Bb = gr.Slider( + minimum=2, + maximum=100, + step=1, + label=i18n("总训练轮数total_epoch"), + value=15, + interactive=True, + ) + if_save_latest1Bb = gr.Checkbox( + label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights1Bb = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + save_every_epoch1Bb = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + gpu_numbers1Bb = gr.Textbox( + label="GPU卡号以-分割,每个卡号一个进程", + value="%s" % (gpus), + interactive=True, + ) with gr.Row(): - button1Bb_open = gr.Button("开启GPT训练", variant="primary",visible=True) - button1Bb_close = gr.Button("终止GPT训练", variant="primary",visible=False) - info1Bb=gr.Textbox(label="GPT训练进程输出信息") - button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close]) - button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close]) - button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close]) - button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close]) + button1Bb_open = gr.Button( + "开启GPT训练", variant="primary", visible=True + ) + button1Bb_close = gr.Button( + "终止GPT训练", variant="primary", visible=False + ) + info1Bb = gr.Textbox(label="GPT训练进程输出信息") + button1Ba_open.click( + open1Ba, + [ + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + ], + [info1Ba, button1Ba_open, button1Ba_close], + ) + button1Ba_close.click( + close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close] + ) + button1Bb_open.click( + open1Bb, + [ + batch_size1Bb, + total_epoch1Bb, + exp_name, + if_save_latest1Bb, + if_save_every_weights1Bb, + save_every_epoch1Bb, + gpu_numbers1Bb, + pretrained_s1, + ], + [info1Bb, button1Bb_open, button1Bb_close], + ) + button1Bb_close.click( + close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close] + ) with gr.TabItem("1C-推理"): - gr.Markdown(value="选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。") + gr.Markdown( + value="选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" + ) with gr.Row(): - GPT_dropdown = gr.Dropdown(label="*GPT模型列表", choices=sorted(GPT_names),value=pretrained_gpt_name) - SoVITS_dropdown = gr.Dropdown(label="*SoVITS模型列表", choices=sorted(SoVITS_names),value=pretrained_sovits_name) - gpu_number_1C=gr.Textbox(label="GPU卡号,只能填1个整数", value=gpus, interactive=True) + GPT_dropdown = gr.Dropdown( + label="*GPT模型列表", + choices=sorted(GPT_names), + value=pretrained_gpt_name, + ) + SoVITS_dropdown = gr.Dropdown( + label="*SoVITS模型列表", + choices=sorted(SoVITS_names), + value=pretrained_sovits_name, + ) + gpu_number_1C = gr.Textbox( + label="GPU卡号,只能填1个整数", value=gpus, interactive=True + ) refresh_button = gr.Button("刷新模型路径", variant="primary") - refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown]) + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[SoVITS_dropdown, GPT_dropdown], + ) with gr.Row(): if_tts = gr.Checkbox(label="是否开启TTS推理WebUI", show_label=True) tts_info = gr.Textbox(label="TTS推理WebUI进程输出信息") - if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info]) - with gr.TabItem("2-GPT-SoVITS-变声"):gr.Markdown(value="施工中,请静候佳音") + if_tts.change( + change_tts_inference, + [ + if_tts, + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + ], + [tts_info], + ) + with gr.TabItem("2-GPT-SoVITS-变声"): + gr.Markdown(value="施工中,请静候佳音") - ''' + """ os.environ["gpt_path"]=gpt_path os.environ["sovits_path"]=sovits_path#bert_pretrained_dir os.environ["cnhubert_base_path"]=cnhubert_base_path#cnhubert_base_dir os.environ["bert_path"]=bert_path os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number - ''' + """ app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", From 4e064427223713d305ac271c961f31f8977d774d Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:01:16 +0100 Subject: [PATCH 03/58] Fix reqs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 5ab846f..eef377f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ torchaudio modelscope sentencepiece transformers +yaml \ No newline at end of file From 2c18beebb71432e5665e9a6a620b2fde8466fb23 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:01:27 +0100 Subject: [PATCH 04/58] add gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e82a98 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +env +runtime \ No newline at end of file From e71eb24b28d64b9f025bc659a1918bbe1e072783 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:03:38 +0100 Subject: [PATCH 05/58] fix reqs x2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eef377f..665602e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,4 @@ torchaudio modelscope sentencepiece transformers -yaml \ No newline at end of file +PyYAML \ No newline at end of file From c761de73f40155f3ed5048c688fe2a92807865a4 Mon Sep 17 00:00:00 2001 From: Ftps Date: Wed, 17 Jan 2024 01:04:01 +0900 Subject: [PATCH 06/58] add --- GPT_SoVITS/pretrained_models/.gitignore | 2 ++ GPT_SoVITS/pretrained_models/init | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 GPT_SoVITS/pretrained_models/.gitignore delete mode 100644 GPT_SoVITS/pretrained_models/init diff --git a/GPT_SoVITS/pretrained_models/.gitignore b/GPT_SoVITS/pretrained_models/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/GPT_SoVITS/pretrained_models/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/GPT_SoVITS/pretrained_models/init b/GPT_SoVITS/pretrained_models/init deleted file mode 100644 index 8b13789..0000000 --- a/GPT_SoVITS/pretrained_models/init +++ /dev/null @@ -1 +0,0 @@ - From 9031ac9a92b4d36c20f8f78e6af7c24f2348d064 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:04:43 +0100 Subject: [PATCH 07/58] change i18n folder --- GPT_SoVITS/process_ckpt.py | 2 +- {i18n => tools/i18n}/i18n.py | 0 {i18n => tools/i18n}/locale/en_US.json | 0 {i18n => tools/i18n}/locale/es_ES.json | 0 {i18n => tools/i18n}/locale/fr_FR.json | 0 {i18n => tools/i18n}/locale/it_IT.json | 0 {i18n => tools/i18n}/locale/ja_JP.json | 0 {i18n => tools/i18n}/locale/ru_RU.json | 0 {i18n => tools/i18n}/locale/tr_TR.json | 0 {i18n => tools/i18n}/locale/zh_CN.json | 0 {i18n => tools/i18n}/locale/zh_HK.json | 0 {i18n => tools/i18n}/locale/zh_SG.json | 0 {i18n => tools/i18n}/locale/zh_TW.json | 0 {i18n => tools/i18n}/locale_diff.py | 0 {i18n => tools/i18n}/scan_i18n.py | 0 tools/uvr5/webui.py | 2 +- webui.py | 2 +- 17 files changed, 3 insertions(+), 3 deletions(-) rename {i18n => tools/i18n}/i18n.py (100%) rename {i18n => tools/i18n}/locale/en_US.json (100%) rename {i18n => tools/i18n}/locale/es_ES.json (100%) rename {i18n => tools/i18n}/locale/fr_FR.json (100%) rename {i18n => tools/i18n}/locale/it_IT.json (100%) rename {i18n => tools/i18n}/locale/ja_JP.json (100%) rename {i18n => tools/i18n}/locale/ru_RU.json (100%) rename {i18n => tools/i18n}/locale/tr_TR.json (100%) rename {i18n => tools/i18n}/locale/zh_CN.json (100%) rename {i18n => tools/i18n}/locale/zh_HK.json (100%) rename {i18n => tools/i18n}/locale/zh_SG.json (100%) rename {i18n => tools/i18n}/locale/zh_TW.json (100%) rename {i18n => tools/i18n}/locale_diff.py (100%) rename {i18n => tools/i18n}/scan_i18n.py (100%) diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index ed64cf2..170dbb3 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -4,7 +4,7 @@ import traceback from collections import OrderedDict import torch -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() def savee(ckpt, name, epoch, steps, hps): try: diff --git a/i18n/i18n.py b/tools/i18n/i18n.py similarity index 100% rename from i18n/i18n.py rename to tools/i18n/i18n.py diff --git a/i18n/locale/en_US.json b/tools/i18n/locale/en_US.json similarity index 100% rename from i18n/locale/en_US.json rename to tools/i18n/locale/en_US.json diff --git a/i18n/locale/es_ES.json b/tools/i18n/locale/es_ES.json similarity index 100% rename from i18n/locale/es_ES.json rename to tools/i18n/locale/es_ES.json diff --git a/i18n/locale/fr_FR.json b/tools/i18n/locale/fr_FR.json similarity index 100% rename from i18n/locale/fr_FR.json rename to tools/i18n/locale/fr_FR.json diff --git a/i18n/locale/it_IT.json b/tools/i18n/locale/it_IT.json similarity index 100% rename from i18n/locale/it_IT.json rename to tools/i18n/locale/it_IT.json diff --git a/i18n/locale/ja_JP.json b/tools/i18n/locale/ja_JP.json similarity index 100% rename from i18n/locale/ja_JP.json rename to tools/i18n/locale/ja_JP.json diff --git a/i18n/locale/ru_RU.json b/tools/i18n/locale/ru_RU.json similarity index 100% rename from i18n/locale/ru_RU.json rename to tools/i18n/locale/ru_RU.json diff --git a/i18n/locale/tr_TR.json b/tools/i18n/locale/tr_TR.json similarity index 100% rename from i18n/locale/tr_TR.json rename to tools/i18n/locale/tr_TR.json diff --git a/i18n/locale/zh_CN.json b/tools/i18n/locale/zh_CN.json similarity index 100% rename from i18n/locale/zh_CN.json rename to tools/i18n/locale/zh_CN.json diff --git a/i18n/locale/zh_HK.json b/tools/i18n/locale/zh_HK.json similarity index 100% rename from i18n/locale/zh_HK.json rename to tools/i18n/locale/zh_HK.json diff --git a/i18n/locale/zh_SG.json b/tools/i18n/locale/zh_SG.json similarity index 100% rename from i18n/locale/zh_SG.json rename to tools/i18n/locale/zh_SG.json diff --git a/i18n/locale/zh_TW.json b/tools/i18n/locale/zh_TW.json similarity index 100% rename from i18n/locale/zh_TW.json rename to tools/i18n/locale/zh_TW.json diff --git a/i18n/locale_diff.py b/tools/i18n/locale_diff.py similarity index 100% rename from i18n/locale_diff.py rename to tools/i18n/locale_diff.py diff --git a/i18n/scan_i18n.py b/tools/i18n/scan_i18n.py similarity index 100% rename from i18n/scan_i18n.py rename to tools/i18n/scan_i18n.py diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index 051ece5..11b39f5 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,7 +1,7 @@ import os import traceback,gradio as gr import logging -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() logger = logging.getLogger(__name__) diff --git a/webui.py b/webui.py index 703c597..e51b9e6 100644 --- a/webui.py +++ b/webui.py @@ -37,7 +37,7 @@ from config import ( webui_port_uvr5, webui_port_subfix, ) -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() from multiprocessing import cpu_count From ebe96e0ebe25d6ac96ad1728f1a5d34fb94d4898 Mon Sep 17 00:00:00 2001 From: Ftps Date: Wed, 17 Jan 2024 01:05:50 +0900 Subject: [PATCH 08/58] add --- GPT_SoVITS/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 GPT_SoVITS/init diff --git a/GPT_SoVITS/init b/GPT_SoVITS/init deleted file mode 100644 index 8b13789..0000000 --- a/GPT_SoVITS/init +++ /dev/null @@ -1 +0,0 @@ - From 0d92575115ccd9d444854e8fc7244cd814d9f1cf Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:10:27 +0100 Subject: [PATCH 09/58] Code refactor + remove unused imports --- GPT_SoVITS/inference_webui.py | 342 +++++++++++++++++++++------------- GPT_SoVITS/process_ckpt.py | 9 +- GPT_SoVITS/s1_train.py | 127 ++++++++----- GPT_SoVITS/s2_train.py | 340 ++++++++++++++++++++++++--------- GPT_SoVITS/utils.py | 188 +++++++++++++------ 5 files changed, 671 insertions(+), 335 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4917d32..7d79fbd 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1,49 +1,55 @@ import os -gpt_path=os.environ.get("gpt_path","pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt") -sovits_path=os.environ.get("sovits_path","pretrained_models/s2G488k.pth") -cnhubert_base_path=os.environ.get("cnhubert_base_path","pretrained_models/chinese-hubert-base") -bert_path=os.environ.get("bert_path","pretrained_models/chinese-roberta-wwm-ext-large") -infer_ttswebui=os.environ.get("infer_ttswebui",9872) -infer_ttswebui=int(infer_ttswebui) -if("_CUDA_VISIBLE_DEVICES"in os.environ): - os.environ["CUDA_VISIBLE_DEVICES"]=os.environ["_CUDA_VISIBLE_DEVICES"] -is_half=eval(os.environ.get("is_half","True")) + +gpt_path = os.environ.get( + "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +) +sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") +cnhubert_base_path = os.environ.get( + "cnhubert_base_path", "pretrained_models/chinese-hubert-base" +) +bert_path = os.environ.get( + "bert_path", "pretrained_models/chinese-roberta-wwm-ext-large" +) +infer_ttswebui = os.environ.get("infer_ttswebui", 9872) +infer_ttswebui = int(infer_ttswebui) +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +is_half = eval(os.environ.get("is_half", "True")) import gradio as gr from transformers import AutoModelForMaskedLM, AutoTokenizer -import sys,torch,numpy as np -from pathlib import Path -import os,pdb,utils,librosa,math,traceback,requests,argparse,torch,multiprocessing,pandas as pd,torch.multiprocessing as mp,soundfile +import torch, numpy as np +import os, librosa, torch + # torch.backends.cuda.sdp_kernel("flash") # torch.backends.cuda.enable_flash_sdp(True) # torch.backends.cuda.enable_mem_efficient_sdp(True) # Not avaliable if torch version is lower than 2.0 # torch.backends.cuda.enable_math_sdp(True) -from random import shuffle -from AR.utils import get_newest_ckpt -from glob import glob -from tqdm import tqdm from feature_extractor import cnhubert -cnhubert.cnhubert_base_path=cnhubert_base_path -from io import BytesIO + +cnhubert.cnhubert_base_path = cnhubert_base_path from module.models import SynthesizerTrn from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from AR.utils.io import load_yaml_config from text import cleaned_text_to_sequence -from text.cleaner import text_to_sequence, clean_text +from text.cleaner import clean_text from time import time as ttime from module.mel_processing import spectrogram_torch from my_utils import load_audio -device="cuda" +device = "cuda" tokenizer = AutoTokenizer.from_pretrained(bert_path) -bert_model=AutoModelForMaskedLM.from_pretrained(bert_path) -if(is_half==True):bert_model=bert_model.half().to(device) -else:bert_model=bert_model.to(device) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +if is_half == True: + bert_model = bert_model.half().to(device) +else: + bert_model = bert_model.to(device) + + # bert_model=bert_model.to(device) def get_bert_feature(text, word2ph): with torch.no_grad(): inputs = tokenizer(text, return_tensors="pt") for i in inputs: - inputs[i] = inputs[i].to(device)#####输入是long不用管精度问题,精度随bert_model + inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model res = bert_model(**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] assert len(word2ph) == len(text) @@ -55,9 +61,12 @@ def get_bert_feature(text, word2ph): # if(is_half==True):phone_level_feature=phone_level_feature.half() return phone_level_feature.T + n_semantic = 1024 -dict_s2=torch.load(sovits_path,map_location="cpu") -hps=dict_s2["config"] +dict_s2 = torch.load(sovits_path, map_location="cpu") +hps = dict_s2["config"] + + class DictToAttrRecursive: def __init__(self, input_dict): for key, value in input_dict.items(): @@ -67,206 +76,271 @@ class DictToAttrRecursive: else: setattr(self, key, value) + hps = DictToAttrRecursive(hps) -hps.model.semantic_frame_rate="25hz" -dict_s1=torch.load(gpt_path,map_location="cpu") -config=dict_s1["config"] -ssl_model=cnhubert.get_model() -if(is_half==True):ssl_model=ssl_model.half().to(device) -else:ssl_model=ssl_model.to(device) +hps.model.semantic_frame_rate = "25hz" +dict_s1 = torch.load(gpt_path, map_location="cpu") +config = dict_s1["config"] +ssl_model = cnhubert.get_model() +if is_half == True: + ssl_model = ssl_model.half().to(device) +else: + ssl_model = ssl_model.to(device) vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model) -if(is_half==True):vq_model=vq_model.half().to(device) -else:vq_model=vq_model.to(device) + **hps.model +) +if is_half == True: + vq_model = vq_model.half().to(device) +else: + vq_model = vq_model.to(device) vq_model.eval() -print(vq_model.load_state_dict(dict_s2["weight"],strict=False)) +print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) hz = 50 -max_sec = config['data']['max_sec'] +max_sec = config["data"]["max_sec"] # t2s_model = Text2SemanticLightningModule.load_from_checkpoint(checkpoint_path=gpt_path, config=config, map_location="cpu")#########todo -t2s_model = Text2SemanticLightningModule(config,"ojbk",is_train=False) +t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False) t2s_model.load_state_dict(dict_s1["weight"]) -if(is_half==True):t2s_model=t2s_model.half() -t2s_model=t2s_model.to(device) +if is_half == True: + t2s_model = t2s_model.half() +t2s_model = t2s_model.to(device) t2s_model.eval() total = sum([param.nelement() for param in t2s_model.parameters()]) print("Number of parameter: %.2fM" % (total / 1e6)) + + def get_spepc(hps, filename): - audio=load_audio(filename,int(hps.data.sampling_rate)) - audio=torch.FloatTensor(audio) + audio = load_audio(filename, int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) return spec -dict_language={ - "中文":"zh", - "英文":"en", - "日文":"ja" -} -def get_tts_wav(ref_wav_path,prompt_text,prompt_language,text,text_language): + +dict_language = {"中文": "zh", "英文": "en", "日文": "ja"} + + +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): t0 = ttime() - prompt_text=prompt_text.strip("\n") - prompt_language,text=prompt_language,text.strip("\n") + prompt_text = prompt_text.strip("\n") + prompt_language, text = prompt_language, text.strip("\n") with torch.no_grad(): wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙 wav16k = torch.from_numpy(wav16k) - if(is_half==True):wav16k=wav16k.half().to(device) - else:wav16k=wav16k.to(device) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float() + if is_half == True: + wav16k = wav16k.half().to(device) + else: + wav16k = wav16k.to(device) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() codes = vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] t1 = ttime() - prompt_language=dict_language[prompt_language] - text_language=dict_language[text_language] + prompt_language = dict_language[prompt_language] + text_language = dict_language[text_language] phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language) - phones1=cleaned_text_to_sequence(phones1) - texts=text.split("\n") + phones1 = cleaned_text_to_sequence(phones1) + texts = text.split("\n") audio_opt = [] - zero_wav=np.zeros(int(hps.data.sampling_rate*0.3),dtype=np.float16 if is_half==True else np.float32) + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) for text in texts: phones2, word2ph2, norm_text2 = clean_text(text, text_language) phones2 = cleaned_text_to_sequence(phones2) - if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device) - else:bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device) - if(text_language=="zh"):bert2 = get_bert_feature(norm_text2, word2ph2).to(device) - else:bert2 = torch.zeros((1024, len(phones2))).to(bert1) + if prompt_language == "zh": + bert1 = get_bert_feature(norm_text1, word2ph1).to(device) + else: + bert1 = torch.zeros( + (1024, len(phones1)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + if text_language == "zh": + bert2 = get_bert_feature(norm_text2, word2ph2).to(device) + else: + bert2 = torch.zeros((1024, len(phones2))).to(bert1) bert = torch.cat([bert1, bert2], 1) - all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) bert = bert.to(device).unsqueeze(0) all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) prompt = prompt_semantic.unsqueeze(0).to(device) t2 = ttime() with torch.no_grad(): # pred_semantic = t2s_model.model.infer( - pred_semantic,idx = t2s_model.model.infer_panel( + pred_semantic, idx = t2s_model.model.infer_panel( all_phoneme_ids, all_phoneme_len, prompt, bert, # prompt_phone_len=ph_offset, - top_k=config['inference']['top_k'], - early_stop_num=hz * max_sec) + top_k=config["inference"]["top_k"], + early_stop_num=hz * max_sec, + ) t3 = ttime() # print(pred_semantic.shape,idx) - pred_semantic = pred_semantic[:,-idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次 - refer = get_spepc(hps, ref_wav_path)#.to(device) - if(is_half==True):refer=refer.half().to(device) - else:refer=refer.to(device) + pred_semantic = pred_semantic[:, -idx:].unsqueeze( + 0 + ) # .unsqueeze(0)#mq要多unsqueeze一次 + refer = get_spepc(hps, ref_wav_path) # .to(device) + if is_half == True: + refer = refer.half().to(device) + else: + refer = refer.to(device) # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] - audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer).detach().cpu().numpy()[0, 0]###试试重建不带上prompt部分 + audio = ( + vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer + ) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 audio_opt.append(audio) audio_opt.append(zero_wav) t4 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) - yield hps.data.sampling_rate,(np.concatenate(audio_opt,0)*32768).astype(np.int16) + yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype( + np.int16 + ) + + +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} # 不考虑省略号 -splits={",","。","?","!",",",".","?","!","~",":",":","—","…",}#不考虑省略号 def split(todo_text): todo_text = todo_text.replace("……", "。").replace("——", ",") - if (todo_text[-1] not in splits): todo_text += "。" + if todo_text[-1] not in splits: + todo_text += "。" i_split_head = i_split_tail = 0 len_text = len(todo_text) todo_texts = [] - while (1): - if (i_split_head >= len_text): break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 - if (todo_text[i_split_head] in splits): + while 1: + if i_split_head >= len_text: + break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 + if todo_text[i_split_head] in splits: i_split_head += 1 todo_texts.append(todo_text[i_split_tail:i_split_head]) i_split_tail = i_split_head else: i_split_head += 1 return todo_texts + + def cut1(inp): - inp=inp.strip("\n") - inps=split(inp) - split_idx=list(range(0,len(inps),5)) - split_idx[-1]=None - if(len(split_idx)>1): - opts=[] - for idx in range(len(split_idx)-1): - opts.append("".join(inps[split_idx[idx]:split_idx[idx+1]])) + inp = inp.strip("\n") + inps = split(inp) + split_idx = list(range(0, len(inps), 5)) + split_idx[-1] = None + if len(split_idx) > 1: + opts = [] + for idx in range(len(split_idx) - 1): + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) else: - opts=[inp] + opts = [inp] return "\n".join(opts) + def cut2(inp): - inp=inp.strip("\n") - inps=split(inp) - if(len(inps)<2):return [inp] - opts=[] - summ=0 - tmp_str="" + inp = inp.strip("\n") + inps = split(inp) + if len(inps) < 2: + return [inp] + opts = [] + summ = 0 + tmp_str = "" for i in range(len(inps)): - summ+=len(inps[i]) - tmp_str+=inps[i] - if(summ>50): - summ=0 + summ += len(inps[i]) + tmp_str += inps[i] + if summ > 50: + summ = 0 opts.append(tmp_str) - tmp_str="" - if(tmp_str!=""):opts.append(tmp_str) - if(len(opts[-1])<50):##如果最后一个太短了,和前一个合一起 - opts[-2]=opts[-2]+opts[-1] - opts=opts[:-1] + tmp_str = "" + if tmp_str != "": + opts.append(tmp_str) + if len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 + opts[-2] = opts[-2] + opts[-1] + opts = opts[:-1] return "\n".join(opts) + def cut3(inp): - inp=inp.strip("\n") - return "\n".join(["%s。"%item for item in inp.strip("。").split("。")]) + inp = inp.strip("\n") + return "\n".join(["%s。" % item for item in inp.strip("。").split("。")]) + with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value= - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + value="本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." ) # with gr.Tabs(): # with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): with gr.Group(): - gr.Markdown( - value= - "*请上传并填写参考信息" - ) + gr.Markdown(value="*请上传并填写参考信息") with gr.Row(): inp_ref = gr.Audio(label="请上传参考音频", type="filepath") - prompt_text= gr.Textbox(label="参考音频的文本",value="") - prompt_language= gr.Dropdown(label="参考音频的语种",choices=["中文","英文","日文"],value="中文") - gr.Markdown( - value= - "*请填写需要合成的目标文本" - ) + prompt_text = gr.Textbox(label="参考音频的文本", value="") + prompt_language = gr.Dropdown( + label="参考音频的语种", choices=["中文", "英文", "日文"], value="中文" + ) + gr.Markdown(value="*请填写需要合成的目标文本") with gr.Row(): - text=gr.Textbox(label="需要合成的文本",value="") - text_language = gr.Dropdown(label="需要合成的语种", choices=["中文", "英文", "日文"],value="中文") - inference_button=gr.Button("合成语音", variant="primary") + text = gr.Textbox(label="需要合成的文本", value="") + text_language = gr.Dropdown( + label="需要合成的语种", choices=["中文", "英文", "日文"], value="中文" + ) + inference_button = gr.Button("合成语音", variant="primary") output = gr.Audio(label="输出的语音") - inference_button.click(get_tts_wav, [inp_ref, prompt_text,prompt_language, text,text_language], [output]) - - gr.Markdown( - value= - "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。" + inference_button.click( + get_tts_wav, + [inp_ref, prompt_text, prompt_language, text, text_language], + [output], ) + + gr.Markdown(value="文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。") with gr.Row(): - text_inp=gr.Textbox(label="需要合成的切分前文本",value="") + text_inp = gr.Textbox(label="需要合成的切分前文本", value="") button1 = gr.Button("凑五句一切", variant="primary") button2 = gr.Button("凑50字一切", variant="primary") button3 = gr.Button("按中文句号。切", variant="primary") text_opt = gr.Textbox(label="切分后文本", value="") - button1.click(cut1,[text_inp],[text_opt]) - button2.click(cut2,[text_inp],[text_opt]) - button3.click(cut3,[text_inp],[text_opt]) - gr.Markdown( - value= - "后续将支持混合语种编码文本输入。" - ) + button1.click(cut1, [text_inp], [text_opt]) + button2.click(cut2, [text_inp], [text_opt]) + button3.click(cut3, [text_inp], [text_opt]) + gr.Markdown(value="后续将支持混合语种编码文本输入。") app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, server_port=infer_ttswebui, quiet=True, -) \ No newline at end of file +) diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 170dbb3..7483337 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -1,11 +1,12 @@ -import os -import sys import traceback from collections import OrderedDict import torch from tools.i18n.i18n import I18nAuto + i18n = I18nAuto() + + def savee(ckpt, name, epoch, steps, hps): try: opt = OrderedDict() @@ -15,8 +16,8 @@ def savee(ckpt, name, epoch, steps, hps): continue opt["weight"][key] = ckpt[key].half() opt["config"] = hps - opt["info"] = "%sepoch_%siteration" % (epoch,steps) - torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir,name)) + opt["info"] = "%sepoch_%siteration" % (epoch, steps) + torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) return "Success." except: return traceback.format_exc() diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 37166cb..4a77006 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -2,56 +2,84 @@ import os import pdb -if("_CUDA_VISIBLE_DEVICES"in os.environ): - os.environ["CUDA_VISIBLE_DEVICES"]=os.environ["_CUDA_VISIBLE_DEVICES"] +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] import argparse import logging from pathlib import Path -import torch,platform +import torch, platform from pytorch_lightning import seed_everything from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.loggers import TensorBoardLogger#WandbLogger +from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger from pytorch_lightning.strategies import DDPStrategy from AR.data.data_module import Text2SemanticDataModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.utils.io import load_yaml_config -logging.getLogger('numba').setLevel(logging.WARNING) -logging.getLogger('matplotlib').setLevel(logging.WARNING) -torch.set_float32_matmul_precision('high') + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) +torch.set_float32_matmul_precision("high") from AR.utils import get_newest_ckpt from collections import OrderedDict + + class my_model_ckpt(ModelCheckpoint): - def __init__(self,config,if_save_latest,if_save_every_weights,half_weights_save_dir,exp_name,**kwargs): + def __init__( + self, + config, + if_save_latest, + if_save_every_weights, + half_weights_save_dir, + exp_name, + **kwargs + ): super().__init__(**kwargs) - self.if_save_latest=if_save_latest - self.if_save_every_weights=if_save_every_weights - self.half_weights_save_dir=half_weights_save_dir - self.exp_name=exp_name - self.config=config + self.if_save_latest = if_save_latest + self.if_save_every_weights = if_save_every_weights + self.half_weights_save_dir = half_weights_save_dir + self.exp_name = exp_name + self.config = config def on_train_epoch_end(self, trainer, pl_module): - if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): + if not self._should_skip_saving_checkpoint( + trainer + ) and self._should_save_on_train_epoch_end(trainer): monitor_candidates = self._monitor_candidates(trainer) - if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0: - if(self.if_save_latest==True):####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt - to_clean=list(os.listdir(self.dirpath)) + if ( + self._every_n_epochs >= 1 + and (trainer.current_epoch + 1) % self._every_n_epochs == 0 + ): + if ( + self.if_save_latest == True + ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt + to_clean = list(os.listdir(self.dirpath)) self._save_topk_checkpoint(trainer, monitor_candidates) - if (self.if_save_latest == True): + if self.if_save_latest == True: for name in to_clean: try: - os.remove("%s/%s"%(self.dirpath,name)) - except:pass - if(self.if_save_every_weights==True): - to_save_od=OrderedDict() - to_save_od["weight"]=OrderedDict() - dictt=trainer.strategy._lightning_module.state_dict() - for key in dictt:to_save_od["weight"][key]=dictt[key].half() - to_save_od["config"]=self.config - to_save_od["info"]="GPT-e%s"%(trainer.current_epoch+1) - torch.save(to_save_od,"%s/%s-e%s.ckpt"%(self.half_weights_save_dir,self.exp_name,trainer.current_epoch+1)) + os.remove("%s/%s" % (self.dirpath, name)) + except: + pass + if self.if_save_every_weights == True: + to_save_od = OrderedDict() + to_save_od["weight"] = OrderedDict() + dictt = trainer.strategy._lightning_module.state_dict() + for key in dictt: + to_save_od["weight"][key] = dictt[key].half() + to_save_od["config"] = self.config + to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) + torch.save( + to_save_od, + "%s/%s-e%s.ckpt" + % ( + self.half_weights_save_dir, + self.exp_name, + trainer.current_epoch + 1, + ), + ) self._save_last_checkpoint(trainer, monitor_candidates) @@ -61,41 +89,45 @@ def main(args): output_dir = Path(config["output_dir"]) output_dir.mkdir(parents=True, exist_ok=True) - ckpt_dir = output_dir / 'ckpt' + ckpt_dir = output_dir / "ckpt" ckpt_dir.mkdir(parents=True, exist_ok=True) - seed_everything(config["train"]["seed"], workers=True) ckpt_callback: ModelCheckpoint = my_model_ckpt( config=config, - if_save_latest=config["train"]["if_save_latest"], if_save_every_weights=config["train"]["if_save_every_weights"], half_weights_save_dir=config["train"]["half_weights_save_dir"], exp_name=config["train"]["exp_name"], + if_save_latest=config["train"]["if_save_latest"], + if_save_every_weights=config["train"]["if_save_every_weights"], + half_weights_save_dir=config["train"]["half_weights_save_dir"], + exp_name=config["train"]["exp_name"], save_top_k=-1, - monitor='top_3_acc', - mode='max', + monitor="top_3_acc", + mode="max", save_on_train_epoch_end=True, every_n_epochs=config["train"]["save_every_n_epoch"], dirpath=ckpt_dir, ) - logger = TensorBoardLogger( - name=output_dir.stem, - save_dir=output_dir - ) + logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) trainer: Trainer = Trainer( max_epochs=config["train"]["epochs"], - accelerator='gpu', + accelerator="gpu", # val_check_interval=9999999999999999999999,###不要验证 # check_val_every_n_epoch=None, limit_val_batches=0, devices=-1, benchmark=False, fast_dev_run=False, - strategy=DDPStrategy(process_group_backend="nccl"if platform.system()!="Windows"else "gloo"), + strategy=DDPStrategy( + process_group_backend="nccl" if platform.system() != "Windows" else "gloo" + ), precision=config["train"]["precision"], - logger=logger,num_sanity_val_steps=0, - callbacks=[ckpt_callback]) + logger=logger, + num_sanity_val_steps=0, + callbacks=[ckpt_callback], + ) model: Text2SemanticLightningModule = Text2SemanticLightningModule( - config, output_dir) + config, output_dir + ) data_module: Text2SemanticDataModule = Text2SemanticDataModule( config, @@ -116,14 +148,15 @@ def main(args): # srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '-c', - '--config_file', + "-c", + "--config_file", type=str, - default='configs/s1longer.yaml', - help='path of config file') + default="configs/s1longer.yaml", + help="path of config file", + ) # args for dataset # parser.add_argument('--train_semantic_path',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/6-name2semantic.tsv') # parser.add_argument('--train_phoneme_path', type=str, default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/2-name2text.txt') diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index 7a455eb..d2ec262 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -1,4 +1,5 @@ -import utils,os +import utils, os + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") import torch @@ -6,11 +7,12 @@ from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter import torch.multiprocessing as mp -import torch.distributed as dist,traceback +import torch.distributed as dist, traceback from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging,traceback +import logging, traceback + logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) @@ -20,37 +22,42 @@ from module import commons from module.data_utils import ( TextAudioSpeakerLoader, TextAudioSpeakerCollate, - DistributedBucketSampler + DistributedBucketSampler, ) from module.models import ( SynthesizerTrn, MultiPeriodDiscriminator, ) -from module.losses import ( - generator_loss, - discriminator_loss, - feature_loss, - kl_loss -) +from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch from process_ckpt import savee + torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = False ###反正A100fp32更快,那试试tf32吧 torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True -torch.set_float32_matmul_precision('medium')#最低精度但最快(也就快一丁点),对于结果造成不了影响 +torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 # from config import pretrained_s2G,pretrained_s2D global_step = 0 + + def main(): """Assume Single Node Multi GPUs Training Only""" assert torch.cuda.is_available(), "CPU training is not allowed." n_gpus = torch.cuda.device_count() - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = str(randint(20000, 55555)) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) - mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) def run(rank, n_gpus, hps): @@ -62,21 +69,54 @@ def run(rank, n_gpus, hps): writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) - dist.init_process_group(backend='gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus,rank=rank) + dist.init_process_group( + backend="gloo" if os.name == "nt" else "nccl", + init_method="env://", + world_size=n_gpus, + rank=rank, + ) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) - train_dataset = TextAudioSpeakerLoader(hps.data)######## + train_dataset = TextAudioSpeakerLoader(hps.data) ######## train_sampler = DistributedBucketSampler( train_dataset, hps.train.batch_size, - [32, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900], + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1100, + 1200, + 1300, + 1400, + 1500, + 1600, + 1700, + 1800, + 1900, + ], num_replicas=n_gpus, rank=rank, - shuffle=True) + shuffle=True, + ) collate_fn = TextAudioSpeakerCollate() - train_loader = DataLoader(train_dataset, num_workers=6, shuffle=False, pin_memory=True, - collate_fn=collate_fn, batch_sampler=train_sampler,persistent_workers=True,prefetch_factor=16) + train_loader = DataLoader( + train_dataset, + num_workers=6, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=16, + ) # if rank == 0: # eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) # eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, @@ -87,17 +127,21 @@ def run(rank, n_gpus, hps): hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model).cuda(rank) + **hps.model, + ).cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) for name, param in net_g.named_parameters(): if not param.requires_grad: - print(name,"not requires_grad") + print(name, "not requires_grad") te_p = list(map(id, net_g.enc_p.text_embedding.parameters())) et_p = list(map(id, net_g.enc_p.encoder_text.parameters())) mrte_p = list(map(id, net_g.enc_p.mrte.parameters())) - base_params = filter(lambda p: id(p) not in te_p+et_p+mrte_p and p.requires_grad, net_g.parameters()) + base_params = filter( + lambda p: id(p) not in te_p + et_p + mrte_p and p.requires_grad, + net_g.parameters(), + ) # te_p=net_g.enc_p.text_embedding.parameters() # et_p=net_g.enc_p.encoder_text.parameters() @@ -106,31 +150,46 @@ def run(rank, n_gpus, hps): optim_g = torch.optim.AdamW( # filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 [ - {"params":base_params,"lr":hps.train.learning_rate}, - {"params":net_g.enc_p.text_embedding.parameters(),"lr":hps.train.learning_rate*hps.train.text_low_lr_rate}, - {"params":net_g.enc_p.encoder_text.parameters(),"lr":hps.train.learning_rate*hps.train.text_low_lr_rate}, - {"params":net_g.enc_p.mrte.parameters(),"lr":hps.train.learning_rate*hps.train.text_low_lr_rate}, + {"params": base_params, "lr": hps.train.learning_rate}, + { + "params": net_g.enc_p.text_embedding.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.encoder_text.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.mrte.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, ], hps.train.learning_rate, betas=hps.train.betas, - eps=hps.train.eps) + eps=hps.train.eps, + ) optim_d = torch.optim.AdamW( net_d.parameters(), hps.train.learning_rate, betas=hps.train.betas, - eps=hps.train.eps) - net_g = DDP(net_g, device_ids=[rank],find_unused_parameters=True) - net_d = DDP(net_d, device_ids=[rank],find_unused_parameters=True) + eps=hps.train.eps, + ) + net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) try: # 如果能加载自动resume _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2"%hps.data.exp_dir, "D_*.pth"), net_d, optim_d + utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"), + net_d, + optim_d, ) # D多半加载没事 if rank == 0: logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2"%hps.data.exp_dir, "G_*.pth"), net_g, optim_g + utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"), + net_g, + optim_g, ) global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 @@ -144,7 +203,8 @@ def run(rank, n_gpus, hps): logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) print( net_g.module.load_state_dict( - torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],strict=False + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, ) ) ##测试不加载优化器 if hps.train.pretrained_s2D != "": @@ -159,8 +219,12 @@ def run(rank, n_gpus, hps): # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=-1) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=-1 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=-1 + ) for _ in range(epoch_str): scheduler_g.step() scheduler_d.step() @@ -169,17 +233,39 @@ def run(rank, n_gpus, hps): for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, - # [train_loader, eval_loader], logger, [writer, writer_eval]) - [train_loader, None], logger, [writer, writer_eval]) + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + # [train_loader, eval_loader], logger, [writer, writer_eval]) + [train_loader, None], + logger, + [writer, writer_eval], + ) else: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, - [train_loader, None], None, None) + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + ) scheduler_g.step() scheduler_d.step() -def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): +def train_and_evaluate( + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers +): net_g, net_d = nets optim_g, optim_d = optims # scheduler_g, scheduler_d = schedulers @@ -192,17 +278,39 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade net_g.train() net_d.train() - for batch_idx, (ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths) in tqdm(enumerate(train_loader)): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in tqdm(enumerate(train_loader)): + spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) ssl = ssl.cuda(rank, non_blocking=True) - ssl.requires_grad=False + ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda(rank, non_blocking=True) + text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( + rank, non_blocking=True + ) with autocast(enabled=hps.train.fp16_run): - y_hat, kl_ssl, ids_slice, x_mask, z_mask, \ - (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl = net_g(ssl, spec, spec_lengths, text, text_lengths) + ( + y_hat, + kl_ssl, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + stats_ssl, + ) = net_g(ssl, spec, spec_lengths, text, text_lengths) mel = spec_to_mel_torch( spec, @@ -210,8 +318,11 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, - hps.data.mel_fmax) - y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), hps.data.filter_length, @@ -220,15 +331,19 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, - hps.data.mel_fmax + hps.data.mel_fmax, ) - y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice + y = commons.slice_segments( + y, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) loss_disc_all = loss_disc optim_d.zero_grad() scaler.scale(loss_disc_all).backward() @@ -256,32 +371,54 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade if rank == 0: if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] + lr = optim_g.param_groups[0]["lr"] losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) logger.info([x.item() for x in losses] + [global_step, lr]) - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, - "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc_all, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } scalar_dict.update( - {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl_ssl": kl_ssl, "loss/g/kl": loss_kl}) + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl_ssl": kl_ssl, + "loss/g/kl": loss_kl, + } + ) # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - "all/stats_ssl": utils.plot_spectrogram_to_numpy(stats_ssl[0].data.cpu().numpy()), + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + "all/stats_ssl": utils.plot_spectrogram_to_numpy( + stats_ssl[0].data.cpu().numpy() + ), } utils.summarize( writer=writer, global_step=global_step, images=image_dict, - scalars=scalar_dict) + scalars=scalar_dict, + ) global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: @@ -290,14 +427,18 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade optim_g, hps.train.learning_rate, epoch, - os.path.join("%s/logs_s2"%hps.data.exp_dir, "G_{}.pth".format(global_step)), + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(global_step) + ), ) utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, epoch, - os.path.join("%s/logs_s2"%hps.data.exp_dir, "D_{}.pth".format(global_step)), + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(global_step) + ), ) else: utils.save_checkpoint( @@ -305,14 +446,18 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade optim_g, hps.train.learning_rate, epoch, - os.path.join("%s/logs_s2"%hps.data.exp_dir, "G_{}.pth".format(233333333333)), + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(233333333333) + ), ) utils.save_checkpoint( net_d, optim_d, hps.train.learning_rate, epoch, - os.path.join("%s/logs_s2"%hps.data.exp_dir, "D_{}.pth".format(233333333333)), + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(233333333333) + ), ) if rank == 0 and hps.train.if_save_every_weights == True: if hasattr(net_g, "module"): @@ -334,11 +479,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade ) ) - - if rank == 0: - logger.info('====> Epoch: {}'.format(epoch)) - + logger.info("====> Epoch: {}".format(epoch)) def evaluate(hps, generator, eval_loader, writer_eval): @@ -347,15 +489,25 @@ def evaluate(hps, generator, eval_loader, writer_eval): audio_dict = {} print("Evaluating ...") with torch.no_grad(): - for batch_idx, (ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths) in enumerate(eval_loader): + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in enumerate(eval_loader): print(111) spec, spec_lengths = spec.cuda(), spec_lengths.cuda() y, y_lengths = y.cuda(), y_lengths.cuda() ssl = ssl.cuda() text, text_lengths = text.cuda(), text_lengths.cuda() for test in [0, 1]: - - y_hat, mask, *_ = generator.module.infer(ssl,spec, spec_lengths,text, text_lengths, test=test) + y_hat, mask, *_ = generator.module.infer( + ssl, spec, spec_lengths, text, text_lengths, test=test + ) y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length mel = spec_to_mel_torch( @@ -364,7 +516,8 @@ def evaluate(hps, generator, eval_loader, writer_eval): hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, - hps.data.mel_fmax) + hps.data.mel_fmax, + ) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1).float(), hps.data.filter_length, @@ -373,16 +526,26 @@ def evaluate(hps, generator, eval_loader, writer_eval): hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, - hps.data.mel_fmax + hps.data.mel_fmax, ) - image_dict.update({ - f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) - }) - audio_dict.update({ - f"gen/audio_{batch_idx}_{test}": y_hat[0, :, :y_hat_lengths[0]] - }) - image_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) - audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, :y_lengths[0]]}) + image_dict.update( + { + f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].cpu().numpy() + ) + } + ) + audio_dict.update( + {f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]]} + ) + image_dict.update( + { + f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( + mel[0].cpu().numpy() + ) + } + ) + audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) # y_hat, mask, *_ = generator.module.infer(ssl, spec_lengths, speakers, y=None) # audio_dict.update({ @@ -394,9 +557,10 @@ def evaluate(hps, generator, eval_loader, writer_eval): global_step=global_step, images=image_dict, audios=audio_dict, - audio_sampling_rate=hps.data.sampling_rate + audio_sampling_rate=hps.data.sampling_rate, ) generator.train() + if __name__ == "__main__": main() diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py index e3ed89b..0ce03b3 100644 --- a/GPT_SoVITS/utils.py +++ b/GPT_SoVITS/utils.py @@ -12,8 +12,9 @@ import numpy as np from scipy.io.wavfile import read import torch import logging -logging.getLogger('numba').setLevel(logging.ERROR) -logging.getLogger('matplotlib').setLevel(logging.ERROR) + +logging.getLogger("numba").setLevel(logging.ERROR) +logging.getLogger("matplotlib").setLevel(logging.ERROR) MATPLOTLIB_FLAG = False @@ -23,13 +24,17 @@ logger = logging def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None and not skip_optimizer and checkpoint_dict['optimizer'] is not None: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - saved_state_dict = checkpoint_dict['model'] - if hasattr(model, 'module'): + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None + and not skip_optimizer + and checkpoint_dict["optimizer"] is not None + ): + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): state_dict = model.module.state_dict() else: state_dict = model.state_dict() @@ -39,41 +44,63 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False # assert "quantizer" not in k # print("load", k) new_state_dict[k] = saved_state_dict[k] - assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape) + assert saved_state_dict[k].shape == v.shape, ( + saved_state_dict[k].shape, + v.shape, + ) except: traceback.print_exc() - print("error, %s is not in the checkpoint" % k)#shape不对也会,比如text_embedding当cleaner修改时 + print( + "error, %s is not in the checkpoint" % k + ) # shape不对也会,比如text_embedding当cleaner修改时 new_state_dict[k] = v - if hasattr(model, 'module'): + if hasattr(model, "module"): model.module.load_state_dict(new_state_dict) else: model.load_state_dict(new_state_dict) print("load ") - logger.info("Loaded checkpoint '{}' (iteration {})".format( - checkpoint_path, iteration)) + logger.info( + "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + ) return model, optimizer, learning_rate, iteration def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info("Saving model and optimizer state at iteration {} to {}".format( - iteration, checkpoint_path)) - if hasattr(model, 'module'): + logger.info( + "Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): state_dict = model.module.state_dict() else: state_dict = model.state_dict() - torch.save({'model': state_dict, - 'iteration': iteration, - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, checkpoint_path) + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) -def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): for k, v in scalars.items(): writer.add_scalar(k, v, global_step) for k, v in histograms.items(): writer.add_histogram(k, v, global_step) for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats='HWC') + writer.add_image(k, v, global_step, dataformats="HWC") for k, v in audios.items(): writer.add_audio(k, v, global_step, audio_sampling_rate) @@ -90,23 +117,23 @@ def plot_spectrogram_to_numpy(spectrogram): global MATPLOTLIB_FLAG if not MATPLOTLIB_FLAG: import matplotlib + matplotlib.use("Agg") MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') + mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt import numpy as np fig, ax = plt.subplots(figsize=(10, 2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", - interpolation='none') + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") plt.colorbar(im, ax=ax) plt.xlabel("Frames") plt.ylabel("Channels") plt.tight_layout() fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) plt.close() return data @@ -116,26 +143,28 @@ def plot_alignment_to_numpy(alignment, info=None): global MATPLOTLIB_FLAG if not MATPLOTLIB_FLAG: import matplotlib + matplotlib.use("Agg") MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') + mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt import numpy as np fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', - interpolation='none') + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' + xlabel = "Decoder timestep" if info is not None: - xlabel += '\n\n' + info + xlabel += "\n\n" + info plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') + plt.ylabel("Encoder timestep") plt.tight_layout() fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) plt.close() return data @@ -147,16 +176,31 @@ def load_wav_to_torch(full_path): def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding='utf-8') as f: + with open(filename, encoding="utf-8") as f: filepaths_and_text = [line.strip().split(split) for line in f] return filepaths_and_text def get_hparams(init=True, stage=1): parser = argparse.ArgumentParser() - parser.add_argument('-c', '--config', type=str, default="./configs/s2.json",help='JSON file for configuration') - parser.add_argument('-p', '--pretrain', type=str, required=False,default=None,help='pretrain dir') - parser.add_argument('-rs', '--resume_step', type=int, required=False,default=None,help='resume step') + parser.add_argument( + "-c", + "--config", + type=str, + default="./configs/s2.json", + help="JSON file for configuration", + ) + parser.add_argument( + "-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir" + ) + parser.add_argument( + "-rs", + "--resume_step", + type=int, + required=False, + default=None, + help="resume step", + ) # parser.add_argument('-e', '--exp_dir', type=str, required=False,default=None,help='experiment directory') # parser.add_argument('-g', '--pretrained_s2G', type=str, required=False,default=None,help='pretrained sovits gererator weights') # parser.add_argument('-d', '--pretrained_s2D', type=str, required=False,default=None,help='pretrained sovits discriminator weights') @@ -172,7 +216,7 @@ def get_hparams(init=True, stage=1): hparams.pretrain = args.pretrain hparams.resume_step = args.resume_step # hparams.data.exp_dir = args.exp_dir - if stage ==1: + if stage == 1: model_dir = hparams.s1_ckpt_dir else: model_dir = hparams.s2_ckpt_dir @@ -186,29 +230,38 @@ def get_hparams(init=True, stage=1): return hparams - -def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_time=True): +def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_time=True): """Freeing up space by deleting saved ckpts - Arguments: - path_to_models -- Path to the model directory - n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth - sort_by_time -- True -> chronologically delete ckpts - False -> lexicographically delete ckpts - """ + Arguments: + path_to_models -- Path to the model directory + n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth + sort_by_time -- True -> chronologically delete ckpts + False -> lexicographically delete ckpts + """ import re - ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] - name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1))) - time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f))) + + ckpts_files = [ + f + for f in os.listdir(path_to_models) + if os.path.isfile(os.path.join(path_to_models, f)) + ] + name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) + time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) sort_key = time_key if sort_by_time else name_key - x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')], - key=sort_key) - to_del = [os.path.join(path_to_models, fn) for fn in - (x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])] + x_sorted = lambda _x: sorted( + [f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], + key=sort_key, + ) + to_del = [ + os.path.join(path_to_models, fn) + for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + ] del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") del_routine = lambda x: [os.remove(x), del_info(x)] rs = [del_routine(fn) for fn in to_del] + def get_hparams_from_dir(model_dir): config_save_path = os.path.join(model_dir, "config.json") with open(config_save_path, "r") as f: @@ -228,12 +281,15 @@ def get_hparams_from_file(config_path): hparams = HParams(**config) return hparams + def check_git_hash(model_dir): source_dir = os.path.dirname(os.path.realpath(__file__)) if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - )) + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) return cur_hash = subprocess.getoutput("git rev-parse HEAD") @@ -242,8 +298,11 @@ def check_git_hash(model_dir): if os.path.exists(path): saved_hash = open(path).read() if saved_hash != cur_hash: - logger.warn("git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8])) + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) else: open(path, "w").write(cur_hash) @@ -263,7 +322,7 @@ def get_logger(model_dir, filename="train.log"): return logger -class HParams(): +class HParams: def __init__(self, **kwargs): for k, v in kwargs.items(): if type(v) == dict: @@ -294,5 +353,10 @@ class HParams(): def __repr__(self): return self.__dict__.__repr__() -if __name__ == '__main__': - print(load_wav_to_torch('/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac')) \ No newline at end of file + +if __name__ == "__main__": + print( + load_wav_to_torch( + "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac" + ) + ) From 0d3d47f3c31d1ae0ab99d10df8b2c3a2e2d1bb94 Mon Sep 17 00:00:00 2001 From: Blaise <133521603+blaise-tk@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:14:18 +0100 Subject: [PATCH 10/58] more code refactor --- GPT_SoVITS/AR/data/bucket_sampler.py | 60 +- GPT_SoVITS/AR/data/data_module.py | 30 +- GPT_SoVITS/AR/data/dataset.py | 175 +-- GPT_SoVITS/AR/models/t2s_lightning_module.py | 132 +- GPT_SoVITS/AR/models/t2s_model.py | 231 ++-- GPT_SoVITS/AR/models/utils.py | 28 +- GPT_SoVITS/AR/modules/activation.py | 161 ++- GPT_SoVITS/AR/modules/embedding.py | 35 +- GPT_SoVITS/AR/modules/lr_schedulers.py | 39 +- .../AR/modules/patched_mha_with_cache.py | 177 ++- GPT_SoVITS/AR/modules/scaling.py | 120 +- GPT_SoVITS/AR/modules/transformer.py | 191 +-- GPT_SoVITS/AR/text_processing/phonemizer.py | 30 +- GPT_SoVITS/AR/text_processing/symbols.py | 4 +- GPT_SoVITS/AR/utils/io.py | 22 +- GPT_SoVITS/configs/s1.yaml | 52 +- GPT_SoVITS/configs/s1big.yaml | 52 +- GPT_SoVITS/configs/s1big2.yaml | 52 +- GPT_SoVITS/configs/s1longer.yaml | 52 +- GPT_SoVITS/configs/s1mq.yaml | 144 +-- GPT_SoVITS/configs/train.yaml | 52 +- GPT_SoVITS/feature_extractor/cnhubert.py | 25 +- GPT_SoVITS/feature_extractor/whisper_enc.py | 11 +- GPT_SoVITS/module/attentions.py | 1075 ++++++++++------- GPT_SoVITS/module/commons.py | 236 ++-- GPT_SoVITS/module/core_vq.py | 52 +- GPT_SoVITS/module/data_utils.py | 187 ++- GPT_SoVITS/module/losses.py | 97 +- GPT_SoVITS/module/mel_processing.py | 98 +- GPT_SoVITS/module/models.py | 617 ++++++---- GPT_SoVITS/module/modules.py | 824 ++++++++----- GPT_SoVITS/module/mrte_model.py | 204 ++-- GPT_SoVITS/module/quantize.py | 23 +- GPT_SoVITS/module/transforms.py | 172 +-- GPT_SoVITS/prepare_datasets/0-pipeline.py | 67 +- GPT_SoVITS/prepare_datasets/1-get-text.py | 120 +- .../prepare_datasets/2-get-hubert-wav32k.py | 111 +- GPT_SoVITS/prepare_datasets/3-get-semantic.py | 79 +- GPT_SoVITS/text/chinese.py | 97 +- GPT_SoVITS/text/cleaner.py | 26 +- GPT_SoVITS/text/english.py | 104 +- GPT_SoVITS/text/japanese.py | 70 +- GPT_SoVITS/text/symbols.py | 395 +++++- GPT_SoVITS/text/tone_sandhi.py | 610 ++++++++-- 44 files changed, 4516 insertions(+), 2623 deletions(-) diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py index ee59479..7d752db 100644 --- a/GPT_SoVITS/AR/data/bucket_sampler.py +++ b/GPT_SoVITS/AR/data/bucket_sampler.py @@ -16,7 +16,7 @@ __all__ = [ "DistributedBucketSampler", ] -T_co = TypeVar('T_co', covariant=True) +T_co = TypeVar("T_co", covariant=True) class DistributedBucketSampler(Sampler[T_co]): @@ -28,28 +28,30 @@ class DistributedBucketSampler(Sampler[T_co]): sort batches """ - def __init__(self, - dataset: Dataset, - num_replicas: Optional[int]=None, - rank: Optional[int]=None, - shuffle: bool=True, - seed: int=0, - drop_last: bool=False, - batch_size: int=32) -> None: + def __init__( + self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False, + batch_size: int = 32, + ) -> None: if num_replicas is None: if not dist.is_available(): - raise RuntimeError( - "Requires distributed package to be available") + raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): - raise RuntimeError( - "Requires distributed package to be available") + raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() torch.cuda.set_device(rank) if rank >= num_replicas or rank < 0: - raise ValueError("Invalid rank {}, rank should be in the interval" - " [0, {}]".format(rank, num_replicas - 1)) + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1) + ) self.dataset = dataset self.num_replicas = num_replicas self.rank = rank @@ -57,19 +59,20 @@ class DistributedBucketSampler(Sampler[T_co]): self.drop_last = drop_last # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. - if self.drop_last and len( - self. - dataset) % self.num_replicas != 0: # type: ignore[arg-type] + if ( + self.drop_last and len(self.dataset) % self.num_replicas != 0 + ): # type: ignore[arg-type] # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( - (len(self.dataset) - self.num_replicas) / - self.num_replicas # type: ignore[arg-type] + (len(self.dataset) - self.num_replicas) + / self.num_replicas # type: ignore[arg-type] ) else: self.num_samples = math.ceil( - len(self.dataset) / self.num_replicas) # type: ignore[arg-type] + len(self.dataset) / self.num_replicas + ) # type: ignore[arg-type] self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle self.seed = seed @@ -84,7 +87,7 @@ class DistributedBucketSampler(Sampler[T_co]): id_with_lengths.sort(key=lambda x: x[1]) return id_with_lengths - def make_buckets(self, bucket_width: float=2.0): + def make_buckets(self, bucket_width: float = 2.0): buckets = [] cur = [] max_sec = bucket_width @@ -114,8 +117,8 @@ class DistributedBucketSampler(Sampler[T_co]): shuffled_bucket = list(itertools.chain(*shuffled_bucket)) n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) batches = [ - shuffled_bucket[b * grouped_batch_size:(b + 1) * - grouped_batch_size] for b in range(n_batch) + shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] + for b in range(n_batch) ] shuffle(batches) indices = list(itertools.chain(*batches)) @@ -129,15 +132,16 @@ class DistributedBucketSampler(Sampler[T_co]): if padding_size <= len(indices): indices += indices[:padding_size] else: - indices += (indices * math.ceil(padding_size / - len(indices)))[:padding_size] + indices += (indices * math.ceil(padding_size / len(indices)))[ + :padding_size + ] else: # remove tail of data to make it evenly divisible. - indices = indices[:self.total_size] + indices = indices[: self.total_size] assert len(indices) == self.total_size # subsample - indices = indices[self.rank:self.total_size:self.num_replicas] + indices = indices[self.rank : self.total_size : self.num_replicas] assert len(indices) == self.num_samples return iter(indices) diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py index 4c300f1..f3d895a 100644 --- a/GPT_SoVITS/AR/data/data_module.py +++ b/GPT_SoVITS/AR/data/data_module.py @@ -6,14 +6,21 @@ from torch.utils.data import DataLoader class Text2SemanticDataModule(LightningDataModule): - def __init__(self, config, train_semantic_path, train_phoneme_path,dev_semantic_path=None, dev_phoneme_path=None): + def __init__( + self, + config, + train_semantic_path, + train_phoneme_path, + dev_semantic_path=None, + dev_phoneme_path=None, + ): super().__init__() self.config = config self.train_semantic_path = train_semantic_path self.train_phoneme_path = train_phoneme_path self.dev_semantic_path = dev_semantic_path self.dev_phoneme_path = dev_phoneme_path - self.num_workers = self.config['data']['num_workers'] + self.num_workers = self.config["data"]["num_workers"] def prepare_data(self): pass @@ -22,8 +29,9 @@ class Text2SemanticDataModule(LightningDataModule): self._train_dataset = Text2SemanticDataset( phoneme_path=self.train_phoneme_path, semantic_path=self.train_semantic_path, - max_sec=self.config['data']['max_sec'], - pad_val=self.config['data']['pad_val']) + max_sec=self.config["data"]["max_sec"], + pad_val=self.config["data"]["pad_val"], + ) self._dev_dataset = self._train_dataset # self._dev_dataset = Text2SemanticDataset( # phoneme_path=self.dev_phoneme_path, @@ -33,9 +41,8 @@ class Text2SemanticDataModule(LightningDataModule): # pad_val=self.config['data']['pad_val']) def train_dataloader(self): - batch_size = self.config['train']['batch_size'] - sampler = DistributedBucketSampler( - self._train_dataset, batch_size=batch_size) + batch_size = self.config["train"]["batch_size"] + sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) return DataLoader( self._train_dataset, batch_size=batch_size, @@ -43,7 +50,7 @@ class Text2SemanticDataModule(LightningDataModule): collate_fn=self._train_dataset.collate, num_workers=self.num_workers, persistent_workers=True, - prefetch_factor=16 + prefetch_factor=16, ) def val_dataloader(self): @@ -52,9 +59,9 @@ class Text2SemanticDataModule(LightningDataModule): batch_size=1, shuffle=False, collate_fn=self._train_dataset.collate, - num_workers=max(self.num_workers,12), + num_workers=max(self.num_workers, 12), persistent_workers=True, - prefetch_factor=16 + prefetch_factor=16, ) # 这个会使用到嘛? @@ -63,4 +70,5 @@ class Text2SemanticDataModule(LightningDataModule): self._dev_dataset, batch_size=1, shuffle=False, - collate_fn=self._train_dataset.collate) + collate_fn=self._train_dataset.collate, + ) diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 72c9e2e..47adacc 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -1,21 +1,24 @@ # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/t2s_dataset.py import pdb import sys + # sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") -import traceback,os +import traceback, os from typing import Dict from typing import List import numpy as np import pandas as pd -import torch,json +import torch, json from torch.utils.data import DataLoader from torch.utils.data import Dataset from transformers import AutoTokenizer from text import cleaned_text_to_sequence + # from config import exp_dir + def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0): seq = sequences[0] ndim = seq.ndim @@ -28,44 +31,52 @@ def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0 padded_sequences = [] for seq, length in zip(sequences, seq_lengths): - padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * ( - ndim - axis - 1) - padded_seq = np.pad( - seq, padding, mode='constant', constant_values=pad_value) + padding = ( + [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) + ) + padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) padded_sequences.append(padded_seq) batch = np.stack(padded_sequences) return batch + class Text2SemanticDataset(Dataset): """dataset class for text tokens to semantic model training.""" - def __init__(self, - phoneme_path: str, - semantic_path: str, - max_sample: int = None, - max_sec: int = 100, - pad_val: int = 1024, - # min value of phoneme/sec - min_ps_ratio: int = 3, - # max value of phoneme/sec - max_ps_ratio: int = 25) -> None: + def __init__( + self, + phoneme_path: str, + semantic_path: str, + max_sample: int = None, + max_sec: int = 100, + pad_val: int = 1024, + # min value of phoneme/sec + min_ps_ratio: int = 3, + # max value of phoneme/sec + max_ps_ratio: int = 25, + ) -> None: super().__init__() - self.semantic_data = pd.read_csv(semantic_path, delimiter='\t', encoding="utf-8") + self.semantic_data = pd.read_csv( + semantic_path, delimiter="\t", encoding="utf-8" + ) # get dict - self.path2=phoneme_path#"%s/2-name2text.txt"%exp_dir#phoneme_path - self.path3="%s/3-bert"%(os.path.basename(phoneme_path))#"%s/3-bert"%exp_dir#bert_dir - self.path6=semantic_path#"%s/6-name2semantic.tsv"%exp_dir#semantic_path + self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path + self.path3 = "%s/3-bert" % ( + os.path.basename(phoneme_path) + ) # "%s/3-bert"%exp_dir#bert_dir + self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path assert os.path.exists(self.path2) assert os.path.exists(self.path6) - self.phoneme_data={} - with open(self.path2,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") for line in lines: - tmp=line.split("\t") - if(len(tmp)!=4):continue - self.phoneme_data[tmp[0]]=[tmp[1],tmp[2],tmp[3]] + tmp = line.split("\t") + if len(tmp) != 4: + continue + self.phoneme_data[tmp[0]] = [tmp[1], tmp[2], tmp[3]] # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item() # pad for semantic tokens @@ -74,7 +85,7 @@ class Text2SemanticDataset(Dataset): # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read() # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz # self.hz=int(data[:-2])# - self.hz=int(os.environ.get("hz","25hz")[:-2]) + self.hz = int(os.environ.get("hz", "25hz")[:-2]) # max seconds of semantic token self.max_sec = max_sec @@ -100,7 +111,6 @@ class Text2SemanticDataset(Dataset): # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large") # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large") - def init_batch(self): semantic_data_len = len(self.semantic_data) phoneme_data_len = len(self.phoneme_data.keys()) @@ -113,7 +123,7 @@ class Text2SemanticDataset(Dataset): for i in range(semantic_data_len): # 先依次遍历 # get str - item_name = self.semantic_data['item_name'][i] + item_name = self.semantic_data["item_name"][i] # print(self.phoneme_data) try: phoneme, word2ph, text = self.phoneme_data[item_name] @@ -123,16 +133,18 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue - semantic_str = self.semantic_data['semantic_audio'][i] + semantic_str = self.semantic_data["semantic_audio"][i] # get token list - semantic_ids = [int(idx) for idx in semantic_str.split(' ')] + semantic_ids = [int(idx) for idx in semantic_str.split(" ")] # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len # 过滤掉太长的样本 - if len(semantic_ids) > self.max_sec * self.hz:#########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k + if ( + len(semantic_ids) > self.max_sec * self.hz + ): #########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k num_deleted_bigger += 1 continue # (T, ), 这个速度不会很慢,所以可以在一开始就处理,无需在 __getitem__ 里面单个处理#### - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") try: phoneme_ids = cleaned_text_to_sequence(phoneme) @@ -142,7 +154,9 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 - if len(phoneme_ids) >self.max_sec * self.hz/2.5:###########2:改为恒定限制为semantic/2.5就行 + if ( + len(phoneme_ids) > self.max_sec * self.hz / 2.5 + ): ###########2:改为恒定限制为semantic/2.5就行 num_deleted_ps += 1 continue # if len(semantic_ids) > 1000:###########3 @@ -151,7 +165,9 @@ class Text2SemanticDataset(Dataset): ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) - if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio:##########4#3~25#每秒多少个phone + if ( + ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio + ): ##########4#3~25#每秒多少个phone num_deleted_ps += 1 # print(item_name) continue @@ -160,16 +176,16 @@ class Text2SemanticDataset(Dataset): idx += 1 self.item_names.append(item_name) - min_num=100#20直接不补#30补了也不存ckpt - leng =len(self.semantic_phoneme) - if(leng 0: print(f"there are {num_not_in} semantic datas not in phoneme datas") if num_deleted_bigger > 0: @@ -181,13 +197,13 @@ class Text2SemanticDataset(Dataset): print( f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}" ) - ''' + """ there are 31 semantic datas not in phoneme datas deleted 34 audios who's duration are bigger than 54 seconds deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3 dataset.__len__(): 366463 - ''' + """ # 345410 for LibriTTS print("dataset.__len__():", self.__len__()) @@ -204,22 +220,24 @@ class Text2SemanticDataset(Dataset): # semantic tokens target semantic_ids_len = len(semantic_ids) - flag=0 + flag = 0 path_bert = "%s/%s.pt" % (self.path3, item_name) - if(os.path.exists(path_bert)==True):bert_feature = torch.load(path_bert,map_location="cpu") - else:flag=1 - if(flag==1): + if os.path.exists(path_bert) == True: + bert_feature = torch.load(path_bert, map_location="cpu") + else: + flag = 1 + if flag == 1: # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32) - bert_feature=None + bert_feature = None else: assert bert_feature.shape[-1] == len(phoneme_ids) return { - 'idx': idx, - 'phoneme_ids': phoneme_ids, - 'phoneme_ids_len': phoneme_ids_len, - 'semantic_ids': semantic_ids, - 'semantic_ids_len': semantic_ids_len, - 'bert_feature': bert_feature, + "idx": idx, + "phoneme_ids": phoneme_ids, + "phoneme_ids_len": phoneme_ids_len, + "semantic_ids": semantic_ids, + "semantic_ids_len": semantic_ids_len, + "bert_feature": bert_feature, } def get_sample_length(self, idx: int): @@ -235,7 +253,6 @@ class Text2SemanticDataset(Dataset): semantic_ids_lens: List[int] = [] # return - for item in examples: sample_index.append(item["idx"]) phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64)) @@ -256,9 +273,9 @@ class Text2SemanticDataset(Dataset): bert_padded.zero_() for idx, item in enumerate(examples): - bert = item['bert_feature'] - if(bert!=None): - bert_padded[idx, :, :bert.shape[-1]] = bert + bert = item["bert_feature"] + if bert != None: + bert_padded[idx, :, : bert.shape[-1]] = bert return { # List[int] @@ -276,27 +293,27 @@ class Text2SemanticDataset(Dataset): } -if __name__ == '__main__': - root_dir = '/data/docker/liujing04/gpt-vits/prepare/dump_mix/' +if __name__ == "__main__": + root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/" dataset = Text2SemanticDataset( - phoneme_path=root_dir + 'phoneme_train.npy', - semantic_path=root_dir + 'semantic_train.tsv') + phoneme_path=root_dir + "phoneme_train.npy", + semantic_path=root_dir + "semantic_train.tsv", + ) batch_size = 12 dataloader = DataLoader( - dataset, - batch_size=batch_size, - collate_fn=dataset.collate, - shuffle=False) + dataset, batch_size=batch_size, collate_fn=dataset.collate, shuffle=False + ) for i, batch in enumerate(dataloader): - if(i%1000==0):print(i) + if i % 1000 == 0: + print(i) # if i == 0: # print('batch["ids"]:', batch["ids"]) - # print('batch["phoneme_ids"]:', batch["phoneme_ids"], - # batch["phoneme_ids"].shape) - # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"], - # batch["phoneme_ids_len"].shape) - # print('batch["semantic_ids"]:', batch["semantic_ids"], - # batch["semantic_ids"].shape) - # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"], - # batch["semantic_ids_len"].shape) + # print('batch["phoneme_ids"]:', batch["phoneme_ids"], + # batch["phoneme_ids"].shape) + # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"], + # batch["phoneme_ids_len"].shape) + # print('batch["semantic_ids"]:', batch["semantic_ids"], + # batch["semantic_ids"].shape) + # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"], + # batch["semantic_ids_len"].shape) diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 149d88e..f9dfc64 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -1,5 +1,6 @@ # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py -import os,sys +import os, sys + now_dir = os.getcwd() sys.path.append(now_dir) from typing import Dict @@ -12,29 +13,35 @@ from AR.modules.optim import ScaledAdam class Text2SemanticLightningModule(LightningModule): - def __init__(self, config, output_dir,is_train=True): + def __init__(self, config, output_dir, is_train=True): super().__init__() self.config = config self.top_k = 3 self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) - pretrained_s1=config.get("pretrained_s1") - if(pretrained_s1 and is_train): + pretrained_s1 = config.get("pretrained_s1") + if pretrained_s1 and is_train: # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) - print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["weight"])) + print( + self.load_state_dict( + torch.load(pretrained_s1, map_location="cpu")["weight"] + ) + ) if is_train: self.automatic_optimization = False self.save_hyperparameters() - self.eval_dir = output_dir / 'eval' + self.eval_dir = output_dir / "eval" self.eval_dir.mkdir(parents=True, exist_ok=True) def training_step(self, batch: Dict, batch_idx: int): - opt = self.optimizers() scheduler = self.lr_schedulers() loss, acc = self.model.forward( - batch['phoneme_ids'], batch['phoneme_ids_len'], - batch['semantic_ids'], batch['semantic_ids_len'], - batch['bert_feature']) + batch["phoneme_ids"], + batch["phoneme_ids_len"], + batch["semantic_ids"], + batch["semantic_ids_len"], + batch["bert_feature"], + ) self.manual_backward(loss) if batch_idx > 0 and batch_idx % 4 == 0: opt.step() @@ -47,63 +54,67 @@ class Text2SemanticLightningModule(LightningModule): on_step=True, on_epoch=True, prog_bar=True, - sync_dist=True) + sync_dist=True, + ) self.log( "lr", scheduler.get_last_lr()[0], on_epoch=True, prog_bar=True, - sync_dist=True) + sync_dist=True, + ) self.log( f"top_{self.top_k}_acc", acc, on_step=True, on_epoch=True, prog_bar=True, - sync_dist=True) + sync_dist=True, + ) - def validation_step(self, batch: Dict, batch_idx: int):return - # # get loss - # loss, acc = self.model.forward( - # batch['phoneme_ids'], batch['phoneme_ids_len'], - # batch['semantic_ids'], batch['semantic_ids_len'], - # batch['bert_feature'] - # ) - # - # self.log( - # "val_total_loss", - # loss, - # on_step=True, - # on_epoch=True, - # prog_bar=True, - # sync_dist=True) - # self.log( - # f"val_top_{self.top_k}_acc", - # acc, - # on_step=True, - # on_epoch=True, - # prog_bar=True, - # sync_dist=True) - # - # # get infer output - # semantic_len = batch['semantic_ids'].size(1) - # prompt_len = min(int(semantic_len * 0.5), 150) - # prompt = batch['semantic_ids'][:, :prompt_len] - # pred_semantic = self.model.infer(batch['phoneme_ids'], - # batch['phoneme_ids_len'], prompt, - # batch['bert_feature'] - # ) - # save_name = f'semantic_toks_{batch_idx}.pt' - # save_path = os.path.join(self.eval_dir, save_name) - # torch.save(pred_semantic.detach().cpu(), save_path) + def validation_step(self, batch: Dict, batch_idx: int): + return + + # # get loss + # loss, acc = self.model.forward( + # batch['phoneme_ids'], batch['phoneme_ids_len'], + # batch['semantic_ids'], batch['semantic_ids_len'], + # batch['bert_feature'] + # ) + # + # self.log( + # "val_total_loss", + # loss, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # self.log( + # f"val_top_{self.top_k}_acc", + # acc, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # + # # get infer output + # semantic_len = batch['semantic_ids'].size(1) + # prompt_len = min(int(semantic_len * 0.5), 150) + # prompt = batch['semantic_ids'][:, :prompt_len] + # pred_semantic = self.model.infer(batch['phoneme_ids'], + # batch['phoneme_ids_len'], prompt, + # batch['bert_feature'] + # ) + # save_name = f'semantic_toks_{batch_idx}.pt' + # save_path = os.path.join(self.eval_dir, save_name) + # torch.save(pred_semantic.detach().cpu(), save_path) def configure_optimizers(self): model_parameters = self.model.parameters() parameters_names = [] - parameters_names.append([ - name_param_pair[0] - for name_param_pair in self.model.named_parameters() - ]) + parameters_names.append( + [name_param_pair[0] for name_param_pair in self.model.named_parameters()] + ) lm_opt = ScaledAdam( model_parameters, lr=0.01, @@ -111,18 +122,19 @@ class Text2SemanticLightningModule(LightningModule): clipping_scale=2.0, parameters_names=parameters_names, show_dominant_parameters=False, - clipping_update_period=1000, ) + clipping_update_period=1000, + ) return { "optimizer": lm_opt, "lr_scheduler": { - "scheduler": - WarmupCosineLRSchedule( + "scheduler": WarmupCosineLRSchedule( lm_opt, - init_lr=self.config['optimizer']['lr_init'], - peak_lr=self.config['optimizer']['lr'], - end_lr=self.config['optimizer']['lr_end'], - warmup_steps=self.config['optimizer']['warmup_steps'], - total_steps=self.config['optimizer']['decay_steps']) - } + init_lr=self.config["optimizer"]["lr_init"], + peak_lr=self.config["optimizer"]["lr"], + end_lr=self.config["optimizer"]["lr_end"], + warmup_steps=self.config["optimizer"]["warmup_steps"], + total_steps=self.config["optimizer"]["decay_steps"], + ) + }, } diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 9f5337e..9f8330b 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -3,7 +3,12 @@ import torch from tqdm import tqdm from AR.models.utils import make_pad_mask -from AR.models.utils import topk_sampling,sample,logits_to_probs,multinomial_sample_one_no_sync +from AR.models.utils import ( + topk_sampling, + sample, + logits_to_probs, + multinomial_sample_one_no_sync, +) from AR.modules.embedding import SinePositionalEmbedding from AR.modules.embedding import TokenEmbedding from AR.modules.transformer import LayerNorm @@ -22,35 +27,39 @@ default_config = { "p_dropout": 0.0, "vocab_size": 1024 + 1, "phoneme_vocab_size": 512, - "EOS": 1024 + "EOS": 1024, } class Text2SemanticDecoder(nn.Module): def __init__(self, config, norm_first=False, top_k=3): super(Text2SemanticDecoder, self).__init__() - self.model_dim = config['model']["hidden_dim"] - self.embedding_dim = config['model']["embedding_dim"] - self.num_head = config['model']["head"] - self.num_layers = config['model']["n_layer"] + self.model_dim = config["model"]["hidden_dim"] + self.embedding_dim = config["model"]["embedding_dim"] + self.num_head = config["model"]["head"] + self.num_layers = config["model"]["n_layer"] self.norm_first = norm_first - self.vocab_size = config['model']["vocab_size"] - self.phoneme_vocab_size = config['model']["phoneme_vocab_size"] - self.p_dropout = config['model']["dropout"] - self.EOS = config['model']["EOS"] + self.vocab_size = config["model"]["vocab_size"] + self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] + self.p_dropout = config["model"]["dropout"] + self.EOS = config["model"]["EOS"] self.norm_first = norm_first assert self.EOS == self.vocab_size - 1 # should be same as num of kmeans bin # assert self.EOS == 1024 self.bert_proj = nn.Linear(1024, self.embedding_dim) self.ar_text_embedding = TokenEmbedding( - self.embedding_dim, self.phoneme_vocab_size, self.p_dropout) + self.embedding_dim, self.phoneme_vocab_size, self.p_dropout + ) self.ar_text_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.embedding_dim, dropout=0.1, scale=False, alpha=True + ) self.ar_audio_embedding = TokenEmbedding( - self.embedding_dim, self.vocab_size, self.p_dropout) + self.embedding_dim, self.vocab_size, self.p_dropout + ) self.ar_audio_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.embedding_dim, dropout=0.1, scale=False, alpha=True + ) self.h = TransformerEncoder( TransformerEncoderLayer( @@ -59,28 +68,30 @@ class Text2SemanticDecoder(nn.Module): dim_feedforward=self.model_dim * 4, dropout=0.1, batch_first=True, - norm_first=norm_first, ), + norm_first=norm_first, + ), num_layers=self.num_layers, - norm=LayerNorm(self.model_dim) if norm_first else None, ) + norm=LayerNorm(self.model_dim) if norm_first else None, + ) - self.ar_predict_layer = nn.Linear( - self.model_dim, self.vocab_size, bias=False) - self.loss_fct = nn.CrossEntropyLoss(reduction='sum') + self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.loss_fct = nn.CrossEntropyLoss(reduction="sum") self.ar_accuracy_metric = MulticlassAccuracy( self.vocab_size, top_k=top_k, average="micro", multidim_average="global", - ignore_index=self.EOS, ) + ignore_index=self.EOS, + ) def forward(self, x, x_lens, y, y_lens, bert_feature): - ''' + """ x: phoneme_ids y: semantic_ids - ''' + """ x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1,2)) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) x = self.ar_text_position(x) x_mask = make_pad_mask(x_lens) @@ -102,18 +113,23 @@ class Text2SemanticDecoder(nn.Module): x_attn_mask = F.pad( torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), (0, y_len), - value=True, ) + value=True, + ) y_attn_mask = F.pad( torch.triu( torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), - diagonal=1, ), + diagonal=1, + ), (x_len, 0), - value=False, ) + value=False, + ) xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) bsz, src_len = x.shape[0], x_len + y_len - _xy_padding_mask = (ar_xy_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, self.num_head, -1, -1) - .reshape(bsz * self.num_head, 1, src_len)) + _xy_padding_mask = ( + ar_xy_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, self.num_head, -1, -1) + .reshape(bsz * self.num_head, 1, src_len) + ) xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) @@ -122,26 +138,28 @@ class Text2SemanticDecoder(nn.Module): xy_pos = torch.concat([x, y_pos], dim=1) xy_dec, _ = self.h( (xy_pos, None), - mask=xy_attn_mask, ) + mask=xy_attn_mask, + ) logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1) # loss # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum - loss = F.cross_entropy(logits, targets, reduction='sum') + loss = F.cross_entropy(logits, targets, reduction="sum") acc = self.ar_accuracy_metric(logits.detach(), targets).item() return loss, acc # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 - def infer(self, - x, - x_lens, - prompts, - bert_feature, - top_k: int=-100, - early_stop_num: int=-1, - temperature: float=1.0): - + def infer( + self, + x, + x_lens, + prompts, + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, + ): x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1,2)) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) x = self.ar_text_position(x) # AR Decoder @@ -159,35 +177,37 @@ class Text2SemanticDecoder(nn.Module): x_attn_mask_pad = F.pad( x_attn_mask, (0, y_len), - value=True, ) + value=True, + ) y_attn_mask = F.pad( - torch.triu( - torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), (x_len, 0), - value=False, ) - xy_attn_mask = torch.concat( - [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device) + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( + y.device + ) xy_dec, _ = self.h( (xy_pos, None), - mask=xy_attn_mask, ) + mask=xy_attn_mask, + ) logits = self.ar_predict_layer(xy_dec[:, -1]) samples = topk_sampling( - logits, top_k=top_k, top_p=1.0, temperature=temperature) + logits, top_k=top_k, top_p=1.0, temperature=temperature + ) - if early_stop_num != -1 and (y.shape[1] - prefix_len - ) > early_stop_num: + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: print("use early stop num:", early_stop_num) stop = True - if torch.argmax( - logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS) stop = True if stop: if prompts.shape[1] == y.shape[1]: y = torch.concat([y, torch.zeros_like(samples)], dim=1) - print('bad zero prediction') + print("bad zero prediction") print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") break # 本次生成的 semantic_ids 和之前的 y 构成新的 y @@ -198,23 +218,24 @@ class Text2SemanticDecoder(nn.Module): return y def pad_y_eos(self, y, y_mask_int, eos_id): - targets = F.pad( - y, (0, 1), value=0) + eos_id * F.pad( - y_mask_int, (0, 1), value=1) + targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad( + y_mask_int, (0, 1), value=1 + ) # 错位 return targets[:, :-1], targets[:, 1:] - def infer_panel(self, - x,#####全部文本token - x_lens, - prompts,####参考音频token - bert_feature, - top_k: int=-100, - early_stop_num: int=-1, - temperature: float=1.0): - + def infer_panel( + self, + x, #####全部文本token + x_lens, + prompts, ####参考音频token + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, + ): x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1,2)) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) x = self.ar_text_position(x) # AR Decoder @@ -224,75 +245,81 @@ class Text2SemanticDecoder(nn.Module): x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) stop = False # print(1111111,self.num_layers) - cache={ - "all_stage":self.num_layers, - "k":[None]*self.num_layers,###根据配置自己手写 - "v":[None]*self.num_layers, + cache = { + "all_stage": self.num_layers, + "k": [None] * self.num_layers, ###根据配置自己手写 + "v": [None] * self.num_layers, # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存,每次都要重新拼xy_pos.主要还是写法原因,其实是可以历史统一一样的,但也没啥计算量就不管了 - "y_emb":None,##只需要对最新的samples求emb,再拼历史的就行 + "y_emb": None, ##只需要对最新的samples求emb,再拼历史的就行 # "logits":None,###原版就已经只对结尾求再拼接了,不用管 # "xy_dec":None,###不需要,本来只需要最后一个做logits - "first_infer":1, - "stage":0 + "first_infer": 1, + "stage": 0, } for idx in tqdm(range(1500)): - if(cache["first_infer"]==1): + if cache["first_infer"] == 1: y_emb = self.ar_audio_embedding(y) else: - y_emb = torch.cat([cache["y_emb"],self.ar_audio_embedding(y[:,-1:])],1) - cache["y_emb"]=y_emb + y_emb = torch.cat( + [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 + ) + cache["y_emb"] = y_emb y_pos = self.ar_audio_position(y_emb) # x 和逐渐增长的 y 一起输入给模型 - if(cache["first_infer"]==1): + if cache["first_infer"] == 1: xy_pos = torch.concat([x, y_pos], dim=1) else: - xy_pos=y_pos[:,-1:] + xy_pos = y_pos[:, -1:] y_len = y_pos.shape[1] ###以下3个不做缓存 - if (cache["first_infer"] == 1): + if cache["first_infer"] == 1: x_attn_mask_pad = F.pad( - x_attn_mask, - (0, y_len),###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) - value=True, ) - y_attn_mask = F.pad(###yy的右上1扩展到左边xy的0,(y,x+y) - torch.triu( - torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + x_attn_mask, + (0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) + value=True, + ) + y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y) + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), (x_len, 0), - value=False, ) - xy_attn_mask = torch.concat( - [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device) + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( + y.device + ) else: ###最右边一列(是错的) # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device) # xy_attn_mask[:,-1]=False ###最下面一行(是对的) - xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool, device=xy_pos.device) + xy_attn_mask = torch.zeros( + (1, x_len + y_len), dtype=torch.bool, device=xy_pos.device + ) # pdb.set_trace() ###缓存重头戏 # print(1111,xy_pos.shape,xy_attn_mask.shape,x_len,y_len) - xy_dec, _ = self.h( - (xy_pos, None), - mask=xy_attn_mask,cache=cache ) - logits = self.ar_predict_layer(xy_dec[:, -1])##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的 + xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer( + xy_dec[:, -1] + ) ##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的 # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature) - samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) - if early_stop_num != -1 and (y.shape[1] - prefix_len - ) > early_stop_num: + samples = sample( + logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35 + )[0].unsqueeze(0) + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: print("use early stop num:", early_stop_num) stop = True - if torch.argmax( - logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS) stop = True if stop: if prompts.shape[1] == y.shape[1]: y = torch.concat([y, torch.zeros_like(samples)], dim=1) - print('bad zero prediction') + print("bad zero prediction") print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") break # 本次生成的 semantic_ids 和之前的 y 构成新的 y # print(samples.shape)#[1,1]#第一个1是bs y = torch.concat([y, samples], dim=1) - cache["first_infer"]=0 - return y,idx + cache["first_infer"] = 0 + return y, idx diff --git a/GPT_SoVITS/AR/models/utils.py b/GPT_SoVITS/AR/models/utils.py index dfe1d8a..25fe446 100644 --- a/GPT_SoVITS/AR/models/utils.py +++ b/GPT_SoVITS/AR/models/utils.py @@ -2,6 +2,7 @@ import torch import torch.nn.functional as F + def sequence_mask(length, max_length=None): if max_length is None: max_length = length.max() @@ -9,7 +10,7 @@ def sequence_mask(length, max_length=None): return x.unsqueeze(0) < length.unsqueeze(1) -def make_pad_mask(lengths: torch.Tensor, max_len: int=0) -> torch.Tensor: +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: """ Args: lengths: @@ -38,11 +39,9 @@ def make_pad_mask(lengths: torch.Tensor, max_len: int=0) -> torch.Tensor: # https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py -def top_k_top_p_filtering(logits, - top_k=0, - top_p=1.0, - filter_value=-float("Inf"), - min_tokens_to_keep=1): +def top_k_top_p_filtering( + logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 +): """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) @@ -53,16 +52,14 @@ def top_k_top_p_filtering(logits, From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), - logits.size(-1)) # Safety check + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum( - F.softmax(sorted_logits, dim=-1), dim=-1) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p @@ -70,13 +67,13 @@ def top_k_top_p_filtering(logits, # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ - ..., :-1].clone() + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter( - 1, sorted_indices, sorted_indices_to_remove) + 1, sorted_indices, sorted_indices_to_remove + ) logits[indices_to_remove] = filter_value return logits @@ -100,6 +97,8 @@ def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): from typing import Optional, Tuple + + def multinomial_sample_one_no_sync( probs_sort, ): # Does multinomial sampling without a cuda synchronization @@ -115,7 +114,7 @@ def logits_to_probs( top_p: Optional[int] = None, repetition_penalty: float = 1.0, ): - previous_tokens=previous_tokens.squeeze() + previous_tokens = previous_tokens.squeeze() # print(logits.shape,previous_tokens.shape) # pdb.set_trace() if previous_tokens is not None and repetition_penalty != 1.0: @@ -159,4 +158,3 @@ def sample( ) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs - diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 50631e9..5ca888b 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -13,7 +13,9 @@ from torch.nn.parameter import Parameter from torch.nn import functional as F from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched -F.multi_head_attention_forward=multi_head_attention_forward_patched + +F.multi_head_attention_forward = multi_head_attention_forward_patched + class MultiheadAttention(Module): r"""Allows the model to jointly attend to information @@ -76,66 +78,71 @@ class MultiheadAttention(Module): bias_v: Optional[torch.Tensor] def __init__( - self, - embed_dim, - num_heads, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - kdim=None, - vdim=None, - batch_first=False, - linear1_cls=Linear, - linear2_cls=Linear, - device=None, - dtype=None, ) -> None: + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + linear1_cls=Linear, + linear2_cls=Linear, + device=None, + dtype=None, + ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim - self._qkv_same_embed_dim = (self.kdim == embed_dim and - self.vdim == embed_dim) + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads - assert (self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" if add_bias_kv: - self.bias_k = Parameter( - torch.empty((1, 1, embed_dim), **factory_kwargs)) - self.bias_v = Parameter( - torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) else: self.bias_k = self.bias_v = None if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs)) + torch.empty((embed_dim, embed_dim), **factory_kwargs) + ) self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs)) + torch.empty((embed_dim, self.kdim), **factory_kwargs) + ) self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs)) + torch.empty((embed_dim, self.vdim), **factory_kwargs) + ) self.register_parameter("in_proj_weight", None) else: self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)) + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs)) + torch.empty(3 * embed_dim, **factory_kwargs) + ) else: self.register_parameter("in_proj_bias", None) self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs) + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) self._reset_parameters() else: @@ -143,7 +150,8 @@ class MultiheadAttention(Module): raise NotImplementedError else: self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs) + embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + ) self.in_proj_weight = self.in_proj_linear.weight self.register_parameter("q_proj_weight", None) @@ -156,7 +164,8 @@ class MultiheadAttention(Module): self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs) + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) if self.bias_k is not None: xavier_normal_(self.bias_k) @@ -190,14 +199,15 @@ class MultiheadAttention(Module): super(MultiheadAttention, self).__setstate__(state) def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - key_padding_mask: Optional[Tensor]=None, - need_weights: bool=True, - attn_mask: Optional[Tensor]=None, - average_attn_weights: bool=True,cache=None + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + cache=None, ) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: @@ -251,23 +261,26 @@ class MultiheadAttention(Module): if key_padding_mask is not None: _kpm_dtype = key_padding_mask.dtype if _kpm_dtype != torch.bool and not torch.is_floating_point( - key_padding_mask): + key_padding_mask + ): raise AssertionError( "only bool and floating types of key_padding_mask are supported" ) why_not_fast_path = "" if not is_batched: - why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" + why_not_fast_path = ( + f"input not batched; expected query.dim() of 3 but got {query.dim()}" + ) elif query is not key or key is not value: # When lifting this restriction, don't forget to either # enforce that the dtypes all match or test cases where # they don't! why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" - elif (self.in_proj_bias is not None and - query.dtype != self.in_proj_bias.dtype): + elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" - elif (self.in_proj_weight is not None and - query.dtype != self.in_proj_weight.dtype): + elif ( + self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype + ): # this case will fail anyway, but at least they'll get a useful error message. why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" elif self.training: @@ -288,29 +301,41 @@ class MultiheadAttention(Module): why_not_fast_path = "attn_mask was not None" elif query.is_nested and key_padding_mask is not None: why_not_fast_path = ( - "key_padding_mask is not supported with NestedTensor input") + "key_padding_mask is not supported with NestedTensor input" + ) elif self.num_heads % 2 == 1: why_not_fast_path = "num_heads is odd" elif torch.is_autocast_enabled(): why_not_fast_path = "autocast is enabled" if not why_not_fast_path: - tensor_args = (query, key, value, self.in_proj_weight, - self.in_proj_bias, self.out_proj.weight, - self.out_proj.bias, ) + tensor_args = ( + query, + key, + value, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + ) # We have to use list comprehensions below because TorchScript does not support # generator expressions. if torch.overrides.has_torch_function(tensor_args): why_not_fast_path = "some Tensor argument has_torch_function" - elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) - for x in tensor_args]): - why_not_fast_path = ( - "some Tensor argument is neither CUDA nor CPU") + elif not all( + [ + (x is None or x.is_cuda or "cpu" in str(x.device)) + for x in tensor_args + ] + ): + why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" elif torch.is_grad_enabled() and any( - [x is not None and x.requires_grad for x in tensor_args]): + [x is not None and x.requires_grad for x in tensor_args] + ): why_not_fast_path = ( "grad is enabled and at least one of query or the " - "input/output projection weights or biases requires_grad") + "input/output projection weights or biases requires_grad" + ) if not why_not_fast_path: return torch._native_multi_head_attention( query, @@ -322,17 +347,21 @@ class MultiheadAttention(Module): self.in_proj_bias, self.out_proj.weight, self.out_proj.bias, - key_padding_mask - if key_padding_mask is not None else attn_mask, + key_padding_mask if key_padding_mask is not None else attn_mask, need_weights, average_attn_weights, - 1 if key_padding_mask is not None else 0 - if attn_mask is not None else None, ) + 1 + if key_padding_mask is not None + else 0 + if attn_mask is not None + else None, + ) any_nested = query.is_nested or key.is_nested or value.is_nested assert not any_nested, ( "MultiheadAttention does not support NestedTensor outside of its fast path. " - + f"The fast path was not hit because {why_not_fast_path}") + + f"The fast path was not hit because {why_not_fast_path}" + ) if self.batch_first and is_batched: # make sure that the transpose op does not affect the "is" property @@ -343,9 +372,7 @@ class MultiheadAttention(Module): query, key = [x.transpose(1, 0) for x in (query, key)] value = key else: - query, key, value = [ - x.transpose(1, 0) for x in (query, key, value) - ] + query, key, value = [x.transpose(1, 0) for x in (query, key, value)] if not self._qkv_same_embed_dim: attn_output, attn_output_weights = F.multi_head_attention_forward( @@ -370,7 +397,9 @@ class MultiheadAttention(Module): q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, v_proj_weight=self.v_proj_weight, - average_attn_weights=average_attn_weights,cache=cache ) + average_attn_weights=average_attn_weights, + cache=cache, + ) else: attn_output, attn_output_weights = F.multi_head_attention_forward( query, @@ -390,7 +419,9 @@ class MultiheadAttention(Module): key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, - average_attn_weights=average_attn_weights,cache=cache ) + average_attn_weights=average_attn_weights, + cache=cache, + ) if self.batch_first and is_batched: return attn_output.transpose(1, 0), attn_output_weights else: diff --git a/GPT_SoVITS/AR/modules/embedding.py b/GPT_SoVITS/AR/modules/embedding.py index 35063c7..3a382f9 100644 --- a/GPT_SoVITS/AR/modules/embedding.py +++ b/GPT_SoVITS/AR/modules/embedding.py @@ -7,10 +7,11 @@ from torch import nn class TokenEmbedding(nn.Module): def __init__( - self, - embedding_dim: int, - vocab_size: int, - dropout: float=0.0, ): + self, + embedding_dim: int, + vocab_size: int, + dropout: float = 0.0, + ): super().__init__() self.vocab_size = vocab_size @@ -24,7 +25,7 @@ class TokenEmbedding(nn.Module): return self.word_embeddings.weight def embedding(self, index: int) -> torch.Tensor: - return self.word_embeddings.weight[index:index + 1] + return self.word_embeddings.weight[index : index + 1] def forward(self, x: torch.Tensor): x = self.word_embeddings(x) @@ -34,11 +35,12 @@ class TokenEmbedding(nn.Module): class SinePositionalEmbedding(nn.Module): def __init__( - self, - embedding_dim: int, - dropout: float=0.0, - scale: bool=False, - alpha: bool=False, ): + self, + embedding_dim: int, + dropout: float = 0.0, + scale: bool = False, + alpha: bool = False, + ): super().__init__() self.embedding_dim = embedding_dim self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 @@ -59,13 +61,14 @@ class SinePositionalEmbedding(nn.Module): pe = torch.zeros(x.size(1), self.embedding_dim) if self.reverse: position = torch.arange( - x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) + x.size(1) - 1, -1, -1.0, dtype=torch.float32 + ).unsqueeze(1) else: - position = torch.arange( - 0, x.size(1), dtype=torch.float32).unsqueeze(1) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( - torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.embedding_dim)) + torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.embedding_dim) + ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) @@ -74,5 +77,5 @@ class SinePositionalEmbedding(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: self.extend_pe(x) output = x.unsqueeze(-1) if x.ndim == 2 else x - output = output * self.x_scale + self.alpha * self.pe[:, :x.size(1)] + output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] return self.dropout(output) diff --git a/GPT_SoVITS/AR/modules/lr_schedulers.py b/GPT_SoVITS/AR/modules/lr_schedulers.py index 955d804..7dec462 100644 --- a/GPT_SoVITS/AR/modules/lr_schedulers.py +++ b/GPT_SoVITS/AR/modules/lr_schedulers.py @@ -12,14 +12,16 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. """ - def __init__(self, - optimizer, - init_lr, - peak_lr, - end_lr, - warmup_steps=10000, - total_steps=400000, - current_step=0): + def __init__( + self, + optimizer, + init_lr, + peak_lr, + end_lr, + warmup_steps=10000, + total_steps=400000, + current_step=0, + ): self.init_lr = init_lr self.peak_lr = peak_lr self.end_lr = end_lr @@ -33,10 +35,10 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): self._last_lr = [self.lr] def set_lr(self, lr): - self._last_lr = [g['lr'] for g in self.optimizer.param_groups] + self._last_lr = [g["lr"] for g in self.optimizer.param_groups] for g in self.optimizer.param_groups: # g['lr'] = lr - g['lr'] = self.end_lr###锁定用线性 + g["lr"] = self.end_lr ###锁定用线性 def step(self): if self._current_step < self.warmup_steps: @@ -47,7 +49,8 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): else: decay_ratio = (self._current_step - self.warmup_steps) / ( - self.total_steps - self.warmup_steps) + self.total_steps - self.warmup_steps + ) if decay_ratio < 0.0 or decay_ratio > 1.0: raise RuntimeError( "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." @@ -55,25 +58,19 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) - self.lr=lr=self.end_lr=0.002###锁定用线性###不听话,直接锁定! + self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! self.set_lr(lr) self.lr = lr self._current_step += 1 return self.lr - -if __name__ == '__main__': +if __name__ == "__main__": m = nn.Linear(10, 10) opt = Adam(m.parameters(), lr=1e-4) s = WarmupCosineLRSchedule( - opt, - 1e-6, - 2e-4, - 1e-6, - warmup_steps=2000, - total_steps=20000, - current_step=0) + opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 + ) lrs = [] for i in range(25000): s.step() diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py index bfb748e..5720670 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py @@ -1,9 +1,16 @@ from torch.nn.functional import * -from torch.nn.functional import _mha_shape_check,_canonical_mask,_none_or_dtype,_in_projection_packed +from torch.nn.functional import ( + _mha_shape_check, + _canonical_mask, + _none_or_dtype, + _in_projection_packed, +) + # import torch # Tensor = torch.Tensor # from typing import Callable, List, Optional, Tuple, Union + def multi_head_attention_forward_patched( query: Tensor, key: Tensor, @@ -29,7 +36,8 @@ def multi_head_attention_forward_patched( static_k: Optional[Tensor] = None, static_v: Optional[Tensor] = None, average_attn_weights: bool = True, - is_causal: bool = False,cache=None + is_causal: bool = False, + cache=None, ) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: @@ -105,7 +113,17 @@ def multi_head_attention_forward_patched( :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`. """ - tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias) + tens_ops = ( + query, + key, + value, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + out_proj_weight, + out_proj_bias, + ) if has_torch_function(tens_ops): return handle_torch_function( multi_head_attention_forward, @@ -134,10 +152,13 @@ def multi_head_attention_forward_patched( v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v, - average_attn_weights=average_attn_weights,cache=cache + average_attn_weights=average_attn_weights, + cache=cache, ) - is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads) + is_batched = _mha_shape_check( + query, key, value, key_padding_mask, attn_mask, num_heads + ) # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input # is batched, run the computation and before returning squeeze the @@ -159,7 +180,7 @@ def multi_head_attention_forward_patched( mask_name="key_padding_mask", other_type=_none_or_dtype(attn_mask), other_name="attn_mask", - target_type=query.dtype + target_type=query.dtype, ) if is_causal and attn_mask is None: @@ -184,59 +205,84 @@ def multi_head_attention_forward_patched( check_other=False, ) - if key_padding_mask is not None: # We have the attn_mask, and use that to merge kpm into it. # Turn off use of is_causal hint, as the merged mask is no # longer causal. is_causal = False - assert embed_dim == embed_dim_to_check, \ - f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + assert ( + embed_dim == embed_dim_to_check + ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" if isinstance(embed_dim, torch.Tensor): # embed_dim can be a tensor when JIT tracing - head_dim = embed_dim.div(num_heads, rounding_mode='trunc') + head_dim = embed_dim.div(num_heads, rounding_mode="trunc") else: head_dim = embed_dim // num_heads - assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + assert ( + head_dim * num_heads == embed_dim + ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used - assert key.shape[:2] == value.shape[:2], \ - f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + assert ( + key.shape[:2] == value.shape[:2] + ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" else: - assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" + assert ( + key.shape == value.shape + ), f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: - assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None" + assert ( + in_proj_weight is not None + ), "use_separate_proj_weight is False but in_proj_weight is None" q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: - assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" - assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" - assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" + assert ( + q_proj_weight is not None + ), "use_separate_proj_weight is True but q_proj_weight is None" + assert ( + k_proj_weight is not None + ), "use_separate_proj_weight is True but k_proj_weight is None" + assert ( + v_proj_weight is not None + ), "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = in_proj_bias.chunk(3) - q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v) - if(cache!=None): - if(cache["first_infer"]==1): - cache["k"][cache["stage"]]=k + q, k, v = _in_projection( + query, + key, + value, + q_proj_weight, + k_proj_weight, + v_proj_weight, + b_q, + b_k, + b_v, + ) + if cache != None: + if cache["first_infer"] == 1: + cache["k"][cache["stage"]] = k # print(0,cache["k"].shape) - cache["v"][cache["stage"]]=v - else:###12个layer每个都要留自己的cache_kv + cache["v"][cache["stage"]] = v + else: ###12个layer每个都要留自己的cache_kv # print(1,cache["k"].shape) - cache["k"][cache["stage"]]=torch.cat([cache["k"][cache["stage"]],k],0)##本来时序是1,但是proj的时候可能transpose了所以时序到0维了 - cache["v"][cache["stage"]]=torch.cat([cache["v"][cache["stage"]],v],0) + cache["k"][cache["stage"]] = torch.cat( + [cache["k"][cache["stage"]], k], 0 + ) ##本来时序是1,但是proj的时候可能transpose了所以时序到0维了 + cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]], v], 0) # print(2, cache["k"].shape) src_len = cache["k"][cache["stage"]].shape[0] - k=cache["k"][cache["stage"]] - v=cache["v"][cache["stage"]] + k = cache["k"][cache["stage"]] + v = cache["v"][cache["stage"]] # if attn_mask is not None: # attn_mask=attn_mask[-1:,] - # print(attn_mask.shape,attn_mask) + # print(attn_mask.shape,attn_mask) cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] # print(2333,cache) # prep attention mask @@ -255,14 +301,20 @@ def multi_head_attention_forward_patched( if attn_mask.dim() == 2: correct_2d_size = (tgt_len, src_len) if attn_mask.shape != correct_2d_size: - raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.") + raise RuntimeError( + f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." + ) attn_mask = attn_mask.unsqueeze(0) elif attn_mask.dim() == 3: correct_3d_size = (bsz * num_heads, tgt_len, src_len) if attn_mask.shape != correct_3d_size: - raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.") + raise RuntimeError( + f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." + ) else: - raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") + raise RuntimeError( + f"attn_mask's dimension {attn_mask.dim()} is not supported" + ) # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: @@ -286,26 +338,34 @@ def multi_head_attention_forward_patched( k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert static_k.size(0) == bsz * num_heads, \ - f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" - assert static_k.size(2) == head_dim, \ - f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + assert ( + static_k.size(0) == bsz * num_heads + ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" + assert ( + static_k.size(2) == head_dim + ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert static_v.size(0) == bsz * num_heads, \ - f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" - assert static_v.size(2) == head_dim, \ - f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + assert ( + static_v.size(0) == bsz * num_heads + ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" + assert ( + static_v.size(2) == head_dim + ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * num_heads, 1, head_dim) - k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) - v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) + k = torch.cat( + [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1 + ) + v = torch.cat( + [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1 + ) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: @@ -316,10 +376,15 @@ def multi_head_attention_forward_patched( # merge key padding and attention masks if key_padding_mask is not None: - assert key_padding_mask.shape == (bsz, src_len), \ - f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" - key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \ - expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) + assert key_padding_mask.shape == ( + bsz, + src_len, + ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" + key_padding_mask = ( + key_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, num_heads, -1, -1) + .reshape(bsz * num_heads, 1, src_len) + ) if attn_mask is None: attn_mask = key_padding_mask else: @@ -337,10 +402,14 @@ def multi_head_attention_forward_patched( B, Nt, E = q.shape q_scaled = q / math.sqrt(E) - assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" + assert not ( + is_causal and attn_mask is None + ), "FIXME: is_causal not implemented for need_weights" if attn_mask is not None: - attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1)) + attn_output_weights = torch.baddbmm( + attn_mask, q_scaled, k.transpose(-2, -1) + ) else: attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) attn_output_weights = softmax(attn_output_weights, dim=-1) @@ -349,7 +418,9 @@ def multi_head_attention_forward_patched( attn_output = torch.bmm(attn_output_weights, v) - attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) + attn_output = ( + attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) + ) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) @@ -377,8 +448,12 @@ def multi_head_attention_forward_patched( k = k.view(bsz, num_heads, src_len, head_dim) v = v.view(bsz, num_heads, src_len, head_dim) - attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) - attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) + attn_output = scaled_dot_product_attention( + q, k, v, attn_mask, dropout_p, is_causal + ) + attn_output = ( + attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) + ) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) diff --git a/GPT_SoVITS/AR/modules/scaling.py b/GPT_SoVITS/AR/modules/scaling.py index ec31d61..9256a8c 100644 --- a/GPT_SoVITS/AR/modules/scaling.py +++ b/GPT_SoVITS/AR/modules/scaling.py @@ -61,8 +61,9 @@ class DoubleSwishFunction(torch.autograd.Function): # floors), should be expectation-preserving. floor = -0.043637 ceil = 1.2 - d_scaled = (deriv - floor) * (255.0 / (ceil - floor) - ) + torch.rand_like(deriv) + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( + deriv + ) if __name__ == "__main__": # for self-testing only. assert d_scaled.min() >= 0.0 @@ -75,7 +76,7 @@ class DoubleSwishFunction(torch.autograd.Function): @staticmethod def backward(ctx, y_grad: Tensor) -> Tensor: - (d, ) = ctx.saved_tensors + (d,) = ctx.saved_tensors # the same constants as used in forward pass. floor = -0.043637 ceil = 1.2 @@ -96,11 +97,12 @@ class DoubleSwish(torch.nn.Module): class ActivationBalancerFunction(torch.autograd.Function): @staticmethod def forward( - ctx, - x: Tensor, - scale_factor: Tensor, - sign_factor: Optional[Tensor], - channel_dim: int, ) -> Tensor: + ctx, + x: Tensor, + scale_factor: Tensor, + sign_factor: Optional[Tensor], + channel_dim: int, + ) -> Tensor: if channel_dim < 0: channel_dim += x.ndim ctx.channel_dim = channel_dim @@ -125,16 +127,22 @@ class ActivationBalancerFunction(torch.autograd.Function): scale_factor = scale_factor.unsqueeze(-1) factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5) neg_delta_grad = x_grad.abs() * factor - return (x_grad - neg_delta_grad, None, None, None, ) + return ( + x_grad - neg_delta_grad, + None, + None, + None, + ) def _compute_scale_factor( - x: Tensor, - channel_dim: int, - min_abs: float, - max_abs: float, - gain_factor: float, - max_factor: float, ) -> Tensor: + x: Tensor, + channel_dim: int, + min_abs: float, + max_abs: float, + gain_factor: float, + max_factor: float, +) -> Tensor: if channel_dim < 0: channel_dim += x.ndim sum_dims = [d for d in range(x.ndim) if d != channel_dim] @@ -145,23 +153,25 @@ def _compute_scale_factor( else: # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if # x_abs)_mean , min_abs. - below_threshold = ( - (min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( - min=0, max=max_factor) + below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( + min=0, max=max_factor + ) above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( - min=0, max=max_factor) + min=0, max=max_factor + ) return below_threshold - above_threshold def _compute_sign_factor( - x: Tensor, - channel_dim: int, - min_positive: float, - max_positive: float, - gain_factor: float, - max_factor: float, ) -> Tensor: + x: Tensor, + channel_dim: int, + min_positive: float, + max_positive: float, + gain_factor: float, + max_factor: float, +) -> Tensor: if channel_dim < 0: channel_dim += x.ndim sum_dims = [d for d in range(x.ndim) if d != channel_dim] @@ -171,18 +181,18 @@ def _compute_sign_factor( else: # 0 if proportion_positive >= min_positive, else can be # as large as max_factor. - factor1 = ((min_positive - proportion_positive) * - (gain_factor / min_positive)).clamp_( - min=0, max=max_factor) + factor1 = ( + (min_positive - proportion_positive) * (gain_factor / min_positive) + ).clamp_(min=0, max=max_factor) if max_positive == 1.0: factor2 = 0.0 else: # 0 if self.proportion_positive <= max_positive, else can be # as large as -max_factor. - factor2 = ((proportion_positive - max_positive) * - (gain_factor / (1.0 - max_positive))).clamp_( - min=0, max=max_factor) + factor2 = ( + (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) + ).clamp_(min=0, max=max_factor) sign_factor = factor1 - factor2 # require min_positive != 0 or max_positive != 1: assert not isinstance(sign_factor, float) @@ -230,17 +240,18 @@ class ActivationBalancer(torch.nn.Module): """ def __init__( - self, - num_channels: int, - channel_dim: int, - min_positive: float=0.05, - max_positive: float=0.95, - max_factor: float=0.04, - sign_gain_factor: float=0.01, - scale_gain_factor: float=0.02, - min_abs: float=0.2, - max_abs: float=100.0, - min_prob: float=0.1, ): + self, + num_channels: int, + channel_dim: int, + min_positive: float = 0.05, + max_positive: float = 0.95, + max_factor: float = 0.04, + sign_gain_factor: float = 0.01, + scale_gain_factor: float = 0.02, + min_abs: float = 0.2, + max_abs: float = 100.0, + min_prob: float = 0.1, + ): super(ActivationBalancer, self).__init__() self.num_channels = num_channels self.channel_dim = channel_dim @@ -260,8 +271,7 @@ class ActivationBalancer(torch.nn.Module): self.register_buffer("count", torch.tensor(0, dtype=torch.int64)) def forward(self, x: Tensor) -> Tensor: - if (torch.jit.is_scripting() or not x.requires_grad or - torch.jit.is_tracing()): + if torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing(): return _no_op(x) count = self.cpu_count @@ -276,7 +286,7 @@ class ActivationBalancer(torch.nn.Module): # the prob of doing some work exponentially decreases from 0.5 till it hits # a floor at min_prob (==0.1, by default) - prob = max(self.min_prob, 0.5**(1 + (count / 4000.0))) + prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0))) if random.random() < prob: sign_gain_factor = 0.5 @@ -287,7 +297,8 @@ class ActivationBalancer(torch.nn.Module): self.min_positive, self.max_positive, gain_factor=self.sign_gain_factor / prob, - max_factor=self.max_factor, ) + max_factor=self.max_factor, + ) else: sign_factor = None @@ -297,23 +308,28 @@ class ActivationBalancer(torch.nn.Module): min_abs=self.min_abs, max_abs=self.max_abs, gain_factor=self.scale_gain_factor / prob, - max_factor=self.max_factor, ) + max_factor=self.max_factor, + ) return ActivationBalancerFunction.apply( x, scale_factor, sign_factor, - self.channel_dim, ) + self.channel_dim, + ) else: return _no_op(x) -def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, - min_prob=0.25) -> nn.Sequential: +def BalancedDoubleSwish( + d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 +) -> nn.Sequential: """ ActivationBalancer -> DoubleSwish """ balancer = ActivationBalancer( - d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob) + d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob + ) return nn.Sequential( balancer, - DoubleSwish(), ) + DoubleSwish(), + ) diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 04f0b1b..7921f48 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -26,26 +26,28 @@ class LayerNorm(nn.Module): elementwise_affine: bool def __init__( - self, - normalized_shape: _shape_t, - eps: float=1e-5, - elementwise_affine: bool=True, - device=None, - dtype=None, ) -> None: + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(LayerNorm, self).__init__() if isinstance(normalized_shape, numbers.Integral): # mypy error: incompatible types in assignment - normalized_shape = (normalized_shape, ) # type: ignore[assignment] - self.normalized_shape = tuple( - normalized_shape) # type: ignore[arg-type] + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs)) + torch.empty(self.normalized_shape, **factory_kwargs) + ) self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs)) + torch.empty(self.normalized_shape, **factory_kwargs) + ) else: self.register_parameter("weight", None) self.register_parameter("bias", None) @@ -57,36 +59,43 @@ class LayerNorm(nn.Module): nn.init.ones_(self.weight) nn.init.zeros_(self.bias) - def forward(self, input: Tensor, embedding: Any=None) -> Tensor: + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): input, embedding = input - return (F.layer_norm( - input, - self.normalized_shape, - self.weight, - self.bias, - self.eps, ), embedding, ) + return ( + F.layer_norm( + input, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ), + embedding, + ) assert embedding is None - return F.layer_norm(input, self.normalized_shape, self.weight, - self.bias, self.eps) + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps + ) def extra_repr(self) -> str: return ( "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__)) + "elementwise_affine={elementwise_affine}".format(**self.__dict__) + ) class IdentityNorm(nn.Module): def __init__( - self, - d_model: int, - eps: float=1e-5, - device=None, - dtype=None, ) -> None: + self, + d_model: int, + eps: float = 1e-5, + device=None, + dtype=None, + ) -> None: super(IdentityNorm, self).__init__() - def forward(self, input: Tensor, embedding: Any=None) -> Tensor: + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): return input @@ -121,11 +130,13 @@ class TransformerEncoder(nn.Module): self.norm = norm def forward( - self, - src: Tensor, - mask: Optional[Tensor]=None, - src_key_padding_mask: Optional[Tensor]=None, - return_layer_states: bool=False,cache=None ) -> Tensor: + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + return_layer_states: bool = False, + cache=None, + ) -> Tensor: r"""Pass the input through the encoder layers in turn. Args: @@ -144,7 +155,9 @@ class TransformerEncoder(nn.Module): output = mod( output, src_mask=mask, - src_key_padding_mask=src_key_padding_mask, cache=cache) + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) layer_states.append(output[0]) if self.norm is not None: @@ -154,9 +167,12 @@ class TransformerEncoder(nn.Module): output = src for mod in self.layers: - output = mod(output, - src_mask=mask, - src_key_padding_mask=src_key_padding_mask, cache=cache) + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) if self.norm is not None: output = self.norm(output) @@ -168,43 +184,47 @@ class TransformerEncoderLayer(nn.Module): __constants__ = ["batch_first", "norm_first"] def __init__( - self, - d_model: int, - nhead: int, - dim_feedforward: int=2048, - dropout: float=0.1, - activation: Union[str, Callable[[Tensor], Tensor]]=F.relu, - batch_first: bool=False, - norm_first: bool=False, - device=None, - dtype=None, - linear1_self_attention_cls: nn.Module=nn.Linear, - linear2_self_attention_cls: nn.Module=nn.Linear, - linear1_feedforward_cls: nn.Module=nn.Linear, - linear2_feedforward_cls: nn.Module=nn.Linear, - layer_norm_cls: nn.Module=LayerNorm, - layer_norm_eps: float=1e-5, - adaptive_layer_norm=False, ) -> None: + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + batch_first: bool = False, + norm_first: bool = False, + device=None, + dtype=None, + linear1_self_attention_cls: nn.Module = nn.Linear, + linear2_self_attention_cls: nn.Module = nn.Linear, + linear1_feedforward_cls: nn.Module = nn.Linear, + linear2_feedforward_cls: nn.Module = nn.Linear, + layer_norm_cls: nn.Module = LayerNorm, + layer_norm_eps: float = 1e-5, + adaptive_layer_norm=False, + ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(TransformerEncoderLayer, self).__init__() # print(233333333333,d_model,nhead) # import os # os._exit(2333333) self.self_attn = MultiheadAttention( - d_model,#512 16 + d_model, # 512 16 nhead, dropout=dropout, batch_first=batch_first, linear1_cls=linear1_self_attention_cls, linear2_cls=linear2_self_attention_cls, - **factory_kwargs, ) + **factory_kwargs, + ) # Implementation of Feedforward model - self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, - **factory_kwargs) + self.linear1 = linear1_feedforward_cls( + d_model, dim_feedforward, **factory_kwargs + ) self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, - **factory_kwargs) + self.linear2 = linear2_feedforward_cls( + dim_feedforward, d_model, **factory_kwargs + ) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) @@ -230,11 +250,9 @@ class TransformerEncoderLayer(nn.Module): norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) if layer_norm_cls == IdentityNorm: - norm2 = BalancedBasicNorm( - d_model, eps=layer_norm_eps, **factory_kwargs) + norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) else: - norm2 = layer_norm_cls( - d_model, eps=layer_norm_eps, **factory_kwargs) + norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) if adaptive_layer_norm: self.norm1 = AdaptiveLayerNorm(d_model, norm1) @@ -249,10 +267,12 @@ class TransformerEncoderLayer(nn.Module): self.activation = F.relu def forward( - self, - src: Tensor, - src_mask: Optional[Tensor]=None, - src_key_padding_mask: Optional[Tensor]=None,cache=None ) -> Tensor: + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + cache=None, + ) -> Tensor: r"""Pass the input through the encoder layer. Args: @@ -272,7 +292,8 @@ class TransformerEncoderLayer(nn.Module): if src_key_padding_mask is not None: _skpm_dtype = src_key_padding_mask.dtype if _skpm_dtype != torch.bool and not torch.is_floating_point( - src_key_padding_mask): + src_key_padding_mask + ): raise AssertionError( "only bool and floating types of key_padding_mask are supported" ) @@ -281,12 +302,15 @@ class TransformerEncoderLayer(nn.Module): x = x + self._sa_block( self.norm1(x, stage_embedding), src_mask, - src_key_padding_mask,cache=cache ) + src_key_padding_mask, + cache=cache, + ) x = x + self._ff_block(self.norm2(x, stage_embedding)) else: x = self.norm1( - x + self._sa_block(x, src_mask, src_key_padding_mask,cache=cache), - stage_embedding, ) + x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), + stage_embedding, + ) x = self.norm2(x + self._ff_block(x), stage_embedding) if is_src_tuple: @@ -295,12 +319,14 @@ class TransformerEncoderLayer(nn.Module): # self-attention block def _sa_block( - self, - x: Tensor, - attn_mask: Optional[Tensor], - key_padding_mask: Optional[Tensor],cache=None ) -> Tensor: + self, + x: Tensor, + attn_mask: Optional[Tensor], + key_padding_mask: Optional[Tensor], + cache=None, + ) -> Tensor: # print(x.shape,attn_mask.shape,key_padding_mask) - #torch.Size([1, 188, 512]) torch.Size([188, 188]) None + # torch.Size([1, 188, 512]) torch.Size([188, 188]) None # import os # os._exit(23333) x = self.self_attn( @@ -309,7 +335,9 @@ class TransformerEncoderLayer(nn.Module): x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, - need_weights=False,cache=cache )[0] + need_weights=False, + cache=cache, + )[0] return self.dropout1(x) # feed forward block @@ -328,20 +356,23 @@ class AdaptiveLayerNorm(nn.Module): self.d_model = d_model self.eps = self.norm.eps - def forward(self, input: Tensor, embedding: Tensor=None) -> Tensor: + def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: if isinstance(input, tuple): input, embedding = input weight, bias = torch.split( self.project_layer(embedding), split_size_or_sections=self.d_model, - dim=-1, ) + dim=-1, + ) return (weight * self.norm(input) + bias, embedding) weight, bias = torch.split( self.project_layer(embedding), split_size_or_sections=self.d_model, - dim=-1, ) + dim=-1, + ) return weight * self.norm(input) + bias + def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py index 83ecfb7..9fcf5c0 100644 --- a/GPT_SoVITS/AR/text_processing/phonemizer.py +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -27,46 +27,44 @@ class GruutPhonemizer: "—": "—", "…": "… ", "«": "«", - "»": "»" + "»": "»", } - self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" + self._punctuation_regexp: str = ( + rf"([{''.join(self._special_cases_dict.keys())}])" + ) def _normalize_punctuation(self, text: str) -> str: - text = regex.sub(fr"\pZ+{self._punctuation_regexp}", r"\1", text) - text = regex.sub(fr"{self._punctuation_regexp}(\pL)", r"\1 \2", text) + text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) + text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) text = regex.sub(r"\pZ+", r" ", text) return text.strip() def _convert_punctuation(self, word: Word) -> str: if not word.phonemes: - return '' - if word.phonemes[0] in ['‖', '|']: + return "" + if word.phonemes[0] in ["‖", "|"]: return word.text.strip() - phonemes = ''.join(word.phonemes) + phonemes = "".join(word.phonemes) # remove modifier characters ˈˌː with regex - phonemes = re.sub(r'[ˈˌː͡]', '', phonemes) + phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) return phonemes.strip() - def phonemize(self, text: str, espeak: bool=False) -> str: + def phonemize(self, text: str, espeak: bool = False) -> str: text_to_phonemize: str = self._normalize_punctuation(text) sents: List[Sentence] = [ sent - for sent in self._phonemizer( - text_to_phonemize, lang="en-us", espeak=espeak) + for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) ] words: List[str] = [ self._convert_punctuation(word) for word in itertools.chain(*sents) ] - return ' '.join(words) + return " ".join(words) def transform(self, phonemes): # convert phonemes to ids # dictionary is in symbols.py - return [ - self.symbol_to_id[p] for p in phonemes - if p in self.symbol_to_id.keys() - ] + return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] if __name__ == "__main__": diff --git a/GPT_SoVITS/AR/text_processing/symbols.py b/GPT_SoVITS/AR/text_processing/symbols.py index 6bc9a0c..c57e2d4 100644 --- a/GPT_SoVITS/AR/text_processing/symbols.py +++ b/GPT_SoVITS/AR/text_processing/symbols.py @@ -1,7 +1,7 @@ # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/symbols.py -PAD = '_' +PAD = "_" PUNCTUATION = ';:,.!?¡¿—…"«»“” ' -LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' +LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) SPACE_ID = SYMBOLS.index(" ") diff --git a/GPT_SoVITS/AR/utils/io.py b/GPT_SoVITS/AR/utils/io.py index 24f1be6..52f1f3c 100644 --- a/GPT_SoVITS/AR/utils/io.py +++ b/GPT_SoVITS/AR/utils/io.py @@ -11,22 +11,24 @@ def load_yaml_config(path): def save_config_to_yaml(config, path): - assert path.endswith('.yaml') - with open(path, 'w') as f: + assert path.endswith(".yaml") + with open(path, "w") as f: f.write(yaml.dump(config)) f.close() def write_args(args, path): - args_dict = dict((name, getattr(args, name)) for name in dir(args) - if not name.startswith('_')) - with open(path, 'a') as args_file: - args_file.write('==> torch version: {}\n'.format(torch.__version__)) + args_dict = dict( + (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") + ) + with open(path, "a") as args_file: + args_file.write("==> torch version: {}\n".format(torch.__version__)) args_file.write( - '==> cudnn version: {}\n'.format(torch.backends.cudnn.version())) - args_file.write('==> Cmd:\n') + "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) + ) + args_file.write("==> Cmd:\n") args_file.write(str(sys.argv)) - args_file.write('\n==> args:\n') + args_file.write("\n==> args:\n") for k, v in sorted(args_dict.items()): - args_file.write(' %s: %s\n' % (str(k), str(v))) + args_file.write(" %s: %s\n" % (str(k), str(v))) args_file.close() diff --git a/GPT_SoVITS/configs/s1.yaml b/GPT_SoVITS/configs/s1.yaml index 5481b9b..f8ae17d 100644 --- a/GPT_SoVITS/configs/s1.yaml +++ b/GPT_SoVITS/configs/s1.yaml @@ -1,31 +1,31 @@ train: - seed: 1234 - epochs: 300 - batch_size: 8 - gradient_accumulation: 4 - save_every_n_epoch: 1 - precision: 16 - gradient_clip: 1.0 + seed: 1234 + epochs: 300 + batch_size: 8 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16 + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 54 - num_workers: 1 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model model: - vocab_size: 1025 - phoneme_vocab_size: 512 - embedding_dim: 512 - hidden_dim: 512 - head: 16 - linear_units: 2048 - n_layer: 12 - dropout: 0 - EOS: 1024 + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 12 + dropout: 0 + EOS: 1024 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/configs/s1big.yaml b/GPT_SoVITS/configs/s1big.yaml index 3a17ae5..a811150 100644 --- a/GPT_SoVITS/configs/s1big.yaml +++ b/GPT_SoVITS/configs/s1big.yaml @@ -1,31 +1,31 @@ train: - seed: 1234 - epochs: 300 - batch_size: 8 - gradient_accumulation: 4 - save_every_n_epoch: 1 - precision: 16-mixed - gradient_clip: 1.0 + seed: 1234 + epochs: 300 + batch_size: 8 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 54 - num_workers: 1 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model model: - vocab_size: 1025 - phoneme_vocab_size: 512 - embedding_dim: 1024 - hidden_dim: 1024 - head: 16 - linear_units: 2048 - n_layer: 16 - dropout: 0 - EOS: 1024 + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 1024 + hidden_dim: 1024 + head: 16 + linear_units: 2048 + n_layer: 16 + dropout: 0 + EOS: 1024 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/configs/s1big2.yaml b/GPT_SoVITS/configs/s1big2.yaml index 1037fc7..b8b889b 100644 --- a/GPT_SoVITS/configs/s1big2.yaml +++ b/GPT_SoVITS/configs/s1big2.yaml @@ -1,31 +1,31 @@ train: - seed: 1234 - epochs: 300 - batch_size: 12 - gradient_accumulation: 4 - save_every_n_epoch: 1 - precision: 16-mixed - gradient_clip: 1.0 + seed: 1234 + epochs: 300 + batch_size: 12 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 54 - num_workers: 1 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model model: - vocab_size: 1025 - phoneme_vocab_size: 512 - embedding_dim: 1024 - hidden_dim: 1024 - head: 16 - linear_units: 2048 - n_layer: 6 - dropout: 0 - EOS: 1024 + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 1024 + hidden_dim: 1024 + head: 16 + linear_units: 2048 + n_layer: 6 + dropout: 0 + EOS: 1024 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/configs/s1longer.yaml b/GPT_SoVITS/configs/s1longer.yaml index b238abd..3f57abd 100644 --- a/GPT_SoVITS/configs/s1longer.yaml +++ b/GPT_SoVITS/configs/s1longer.yaml @@ -1,31 +1,31 @@ train: - seed: 1234 - epochs: 20 - batch_size: 8 - save_every_n_epoch: 1 - precision: 16-mixed - gradient_clip: 1.0 + seed: 1234 + epochs: 20 + batch_size: 8 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 54 - num_workers: 4 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 54 + num_workers: 4 + pad_val: 1024 # same with EOS in model model: - vocab_size: 1025 - phoneme_vocab_size: 512 - embedding_dim: 512 - hidden_dim: 512 - head: 16 - linear_units: 2048 - n_layer: 24 - dropout: 0 - EOS: 1024 - random_bert: 0 + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 24 + dropout: 0 + EOS: 1024 + random_bert: 0 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/configs/s1mq.yaml b/GPT_SoVITS/configs/s1mq.yaml index 19aac92..b554fd3 100644 --- a/GPT_SoVITS/configs/s1mq.yaml +++ b/GPT_SoVITS/configs/s1mq.yaml @@ -1,77 +1,77 @@ train: - seed: 1234 - epochs: 100 - batch_size: 6 - gradient_accumulation: 4 - save_every_n_epoch: 1 - precision: 32 - gradient_clip: 1.0 + seed: 1234 + epochs: 100 + batch_size: 6 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 32 + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 40 - num_workers: 1 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 40 + num_workers: 1 + pad_val: 1024 # same with EOS in model model: - saving_path: "ckpt/" - resume_checkpoint: null - vocoder_config_path: "quantizer/new_ckpt/config.json" - vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" - datadir: "/home/liweiche/GigaSpeech/wavs" - metapath: "/home/liweiche/GigaSpeech/train2.json" - val_metapath: "/home/liweiche/GigaSpeech/dev2.json" - sampledir: "logs/" - pretrained_path: null - lr: 0.0001 - batch_size: 200.0 - train_bucket_size: 8192 - training_step: 800000 - optim_flat_percent: 0.0 - warmup_step: 50 - adam_beta1: 0.9 - adam_beta2: 0.98 - ffd_size: 3072 - hidden_size: 768 - enc_nlayers: 6 - dec_nlayers: 6 - nheads: 12 - ar_layer: 4 - ar_ffd_size: 1024 - ar_hidden_size: 256 - ar_nheads: 4 - aligner_softmax_temp: 1.0 - layer_norm_eps: 0.00001 - speaker_embed_dropout: 0.05 - label_smoothing: 0.0 - val_check_interval: 5000 - check_val_every_n_epoch: 1 - precision: "fp16" - nworkers: 16 - distributed: true - accelerator: "ddp" - version: null - accumulate_grad_batches: 1 - use_repetition_token: true - use_repetition_gating: false - repetition_penalty: 1.0 - sampling_temperature: 1.0 - top_k: -1 - min_top_k: 3 - top_p: 0.8 - sample_num: 4 - length_penalty_max_length: 15000 - length_penalty_max_prob: 0.95 - max_input_length: 2048 - max_output_length: 2000 - sample_rate: 16000 - n_codes: 1024 - n_cluster_groups: 1 - phone_context_window: 4 - phoneset_size: 1000 + saving_path: "ckpt/" + resume_checkpoint: null + vocoder_config_path: "quantizer/new_ckpt/config.json" + vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" + datadir: "/home/liweiche/GigaSpeech/wavs" + metapath: "/home/liweiche/GigaSpeech/train2.json" + val_metapath: "/home/liweiche/GigaSpeech/dev2.json" + sampledir: "logs/" + pretrained_path: null + lr: 0.0001 + batch_size: 200.0 + train_bucket_size: 8192 + training_step: 800000 + optim_flat_percent: 0.0 + warmup_step: 50 + adam_beta1: 0.9 + adam_beta2: 0.98 + ffd_size: 3072 + hidden_size: 768 + enc_nlayers: 6 + dec_nlayers: 6 + nheads: 12 + ar_layer: 4 + ar_ffd_size: 1024 + ar_hidden_size: 256 + ar_nheads: 4 + aligner_softmax_temp: 1.0 + layer_norm_eps: 0.00001 + speaker_embed_dropout: 0.05 + label_smoothing: 0.0 + val_check_interval: 5000 + check_val_every_n_epoch: 1 + precision: "fp16" + nworkers: 16 + distributed: true + accelerator: "ddp" + version: null + accumulate_grad_batches: 1 + use_repetition_token: true + use_repetition_gating: false + repetition_penalty: 1.0 + sampling_temperature: 1.0 + top_k: -1 + min_top_k: 3 + top_p: 0.8 + sample_num: 4 + length_penalty_max_length: 15000 + length_penalty_max_prob: 0.95 + max_input_length: 2048 + max_output_length: 2000 + sample_rate: 16000 + n_codes: 1024 + n_cluster_groups: 1 + phone_context_window: 4 + phoneset_size: 1000 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/configs/train.yaml b/GPT_SoVITS/configs/train.yaml index a61e90d..be53335 100644 --- a/GPT_SoVITS/configs/train.yaml +++ b/GPT_SoVITS/configs/train.yaml @@ -1,32 +1,32 @@ gpu: - n_card: 1 - n_process_per_card: 2 + n_card: 1 + n_process_per_card: 2 io: - text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS - save_every_n_epoch: 1 - precision: 16-mixed - gradient_clip: 1.0 + text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 optimizer: - lr: 0.01 - lr_init: 0.00001 - lr_end: 0.0001 - warmup_steps: 2000 - decay_steps: 40000 + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 data: - max_eval_sample: 8 - max_sec: 54 - num_workers: 1 - pad_val: 1024 # same with EOS in model + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model model: - vocab_size: 1025 - phoneme_vocab_size: 512 - embedding_dim: 512 - hidden_dim: 512 - head: 16 - linear_units: 2048 - n_layer: 24 - dropout: 0 - EOS: 1024 - random_bert: 0 + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 24 + dropout: 0 + EOS: 1024 + random_bert: 0 inference: - top_k: 5 \ No newline at end of file + top_k: 5 diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index 048dc85..dc155bd 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -11,23 +11,30 @@ logging.getLogger("numba").setLevel(logging.WARNING) from transformers import ( Wav2Vec2FeatureExtractor, HubertModel, - Wav2Vec2Model, ) import utils import torch.nn as nn -cnhubert_base_path=None +cnhubert_base_path = None + + class CNHubert(nn.Module): def __init__(self): super().__init__() self.model = HubertModel.from_pretrained(cnhubert_base_path) - self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(cnhubert_base_path) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + cnhubert_base_path + ) + def forward(self, x): - input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) + input_values = self.feature_extractor( + x, return_tensors="pt", sampling_rate=16000 + ).input_values.to(x.device) feats = self.model(input_values)["last_hidden_state"] return feats + # class CNHubertLarge(nn.Module): # def __init__(self): # super().__init__() @@ -59,12 +66,12 @@ class CNHubert(nn.Module): # return feats - def get_model(): model = CNHubert() model.eval() return model + # def get_large_model(): # model = CNHubertLarge() # model.eval() @@ -80,18 +87,18 @@ def get_model(): # model.eval() # return model + def get_content(hmodel, wav_16k_tensor): with torch.no_grad(): feats = hmodel(wav_16k_tensor) - return feats.transpose(1,2) + return feats.transpose(1, 2) -if __name__ == '__main__': +if __name__ == "__main__": model = get_model() src_path = "/Users/Shared/原音频2.wav" wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) model = model wav_16k_tensor = wav_16k_tensor - feats = get_content(model,wav_16k_tensor) + feats = get_content(model, wav_16k_tensor) print(feats.shape) - diff --git a/GPT_SoVITS/feature_extractor/whisper_enc.py b/GPT_SoVITS/feature_extractor/whisper_enc.py index 023f751..983c3e4 100644 --- a/GPT_SoVITS/feature_extractor/whisper_enc.py +++ b/GPT_SoVITS/feature_extractor/whisper_enc.py @@ -3,20 +3,23 @@ import torch def get_model(): import whisper - model = whisper.load_model("small", device='cpu') + + model = whisper.load_model("small", device="cpu") return model.encoder def get_content(model=None, wav_16k_tensor=None): from whisper import log_mel_spectrogram, pad_or_trim + dev = next(model.parameters()).device mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] # if torch.cuda.is_available(): # mel = mel.to(torch.float16) feature_len = mel.shape[-1] // 2 - assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" + assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" with torch.no_grad(): - feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1,2) + feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ + :1, :feature_len, : + ].transpose(1, 2) return feature - diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index 07672e2..a2e9e51 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -4,315 +4,432 @@ from torch import nn from torch.nn import functional as F from module import commons -from module. modules import LayerNorm - +from module.modules import LayerNorm + class Encoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,isflow=False, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=False, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - if isflow: - cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2*hidden_channels*n_layers, 1) - self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) - self.cond_layer = weight_norm_modules(cond_layer, name='weight') - self.gin_channels = kwargs["gin_channels"] - def forward(self, x, x_mask, g=None): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - if g is not None: - g = self.cond_layer(g) + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + if isflow: + cond_layer = torch.nn.Conv1d( + kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 + ) + self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) + self.cond_layer = weight_norm_modules(cond_layer, name="weight") + self.gin_channels = kwargs["gin_channels"] - for i in range(self.n_layers): - if g is not None: - x = self.cond_pre(x) - cond_offset = i * 2 * self.hidden_channels - g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] - x = commons.fused_add_tanh_sigmoid_multiply( - x, - g_l, - torch.IntTensor([self.hidden_channels])) - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + if g is not None: + g = self.cond_layer(g) - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x + for i in range(self.n_layers): + if g is not None: + x = self.cond_pre(x) + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + x = commons.fused_add_tanh_sigmoid_multiply( + x, g_l, torch.IntTensor([self.hidden_channels]) + ) + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x class Decoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) - def forward(self, x, x_mask, h, h_mask): - """ - x: decoder input - h: encoder output - """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x class MultiHeadAttention(nn.Module): - def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): - super().__init__() - assert channels % n_heads == 0 + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) - x = self.conv_o(x) - return x + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + x, self.attn = self.attention(q, k, v, mask=attn_mask) - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert t_s == t_t, "Local attention is only available for self-attention." - block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) - output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn + x = self.conv_o(x) + return x - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] - return used_relative_embeddings + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] - return x_final + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) - x_flat = x.view([batch, heads, length**2 + length*(length -1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] - return x_final + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) class FFN(nn.Module): - def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x import torch.nn as nn @@ -320,195 +437,273 @@ from torch.nn.utils import remove_weight_norm, weight_norm class Depthwise_Separable_Conv1D(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - bias=True, - padding_mode='zeros', # TODO: refine this type - device=None, - dtype=None - ): - super().__init__() - self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, - groups=in_channels, stride=stride, padding=padding, dilation=dilation, bias=bias, - padding_mode=padding_mode, device=device, dtype=dtype) - self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, - device=device, dtype=dtype) + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode="zeros", # TODO: refine this type + device=None, + dtype=None, + ): + super().__init__() + self.depth_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + groups=in_channels, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + self.point_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + bias=bias, + device=device, + dtype=dtype, + ) - def forward(self, input): - return self.point_conv(self.depth_conv(input)) + def forward(self, input): + return self.point_conv(self.depth_conv(input)) - def weight_norm(self): - self.depth_conv = weight_norm(self.depth_conv, name='weight') - self.point_conv = weight_norm(self.point_conv, name='weight') + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name="weight") + self.point_conv = weight_norm(self.point_conv, name="weight") - def remove_weight_norm(self): - self.depth_conv = remove_weight_norm(self.depth_conv, name='weight') - self.point_conv = remove_weight_norm(self.point_conv, name='weight') + def remove_weight_norm(self): + self.depth_conv = remove_weight_norm(self.depth_conv, name="weight") + self.point_conv = remove_weight_norm(self.point_conv, name="weight") class Depthwise_Separable_TransposeConv1D(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - output_padding=0, - bias=True, - dilation=1, - padding_mode='zeros', # TODO: refine this type - device=None, - dtype=None - ): - super().__init__() - self.depth_conv = nn.ConvTranspose1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, - groups=in_channels, stride=stride, output_padding=output_padding, - padding=padding, dilation=dilation, bias=bias, padding_mode=padding_mode, - device=device, dtype=dtype) - self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, - device=device, dtype=dtype) + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + bias=True, + dilation=1, + padding_mode="zeros", # TODO: refine this type + device=None, + dtype=None, + ): + super().__init__() + self.depth_conv = nn.ConvTranspose1d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + groups=in_channels, + stride=stride, + output_padding=output_padding, + padding=padding, + dilation=dilation, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + self.point_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + bias=bias, + device=device, + dtype=dtype, + ) - def forward(self, input): - return self.point_conv(self.depth_conv(input)) + def forward(self, input): + return self.point_conv(self.depth_conv(input)) - def weight_norm(self): - self.depth_conv = weight_norm(self.depth_conv, name='weight') - self.point_conv = weight_norm(self.point_conv, name='weight') + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name="weight") + self.point_conv = weight_norm(self.point_conv, name="weight") - def remove_weight_norm(self): - remove_weight_norm(self.depth_conv, name='weight') - remove_weight_norm(self.point_conv, name='weight') + def remove_weight_norm(self): + remove_weight_norm(self.depth_conv, name="weight") + remove_weight_norm(self.point_conv, name="weight") -def weight_norm_modules(module, name='weight', dim=0): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): - module.weight_norm() - return module - else: - return weight_norm(module, name, dim) +def weight_norm_modules(module, name="weight", dim=0): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( + module, Depthwise_Separable_TransposeConv1D + ): + module.weight_norm() + return module + else: + return weight_norm(module, name, dim) -def remove_weight_norm_modules(module, name='weight'): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): - module.remove_weight_norm() - else: - remove_weight_norm(module, name) +def remove_weight_norm_modules(module, name="weight"): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( + module, Depthwise_Separable_TransposeConv1D + ): + module.remove_weight_norm() + else: + remove_weight_norm(module, name) class FFT(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., - proximal_bias=False, proximal_init=True, isflow = False, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - if isflow: - cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2*hidden_channels*n_layers, 1) - self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) - self.cond_layer = weight_norm_modules(cond_layer, name='weight') - self.gin_channels = kwargs["gin_channels"] - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append( - MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, - proximal_init=proximal_init)) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers=1, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + isflow=False, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + if isflow: + cond_layer = torch.nn.Conv1d( + kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 + ) + self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) + self.cond_layer = weight_norm_modules(cond_layer, name="weight") + self.gin_channels = kwargs["gin_channels"] + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) - def forward(self, x, x_mask, g = None): - """ - x: decoder input - h: encoder output - """ - if g is not None: - g = self.cond_layer(g) + def forward(self, x, x_mask, g=None): + """ + x: decoder input + h: encoder output + """ + if g is not None: + g = self.cond_layer(g) - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) - x = x * x_mask - for i in range(self.n_layers): - if g is not None: - x = self.cond_pre(x) - cond_offset = i * 2 * self.hidden_channels - g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] - x = commons.fused_add_tanh_sigmoid_multiply( - x, - g_l, - torch.IntTensor([self.hidden_channels])) - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - x = x * x_mask - return x + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + x = x * x_mask + for i in range(self.n_layers): + if g is not None: + x = self.cond_pre(x) + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + x = commons.fused_add_tanh_sigmoid_multiply( + x, g_l, torch.IntTensor([self.hidden_channels]) + ) + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + x = x * x_mask + return x class TransformerCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - n_layers, - n_heads, - p_dropout=0, - filter_channels=0, - mean_only=False, - wn_sharing_parameter=None, - gin_channels = 0 - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only + def __init__( + self, + channels, + hidden_channels, + kernel_size, + n_layers, + n_heads, + p_dropout=0, + filter_channels=0, + mean_only=False, + wn_sharing_parameter=None, + gin_channels=0, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow = True, gin_channels = gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = ( + Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + isflow=True, + gin_channels=gin_channels, + ) + if wn_sharing_parameter is None + else wn_sharing_parameter + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels]*2, 1) - else: - m = stats - logs = torch.zeros_like(m) + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1,2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x \ No newline at end of file + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/GPT_SoVITS/module/commons.py b/GPT_SoVITS/module/commons.py index 7c9b028..e96cf92 100644 --- a/GPT_SoVITS/module/commons.py +++ b/GPT_SoVITS/module/commons.py @@ -1,189 +1,189 @@ import math -import numpy as np import torch -from torch import nn from torch.nn import functional as F def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) + return int((kernel_size * dilation - dilation) / 2) def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape def intersperse(lst, item): - result = [item] * (len(lst) * 2 + 1) - result[1::2] = lst - return result + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) - return kl + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str -def get_timing_signal_1d( - length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = ( - math.log(float(max_timescale) / float(min_timescale)) / - (num_timescales - 1)) - inv_timescales = min_timescale * torch.exp( - torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2,3) * mask - return path + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1. / norm_type) - return total_norm + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm def squeeze(x, x_mask=None, n_sqz=2): - b, c, t = x.size() + b, c, t = x.size() - t = (t // n_sqz) * n_sqz - x = x[:, :, :t] - x_sqz = x.view(b, c, t // n_sqz, n_sqz) - x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) - if x_mask is not None: - x_mask = x_mask[:, :, n_sqz - 1::n_sqz] - else: - x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) - return x_sqz * x_mask, x_mask + if x_mask is not None: + x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] + else: + x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * x_mask, x_mask def unsqueeze(x, x_mask=None, n_sqz=2): - b, c, t = x.size() + b, c, t = x.size() - x_unsqz = x.view(b, n_sqz, c // n_sqz, t) - x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) - if x_mask is not None: - x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) - else: - x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) - return x_unsqz * x_mask, x_mask + if x_mask is not None: + x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * x_mask, x_mask diff --git a/GPT_SoVITS/module/core_vq.py b/GPT_SoVITS/module/core_vq.py index 9121f3a..a5e22d6 100644 --- a/GPT_SoVITS/module/core_vq.py +++ b/GPT_SoVITS/module/core_vq.py @@ -76,10 +76,8 @@ def kmeans(samples, num_clusters: int, num_iters: int = 10): print("kmeans start ... ") for _ in tqdm(range(num_iters)): - diffs = rearrange(samples, "n d -> n () d") - rearrange( - means, "c d -> () c d" - ) - dists = -(diffs ** 2).sum(dim=-1) + diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d") + dists = -(diffs**2).sum(dim=-1) buckets = dists.max(dim=-1).indices bins = torch.bincount(buckets, minlength=num_clusters) @@ -110,6 +108,7 @@ class EuclideanCodebook(nn.Module): that have an exponential moving average cluster size less than the specified threshold with randomly selected vector from the current batch. """ + def __init__( self, dim: int, @@ -122,7 +121,9 @@ class EuclideanCodebook(nn.Module): ): super().__init__() self.decay = decay - init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros + init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = ( + uniform_init if not kmeans_init else torch.zeros + ) embed = init_fn(codebook_size, dim) self.codebook_size = codebook_size @@ -147,7 +148,7 @@ class EuclideanCodebook(nn.Module): self.cluster_size.data.copy_(cluster_size) self.inited.data.copy_(torch.Tensor([True])) # Make sure all buffers across workers are in sync after initialization - #broadcast_tensors(self.buffers()) + # broadcast_tensors(self.buffers()) def replace_(self, samples, mask): modified_codebook = torch.where( @@ -165,7 +166,7 @@ class EuclideanCodebook(nn.Module): batch_samples = rearrange(batch_samples, "... d -> (...) d") self.replace_(batch_samples, mask=expired_codes) - #broadcast_tensors(self.buffers()) + # broadcast_tensors(self.buffers()) def preprocess(self, x): x = rearrange(x, "... d -> (...) d") @@ -246,6 +247,7 @@ class VectorQuantization(nn.Module): randomly selected vector from the current batch. commitment_weight (float): Weight for commitment loss. """ + def __init__( self, dim: int, @@ -256,22 +258,31 @@ class VectorQuantization(nn.Module): kmeans_init: bool = True, kmeans_iters: int = 50, threshold_ema_dead_code: int = 2, - commitment_weight: float = 1., + commitment_weight: float = 1.0, ): super().__init__() _codebook_dim: int = default(codebook_dim, dim) requires_projection = _codebook_dim != dim - self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()) - self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()) + self.project_in = ( + nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() + ) + self.project_out = ( + nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() + ) self.epsilon = epsilon self.commitment_weight = commitment_weight - self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size, - kmeans_init=kmeans_init, kmeans_iters=kmeans_iters, - decay=decay, epsilon=epsilon, - threshold_ema_dead_code=threshold_ema_dead_code) + self._codebook = EuclideanCodebook( + dim=_codebook_dim, + codebook_size=codebook_size, + kmeans_init=kmeans_init, + kmeans_iters=kmeans_iters, + decay=decay, + epsilon=epsilon, + threshold_ema_dead_code=threshold_ema_dead_code, + ) self.codebook_size = codebook_size @property @@ -316,13 +327,16 @@ class ResidualVectorQuantization(nn.Module): """Residual vector quantization implementation. Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """ + def __init__(self, *, num_quantizers, **kwargs): super().__init__() self.layers = nn.ModuleList( [VectorQuantization(**kwargs) for _ in range(num_quantizers)] ) - def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None): + def forward( + self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None + ): quantized_out = 0.0 residual = x @@ -345,7 +359,9 @@ class ResidualVectorQuantization(nn.Module): out_losses, out_indices = map(torch.stack, (all_losses, all_indices)) return quantized_out, out_indices, out_losses, out_quantized - def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int]= None) -> torch.Tensor: + def encode( + self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None + ) -> torch.Tensor: residual = x all_indices = [] n_q = n_q or len(self.layers) @@ -358,10 +374,10 @@ class ResidualVectorQuantization(nn.Module): out_indices = torch.stack(all_indices) return out_indices - def decode(self, q_indices: torch.Tensor, st: int=0) -> torch.Tensor: + def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor: quantized_out = torch.tensor(0.0, device=q_indices.device) for i, indices in enumerate(q_indices): layer = self.layers[st + i] quantized = layer.decode(indices) quantized_out = quantized_out + quantized - return quantized_out \ No newline at end of file + return quantized_out diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index ea3fe77..15f401d 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,6 +1,6 @@ -import time,logging +import time, logging import os -import random,traceback +import random, traceback import numpy as np import torch import torch.utils.data @@ -16,41 +16,44 @@ import torch import requests from scipy.io import wavfile from io import BytesIO + # from config import exp_dir from my_utils import load_audio + class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ - 1) loads audio, speaker_id, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. """ def __init__(self, hparams, val=False): - exp_dir=hparams.exp_dir - self.path2="%s/2-name2text.txt"%exp_dir - self.path4="%s/4-cnhubert"%exp_dir - self.path5="%s/5-wav32k"%exp_dir + exp_dir = hparams.exp_dir + self.path2 = "%s/2-name2text.txt" % exp_dir + self.path4 = "%s/4-cnhubert" % exp_dir + self.path5 = "%s/5-wav32k" % exp_dir assert os.path.exists(self.path2) assert os.path.exists(self.path4) assert os.path.exists(self.path5) - names4=set([name[:-3]for name in list(os.listdir(self.path4))])#去除.pt后缀 - names5=set(os.listdir(self.path5)) - self.phoneme_data={} - with open(self.path2,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") + names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀 + names5 = set(os.listdir(self.path5)) + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") for line in lines: - tmp=line.split("\t") - if(len(tmp)!=4):continue - self.phoneme_data[tmp[0]]=[tmp[1]] + tmp = line.split("\t") + if len(tmp) != 4: + continue + self.phoneme_data[tmp[0]] = [tmp[1]] - self.audiopaths_sid_text=list(set(self.phoneme_data)&names4&names5) - tmp=self.audiopaths_sid_text - leng=len(tmp) - min_num=100 - if(leng duration > 0.6 or self.val): + if 54 > duration > 0.6 or self.val: audiopaths_sid_text_new.append([audiopath, phoneme_ids]) lengths.append(size // (2 * self.hop_length)) else: @@ -90,7 +93,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): continue print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) print("total left: ", len(audiopaths_sid_text_new)) - assert len(audiopaths_sid_text_new)>1#至少能凑够batch size,这里todo + assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths @@ -98,30 +101,41 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): audiopath, phoneme_ids = audiopath_sid_text text = torch.FloatTensor(phoneme_ids) try: - spec, wav = self.get_audio("%s/%s"%(self.path5,audiopath)) + spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): - ssl = torch.load("%s/%s.pt"%(self.path4,audiopath),map_location="cpu") - if(ssl.shape[-1]!=spec.shape[-1]): - typee=ssl.dtype - ssl=F.pad(ssl.float(),(0,1),mode="replicate").to(typee) - ssl.requires_grad=False + ssl = torch.load( + "%s/%s.pt" % (self.path4, audiopath), map_location="cpu" + ) + if ssl.shape[-1] != spec.shape[-1]: + typee = ssl.dtype + ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) + ssl.requires_grad = False except: traceback.print_exc() spec = torch.zeros(1025, 100) - wav = torch.zeros(1, 100*self.hop_length) - ssl=torch.zeros(1,768,100) - text=text[-1:] + wav = torch.zeros(1, 100 * self.hop_length) + ssl = torch.zeros(1, 768, 100) + text = text[-1:] print("load audio or ssl error!!!!!!", audiopath) # print(ssl.requires_grad,spec.requires_grad,wav.requires_grad,text.requires_grad) return (ssl, spec, wav, text) def get_audio(self, filename): - audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio_array = load_audio( + filename, self.sampling_rate + ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 # print(filename,audio_array.max(),audio_array.min(),audio_array.mean()) - audio=torch.FloatTensor(audio_array)#/32768 + audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length,self.sampling_rate, self.hop_length, self.win_length,center=False) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) spec = torch.squeeze(spec, 0) return spec, audio_norm @@ -131,39 +145,51 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def __getitem__(self, index): # with torch.no_grad(): - return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) + return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) def __len__(self): return len(self.audiopaths_sid_text) def random_slice(self, ssl, wav, mel): - assert abs(ssl.shape[-1]- wav.shape[-1]//self.hop_length) < 3, ("first", ssl.shape, wav.shape) + assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( + "first", + ssl.shape, + wav.shape, + ) len_mel = mel.shape[1] if self.val: - reference_mel = mel[:, :len_mel//3] + reference_mel = mel[:, : len_mel // 3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) - sep_point = random.randint(int(len_mel//3), int(len_mel//3*2)) + sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] - wav2 = wav[:, sep_point*self.hop_length:] + wav2 = wav[:, sep_point * self.hop_length :] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] - wav2 = wav[:, :sep_point*self.hop_length] + wav2 = wav[:, : sep_point * self.hop_length] mel = mel[:, :sep_point] - assert abs(ssl.shape[-1]- wav2.shape[-1]//self.hop_length) < 3, (ssl.shape, wav.shape,wav2.shape, mel.shape, sep_point,self.hop_length, sep_point*self.hop_length, dir) + assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( + ssl.shape, + wav.shape, + wav2.shape, + mel.shape, + sep_point, + self.hop_length, + sep_point * self.hop_length, + dir, + ) return reference_mel, ssl, wav2, mel -class TextAudioSpeakerCollate(): - """ Zero-pads model inputs and targets - """ +class TextAudioSpeakerCollate: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -176,8 +202,8 @@ class TextAudioSpeakerCollate(): """ # Right zero-pad all one-hot text sequences to max input length _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) + torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True + ) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) @@ -194,7 +220,7 @@ class TextAudioSpeakerCollate(): spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) - text_padded = torch.LongTensor(len(batch), max_text_len) + text_padded = torch.LongTensor(len(batch), max_text_len) spec_padded.zero_() wav_padded.zero_() @@ -205,23 +231,31 @@ class TextAudioSpeakerCollate(): row = batch[ids_sorted_decreasing[i]] ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav + wav_padded[i, :, : wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) - - return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths + return ( + ssl_padded, + ssl_lengths, + spec_padded, + spec_lengths, + wav_padded, + wav_lengths, + text_padded, + text_lengths, + ) class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): @@ -234,7 +268,15 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. """ - def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) self.lengths = dataset.lengths # print(233333333333333,self.lengths,dir(dataset)) @@ -254,7 +296,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): buckets[idx_bucket].append(i) for i in range(len(buckets) - 1, 0, -1): - # for i in range(len(buckets) - 1, -1, -1): + # for i in range(len(buckets) - 1, -1, -1): if len(buckets[i]) == 0: buckets.pop(i) self.boundaries.pop(i + 1) @@ -263,7 +305,9 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): for i in range(len(buckets)): len_bucket = len(buckets[i]) total_batch_size = self.num_replicas * self.batch_size - rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size num_samples_per_bucket.append(len_bucket + rem) return buckets, num_samples_per_bucket @@ -289,14 +333,23 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): # add extra samples to make it evenly divisible rem = num_samples_bucket - len_bucket - ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) # subsample - ids_bucket = ids_bucket[self.rank::self.num_replicas] + ids_bucket = ids_bucket[self.rank :: self.num_replicas] # batching for j in range(len(ids_bucket) // self.batch_size): - batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] batches.append(batch) if self.shuffle: diff --git a/GPT_SoVITS/module/losses.py b/GPT_SoVITS/module/losses.py index 50fdf85..b23fc8c 100644 --- a/GPT_SoVITS/module/losses.py +++ b/GPT_SoVITS/module/losses.py @@ -5,64 +5,69 @@ from torch.nn import functional as F def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) - return loss * 2 + return loss * 2 def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses = [] - g_losses = [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = torch.mean((1-dr)**2) - g_loss = torch.mean(dg**2) - loss += (r_loss + g_loss) - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) - return loss, r_losses, g_losses + return loss, r_losses, g_losses def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - dg = dg.float() - l = torch.mean((1-dg)**2) - gen_losses.append(l) - loss += l + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l - return loss, gen_losses + return loss, gen_losses def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - """ - z_p, logs_q: [b, h, t_t] - m_p, logs_p: [b, h, t_t] - """ - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) - kl = torch.sum(kl * z_mask) - l = kl / torch.sum(z_mask) - return l def mle_loss(z, m, logs, logdet, mask): - l = torch.sum(logs) + 0.5 * torch.sum(torch.exp(-2 * logs) * ((z - m)**2)) # neg normal likelihood w/o the constant term - l = l - torch.sum(logdet) # log jacobian determinant - l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes - l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term - return l \ No newline at end of file + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2) + ) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + l = l / torch.sum( + torch.ones_like(z) * mask + ) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l diff --git a/GPT_SoVITS/module/mel_processing.py b/GPT_SoVITS/module/mel_processing.py index 0ef5608..503825e 100644 --- a/GPT_SoVITS/module/mel_processing.py +++ b/GPT_SoVITS/module/mel_processing.py @@ -49,21 +49,37 @@ hann_window = {} def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - if torch.min(y) < -1.: - print('min value is ', torch.min(y)) - if torch.max(y) > 1.: - print('max value is ', torch.max(y)) + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) global hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) - wnsize_dtype_device = str(win_size) + '_' + dtype_device + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) y = y.squeeze(1) - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec @@ -71,37 +87,63 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): global mel_basis - dtype_device = str(spec.dtype) + '_' + str(spec.device) - fmax_dtype_device = str(fmax) + '_' + dtype_device + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) return spec -def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): - if torch.min(y) < -1.: - print('min value is ', torch.min(y)) - if torch.max(y) > 1.: - print('max value is ', torch.max(y)) +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) global mel_basis, hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) - fmax_dtype_device = str(fmax) + '_' + dtype_device - wnsize_dtype_device = str(win_size) + '_' + dtype_device + dtype_device = str(y.dtype) + "_" + str(y.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + wnsize_dtype_device = str(win_size) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=y.dtype, device=y.device + ) if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) y = y.squeeze(1) - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 2361b64..c99485c 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -12,12 +12,21 @@ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding from module.mrte_model import MRTE -from module.quantize import ResidualVectorQuantizer +from module.quantize import ResidualVectorQuantizer from text import symbols from torch.cuda.amp import autocast + class StochasticDurationPredictor(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): super().__init__() filter_channels = in_channels # it needs to be removed from future version. self.in_channels = in_channels @@ -31,21 +40,29 @@ class StochasticDurationPredictor(nn.Module): self.flows = nn.ModuleList() self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) self.post_flows = nn.ModuleList() self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -66,7 +83,10 @@ class StochasticDurationPredictor(nn.Module): h_w = self.post_pre(w) h_w = self.post_convs(h_w, x_mask) h_w = self.post_proj(h_w) * x_mask - e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask + e_q = ( + torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) + * x_mask + ) z_q = e_q for flow in self.post_flows: z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) @@ -74,8 +94,13 @@ class StochasticDurationPredictor(nn.Module): z_u, z1 = torch.split(z_q, [1, 1], 1) u = torch.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) - logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q + logdet_tot_q += torch.sum( + (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] + ) + logq = ( + torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) + - logdet_tot_q + ) logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) @@ -84,12 +109,18 @@ class StochasticDurationPredictor(nn.Module): for flow in flows: z, logdet = flow(z, x_mask, g=x, reverse=reverse) logdet_tot = logdet_tot + logdet - nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot + nll = ( + torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) + - logdet_tot + ) return nll + logq # [b] else: flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale + z = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale + ) for flow in flows: z = flow(z, x_mask, g=x, reverse=reverse) z0, z1 = torch.split(z, [1, 1], 1) @@ -98,7 +129,9 @@ class StochasticDurationPredictor(nn.Module): class DurationPredictor(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): super().__init__() self.in_channels = in_channels @@ -108,9 +141,13 @@ class DurationPredictor(nn.Module): self.gin_channels = gin_channels self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) @@ -135,15 +172,17 @@ class DurationPredictor(nn.Module): class TextEncoder(nn.Module): - def __init__(self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - latent_channels=192): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + latent_channels=192, + ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -160,17 +199,14 @@ class TextEncoder(nn.Module): hidden_channels, filter_channels, n_heads, - n_layers//2, + n_layers // 2, kernel_size, - p_dropout) + p_dropout, + ) self.encoder_text = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) self.text_embedding = nn.Embedding(len(symbols), hidden_channels) self.mrte = MRTE() @@ -179,21 +215,25 @@ class TextEncoder(nn.Module): hidden_channels, filter_channels, n_heads, - n_layers//2, + n_layers // 2, kernel_size, - p_dropout) - + p_dropout, + ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, y, y_lengths, text, text_lengths, ge, test=None): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) y = self.ssl_proj(y * y_mask) * y_mask y = self.encoder_ssl(y * y_mask, y_mask) - text_mask = torch.unsqueeze(commons.sequence_mask(text_lengths, text.size(1)), 1).to(y.dtype) - if test == 1 : + text_mask = torch.unsqueeze( + commons.sequence_mask(text_lengths, text.size(1)), 1 + ).to(y.dtype) + if test == 1: text[:, :] = 0 text = self.text_embedding(text).transpose(1, 2) text = self.encoder_text(text * text_mask, text_mask) @@ -208,9 +248,9 @@ class TextEncoder(nn.Module): def extract_latent(self, x): x = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(x) - return codes.transpose(0,1) - def decode_latent(self, codes, y_mask, refer,refer_mask, ge): + return codes.transpose(0, 1) + def decode_latent(self, codes, y_mask, refer, refer_mask, ge): quantized = self.quantizer.decode(codes) y = self.vq_proj(quantized) * y_mask @@ -224,15 +264,18 @@ class TextEncoder(nn.Module): m, logs = torch.split(stats, self.out_channels, dim=1) return y, m, logs, y_mask, quantized + class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): super().__init__() self.channels = channels self.hidden_channels = hidden_channels @@ -245,8 +288,16 @@ class ResidualCouplingBlock(nn.Module): self.flows = nn.ModuleList() for i in range(n_flows): self.flows.append( - modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, - gin_channels=gin_channels, mean_only=True)) + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): @@ -260,14 +311,16 @@ class ResidualCouplingBlock(nn.Module): class PosteriorEncoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -278,13 +331,21 @@ class PosteriorEncoder(nn.Module): self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): - if(g!=None): + if g != None: g = g.detach() - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -294,14 +355,16 @@ class PosteriorEncoder(nn.Module): class WNEncoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -312,11 +375,20 @@ class WNEncoder(nn.Module): self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.norm = modules.LayerNorm(out_channels) + def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) out = self.proj(x) * x_mask @@ -325,24 +397,45 @@ class WNEncoder(nn.Module): class Generator(torch.nn.Module): - def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(weight_norm( - ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), - k, u, padding=(k - u) // 2))) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) @@ -373,7 +466,7 @@ class Generator(torch.nn.Module): return x def remove_weight_norm(self): - print('Removing weight norm...') + print("Removing weight norm...") for l in self.ups: remove_weight_norm(l) for l in self.resblocks: @@ -386,13 +479,55 @@ class DiscriminatorP(torch.nn.Module): self.period = period self.use_spectral_norm = use_spectral_norm norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), - ]) + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): @@ -421,14 +556,16 @@ class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ]) + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): @@ -451,7 +588,9 @@ class MultiPeriodDiscriminator(torch.nn.Module): periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): @@ -469,31 +608,40 @@ class MultiPeriodDiscriminator(torch.nn.Module): return y_d_rs, y_d_gs, fmap_rs, fmap_gs + class ReferenceEncoder(nn.Module): - ''' + """ inputs --- [N, Ty/r, n_mels*r] mels outputs --- [N, ref_enc_gru_size] - ''' + """ def __init__(self, spec_channels, gin_channels=0): - super().__init__() self.spec_channels = spec_channels ref_enc_filters = [32, 32, 64, 64, 128, 128] K = len(ref_enc_filters) filters = [1] + ref_enc_filters - convs = [weight_norm(nn.Conv2d(in_channels=filters[i], - out_channels=filters[i + 1], - kernel_size=(3, 3), - stride=(2, 2), - padding=(1, 1))) for i in range(K)] + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] self.convs = nn.ModuleList(convs) # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) - self.gru = nn.GRU(input_size=ref_enc_filters[-1] * out_channels, - hidden_size=256 // 2, - batch_first=True) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) self.proj = nn.Linear(128, gin_channels) def forward(self, inputs): @@ -527,23 +675,31 @@ class Quantizer_module(torch.nn.Module): self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e) def forward(self, x): - d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) - 2 * torch.matmul(x, self.embedding.weight.T) + d = ( + torch.sum(x**2, 1, keepdim=True) + + torch.sum(self.embedding.weight**2, 1) + - 2 * torch.matmul(x, self.embedding.weight.T) + ) min_indicies = torch.argmin(d, 1) z_q = self.embedding(min_indicies) return z_q, min_indicies + class Quantizer(torch.nn.Module): def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160): super(Quantizer, self).__init__() assert embed_dim % n_code_groups == 0 - self.quantizer_modules = nn.ModuleList([ - Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups) - ]) + self.quantizer_modules = nn.ModuleList( + [ + Quantizer_module(n_codes, embed_dim // n_code_groups) + for _ in range(n_code_groups) + ] + ) self.n_code_groups = n_code_groups self.embed_dim = embed_dim def forward(self, xin): - #B, C, T + # B, C, T B, C, T = xin.shape xin = xin.transpose(1, 2) x = xin.reshape(-1, self.embed_dim) @@ -553,38 +709,41 @@ class Quantizer(torch.nn.Module): for _x, m in zip(x, self.quantizer_modules): _z_q, _min_indicies = m(_x) z_q.append(_z_q) - min_indicies.append(_min_indicies) #B * T, + min_indicies.append(_min_indicies) # B * T, z_q = torch.cat(z_q, -1).reshape(xin.shape) - loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( + (z_q - xin.detach()) ** 2 + ) z_q = xin + (z_q - xin).detach() z_q = z_q.transpose(1, 2) codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) return z_q, loss, codes.transpose(1, 2) def embed(self, x): - #idx: N, 4, T - x=x.transpose(1, 2) + # idx: N, 4, T + x = x.transpose(1, 2) x = torch.split(x, 1, 2) ret = [] for q, embed in zip(x, self.quantizer_modules): q = embed.embedding(q.squeeze(-1)) ret.append(q) ret = torch.cat(ret, -1) - return ret.transpose(1, 2) #N, C, T + return ret.transpose(1, 2) # N, C, T class CodePredictor(nn.Module): - def __init__(self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - n_q=8, - dims=1024, - ssl_dim=768 - ): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_q=8, + dims=1024, + ssl_dim=768, + ): super().__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels @@ -594,19 +753,18 @@ class CodePredictor(nn.Module): self.p_dropout = p_dropout self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) - self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels) + self.ref_enc = modules.MelStyleEncoder( + ssl_dim, style_vector_dim=hidden_channels + ) self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) - self.out_proj = nn.Conv1d(hidden_channels, (n_q-1) * dims, 1) + self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) self.n_q = n_q self.dims = dims + def forward(self, x, x_mask, refer, codes, infer=False): x = x.detach() x = self.vq_proj(x * x_mask) * x_mask @@ -614,7 +772,9 @@ class CodePredictor(nn.Module): x = x + g x = self.encoder(x * x_mask, x_mask) x = self.out_proj(x * x_mask) * x_mask - logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3) + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( + 2, 3 + ) target = codes[1:].transpose(0, 1) if not infer: logits = logits.reshape(-1, self.dims) @@ -626,44 +786,44 @@ class CodePredictor(nn.Module): correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1) top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item() - print('Top-10 Accuracy:', top3_acc, "%") + print("Top-10 Accuracy:", top3_acc, "%") pred_codes = torch.argmax(logits, dim=-1) acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item() - print('Top-1 Accuracy:', acc, "%") + print("Top-1 Accuracy:", acc, "%") return pred_codes.transpose(0, 1) - class SynthesizerTrn(nn.Module): """ - Synthesizer for Training - """ - - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - **kwargs): + Synthesizer for Training + """ + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + **kwargs + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -685,34 +845,50 @@ class SynthesizerTrn(nn.Module): self.use_sdp = use_sdp self.enc_p = TextEncoder( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, - gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels + ) - self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels) + self.ref_enc = modules.MelStyleEncoder( + spec_channels, style_vector_dim=gin_channels + ) ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) if freeze_quantizer: self.ssl_proj.requires_grad_(False) self.quantizer.requires_grad_(False) @@ -721,56 +897,85 @@ class SynthesizerTrn(nn.Module): # self.enc_p.mrte.requires_grad_(False) def forward(self, ssl, y, y_lengths, text, text_lengths): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) ge = self.ref_enc(y * y_mask, y_mask) with autocast(enabled=False): ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) + quantized, codes, commit_loss, quantized_list = self.quantizer( + ssl, layers=[0] + ) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge + ) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z_p = self.flow(z, y_mask, g=ge) - z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) o = self.dec(z_slice, g=ge) - return o, commit_loss, ids_slice, y_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), quantized + return ( + o, + commit_loss, + ids_slice, + y_mask, + y_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + quantized, + ) def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) ge = self.ref_enc(y * y_mask, y_mask) - ssl = self.ssl_proj(ssl) + ssl = self.ssl_proj(ssl) quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0]) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, test=test) + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge, test=test + ) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) o = self.dec((z * y_mask)[:, :, :], g=ge) - return o,y_mask, (z, z_p, m_p, logs_p) - + return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def decode(self, codes,text, refer, noise_scale=0.5): + def decode(self, codes, text, refer, noise_scale=0.5): refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) - refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) + refer_mask = torch.unsqueeze( + commons.sequence_mask(refer_lengths, refer.size(2)), 1 + ).to(refer.dtype) ge = self.ref_enc(refer * refer_mask, refer_mask) - y_lengths = torch.LongTensor([codes.size(2)*2]).to(codes.device) + y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge + ) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -779,6 +984,6 @@ class SynthesizerTrn(nn.Module): return o def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) + return codes.transpose(0, 1) diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index 711cc5b..f444745 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -17,193 +17,282 @@ LRELU_SLOPE = 0.1 class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - class ConvReluNorm(nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential( - nn.ReLU(), - nn.Dropout(p_dropout)) - for _ in range(n_layers-1): - self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout + """ + Dialted and Depth-Separable Convolution + """ - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size ** i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, - groups=channels, dilation=dilation, padding=padding - )) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert(kernel_size % 2 == 1) - self.hidden_channels =hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - for i in range(n_layers): - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') - self.in_layers.append(in_layer) + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) - if g is not None: - g = self.cond_layer(g) + if g is not None: + g = self.cond_layer(g) - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] - else: - g_l = torch.zeros_like(x_in) + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) - acts = commons.fused_add_tanh_sigmoid_multiply( - x_in, - g_l, - n_channels_tensor) - acts = self.drop(acts) + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:,:self.hidden_channels,:] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:,self.hidden_channels:,:] - else: - output = output + res_skip_acts - return output * x_mask + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super(ResBlock1, self).__init__() - self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]))) - ]) + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) self.convs1.apply(init_weights) - self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))) - ]) + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) self.convs2.apply(init_weights) def forward(self, x, x_mask=None): @@ -231,12 +320,30 @@ class ResBlock1(torch.nn.Module): class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): super(ResBlock2, self).__init__() - self.convs = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))) - ]) + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) self.convs.apply(init_weights) def forward(self, x, x_mask=None): @@ -256,147 +363,169 @@ class ResBlock2(torch.nn.Module): class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels,1)) - self.logs = nn.Parameter(torch.zeros(channels,1)) + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1,2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels]*2, 1) - else: - m = stats - logs = torch.zeros_like(m) + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1,2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x class ConvFlow(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) - self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_derivatives = h[..., 2 * self.num_bins:] + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] - x1, logabsdet = piecewise_rational_quadratic_transform(x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails='linear', - tail_bound=self.tail_bound - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1,2]) - if not reverse: - return x, logdet - else: - return x + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x class LinearNorm(nn.Module): - def __init__(self, - in_channels, - out_channels, - bias=True, - spectral_norm=False, - ): + def __init__( + self, + in_channels, + out_channels, + bias=True, + spectral_norm=False, + ): super(LinearNorm, self).__init__() self.fc = nn.Linear(in_channels, out_channels, bias) @@ -417,10 +546,10 @@ class Mish(nn.Module): class Conv1dGLU(nn.Module): - ''' + """ Conv1d + GLU(Gated Linear Unit) with residual connection. For GLU refer to https://arxiv.org/abs/1612.08083 paper. - ''' + """ def __init__(self, in_channels, out_channels, kernel_size, dropout): super(Conv1dGLU, self).__init__() @@ -438,29 +567,32 @@ class Conv1dGLU(nn.Module): class ConvNorm(nn.Module): - def __init__(self, - in_channels, - out_channels, - kernel_size=1, - stride=1, - padding=None, - dilation=1, - bias=True, - spectral_norm=False, - ): + def __init__( + self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=None, + dilation=1, + bias=True, + spectral_norm=False, + ): super(ConvNorm, self).__init__() if padding is None: - assert (kernel_size % 2 == 1) + assert kernel_size % 2 == 1 padding = int(dilation * (kernel_size - 1) / 2) - self.conv = torch.nn.Conv1d(in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - bias=bias) + self.conv = torch.nn.Conv1d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) if spectral_norm: self.conv = nn.utils.spectral_norm(self.conv) @@ -471,9 +603,9 @@ class ConvNorm(nn.Module): class MultiHeadAttention(nn.Module): - ''' Multi-Head Attention module ''' + """Multi-Head Attention module""" - def __init__(self, n_head, d_model, d_k, d_v, dropout=0., spectral_norm=False): + def __init__(self, n_head, d_model, d_k, d_v, dropout=0.0, spectral_norm=False): super().__init__() self.n_head = n_head @@ -484,7 +616,9 @@ class MultiHeadAttention(nn.Module): self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) - self.attention = ScaledDotProductAttention(temperature=np.power(d_model, 0.5), dropout=dropout) + self.attention = ScaledDotProductAttention( + temperature=np.power(d_model, 0.5), dropout=dropout + ) self.fc = nn.Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) @@ -504,12 +638,9 @@ class MultiHeadAttention(nn.Module): q = self.w_qs(x).view(sz_b, len_x, n_head, d_k) k = self.w_ks(x).view(sz_b, len_x, n_head, d_k) v = self.w_vs(x).view(sz_b, len_x, n_head, d_v) - q = q.permute(2, 0, 1, 3).contiguous().view(-1, - len_x, d_k) # (n*b) x lq x dk - k = k.permute(2, 0, 1, 3).contiguous().view(-1, - len_x, d_k) # (n*b) x lk x dk - v = v.permute(2, 0, 1, 3).contiguous().view(-1, - len_x, d_v) # (n*b) x lv x dv + q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lq x dk + k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lk x dk + v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_v) # (n*b) x lv x dv if mask is not None: slf_mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. @@ -518,8 +649,9 @@ class MultiHeadAttention(nn.Module): output, attn = self.attention(q, k, v, mask=slf_mask) output = output.view(n_head, sz_b, len_x, d_v) - output = output.permute(1, 2, 0, 3).contiguous().view( - sz_b, len_x, -1) # b x lq x (n*dv) + output = ( + output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) + ) # b x lq x (n*dv) output = self.fc(output) @@ -528,7 +660,7 @@ class MultiHeadAttention(nn.Module): class ScaledDotProductAttention(nn.Module): - ''' Scaled Dot-Product Attention ''' + """Scaled Dot-Product Attention""" def __init__(self, temperature, dropout): super().__init__() @@ -551,14 +683,17 @@ class ScaledDotProductAttention(nn.Module): class MelStyleEncoder(nn.Module): - ''' MelStyleEncoder ''' + """MelStyleEncoder""" - def __init__(self, n_mel_channels=80, - style_hidden=128, - style_vector_dim=256, - style_kernel_size=5, - style_head=2, - dropout=0.1): + def __init__( + self, + n_mel_channels=80, + style_hidden=128, + style_vector_dim=256, + style_kernel_size=5, + style_head=2, + dropout=0.1, + ): super(MelStyleEncoder, self).__init__() self.in_dim = n_mel_channels self.hidden_dim = style_hidden @@ -573,7 +708,7 @@ class MelStyleEncoder(nn.Module): nn.Dropout(self.dropout), LinearNorm(self.hidden_dim, self.hidden_dim), Mish(), - nn.Dropout(self.dropout) + nn.Dropout(self.dropout), ) self.temporal = nn.Sequential( @@ -581,9 +716,13 @@ class MelStyleEncoder(nn.Module): Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout), ) - self.slf_attn = MultiHeadAttention(self.n_head, self.hidden_dim, - self.hidden_dim // self.n_head, self.hidden_dim // self.n_head, - self.dropout) + self.slf_attn = MultiHeadAttention( + self.n_head, + self.hidden_dim, + self.hidden_dim // self.n_head, + self.hidden_dim // self.n_head, + self.dropout, + ) self.fc = LinearNorm(self.hidden_dim, self.out_dim) @@ -598,11 +737,13 @@ class MelStyleEncoder(nn.Module): return out def forward(self, x, mask=None): - x = x.transpose(1,2) + x = x.transpose(1, 2) if mask is not None: - mask = (mask.int()==0).squeeze(1) + mask = (mask.int() == 0).squeeze(1) max_len = x.shape[1] - slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None + slf_attn_mask = ( + mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None + ) # spectral x = self.spectral(x) @@ -644,7 +785,9 @@ class MelStyleEncoderVAE(nn.Module): mu = self.fc1(enc_out) logvar = self.fc2(enc_out) posterior = D.Normal(mu, torch.exp(logvar)) - kl_divergence = D.kl_divergence(posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar))) + kl_divergence = D.kl_divergence( + posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar)) + ) loss_kl = kl_divergence.mean() z = posterior.rsample() @@ -656,11 +799,12 @@ class MelStyleEncoderVAE(nn.Module): if manual_latent is None: if random_sample: dev = next(self.parameters()).device - posterior = D.Normal(torch.zeros(1, self.z_latent_dim, device=dev), - torch.ones(1, self.z_latent_dim, device=dev)) + posterior = D.Normal( + torch.zeros(1, self.z_latent_dim, device=dev), + torch.ones(1, self.z_latent_dim, device=dev), + ) z = posterior.rsample() else: - enc_out = self.ref_encoder(inputs.transpose(1, 2)) mu = self.fc1(enc_out) z = mu @@ -681,7 +825,9 @@ class ActNorm(nn.Module): def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): if x_mask is None: - x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype) + x_mask = torch.ones(x.size(0), 1, x.size(2)).to( + device=x.device, dtype=x.dtype + ) x_len = torch.sum(x_mask, [1, 2]) if not self.initialized: self.initialize(x, x_mask) @@ -707,10 +853,12 @@ class ActNorm(nn.Module): denom = torch.sum(x_mask, [0, 2]) m = torch.sum(x * x_mask, [0, 2]) / denom m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom - v = m_sq - (m ** 2) + v = m_sq - (m**2) logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) - bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + bias_init = ( + (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + ) logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) self.bias.data.copy_(bias_init) @@ -720,19 +868,21 @@ class ActNorm(nn.Module): class InvConvNear(nn.Module): def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): super().__init__() - assert (n_split % 2 == 0) + assert n_split % 2 == 0 self.channels = channels self.n_split = n_split self.no_jacobian = no_jacobian - w_init = torch.linalg.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] + w_init = torch.linalg.qr( + torch.FloatTensor(self.n_split, self.n_split).normal_() + )[0] if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): b, c, t = x.size() - assert (c % self.n_split == 0) + assert c % self.n_split == 0 if x_mask is None: x_mask = 1 x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t @@ -740,7 +890,11 @@ class InvConvNear(nn.Module): x_len = torch.sum(x_mask, [1, 2]) x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) - x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t) + x = ( + x.permute(0, 1, 3, 2, 4) + .contiguous() + .view(b, self.n_split, c // self.n_split, t) + ) if reverse: if hasattr(self, "weight_inv"): diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index e936c76..b0cd242 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -5,46 +5,74 @@ from torch import nn from torch.nn.utils import remove_weight_norm, weight_norm from module.attentions import MultiHeadAttention + class MRTE(nn.Module): - def __init__(self, - content_enc_channels=192, - hidden_size=512, - out_channels=192, - kernel_size=5, - n_heads=4, - ge_layer = 2 - ): + def __init__( + self, + content_enc_channels=192, + hidden_size=512, + out_channels=192, + kernel_size=5, + n_heads=4, + ge_layer=2, + ): super(MRTE, self).__init__() - self.cross_attention = MultiHeadAttention(hidden_size,hidden_size,n_heads) - self.c_pre = nn.Conv1d(content_enc_channels,hidden_size, 1) - self.text_pre = nn.Conv1d(content_enc_channels,hidden_size, 1) - self.c_post = nn.Conv1d(hidden_size,out_channels, 1) + self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads) + self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) + self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) + self.c_post = nn.Conv1d(hidden_size, out_channels, 1) def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None): - if(ge==None):ge=0 + if ge == None: + ge = 0 attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1) ssl_enc = self.c_pre(ssl_enc * ssl_mask) text_enc = self.text_pre(text * text_mask) if test != None: if test == 0: - x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge + x = ( + self.cross_attention( + ssl_enc * ssl_mask, text_enc * text_mask, attn_mask + ) + + ssl_enc + + ge + ) elif test == 1: x = ssl_enc + ge - elif test ==2: - x = self.cross_attention(ssl_enc*0 * ssl_mask, text_enc * text_mask, attn_mask) + ge + elif test == 2: + x = ( + self.cross_attention( + ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask + ) + + ge + ) else: raise ValueError("test should be 0,1,2") else: - x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge + x = ( + self.cross_attention( + ssl_enc * ssl_mask, text_enc * text_mask, attn_mask + ) + + ssl_enc + + ge + ) x = self.c_post(x * ssl_mask) return x - + class SpeakerEncoder(torch.nn.Module): - def __init__(self, mel_n_channels=80, model_num_layers=2, model_hidden_size=256, model_embedding_size=256): + def __init__( + self, + mel_n_channels=80, + model_num_layers=2, + model_hidden_size=256, + model_embedding_size=256, + ): super(SpeakerEncoder, self).__init__() - self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.lstm = nn.LSTM( + mel_n_channels, model_hidden_size, model_num_layers, batch_first=True + ) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() @@ -56,13 +84,15 @@ class SpeakerEncoder(torch.nn.Module): class MELEncoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -81,80 +111,82 @@ class MELEncoder(nn.Module): x = self.enc(x) x = self.proj(x) return x - + class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers): - super(WN, self).__init__() - assert(kernel_size % 2 == 1) - self.hidden_channels =hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() - for i in range(n_layers): - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = weight_norm(in_layer) - self.in_layers.append(in_layer) + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = weight_norm(in_layer) + self.in_layers.append(in_layer) - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = weight_norm(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) - def forward(self, x): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) + def forward(self, x): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) - for i in range(self.n_layers): - x_in = self.in_layers[i](x) + for i in range(self.n_layers): + x_in = self.in_layers[i](x) - acts = fused_add_tanh_sigmoid_multiply( - x_in, - n_channels_tensor) + acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor) - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:,:self.hidden_channels,:] - x = (x + res_acts) - output = output + res_skip_acts[:,self.hidden_channels:,:] - else: - output = output + res_skip_acts - return output + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = x + res_acts + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output - def remove_weight_norm(self): - for l in self.in_layers: - remove_weight_norm(l) - for l in self.res_skip_layers: - remove_weight_norm(l) + def remove_weight_norm(self): + for l in self.in_layers: + remove_weight_norm(l) + for l in self.res_skip_layers: + remove_weight_norm(l) @torch.jit.script def fused_add_tanh_sigmoid_multiply(input, n_channels): - n_channels_int = n_channels[0] - t_act = torch.tanh(input[:, :n_channels_int, :]) - s_act = torch.sigmoid(input[:, n_channels_int:, :]) - acts = t_act * s_act - return acts + n_channels_int = n_channels[0] + t_act = torch.tanh(input[:, :n_channels_int, :]) + s_act = torch.sigmoid(input[:, n_channels_int:, :]) + acts = t_act * s_act + return acts - -if __name__ == '__main__': - content_enc = torch.randn(3,192,100) - content_mask = torch.ones(3,1,100) - ref_mel = torch.randn(3,128,30) - ref_mask = torch.ones(3,1,30) +if __name__ == "__main__": + content_enc = torch.randn(3, 192, 100) + content_mask = torch.ones(3, 1, 100) + ref_mel = torch.randn(3, 128, 30) + ref_mask = torch.ones(3, 1, 30) model = MRTE() - out = model(content_enc,content_mask,ref_mel,ref_mask) - print(out.shape) \ No newline at end of file + out = model(content_enc, content_mask, ref_mel, ref_mask) + print(out.shape) diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index cdbdeea..f9a5c63 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -38,6 +38,7 @@ class ResidualVectorQuantizer(nn.Module): that have an exponential moving average cluster size less than the specified threshold with randomly selected vector from the current batch. """ + def __init__( self, dimension: int = 256, @@ -66,7 +67,12 @@ class ResidualVectorQuantizer(nn.Module): threshold_ema_dead_code=self.threshold_ema_dead_code, ) - def forward(self, x: torch.Tensor, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None) -> QuantizedResult: + def forward( + self, + x: torch.Tensor, + n_q: tp.Optional[int] = None, + layers: tp.Optional[list] = None, + ) -> QuantizedResult: """Residual vector quantization on the given input tensor. Args: x (torch.Tensor): Input tensor. @@ -79,12 +85,17 @@ class ResidualVectorQuantizer(nn.Module): """ n_q = n_q if n_q else self.n_q if layers and max(layers) >= n_q: - raise ValueError(f'Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B.') - quantized, codes, commit_loss, quantized_list = self.vq(x, n_q=n_q, layers=layers) + raise ValueError( + f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." + ) + quantized, codes, commit_loss, quantized_list = self.vq( + x, n_q=n_q, layers=layers + ) return quantized, codes, torch.mean(commit_loss), quantized_list - - def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor: + def encode( + self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None + ) -> torch.Tensor: """Encode a given input tensor with the specified sample rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizer to use and returns indices for each quantizer. @@ -105,4 +116,4 @@ class ResidualVectorQuantizer(nn.Module): st (int): Start to decode input codes from which layers. Default: 0. """ quantized = self.vq.decode(codes, st=st) - return quantized \ No newline at end of file + return quantized diff --git a/GPT_SoVITS/module/transforms.py b/GPT_SoVITS/module/transforms.py index 4793d67..a11f799 100644 --- a/GPT_SoVITS/module/transforms.py +++ b/GPT_SoVITS/module/transforms.py @@ -9,66 +9,63 @@ DEFAULT_MIN_BIN_HEIGHT = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3 -def piecewise_rational_quadratic_transform(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): if tails is None: spline_fn = rational_quadratic_spline spline_kwargs = {} else: spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = { - 'tails': tails, - 'tail_bound': tail_bound - } + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs ) return outputs, logabsdet def searchsorted(bin_locations, inputs, eps=1e-6): bin_locations[..., -1] += eps - return torch.sum( - inputs[..., None] >= bin_locations, - dim=-1 - ) - 1 + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 -def unconstrained_rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails='linear', - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) outside_interval_mask = ~inside_interval_mask outputs = torch.zeros_like(inputs) logabsdet = torch.zeros_like(inputs) - if tails == 'linear': + if tails == "linear": unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) constant = np.log(np.exp(1 - min_derivative) - 1) unnormalized_derivatives[..., 0] = constant @@ -77,45 +74,57 @@ def unconstrained_rational_quadratic_spline(inputs, outputs[outside_interval_mask] = inputs[outside_interval_mask] logabsdet[outside_interval_mask] = 0 else: - raise RuntimeError('{} tails are not implemented.'.format(tails)) + raise RuntimeError("{} tails are not implemented.".format(tails)) - outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( inputs=inputs[inside_interval_mask], unnormalized_widths=unnormalized_widths[inside_interval_mask, :], unnormalized_heights=unnormalized_heights[inside_interval_mask, :], unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], inverse=inverse, - left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, - min_derivative=min_derivative + min_derivative=min_derivative, ) return outputs, logabsdet -def rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0., right=1., bottom=0., top=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError('Input to a transform is not within its domain') + raise ValueError("Input to a transform is not within its domain") num_bins = unnormalized_widths.shape[-1] if min_bin_width * num_bins > 1.0: - raise ValueError('Minimal bin width too large for the number of bins') + raise ValueError("Minimal bin width too large for the number of bins") if min_bin_height * num_bins > 1.0: - raise ValueError('Minimal bin height too large for the number of bins') + raise ValueError("Minimal bin height too large for the number of bins") widths = F.softmax(unnormalized_widths, dim=-1) widths = min_bin_width + (1 - min_bin_width * num_bins) * widths cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) cumwidths = (right - left) * cumwidths + left cumwidths[..., 0] = left cumwidths[..., -1] = right @@ -126,7 +135,7 @@ def rational_quadratic_spline(inputs, heights = F.softmax(unnormalized_heights, dim=-1) heights = min_bin_height + (1 - min_bin_height * num_bins) * heights cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) cumheights = (top - bottom) * cumheights + bottom cumheights[..., 0] = bottom cumheights[..., -1] = top @@ -150,15 +159,13 @@ def rational_quadratic_spline(inputs, input_heights = heights.gather(-1, bin_idx)[..., 0] if inverse: - a = (((inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta) - + input_heights * (input_delta - input_derivatives))) - b = (input_heights * input_derivatives - - (inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta)) - c = - input_delta * (inputs - input_cumheights) + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) discriminant = b.pow(2) - 4 * a * c assert (discriminant >= 0).all() @@ -167,11 +174,15 @@ def rational_quadratic_spline(inputs, outputs = root * input_bin_widths + input_cumwidths theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2)) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) return outputs, -logabsdet @@ -179,15 +190,20 @@ def rational_quadratic_spline(inputs, theta = (inputs - input_cumwidths) / input_bin_widths theta_one_minus_theta = theta * (1 - theta) - numerator = input_heights * (input_delta * theta.pow(2) - + input_derivatives * theta_one_minus_theta) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) outputs = input_cumheights + numerator / denominator - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2)) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) return outputs, logabsdet diff --git a/GPT_SoVITS/prepare_datasets/0-pipeline.py b/GPT_SoVITS/prepare_datasets/0-pipeline.py index 4b90a68..4979ed2 100644 --- a/GPT_SoVITS/prepare_datasets/0-pipeline.py +++ b/GPT_SoVITS/prepare_datasets/0-pipeline.py @@ -1,50 +1,81 @@ -import os,torch,sys +import os, torch, sys from subprocess import Popen + now_dir = os.getcwd() sys.path.append(now_dir) -from config import text_path,wav_dir,n_card,n_process_per_card,exp_name,n_parts,exp_dir -os.makedirs("%s/logs_s1"%exp_dir,exist_ok=True) -os.makedirs("%s/logs_s2"%exp_dir,exist_ok=True) +from config import ( + text_path, + wav_dir, + n_card, + exp_name, + n_parts, + exp_dir, +) + +os.makedirs("%s/logs_s1" % exp_dir, exist_ok=True) +os.makedirs("%s/logs_s2" % exp_dir, exist_ok=True) ##############step1 -ps=[] +ps = [] for i_part in range(n_parts): - cmd="python prepare/1-get-text.py %s %s %s %s %s %s"%(text_path,wav_dir,exp_name,i_part,n_parts,i_part%n_card) + cmd = "python prepare/1-get-text.py %s %s %s %s %s %s" % ( + text_path, + wav_dir, + exp_name, + i_part, + n_parts, + i_part % n_card, + ) print(cmd) p = Popen(cmd, shell=True) ps.append(p) for p in ps: p.wait() -opt=[] +opt = [] for i_part in range(n_parts): txt_path = "%s/2-name2text-%s.txt" % (exp_dir, i_part) - with open(txt_path,"r")as f: - opt+=f.read().strip("\n").split("\n") + with open(txt_path, "r") as f: + opt += f.read().strip("\n").split("\n") os.remove(txt_path) -with open("%s/2-name2text.txt"%exp_dir,"w")as f:f.write("\n".join(opt)+"\n") +with open("%s/2-name2text.txt" % exp_dir, "w") as f: + f.write("\n".join(opt) + "\n") ############step2 -ps=[] +ps = [] for i_part in range(n_parts): - cmd="python prepare/2-get-hubert-wav32k.py %s %s %s %s %s %s"%(text_path,wav_dir,exp_name,i_part,n_parts,i_part%n_card) + cmd = "python prepare/2-get-hubert-wav32k.py %s %s %s %s %s %s" % ( + text_path, + wav_dir, + exp_name, + i_part, + n_parts, + i_part % n_card, + ) print(cmd) p = Popen(cmd, shell=True) ps.append(p) for p in ps: p.wait() #############step3 -ps=[] +ps = [] for i_part in range(n_parts): - cmd="python prepare/3-get-semantic.py %s %s %s %s %s"%(text_path,exp_name,i_part,n_parts,i_part%n_card) + cmd = "python prepare/3-get-semantic.py %s %s %s %s %s" % ( + text_path, + exp_name, + i_part, + n_parts, + i_part % n_card, + ) print(cmd) p = Popen(cmd, shell=True) ps.append(p) for p in ps: p.wait() -opt=["item_name semantic_audio"] +opt = ["item_name semantic_audio"] for i_part in range(n_parts): semantic_path = "%s/6-name2semantic-%s.tsv" % (exp_dir, i_part) - with open(semantic_path,"r")as f: - opt+=f.read().strip("\n").split("\n") + with open(semantic_path, "r") as f: + opt += f.read().strip("\n").split("\n") os.remove(semantic_path) -with open("%s/6-name2semantic.tsv"%exp_dir,"w")as f:f.write("\n".join(opt)+"\n") +with open("%s/6-name2semantic.tsv" % exp_dir, "w") as f: + f.write("\n".join(opt) + "\n") diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index 5abd353..8579693 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -2,16 +2,16 @@ import os -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir= os.environ.get("opt_dir") -bert_pretrained_dir= os.environ.get("bert_pretrained_dir") -is_half=eval(os.environ.get("is_half","True")) -import sys,numpy as np,traceback,pdb +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +opt_dir = os.environ.get("opt_dir") +bert_pretrained_dir = os.environ.get("bert_pretrained_dir") +is_half = eval(os.environ.get("is_half", "True")) +import sys, numpy as np, traceback, pdb import os.path from glob import glob from tqdm import tqdm @@ -31,25 +31,29 @@ import numpy as np from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) -txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) -if(os.path.exists(txt_path)==False): - bert_dir="%s/3-bert"%(opt_dir) - os.makedirs(opt_dir,exist_ok=True) - os.makedirs(bert_dir,exist_ok=True) - device="cuda:0" + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s/%s%s.pth" % (dir, ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) +if os.path.exists(txt_path) == False: + bert_dir = "%s/3-bert" % (opt_dir) + os.makedirs(opt_dir, exist_ok=True) + os.makedirs(bert_dir, exist_ok=True) + device = "cuda:0" tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) - bert_model=AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) - if (is_half == True): + bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) + if is_half == True: bert_model = bert_model.half().to(device) else: bert_model = bert_model.to(device) + def get_bert_feature(text, word2ph): with torch.no_grad(): inputs = tokenizer(text, return_tensors="pt") @@ -67,51 +71,55 @@ if(os.path.exists(txt_path)==False): phone_level_feature = torch.cat(phone_level_feature, dim=0) return phone_level_feature.T - def process(data,res): - for name,text,lan in data: + + def process(data, res): + for name, text, lan in data: try: - name=os.path.basename(name) - phones, word2ph, norm_text=clean_text(text.replace("%", '-').replace('¥', ','),lan) - path_bert="%s/%s.pt"%(bert_dir,name) - if (os.path.exists(path_bert) == False and lan == "zh"): + name = os.path.basename(name) + phones, word2ph, norm_text = clean_text( + text.replace("%", "-").replace("¥", ","), lan + ) + path_bert = "%s/%s.pt" % (bert_dir, name) + if os.path.exists(path_bert) == False and lan == "zh": bert_feature = get_bert_feature(norm_text, word2ph) assert bert_feature.shape[-1] == len(phones) # torch.save(bert_feature, path_bert) my_save(bert_feature, path_bert) phones = " ".join(phones) # res.append([name,phones]) - res.append([name,phones, word2ph, norm_text]) + res.append([name, phones, word2ph, norm_text]) except: print(name, text, traceback.format_exc()) - todo=[] - res=[] - with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") + todo = [] + res = [] + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") - language_v1_to_language_v2={ - "ZH":"zh", - "zh":"zh", - "JP":"ja", - "jp":"ja", - "JA":"ja", - "ja":"ja", - "EN":"en", - "en":"en", - "En":"en", + language_v1_to_language_v2 = { + "ZH": "zh", + "zh": "zh", + "JP": "ja", + "jp": "ja", + "JA": "ja", + "ja": "ja", + "EN": "en", + "en": "en", + "En": "en", } - for line in lines[int(i_part)::int(all_parts)]: + for line in lines[int(i_part) :: int(all_parts)]: try: - wav_name,spk_name,language,text=line.split("|") + wav_name, spk_name, language, text = line.split("|") # todo.append([name,text,"zh"]) - todo.append([wav_name,text,language_v1_to_language_v2.get(language,language)]) + todo.append( + [wav_name, text, language_v1_to_language_v2.get(language, language)] + ) except: - print(line,traceback.format_exc()) - - process(todo,res) - opt=[] - for name,phones, word2ph, norm_text in res: - opt.append("%s\t%s\t%s\t%s"%(name,phones, word2ph, norm_text)) - with open(txt_path,"w",encoding="utf8")as f: - f.write("\n".join(opt)+"\n") + print(line, traceback.format_exc()) + process(todo, res) + opt = [] + for name, phones, word2ph, norm_text in res: + opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text)) + with open(txt_path, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index a5075ff..25cb4a8 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -1,20 +1,23 @@ # -*- coding: utf-8 -*- -import sys,os -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -from feature_extractor import cnhubert -opt_dir= os.environ.get("opt_dir") -cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") -is_half=eval(os.environ.get("is_half","True")) +import sys, os -import pdb,traceback,numpy as np,logging +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +from feature_extractor import cnhubert + +opt_dir = os.environ.get("opt_dir") +cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir") +is_half = eval(os.environ.get("is_half", "True")) + +import pdb, traceback, numpy as np, logging from scipy.io import wavfile -import librosa,torch +import librosa, torch + now_dir = os.getcwd() sys.path.append(now_dir) from my_utils import load_audio @@ -32,63 +35,75 @@ from my_utils import load_audio from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) -hubert_dir="%s/4-cnhubert"%(opt_dir) -wav32dir="%s/5-wav32k"%(opt_dir) -os.makedirs(opt_dir,exist_ok=True) -os.makedirs(hubert_dir,exist_ok=True) -os.makedirs(wav32dir,exist_ok=True) -maxx=0.95 -alpha=0.5 -device="cuda:0" -model=cnhubert.get_model() -if(is_half==True): - model=model.half().to(device) +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s/%s%s.pth" % (dir, ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +hubert_dir = "%s/4-cnhubert" % (opt_dir) +wav32dir = "%s/5-wav32k" % (opt_dir) +os.makedirs(opt_dir, exist_ok=True) +os.makedirs(hubert_dir, exist_ok=True) +os.makedirs(wav32dir, exist_ok=True) + +maxx = 0.95 +alpha = 0.5 +device = "cuda:0" +model = cnhubert.get_model() +if is_half == True: + model = model.half().to(device) else: model = model.to(device) + + def name2go(wav_name): - hubert_path="%s/%s.pt"%(hubert_dir,wav_name) - if(os.path.exists(hubert_path)):return - wav_path="%s/%s"%(inp_wav_dir,wav_name) + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path): + return + wav_path = "%s/%s" % (inp_wav_dir, wav_name) tmp_audio = load_audio(wav_path, 32000) tmp_max = np.abs(tmp_audio).max() if tmp_max > 2.2: print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) return - tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio - tmp_audio = librosa.resample( - tmp_audio32, orig_sr=32000, target_sr=16000 - ) + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ( + (1 - alpha) * 32768 + ) * tmp_audio + tmp_audio = librosa.resample(tmp_audio32, orig_sr=32000, target_sr=16000) tensor_wav16 = torch.from_numpy(tmp_audio) - if (is_half == True): - tensor_wav16=tensor_wav16.half().to(device) + if is_half == True: + tensor_wav16 = tensor_wav16.half().to(device) else: tensor_wav16 = tensor_wav16.to(device) - ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) - if np.isnan(ssl.detach().numpy()).sum()!= 0:return + ssl = ( + model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"] + .transpose(1, 2) + .cpu() + ) # torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum() != 0: + return wavfile.write( - "%s/%s"%(wav32dir,wav_name), + "%s/%s" % (wav32dir, wav_name), 32000, tmp_audio32.astype("int16"), ) # torch.save(ssl,hubert_path ) - my_save(ssl,hubert_path ) + my_save(ssl, hubert_path) -with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") -for line in lines[int(i_part)::int(all_parts)]: +with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + +for line in lines[int(i_part) :: int(all_parts)]: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=os.path.basename(wav_name) + wav_name = os.path.basename(wav_name) name2go(wav_name) except: - print(line,traceback.format_exc()) + print(line, traceback.format_exc()) diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index 69f8e3e..7cee6e4 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -1,24 +1,27 @@ import os -inp_text= os.environ.get("inp_text") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir= os.environ.get("opt_dir") -pretrained_s2G= os.environ.get("pretrained_s2G") -s2config_path= os.environ.get("s2config_path") -is_half=eval(os.environ.get("is_half","True")) -import math,traceback + +inp_text = os.environ.get("inp_text") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +opt_dir = os.environ.get("opt_dir") +pretrained_s2G = os.environ.get("pretrained_s2G") +s2config_path = os.environ.get("s2config_path") +is_half = eval(os.environ.get("is_half", "True")) +import math, traceback import multiprocessing -import sys,pdb +import sys, pdb + now_dir = os.getcwd() sys.path.append(now_dir) from random import shuffle import torch.multiprocessing as mp from glob import glob from tqdm import tqdm -import logging,librosa,utils,torch +import logging, librosa, utils, torch from module.models import SynthesizerTrn + logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G @@ -30,52 +33,58 @@ logging.getLogger("numba").setLevel(logging.WARNING) # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name -hubert_dir="%s/4-cnhubert"%(opt_dir) -semantic_path="%s/6-name2semantic-%s.tsv"%(opt_dir,i_part) -if(os.path.exists(semantic_path)==False): - os.makedirs(opt_dir,exist_ok=True) +hubert_dir = "%s/4-cnhubert" % (opt_dir) +semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) +if os.path.exists(semantic_path) == False: + os.makedirs(opt_dir, exist_ok=True) - device="cuda:0" + device = "cuda:0" hps = utils.get_hparams_from_file(s2config_path) vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model) - if(is_half==True): - vq_model=vq_model.half().to(device) + **hps.model + ) + if is_half == True: + vq_model = vq_model.half().to(device) else: vq_model = vq_model.to(device) vq_model.eval() # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) - print(vq_model.load_state_dict(torch.load(pretrained_s2G,map_location="cpu")["weight"], strict=False)) + print( + vq_model.load_state_dict( + torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False + ) + ) - def name2go(wav_name,lines): + def name2go(wav_name, lines): hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) - if(os.path.exists(hubert_path)==False):return + if os.path.exists(hubert_path) == False: + return ssl_content = torch.load(hubert_path, map_location="cpu") - if(is_half==True): - ssl_content=ssl_content.half().to(device) + if is_half == True: + ssl_content = ssl_content.half().to(device) else: ssl_content = ssl_content.to(device) codes = vq_model.extract_latent(ssl_content) semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) - lines.append("%s\t%s"%(wav_name,semantic)) + lines.append("%s\t%s" % (wav_name, semantic)) - with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") - lines1=[] - for line in lines[int(i_part)::int(all_parts)]: + lines1 = [] + for line in lines[int(i_part) :: int(all_parts)]: # print(line) try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=os.path.basename(wav_name) + wav_name = os.path.basename(wav_name) # name2go(name,lines1) - name2go(wav_name,lines1) + name2go(wav_name, lines1) except: - print(line,traceback.format_exc()) - with open(semantic_path,"w",encoding="utf8")as f:f.write("\n".join(lines1)) - + print(line, traceback.format_exc()) + with open(semantic_path, "w", encoding="utf8") as f: + f.write("\n".join(lines1)) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index 03bdefb..64c8818 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -6,49 +6,56 @@ import cn2an from pypinyin import lazy_pinyin, Style import sys + sys.path.append("/data/docker/liujing04/gpt-vits/gpt-vits-master") from text.symbols import punctuation from text.tone_sandhi import ToneSandhi current_file_path = os.path.dirname(__file__) -pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in - open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()} +pinyin_to_symbol_map = { + line.split("\t")[0]: line.strip().split("\t")[1] + for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() +} import jieba.posseg as psg rep_map = { - ':': ',', - ';': ',', - ',': ',', - '。': '.', - '!': '!', - '?': '?', - '\n': '.', + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", "·": ",", - '、': ",", - '...': '…', - '$': '.', - '/': ',', - '—': "-" + "、": ",", + "...": "…", + "$": ".", + "/": ",", + "—": "-", } tone_modifier = ToneSandhi() + def replace_punctuation(text): - text = text.replace("嗯", "恩").replace("呣","母") - pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys())) + text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub(r'[^\u4e00-\u9fa5'+"".join(punctuation)+r']+', '', replaced_text) + replaced_text = re.sub( + r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text + ) return replaced_text + def g2p(text): - pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation)) - sentences = [i for i in re.split(pattern, text) if i.strip()!=''] + pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) + sentences = [i for i in re.split(pattern, text) if i.strip() != ""] phones, word2ph = _g2p(sentences) return phones, word2ph @@ -56,10 +63,10 @@ def g2p(text): def _get_initials_finals(word): initials = [] finals = [] - orig_initials = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.INITIALS) + orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) orig_finals = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 + ) for c, v in zip(orig_initials, orig_finals): initials.append(c) finals.append(v) @@ -72,17 +79,16 @@ def _g2p(segments): for seg in segments: pinyins = [] # Replace all English words in the sentence - seg = re.sub('[a-zA-Z]+', '', seg) + seg = re.sub("[a-zA-Z]+", "", seg) seg_cut = psg.lcut(seg) initials = [] finals = [] seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) for word, pos in seg_cut: - if pos == 'eng': + if pos == "eng": continue sub_initials, sub_finals = _get_initials_finals(word) - sub_finals = tone_modifier.modified_tone(word, pos, - sub_finals) + sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) initials.append(sub_initials) finals.append(sub_finals) @@ -91,7 +97,7 @@ def _g2p(segments): finals = sum(finals, []) # for c, v in zip(initials, finals): - raw_pinyin = c+v + raw_pinyin = c + v # NOTE: post process for pypinyin outputs # we discriminate i, ii and iii if c == v: @@ -102,40 +108,40 @@ def _g2p(segments): v_without_tone = v[:-1] tone = v[-1] - pinyin = c+v_without_tone - assert tone in '12345' + pinyin = c + v_without_tone + assert tone in "12345" if c: # 多音节 v_rep_map = { - "uei": 'ui', - 'iou': 'iu', - 'uen': 'un', + "uei": "ui", + "iou": "iu", + "uen": "un", } if v_without_tone in v_rep_map.keys(): - pinyin = c+v_rep_map[v_without_tone] + pinyin = c + v_rep_map[v_without_tone] else: # 单音节 pinyin_rep_map = { - 'ing': 'ying', - 'i': 'yi', - 'in': 'yin', - 'u': 'wu', + "ing": "ying", + "i": "yi", + "in": "yin", + "u": "wu", } if pinyin in pinyin_rep_map.keys(): pinyin = pinyin_rep_map[pinyin] else: single_rep_map = { - 'v': 'yu', - 'e': 'e', - 'i': 'y', - 'u': 'w', + "v": "yu", + "e": "e", + "i": "y", + "u": "w", } if pinyin[0] in single_rep_map.keys(): - pinyin = single_rep_map[pinyin[0]]+pinyin[1:] + pinyin = single_rep_map[pinyin[0]] + pinyin[1:] assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) - new_c, new_v = pinyin_to_symbol_map[pinyin].split(' ') + new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ") new_v = new_v + tone phone = [new_c, new_v] word2ph.append(len(phone)) @@ -144,9 +150,8 @@ def _g2p(segments): return phones_list, word2ph - def text_normalize(text): - numbers = re.findall(r'\d+(?:\.?\d+)?', text) + numbers = re.findall(r"\d+(?:\.?\d+)?", text) for number in numbers: text = text.replace(number, cn2an.an2cn(number), 1) text = replace_punctuation(text) @@ -154,7 +159,7 @@ def text_normalize(text): return text -if __name__ == '__main__': +if __name__ == "__main__": text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "呣呣呣~就是…大人的鼹鼠党吧?" text = "你好" diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index dc4bd73..e5a9b1b 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,29 +1,27 @@ from text import chinese, japanese, cleaned_text_to_sequence, symbols, english -language_module_map = { - 'zh': chinese, - "ja": japanese, - 'en': english -} +language_module_map = {"zh": chinese, "ja": japanese, "en": english} special = [ - ('%', 'zh', "SP"), - ('¥', 'zh', "SP2"), - ('^', 'zh', "SP3"), + ("%", "zh", "SP"), + ("¥", "zh", "SP2"), + ("^", "zh", "SP3"), # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 ] + + def clean_text(text, language): for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol) language_module = language_module_map[language] norm_text = language_module.text_normalize(text) - if(language=="zh"): + if language == "zh": phones, word2ph = language_module.g2p(norm_text) assert len(phones) == sum(word2ph) assert len(norm_text) == len(word2ph) else: phones = language_module.g2p(norm_text) - word2ph=None + word2ph = None for ph in phones: assert ph in symbols @@ -41,17 +39,17 @@ def clean_special(text, language, special_s, target_symbol): new_ph = [] for ph in phones: assert ph in symbols - if ph == ',': + if ph == ",": new_ph.append(target_symbol) else: new_ph.append(ph) return new_ph + def text_to_sequence(text, language): phones = clean_text(text) return cleaned_text_to_sequence(phones) -if __name__ == '__main__': - print(clean_text("你好%啊啊啊额、还是到付红四方。", 'zh')) - +if __name__ == "__main__": + print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) diff --git a/GPT_SoVITS/text/english.py b/GPT_SoVITS/text/english.py index bf48db1..bd68ddf 100644 --- a/GPT_SoVITS/text/english.py +++ b/GPT_SoVITS/text/english.py @@ -8,20 +8,87 @@ from string import punctuation from text import symbols current_file_path = os.path.dirname(__file__) -CMU_DICT_PATH = os.path.join(current_file_path, 'cmudict.rep') -CACHE_PATH = os.path.join(current_file_path, 'cmudict_cache.pickle') +CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") +CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle") _g2p = G2p() -arpa = {'AH0', 'S', 'AH1', 'EY2', 'AE2', 'EH0', 'OW2', 'UH0', 'NG', 'B', 'G', 'AY0', 'M', 'AA0', 'F', 'AO0', 'ER2', 'UH1', 'IY1', 'AH2', 'DH', 'IY0', 'EY1', 'IH0', 'K', 'N', 'W', 'IY2', 'T', 'AA1', 'ER1', 'EH2', 'OY0', 'UH2', 'UW1', 'Z', 'AW2', 'AW1', 'V', 'UW2', 'AA2', 'ER', 'AW0', 'UW0', 'R', 'OW1', 'EH1', 'ZH', 'AE0', 'IH2', 'IH', 'Y', 'JH', 'P', 'AY1', 'EY0', 'OY2', 'TH', 'HH', 'D', 'ER0', 'CH', 'AO1', 'AE1', 'AO2', 'OY1', 'AY2', 'IH1', 'OW0', 'L', 'SH'} +arpa = { + "AH0", + "S", + "AH1", + "EY2", + "AE2", + "EH0", + "OW2", + "UH0", + "NG", + "B", + "G", + "AY0", + "M", + "AA0", + "F", + "AO0", + "ER2", + "UH1", + "IY1", + "AH2", + "DH", + "IY0", + "EY1", + "IH0", + "K", + "N", + "W", + "IY2", + "T", + "AA1", + "ER1", + "EH2", + "OY0", + "UH2", + "UW1", + "Z", + "AW2", + "AW1", + "V", + "UW2", + "AA2", + "ER", + "AW0", + "UW0", + "R", + "OW1", + "EH1", + "ZH", + "AE0", + "IH2", + "IH", + "Y", + "JH", + "P", + "AY1", + "EY0", + "OY2", + "TH", + "HH", + "D", + "ER0", + "CH", + "AO1", + "AE1", + "AO2", + "OY1", + "AY2", + "IH1", + "OW0", + "L", + "SH", +} def replace_phs(phs): - rep_map = { - ';': ',', - ':': ',', - '\'': '-', - '"': '-' - } + rep_map = {";": ",", ":": ",", "'": "-", '"': "-"} phs_new = [] for ph in phs: if ph in symbols: @@ -29,9 +96,10 @@ def replace_phs(phs): elif ph in rep_map.keys(): phs_new.append(rep_map[ph]) else: - print('ph not in symbols: ', ph) + print("ph not in symbols: ", ph) return phs_new + def read_dict(): g2p_dict = {} start_line = 49 @@ -41,13 +109,13 @@ def read_dict(): while line: if line_index >= start_line: line = line.strip() - word_split = line.split(' ') + word_split = line.split(" ") word = word_split[0] - syllable_split = word_split[1].split(' - ') + syllable_split = word_split[1].split(" - ") g2p_dict[word] = [] for syllable in syllable_split: - phone_split = syllable.split(' ') + phone_split = syllable.split(" ") g2p_dict[word].append(phone_split) line_index = line_index + 1 @@ -57,13 +125,13 @@ def read_dict(): def cache_dict(g2p_dict, file_path): - with open(file_path, 'wb') as pickle_file: + with open(file_path, "wb") as pickle_file: pickle.dump(g2p_dict, pickle_file) def get_dict(): if os.path.exists(CACHE_PATH): - with open(CACHE_PATH, 'rb') as pickle_file: + with open(CACHE_PATH, "rb") as pickle_file: g2p_dict = pickle.load(pickle_file) else: g2p_dict = read_dict() @@ -71,6 +139,7 @@ def get_dict(): return g2p_dict + eng_dict = get_dict() @@ -78,8 +147,8 @@ def text_normalize(text): # todo: eng text normalize return text.replace(";", ",") -def g2p(text): +def g2p(text): phones = [] words = re.split(r"([,;.\-\?\!\s+])", text) for w in words: @@ -97,6 +166,7 @@ def g2p(text): return replace_phs(phones) + if __name__ == "__main__": # print(get_dict()) print(g2p("hello")) @@ -106,4 +176,4 @@ if __name__ == "__main__": # for group in syllables: # for ph in group: # all_phones.add(ph) - # print(all_phones) \ No newline at end of file + # print(all_phones) diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index f1263e4..1cef2db 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -8,57 +8,63 @@ from text import symbols # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( - r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') + r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) # Regular expression matching non-Japanese characters or punctuation marks: _japanese_marks = re.compile( - r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') + r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) # List of (symbol, Japanese) pairs for marks: -_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ - ('%', 'パーセント') -]] +_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] # List of (consonant, sokuon) pairs: -_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ - (r'Q([↑↓]*[kg])', r'k#\1'), - (r'Q([↑↓]*[tdjʧ])', r't#\1'), - (r'Q([↑↓]*[sʃ])', r's\1'), - (r'Q([↑↓]*[pb])', r'p#\1') -]] +_real_sokuon = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + (r"Q([↑↓]*[kg])", r"k#\1"), + (r"Q([↑↓]*[tdjʧ])", r"t#\1"), + (r"Q([↑↓]*[sʃ])", r"s\1"), + (r"Q([↑↓]*[pb])", r"p#\1"), + ] +] # List of (consonant, hatsuon) pairs: -_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ - (r'N([↑↓]*[pbm])', r'm\1'), - (r'N([↑↓]*[ʧʥj])', r'n^\1'), - (r'N([↑↓]*[tdn])', r'n\1'), - (r'N([↑↓]*[kg])', r'ŋ\1') -]] - +_real_hatsuon = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + (r"N([↑↓]*[pbm])", r"m\1"), + (r"N([↑↓]*[ʧʥj])", r"n^\1"), + (r"N([↑↓]*[tdn])", r"n\1"), + (r"N([↑↓]*[kg])", r"ŋ\1"), + ] +] def post_replace_ph(ph): rep_map = { - ':': ',', - ';': ',', - ',': ',', - '。': '.', - '!': '!', - '?': '?', - '\n': '.', + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", "·": ",", - '、': ",", - '...': '…' + "、": ",", + "...": "…", } if ph in rep_map.keys(): ph = rep_map[ph] if ph in symbols: return ph if ph not in symbols: - ph = 'UNK' + ph = "UNK" return ph + def symbols_to_japanese(text): for regex, replacement in _symbols_to_japanese: text = re.sub(regex, replacement, text) @@ -66,7 +72,7 @@ def symbols_to_japanese(text): def preprocess_jap(text): - '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' + """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" text = symbols_to_japanese(text) sentences = re.split(_japanese_marks, text) marks = re.findall(_japanese_marks, text) @@ -77,13 +83,15 @@ def preprocess_jap(text): text += p.split(" ") if i < len(marks): - text += [marks[i].replace(' ', '')] + text += [marks[i].replace(" ", "")] return text + def text_normalize(text): # todo: jap text normalize return text + def g2p(norm_text): phones = preprocess_jap(norm_text) phones = [post_replace_ph(i) for i in phones] @@ -91,7 +99,7 @@ def g2p(norm_text): return phones -if __name__ == '__main__': +if __name__ == "__main__": for line in open("../../../Downloads/transcript_utf8.txt").readlines(): text = line.split(":")[1] phones = g2p(text) diff --git a/GPT_SoVITS/text/symbols.py b/GPT_SoVITS/text/symbols.py index 5322a92..97e3938 100644 --- a/GPT_SoVITS/text/symbols.py +++ b/GPT_SoVITS/text/symbols.py @@ -1,24 +1,397 @@ import os # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 -punctuation = ['!', '?', '…', ",", "."]#@是SP停顿 +punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 punctuation.append("-") -pu_symbols = punctuation + ["SP", 'SP2', 'SP3', "UNK"] +pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"] # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"] -pad = '_' +pad = "_" -c = ['AA', 'EE', 'OO', 'b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'w', 'x', 'y', 'z', 'zh'] -v = ['E1', 'En1', 'a1', 'ai1', 'an1', 'ang1', 'ao1', 'e1', 'ei1', 'en1', 'eng1', 'er1', 'i1', 'i01', 'ia1', 'ian1', 'iang1', 'iao1', 'ie1', 'in1', 'ing1', 'iong1', 'ir1', 'iu1', 'o1', 'ong1', 'ou1', 'u1', 'ua1', 'uai1', 'uan1', 'uang1', 'ui1', 'un1', 'uo1', 'v1', 'van1', 've1', 'vn1', 'E2', 'En2', 'a2', 'ai2', 'an2', 'ang2', 'ao2', 'e2', 'ei2', 'en2', 'eng2', 'er2', 'i2', 'i02', 'ia2', 'ian2', 'iang2', 'iao2', 'ie2', 'in2', 'ing2', 'iong2', 'ir2', 'iu2', 'o2', 'ong2', 'ou2', 'u2', 'ua2', 'uai2', 'uan2', 'uang2', 'ui2', 'un2', 'uo2', 'v2', 'van2', 've2', 'vn2', 'E3', 'En3', 'a3', 'ai3', 'an3', 'ang3', 'ao3', 'e3', 'ei3', 'en3', 'eng3', 'er3', 'i3', 'i03', 'ia3', 'ian3', 'iang3', 'iao3', 'ie3', 'in3', 'ing3', 'iong3', 'ir3', 'iu3', 'o3', 'ong3', 'ou3', 'u3', 'ua3', 'uai3', 'uan3', 'uang3', 'ui3', 'un3', 'uo3', 'v3', 'van3', 've3', 'vn3', 'E4', 'En4', 'a4', 'ai4', 'an4', 'ang4', 'ao4', 'e4', 'ei4', 'en4', 'eng4', 'er4', 'i4', 'i04', 'ia4', 'ian4', 'iang4', 'iao4', 'ie4', 'in4', 'ing4', 'iong4', 'ir4', 'iu4', 'o4', 'ong4', 'ou4', 'u4', 'ua4', 'uai4', 'uan4', 'uang4', 'ui4', 'un4', 'uo4', 'v4', 'van4', 've4', 'vn4', 'E5', 'En5', 'a5', 'ai5', 'an5', 'ang5', 'ao5', 'e5', 'ei5', 'en5', 'eng5', 'er5', 'i5', 'i05', 'ia5', 'ian5', 'iang5', 'iao5', 'ie5', 'in5', 'ing5', 'iong5', 'ir5', 'iu5', 'o5', 'ong5', 'ou5', 'u5', 'ua5', 'uai5', 'uan5', 'uang5', 'ui5', 'un5', 'uo5', 'v5', 'van5', 've5', 'vn5'] +c = [ + "AA", + "EE", + "OO", + "b", + "c", + "ch", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "sh", + "t", + "w", + "x", + "y", + "z", + "zh", +] +v = [ + "E1", + "En1", + "a1", + "ai1", + "an1", + "ang1", + "ao1", + "e1", + "ei1", + "en1", + "eng1", + "er1", + "i1", + "i01", + "ia1", + "ian1", + "iang1", + "iao1", + "ie1", + "in1", + "ing1", + "iong1", + "ir1", + "iu1", + "o1", + "ong1", + "ou1", + "u1", + "ua1", + "uai1", + "uan1", + "uang1", + "ui1", + "un1", + "uo1", + "v1", + "van1", + "ve1", + "vn1", + "E2", + "En2", + "a2", + "ai2", + "an2", + "ang2", + "ao2", + "e2", + "ei2", + "en2", + "eng2", + "er2", + "i2", + "i02", + "ia2", + "ian2", + "iang2", + "iao2", + "ie2", + "in2", + "ing2", + "iong2", + "ir2", + "iu2", + "o2", + "ong2", + "ou2", + "u2", + "ua2", + "uai2", + "uan2", + "uang2", + "ui2", + "un2", + "uo2", + "v2", + "van2", + "ve2", + "vn2", + "E3", + "En3", + "a3", + "ai3", + "an3", + "ang3", + "ao3", + "e3", + "ei3", + "en3", + "eng3", + "er3", + "i3", + "i03", + "ia3", + "ian3", + "iang3", + "iao3", + "ie3", + "in3", + "ing3", + "iong3", + "ir3", + "iu3", + "o3", + "ong3", + "ou3", + "u3", + "ua3", + "uai3", + "uan3", + "uang3", + "ui3", + "un3", + "uo3", + "v3", + "van3", + "ve3", + "vn3", + "E4", + "En4", + "a4", + "ai4", + "an4", + "ang4", + "ao4", + "e4", + "ei4", + "en4", + "eng4", + "er4", + "i4", + "i04", + "ia4", + "ian4", + "iang4", + "iao4", + "ie4", + "in4", + "ing4", + "iong4", + "ir4", + "iu4", + "o4", + "ong4", + "ou4", + "u4", + "ua4", + "uai4", + "uan4", + "uang4", + "ui4", + "un4", + "uo4", + "v4", + "van4", + "ve4", + "vn4", + "E5", + "En5", + "a5", + "ai5", + "an5", + "ang5", + "ao5", + "e5", + "ei5", + "en5", + "eng5", + "er5", + "i5", + "i05", + "ia5", + "ian5", + "iang5", + "iao5", + "ie5", + "in5", + "ing5", + "iong5", + "ir5", + "iu5", + "o5", + "ong5", + "ou5", + "u5", + "ua5", + "uai5", + "uan5", + "uang5", + "ui5", + "un5", + "uo5", + "v5", + "van5", + "ve5", + "vn5", +] -v_without_tone = ['E', 'En', 'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'i0', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'ir', 'iu', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn'] +v_without_tone = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "e", + "ei", + "en", + "eng", + "er", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "o", + "ong", + "ou", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", +] # japanese -ja_symbols = ['I', 'N', 'U', 'a', 'b', 'by', 'ch', 'cl', 'd', 'dy', 'e', 'f', 'g', 'gy', 'h', 'hy', 'i', 'j', 'k', 'ky', - 'm', 'my', 'n', 'ny', 'o', 'p', 'py', 'r', 'ry', 's', 'sh', 't', 'ts', 'u', 'v', 'w', 'y', 'z'] +ja_symbols = [ + "I", + "N", + "U", + "a", + "b", + "by", + "ch", + "cl", + "d", + "dy", + "e", + "f", + "g", + "gy", + "h", + "hy", + "i", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "p", + "py", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "u", + "v", + "w", + "y", + "z", +] -arpa = {'AH0', 'S', 'AH1', 'EY2', 'AE2', 'EH0', 'OW2', 'UH0', 'NG', 'B', 'G', 'AY0', 'M', 'AA0', 'F', 'AO0', 'ER2', 'UH1', 'IY1', 'AH2', 'DH', 'IY0', 'EY1', 'IH0', 'K', 'N', 'W', 'IY2', 'T', 'AA1', 'ER1', 'EH2', 'OY0', 'UH2', 'UW1', 'Z', 'AW2', 'AW1', 'V', 'UW2', 'AA2', 'ER', 'AW0', 'UW0', 'R', 'OW1', 'EH1', 'ZH', 'AE0', 'IH2', 'IH', 'Y', 'JH', 'P', 'AY1', 'EY0', 'OY2', 'TH', 'HH', 'D', 'ER0', 'CH', 'AO1', 'AE1', 'AO2', 'OY1', 'AY2', 'IH1', 'OW0', 'L', 'SH'} +arpa = { + "AH0", + "S", + "AH1", + "EY2", + "AE2", + "EH0", + "OW2", + "UH0", + "NG", + "B", + "G", + "AY0", + "M", + "AA0", + "F", + "AO0", + "ER2", + "UH1", + "IY1", + "AH2", + "DH", + "IY0", + "EY1", + "IH0", + "K", + "N", + "W", + "IY2", + "T", + "AA1", + "ER1", + "EH2", + "OY0", + "UH2", + "UW1", + "Z", + "AW2", + "AW1", + "V", + "UW2", + "AA2", + "ER", + "AW0", + "UW0", + "R", + "OW1", + "EH1", + "ZH", + "AE0", + "IH2", + "IH", + "Y", + "JH", + "P", + "AY1", + "EY0", + "OY2", + "TH", + "HH", + "D", + "ER0", + "CH", + "AO1", + "AE1", + "AO2", + "OY1", + "AY2", + "IH1", + "OW0", + "L", + "SH", +} symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) symbols = sorted(set(symbols)) -if __name__ == '__main__': - print(len(symbols)) \ No newline at end of file +if __name__ == "__main__": + print(len(symbols)) diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index bf3893f..f987a3f 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -19,51 +19,442 @@ from pypinyin import lazy_pinyin from pypinyin import Style -class ToneSandhi(): +class ToneSandhi: def __init__(self): self.must_neural_tone_words = { - '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', - '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊', - '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去', - '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号', - '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当', - '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻', - '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂', - '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆', - '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂', - '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿', - '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台', - '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算', - '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨', - '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快', - '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜', - '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔', - '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事', - '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', - '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', - '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实', - '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', - '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', - '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', - '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', - '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', - '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', - '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', - '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', - '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', - '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', - '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', - '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', - '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', - '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', - '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', - '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', - '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', - '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', - '扫把', '惦记' + "麻烦", + "麻利", + "鸳鸯", + "高粱", + "骨头", + "骆驼", + "马虎", + "首饰", + "馒头", + "馄饨", + "风筝", + "难为", + "队伍", + "阔气", + "闺女", + "门道", + "锄头", + "铺盖", + "铃铛", + "铁匠", + "钥匙", + "里脊", + "里头", + "部分", + "那么", + "道士", + "造化", + "迷糊", + "连累", + "这么", + "这个", + "运气", + "过去", + "软和", + "转悠", + "踏实", + "跳蚤", + "跟头", + "趔趄", + "财主", + "豆腐", + "讲究", + "记性", + "记号", + "认识", + "规矩", + "见识", + "裁缝", + "补丁", + "衣裳", + "衣服", + "衙门", + "街坊", + "行李", + "行当", + "蛤蟆", + "蘑菇", + "薄荷", + "葫芦", + "葡萄", + "萝卜", + "荸荠", + "苗条", + "苗头", + "苍蝇", + "芝麻", + "舒服", + "舒坦", + "舌头", + "自在", + "膏药", + "脾气", + "脑袋", + "脊梁", + "能耐", + "胳膊", + "胭脂", + "胡萝", + "胡琴", + "胡同", + "聪明", + "耽误", + "耽搁", + "耷拉", + "耳朵", + "老爷", + "老实", + "老婆", + "老头", + "老太", + "翻腾", + "罗嗦", + "罐头", + "编辑", + "结实", + "红火", + "累赘", + "糨糊", + "糊涂", + "精神", + "粮食", + "簸箕", + "篱笆", + "算计", + "算盘", + "答应", + "笤帚", + "笑语", + "笑话", + "窟窿", + "窝囊", + "窗户", + "稳当", + "稀罕", + "称呼", + "秧歌", + "秀气", + "秀才", + "福气", + "祖宗", + "砚台", + "码头", + "石榴", + "石头", + "石匠", + "知识", + "眼睛", + "眯缝", + "眨巴", + "眉毛", + "相声", + "盘算", + "白净", + "痢疾", + "痛快", + "疟疾", + "疙瘩", + "疏忽", + "畜生", + "生意", + "甘蔗", + "琵琶", + "琢磨", + "琉璃", + "玻璃", + "玫瑰", + "玄乎", + "狐狸", + "状元", + "特务", + "牲口", + "牙碜", + "牌楼", + "爽快", + "爱人", + "热闹", + "烧饼", + "烟筒", + "烂糊", + "点心", + "炊帚", + "灯笼", + "火候", + "漂亮", + "滑溜", + "溜达", + "温和", + "清楚", + "消息", + "浪头", + "活泼", + "比方", + "正经", + "欺负", + "模糊", + "槟榔", + "棺材", + "棒槌", + "棉花", + "核桃", + "栅栏", + "柴火", + "架势", + "枕头", + "枇杷", + "机灵", + "本事", + "木头", + "木匠", + "朋友", + "月饼", + "月亮", + "暖和", + "明白", + "时候", + "新鲜", + "故事", + "收拾", + "收成", + "提防", + "挖苦", + "挑剔", + "指甲", + "指头", + "拾掇", + "拳头", + "拨弄", + "招牌", + "招呼", + "抬举", + "护士", + "折腾", + "扫帚", + "打量", + "打算", + "打点", + "打扮", + "打听", + "打发", + "扎实", + "扁担", + "戒指", + "懒得", + "意识", + "意思", + "情形", + "悟性", + "怪物", + "思量", + "怎么", + "念头", + "念叨", + "快活", + "忙活", + "志气", + "心思", + "得罪", + "张罗", + "弟兄", + "开通", + "应酬", + "庄稼", + "干事", + "帮手", + "帐篷", + "希罕", + "师父", + "师傅", + "巴结", + "巴掌", + "差事", + "工夫", + "岁数", + "屁股", + "尾巴", + "少爷", + "小气", + "小伙", + "将就", + "对头", + "对付", + "寡妇", + "家伙", + "客气", + "实在", + "官司", + "学问", + "学生", + "字号", + "嫁妆", + "媳妇", + "媒人", + "婆家", + "娘家", + "委屈", + "姑娘", + "姐夫", + "妯娌", + "妥当", + "妖精", + "奴才", + "女婿", + "头发", + "太阳", + "大爷", + "大方", + "大意", + "大夫", + "多少", + "多么", + "外甥", + "壮实", + "地道", + "地方", + "在乎", + "困难", + "嘴巴", + "嘱咐", + "嘟囔", + "嘀咕", + "喜欢", + "喇嘛", + "喇叭", + "商量", + "唾沫", + "哑巴", + "哈欠", + "哆嗦", + "咳嗽", + "和尚", + "告诉", + "告示", + "含糊", + "吓唬", + "后头", + "名字", + "名堂", + "合同", + "吆喝", + "叫唤", + "口袋", + "厚道", + "厉害", + "千斤", + "包袱", + "包涵", + "匀称", + "勤快", + "动静", + "动弹", + "功夫", + "力气", + "前头", + "刺猬", + "刺激", + "别扭", + "利落", + "利索", + "利害", + "分析", + "出息", + "凑合", + "凉快", + "冷战", + "冤枉", + "冒失", + "养活", + "关系", + "先生", + "兄弟", + "便宜", + "使唤", + "佩服", + "作坊", + "体面", + "位置", + "似的", + "伙计", + "休息", + "什么", + "人家", + "亲戚", + "亲家", + "交情", + "云彩", + "事情", + "买卖", + "主意", + "丫头", + "丧气", + "两口", + "东西", + "东家", + "世故", + "不由", + "不在", + "下水", + "下巴", + "上头", + "上司", + "丈夫", + "丈人", + "一辈", + "那个", + "菩萨", + "父亲", + "母亲", + "咕噜", + "邋遢", + "费用", + "冤家", + "甜头", + "介绍", + "荒唐", + "大人", + "泥鳅", + "幸福", + "熟悉", + "计划", + "扑腾", + "蜡烛", + "姥爷", + "照顾", + "喉咙", + "吉他", + "弄堂", + "蚂蚱", + "凤凰", + "拖沓", + "寒碜", + "糟蹋", + "倒腾", + "报复", + "逻辑", + "盘缠", + "喽啰", + "牢骚", + "咖喱", + "扫把", + "惦记", } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" + "男子", + "女子", + "分子", + "原子", + "量子", + "莲子", + "石子", + "瓜子", + "电子", + "人人", + "虎虎", } self.punc = ":,;。?!“”‘’':,;.?!" @@ -72,14 +463,15 @@ class ToneSandhi(): # word: "家里" # pos: "s" # finals: ['ia1', 'i3'] - def _neural_sandhi(self, word: str, pos: str, - finals: List[str]) -> List[str]: - + def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in { - "n", "v", "a" - } and word not in self.must_not_neural_tone_words: + if ( + j - 1 >= 0 + and item == word[j - 1] + and pos[0] in {"n", "v", "a"} + and word not in self.must_not_neural_tone_words + ): finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": @@ -89,9 +481,12 @@ class ToneSandhi(): # e.g. 走了, 看着, 去过 elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: finals[-1] = finals[-1][:-1] + "5" - elif len(word) > 1 and word[-1] in "们子" and pos in { - "r", "n" - } and word not in self.must_not_neural_tone_words: + elif ( + len(word) > 1 + and word[-1] in "们子" + and pos in {"r", "n"} + and word not in self.must_not_neural_tone_words + ): finals[-1] = finals[-1][:-1] + "5" # e.g. 桌上, 地下, 家里 elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: @@ -100,21 +495,26 @@ class ToneSandhi(): elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": finals[-1] = finals[-1][:-1] + "5" # 个做量词 - elif (ge_idx >= 1 and - (word[ge_idx - 1].isnumeric() or - word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个': + elif ( + ge_idx >= 1 + and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ) or word == "个": finals[ge_idx] = finals[ge_idx][:-1] + "5" else: - if word in self.must_neural_tone_words or word[ - -2:] in self.must_neural_tone_words: + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): finals[-1] = finals[-1][:-1] + "5" word_list = self._split_word(word) - finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]] + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] for i, word in enumerate(word_list): # conventional neural in Chinese - if word in self.must_neural_tone_words or word[ - -2:] in self.must_neural_tone_words: + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): finals_list[i][-1] = finals_list[i][-1][:-1] + "5" finals = sum(finals_list, []) return finals @@ -126,15 +526,15 @@ class ToneSandhi(): else: for i, char in enumerate(word): # "不" before tone4 should be bu2, e.g. 不怕 - if char == "不" and i + 1 < len(word) and finals[i + - 1][-1] == "4": + if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": finals[i] = finals[i][:-1] + "2" return finals def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: # "一" in number sequences, e.g. 一零零, 二一零 if word.find("一") != -1 and all( - [item.isnumeric() for item in word if item != "一"]): + [item.isnumeric() for item in word if item != "一"] + ): return finals # "一" between reduplication words shold be yi5, e.g. 看一看 elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: @@ -161,10 +561,10 @@ class ToneSandhi(): first_subword = word_list[0] first_begin_idx = word.find(first_subword) if first_begin_idx == 0: - second_subword = word[len(first_subword):] + second_subword = word[len(first_subword) :] new_word_list = [first_subword, second_subword] else: - second_subword = word[:-len(first_subword)] + second_subword = word[: -len(first_subword)] new_word_list = [second_subword, first_subword] return new_word_list @@ -182,18 +582,19 @@ class ToneSandhi(): elif len(word_list[0]) == 1: finals[1] = finals[1][:-1] + "2" else: - finals_list = [ - finals[:len(word_list[0])], finals[len(word_list[0]):] - ] + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] if len(finals_list) == 2: for i, sub in enumerate(finals_list): # e.g. 所有/人 if self._all_tone_three(sub) and len(sub) == 2: finals_list[i][0] = finals_list[i][0][:-1] + "2" # e.g. 好/喜欢 - elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \ - finals_list[0][-1][-1] == "3": - + elif ( + i == 1 + and not self._all_tone_three(sub) + and finals_list[i][0][-1] == "3" + and finals_list[0][-1][-1] == "3" + ): finals_list[0][-1] = finals_list[0][-1][:-1] + "2" finals = sum(finals_list, []) # split idiom into two words who's length is 2 @@ -222,7 +623,7 @@ class ToneSandhi(): new_seg.append((word, pos)) last_word = word[:] if last_word == "不": - new_seg.append((last_word, 'd')) + new_seg.append((last_word, "d")) last_word = "" return new_seg @@ -236,12 +637,21 @@ class ToneSandhi(): new_seg = [] # function 1 for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ - 0] == seg[i + 1][0] and seg[i - 1][1] == "v": + if ( + i - 1 >= 0 + and word == "一" + and i + 1 < len(seg) + and seg[i - 1][0] == seg[i + 1][0] + and seg[i - 1][1] == "v" + ): new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] else: - if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ - 0] == word and pos == "v": + if ( + i - 2 >= 0 + and seg[i - 1][0] == "一" + and seg[i - 2][0] == word + and pos == "v" + ): continue else: new_seg.append([word, pos]) @@ -257,22 +667,27 @@ class ToneSandhi(): # the first and the second words are all_tone_three def _merge_continuous_three_tones( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and self._all_tone_three( - sub_finals_list[i - 1]) and self._all_tone_three( - sub_finals_list[i]) and not merge_last[i - 1]: + if ( + i - 1 >= 0 + and self._all_tone_three(sub_finals_list[i - 1]) + and self._all_tone_three(sub_finals_list[i]) + and not merge_last[i - 1] + ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len( - seg[i - 1][0]) + len(seg[i][0]) <= 3: + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -287,21 +702,27 @@ class ToneSandhi(): # the last char of first word and the first char of second word is tone_three def _merge_continuous_three_tones_2( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \ - merge_last[i - 1]: + if ( + i - 1 >= 0 + and sub_finals_list[i - 1][-1][-1] == "3" + and sub_finals_list[i][0][-1] == "3" + and not merge_last[i - 1] + ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len( - seg[i - 1][0]) + len(seg[i][0]) <= 3: + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -313,14 +734,13 @@ class ToneSandhi(): def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "儿" and seg[i-1][0] != "#": + if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": new_seg[-1][0] = new_seg[-1][0] + seg[i][0] else: new_seg.append([word, pos]) return new_seg - def _merge_reduplication( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] for i, (word, pos) in enumerate(seg): if new_seg and word == new_seg[-1][0]: @@ -329,8 +749,7 @@ class ToneSandhi(): new_seg.append([word, pos]) return new_seg - def pre_merge_for_modify( - self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: seg = self._merge_bu(seg) try: seg = self._merge_yi(seg) @@ -349,8 +768,7 @@ class ToneSandhi(): seg = self._merge_er(seg) return seg - def modified_tone(self, word: str, pos: str, - finals: List[str]) -> List[str]: + def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) From fd4d0ea444b5b4a3d73eec437135cbb96d25edaa Mon Sep 17 00:00:00 2001 From: spicysama <122108331+AnyaCoder@users.noreply.github.com> Date: Wed, 17 Jan 2024 01:52:11 +0800 Subject: [PATCH 11/58] Update README.md for Ubuntu Ubuntu 22.04 LTS reported: >>> OSError: libsox.so: cannot open shared object file: No such file or directory >>> RuntimeError: Error in dlopen: libavutil.so.58: cannot open shared object file: No such file or directory >>> DEBUG:torchaudio._extension.utils:Attempting to load FFmpeg version 6, 5, 4. not found --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9a7486e..efa67dc 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ pip install modelscope torchaudio sentencepiece funasr ```bash sudo apt install ffmpeg +sudo apt install libsox-dev +conda install -c conda-forge 'ffmpeg<7' ``` #### MacOS Users @@ -115,4 +117,4 @@ Special thanks to the following projects and contributors: - [audio-slicer](https://github.com/openvpi/audio-slicer) - [SubFix](https://github.com/cronrpc/SubFix) - [FFmpeg](https://github.com/FFmpeg/FFmpeg) -- [gradio](https://github.com/gradio-app/gradio) \ No newline at end of file +- [gradio](https://github.com/gradio-app/gradio) From e3a008dbea83c8bc134114b7d0ed0f20386b0d63 Mon Sep 17 00:00:00 2001 From: spicysama <122108331+AnyaCoder@users.noreply.github.com> Date: Wed, 17 Jan 2024 02:55:54 +0800 Subject: [PATCH 12/58] FIx: cannot identify one class to a dict(needed) To implement recursive construction while retaining the characteristics of the original dictionary, we can slightly modify the DictToAttrRecursive class. This allows each object to retain its characteristics as a dictionary while accessing keys and values as attributes. --- GPT_SoVITS/inference_webui.py | 50 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4917d32..9018dc1 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -10,25 +10,15 @@ if("_CUDA_VISIBLE_DEVICES"in os.environ): is_half=eval(os.environ.get("is_half","True")) import gradio as gr from transformers import AutoModelForMaskedLM, AutoTokenizer -import sys,torch,numpy as np -from pathlib import Path -import os,pdb,utils,librosa,math,traceback,requests,argparse,torch,multiprocessing,pandas as pd,torch.multiprocessing as mp,soundfile -# torch.backends.cuda.sdp_kernel("flash") -# torch.backends.cuda.enable_flash_sdp(True) -# torch.backends.cuda.enable_mem_efficient_sdp(True) # Not avaliable if torch version is lower than 2.0 -# torch.backends.cuda.enable_math_sdp(True) -from random import shuffle -from AR.utils import get_newest_ckpt -from glob import glob -from tqdm import tqdm +import numpy as np +import librosa,torch from feature_extractor import cnhubert cnhubert.cnhubert_base_path=cnhubert_base_path -from io import BytesIO + from module.models import SynthesizerTrn from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from AR.utils.io import load_yaml_config from text import cleaned_text_to_sequence -from text.cleaner import text_to_sequence, clean_text +from text.cleaner import clean_text from time import time as ttime from module.mel_processing import spectrogram_torch from my_utils import load_audio @@ -58,16 +48,36 @@ def get_bert_feature(text, word2ph): n_semantic = 1024 dict_s2=torch.load(sovits_path,map_location="cpu") hps=dict_s2["config"] -class DictToAttrRecursive: + +class DictToAttrRecursive(dict): def __init__(self, input_dict): + super().__init__(input_dict) for key, value in input_dict.items(): if isinstance(value, dict): - # 如果值是字典,递归调用构造函数 - setattr(self, key, DictToAttrRecursive(value)) - else: - setattr(self, key, value) + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate="25hz" dict_s1=torch.load(gpt_path,map_location="cpu") config=dict_s1["config"] @@ -269,4 +279,4 @@ app.queue(concurrency_count=511, max_size=1022).launch( inbrowser=True, server_port=infer_ttswebui, quiet=True, -) \ No newline at end of file +) From b8c4cbd9004b139e6cf6f3364ba174d9451aa3ba Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 08:48:27 +0800 Subject: [PATCH 13/58] Create install.sh --- install.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 install.sh diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..7417622 --- /dev/null +++ b/install.sh @@ -0,0 +1,4 @@ +#!/bin/bash +conda install gcc gxx ffmpeg cmake +conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia +pip install -r requirements.txt From 88191fe20268a5c4e15a6b77a6c78052db54a115 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 08:50:06 +0800 Subject: [PATCH 14/58] Update requirements.txt 1.tqdm==4.59.0 may have dependency issues 2.missing requirement chardet --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ab846f..3b5c195 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime -tqdm==4.59.0 +tqdm funasr cn2an pypinyin @@ -17,3 +17,4 @@ torchaudio modelscope sentencepiece transformers +chardet From d54a55b6809e535fa3f77dce54a796cf0657baa3 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 08:55:32 +0800 Subject: [PATCH 15/58] Update README.md --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9a7486e..4c00b4f 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. +### Quick Install with Conda +```bash +conda create -n GPTSoVits python=3.9 +conda activate GPTSoVits +bash install.sh +``` + ### Pip Packages ```bash -pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm==4.59.0 cn2an pypinyin pyopenjtalk g2p_en +pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet ``` ### Additional Requirements @@ -115,4 +122,4 @@ Special thanks to the following projects and contributors: - [audio-slicer](https://github.com/openvpi/audio-slicer) - [SubFix](https://github.com/cronrpc/SubFix) - [FFmpeg](https://github.com/FFmpeg/FFmpeg) -- [gradio](https://github.com/gradio-app/gradio) \ No newline at end of file +- [gradio](https://github.com/gradio-app/gradio) From 7e533c6995523748026db98584cdf9855c636368 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 08:58:19 +0800 Subject: [PATCH 16/58] Update install.sh --- install.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 7417622..8dadb26 100644 --- a/install.sh +++ b/install.sh @@ -1,4 +1,6 @@ #!/bin/bash -conda install gcc gxx ffmpeg cmake +conda install -c conda-forge gcc +conda install -c conda-forge gxx +conda install ffmpeg cmake conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia pip install -r requirements.txt From e78fd0de96fa319c253d4e194afc3916de778b50 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 08:59:38 +0800 Subject: [PATCH 17/58] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c00b4f..ee9cf5d 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. -### Quick Install with Conda +#### Quick Install with Conda + ```bash conda create -n GPTSoVits python=3.9 conda activate GPTSoVits From d78c872047b8286ddeafc69a09243f8d2f22b859 Mon Sep 17 00:00:00 2001 From: Alex Z Date: Wed, 17 Jan 2024 09:01:36 +0800 Subject: [PATCH 18/58] Update README.md --- README.md | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ee9cf5d..36a1360 100644 --- a/README.md +++ b/README.md @@ -38,21 +38,21 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. -#### Quick Install with Conda +### Quick Install with Conda ```bash conda create -n GPTSoVits python=3.9 conda activate GPTSoVits bash install.sh ``` - -### Pip Packages +### Install Manually +#### Pip Packages ```bash pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet ``` -### Additional Requirements +#### Additional Requirements If you need Chinese ASR (supported by FunASR), install: @@ -60,21 +60,26 @@ If you need Chinese ASR (supported by FunASR), install: pip install modelscope torchaudio sentencepiece funasr ``` -### FFmpeg +#### FFmpeg -#### Ubuntu/Debian Users +##### Conda Users +```bash +conda install ffmpeg +``` + +##### Ubuntu/Debian Users ```bash sudo apt install ffmpeg ``` -#### MacOS Users +##### MacOS Users ```bash brew install ffmpeg ``` -#### Windows Users +##### Windows Users Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. From bac65693c65bc6f936b4c1ef3cc5526d794184fe Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 16:17:34 +0800 Subject: [PATCH 19/58] Update config.py --- config.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index 2333646..3ca2d1d 100644 --- a/config.py +++ b/config.py @@ -1,11 +1,9 @@ -import platform - is_half=True exp_root="logs" -python_exec="runtime\python"if platform.system()=="Windows"else "python" +python_exec=sys.executable or "python" infer_device="cuda" webui_port_main=9874 webui_port_uvr5=9873 webui_port_infer_tts=9872 -webui_port_subfix=9871 \ No newline at end of file +webui_port_subfix=9871 From 318ebe5d9f33dc18633ca9bb17df0c827a5efc5d Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 16:17:43 +0800 Subject: [PATCH 20/58] Update config.py --- config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/config.py b/config.py index 3ca2d1d..8e9721a 100644 --- a/config.py +++ b/config.py @@ -1,3 +1,4 @@ +import sys is_half=True exp_root="logs" python_exec=sys.executable or "python" From 3ae64b3ac1ad3e914ece81e022262e09ea274069 Mon Sep 17 00:00:00 2001 From: Rice Cake Date: Wed, 17 Jan 2024 16:41:48 +0800 Subject: [PATCH 21/58] Update README.md --- README.md | 88 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index d1b97d9..de4dd57 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,31 @@ -# GPT-SoVITS - Voice Conversion and Text-to-Speech WebUI +
-## Demo Video and Features +

GPT-SoVITS

+A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI based on VITS.

-Check out our demo video in Chinese: [Bilibili Demo](https://www.bilibili.com/video/BV12g4y1m7Uw/) +[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange +)](https://github.com/RVC-Boss/GPT-SoVITS) + +
+ +[![Licence](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) +[![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) + +[![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) + +[**English**](./README.md) | [**中文简体**](./docs/cn/README.md) + +
+ +------ + + + +> Check out our [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw) here! https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb -### Features: - +## Features: 1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion. 2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data for improved voice similarity and realism. @@ -16,27 +34,11 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- 4. **WebUI Tools:** Integrated tools include voice accompaniment separation, automatic training set segmentation, Chinese ASR, and text labeling, assisting beginners in creating training datasets and GPT/SoVITS models. -## Todo List - -0. **High Priority:** - - Localization in Japanese and English. - - User guide. - -1. **Features:** - - Zero-shot voice conversion (5s) / few-shot voice conversion (1min). - - TTS speaking speed control. - - Enhanced TTS emotion control. - - Experiment with changing SoVITS token inputs to probability distribution of vocabs. - - Improve English and Japanese text frontend. - - Develop tiny and larger-sized TTS models. - - Colab scripts. - - Expand training dataset (2k -> 10k). - -## Requirements (How to Install) +## Environment Preparation ### Python and PyTorch Version -Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. +Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. ### Quick Install with Conda @@ -85,14 +87,6 @@ brew install ffmpeg Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. -### Pretrained Models - -Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS\pretrained_models`. - -For Chinese ASR, download models from [Damo ASR Models](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files) and place them in `tools/damo_asr/models`. - -For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. - ## Dataset Format The TTS annotation .list file format: @@ -101,18 +95,33 @@ The TTS annotation .list file format: vocal_path|speaker_name|language|text ``` -Example: - -``` -D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. -``` - Language dictionary: - 'zh': Chinese - 'ja': Japanese - 'en': English +Example: + +``` +D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. +``` +## Todo List + +0. **High Priority:** + - Localization in Japanese and English. + - User guide. + +1. **Features:** + - Zero-shot voice conversion (5s) / few-shot voice conversion (1min). + - TTS speaking speed control. + - Enhanced TTS emotion control. + - Experiment with changing SoVITS token inputs to probability distribution of vocabs. + - Improve English and Japanese text frontend. + - Develop tiny and larger-sized TTS models. + - Colab scripts. + - Expand training dataset (2k -> 10k). + ## Credits Special thanks to the following projects and contributors: @@ -131,3 +140,8 @@ Special thanks to the following projects and contributors: - [SubFix](https://github.com/cronrpc/SubFix) - [FFmpeg](https://github.com/FFmpeg/FFmpeg) - [gradio](https://github.com/gradio-app/gradio) + +## Thanks to all contributors for their efforts + + + From 567f633cdf5ea6e6c4c7f53516fb9fa6dfdff43d Mon Sep 17 00:00:00 2001 From: Rice Cake Date: Wed, 17 Jan 2024 16:49:15 +0800 Subject: [PATCH 22/58] Update README.md --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index de4dd57..4aee3e6 100644 --- a/README.md +++ b/README.md @@ -108,19 +108,19 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. ``` ## Todo List -0. **High Priority:** - - Localization in Japanese and English. - - User guide. +- [ ] **High Priority:** + - [ ] Localization in Japanese and English. + - [ ] User guide. -1. **Features:** - - Zero-shot voice conversion (5s) / few-shot voice conversion (1min). - - TTS speaking speed control. - - Enhanced TTS emotion control. - - Experiment with changing SoVITS token inputs to probability distribution of vocabs. - - Improve English and Japanese text frontend. - - Develop tiny and larger-sized TTS models. - - Colab scripts. - - Expand training dataset (2k -> 10k). +- [ ] **Features:** + - [ ] Zero-shot voice conversion (5s) / few-shot voice conversion (1min). + - [ ] TTS speaking speed control. + - [ ] Enhanced TTS emotion control. + - [ ] Experiment with changing SoVITS token inputs to probability distribution of vocabs. + - [ ] Improve English and Japanese text frontend. + - [ ] Develop tiny and larger-sized TTS models. + - [ ] Colab scripts. + - [ ] Expand training dataset (2k -> 10k). ## Credits From 021a65064316189db0eaac29dadf66a5a74a6dde Mon Sep 17 00:00:00 2001 From: Rice Cake Date: Wed, 17 Jan 2024 17:04:54 +0800 Subject: [PATCH 23/58] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 4aee3e6..e930814 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,6 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI based on VITS.
[![Licence](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) -[![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) - [**English**](./README.md) | [**中文简体**](./docs/cn/README.md) From 180991c2bf2c2a9d3a0af9dba3d6b2b7a8b865f2 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:30:18 +0800 Subject: [PATCH 24/58] Update README.md --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e930814..60b1f46 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@
-

GPT-SoVITS

-A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI based on VITS.

+

GPT-SoVITS-WebUI

+A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange )](https://github.com/RVC-Boss/GPT-SoVITS) @@ -34,6 +34,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ## Environment Preparation +If you are windows users (tested with win>=10), you don't need read this part. Just download the [Integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main), unzip it and double-click go-webui.bat to start GPT-SoVITS-WebUI. + ### Python and PyTorch Version Tested with Python 3.9, PyTorch 2.0.1, and CUDA 11. @@ -85,6 +87,16 @@ brew install ffmpeg Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. +### Pretrained Models + + +Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS\pretrained_models`. + +For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/damo_asr/models`. + +For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. + + ## Dataset Format The TTS annotation .list file format: From ae9a31a241643a81543518143a80aa1e31983224 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:41:32 +0800 Subject: [PATCH 25/58] Delete tools/init --- tools/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tools/init diff --git a/tools/init b/tools/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/init +++ /dev/null @@ -1 +0,0 @@ - From affaf181aa5f9e95f4c8c2d73c3854974e1c3c6c Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:41:38 +0800 Subject: [PATCH 26/58] Delete tools/uvr5/init --- tools/uvr5/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tools/uvr5/init diff --git a/tools/uvr5/init b/tools/uvr5/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/uvr5/init +++ /dev/null @@ -1 +0,0 @@ - From 37629ad7b351dbfeae4348d6e74e3ee8791ecbf9 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:41:45 +0800 Subject: [PATCH 27/58] Delete GPT_SoVITS/pretrained_models/init --- GPT_SoVITS/pretrained_models/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 GPT_SoVITS/pretrained_models/init diff --git a/GPT_SoVITS/pretrained_models/init b/GPT_SoVITS/pretrained_models/init deleted file mode 100644 index 8b13789..0000000 --- a/GPT_SoVITS/pretrained_models/init +++ /dev/null @@ -1 +0,0 @@ - From f1afa6b1600b5a409435b91fd29a18de05367056 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:41:50 +0800 Subject: [PATCH 28/58] Delete GPT_SoVITS/init --- GPT_SoVITS/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 GPT_SoVITS/init diff --git a/GPT_SoVITS/init b/GPT_SoVITS/init deleted file mode 100644 index 8b13789..0000000 --- a/GPT_SoVITS/init +++ /dev/null @@ -1 +0,0 @@ - From a55eb3faa0c34f9b3b9d18e191f3cd7f4f03fe92 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:42:21 +0800 Subject: [PATCH 29/58] Delete tools/damo_asr/models/init --- tools/damo_asr/models/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tools/damo_asr/models/init diff --git a/tools/damo_asr/models/init b/tools/damo_asr/models/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/damo_asr/models/init +++ /dev/null @@ -1 +0,0 @@ - From 90f940ed91c47e5eafba3c9dfbd30814e549a1ca Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:42:25 +0800 Subject: [PATCH 30/58] Delete tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init --- tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init diff --git a/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init b/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init deleted file mode 100644 index 8b13789..0000000 --- a/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/init +++ /dev/null @@ -1 +0,0 @@ - From cc632b985d96cee9ccf844e1919b36759173e1a3 Mon Sep 17 00:00:00 2001 From: spicysama <122108331+AnyaCoder@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:43:32 +0800 Subject: [PATCH 31/58] Update dataset.py pandas csv file doesn't have keys called "item_name", "sematic_text",update a method "iloc". which is more accurate. --- GPT_SoVITS/AR/data/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 47adacc..b1ea69e 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -116,6 +116,7 @@ class Text2SemanticDataset(Dataset): phoneme_data_len = len(self.phoneme_data.keys()) print("semantic_data_len:", semantic_data_len) print("phoneme_data_len:", phoneme_data_len) + print(self.semantic_data) idx = 0 num_not_in = 0 num_deleted_bigger = 0 @@ -123,7 +124,7 @@ class Text2SemanticDataset(Dataset): for i in range(semantic_data_len): # 先依次遍历 # get str - item_name = self.semantic_data["item_name"][i] + item_name = self.semantic_data.iloc[i,0] # print(self.phoneme_data) try: phoneme, word2ph, text = self.phoneme_data[item_name] @@ -133,7 +134,7 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue - semantic_str = self.semantic_data["semantic_audio"][i] + semantic_str = self.semantic_data.iloc[i,1] # get token list semantic_ids = [int(idx) for idx in semantic_str.split(" ")] # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len From 5bbbd0efcf669bcb81fdb555087c9338e8f0db85 Mon Sep 17 00:00:00 2001 From: Erythrocyte3803 <2544390577@qq.com> Date: Wed, 17 Jan 2024 21:00:37 +0900 Subject: [PATCH 32/58] reupload missing language file --- i18n/locale/en_US.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/es_ES.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/fr_FR.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/it_IT.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/ja_JP.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/ru_RU.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/tr_TR.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/zh_CN.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/zh_HK.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/zh_SG.json | 135 +++++++++++++++++++++++++++++++++++++++++ i18n/locale/zh_TW.json | 135 +++++++++++++++++++++++++++++++++++++++++ 11 files changed, 1485 insertions(+) create mode 100644 i18n/locale/en_US.json create mode 100644 i18n/locale/es_ES.json create mode 100644 i18n/locale/fr_FR.json create mode 100644 i18n/locale/it_IT.json create mode 100644 i18n/locale/ja_JP.json create mode 100644 i18n/locale/ru_RU.json create mode 100644 i18n/locale/tr_TR.json create mode 100644 i18n/locale/zh_CN.json create mode 100644 i18n/locale/zh_HK.json create mode 100644 i18n/locale/zh_SG.json create mode 100644 i18n/locale/zh_TW.json diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json new file mode 100644 index 0000000..d585505 --- /dev/null +++ b/i18n/locale/en_US.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.", + "A模型权重": "Weight (w) for Model A:", + "A模型路径": "Path to Model A:", + "B模型路径": "Path to Model B:", + "E:\\语音音频+标注\\米津玄师\\src": "C:\\Users\\Desktop\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation:", + "Index Rate": "Index Rate", + "Onnx导出": "Export Onnx", + "Onnx输出路径": "Onnx Export Path:", + "RVC模型路径": "RVC Model Path:", + "ckpt处理": "ckpt Processing", + "harvest进程数": "Number of CPU processes used for harvest pitch algorithm", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Enter the GPU index(es) separated by '-', e.g., 0-0-1 to use 2 processes in GPU0 and 1 process in GPU1", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.", + "step1:正在处理数据": "Step 1: Processing data", + "step2:正在提取音高&正在提取特征": "step2:Pitch extraction & feature extraction", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):", + "step3: 填写训练设置, 开始训练模型和索引": "Step 3: Fill in the training settings and start training the model and index", + "step3a:正在训练模型": "Step 3a: Model training started", + "一键训练": "One-click training", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be imported. If a folder path exists, this input is ignored.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch processing for vocal accompaniment separation using the UVR5 model.
Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).
The model is divided into three categories:
1. Preserve vocals: Choose this option for audio without harmonies. It preserves vocals better than HP5. It includes two built-in models: HP2 and HP3. HP3 may slightly leak accompaniment but preserves vocals slightly better than HP2.
2. Preserve main vocals only: Choose this option for audio with harmonies. It may weaken the main vocals. It includes one built-in model: HP5.
3. De-reverb and de-delay models (by FoxJoy):
  (1) MDX-Net: The best choice for stereo reverb removal but cannot remove mono reverb;
 (234) DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverb and can remove mono reverb, but not very effectively for heavily reverberated high-frequency content.
De-reverb/de-delay notes:
1. The processing time for the DeEcho-DeReverb model is approximately twice as long as the other two DeEcho models.
2. The MDX-Net-Dereverb model is quite slow.
3. The recommended cleanest configuration is to apply MDX-Net first and then DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Enter the GPU index(es) separated by '-', e.g., 0-1-2 to use GPU 0, 1, and 2:", + "伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Save name:", + "保存的文件名, 默认空为和源文件同名": "Save file name (default: same as the source file):", + "保存的模型名不带后缀": "Saved model name (without extension):", + "保存频率save_every_epoch": "Save frequency (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy:", + "修改": "Modify", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modify model information (only supported for small model files extracted from the 'weights' folder)", + "停止音频转换": "Stop audio conversion", + "全流程结束!": "All processes have been completed!", + "刷新音色列表和索引路径": "Refresh voice list and index path", + "加载模型": "Load model", + "加载预训练底模D路径": "Load pre-trained base model D path:", + "加载预训练底模G路径": "Load pre-trained base model G path:", + "单次推理": "Single Inference", + "卸载音色省显存": "Unload voice to save GPU memory:", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", + "否": "No", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Response threshold", + "响度因子": "loudness factor", + "处理数据": "Process data", + "导出Onnx模型": "Export Onnx Model", + "导出文件格式": "Export file format", + "常见问题解答": "FAQ (Frequently Asked Questions)", + "常规设置": "General settings", + "开始音频转换": "Start audio conversion", + "很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.", + "性能设置": "Performance settings", + "总训练轮数total_epoch": "Total training epochs (total_epoch):", + "批量推理": "Batch Inference", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').", + "指定输出主人声文件夹": "Specify the output folder for vocals:", + "指定输出文件夹": "Specify output folder:", + "指定输出非主人声文件夹": "Specify the output folder for accompaniment:", + "推理时间(ms):": "Inference time (ms):", + "推理音色": "Inferencing voice:", + "提取": "Extract", + "提取音高和处理数据使用的CPU进程数": "Number of CPU processes used for pitch extraction and data processing:", + "是": "Yes", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Save only the latest '.ckpt' file to save disk space:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Cache all training sets to GPU memory. Caching small datasets (less than 10 minutes) can speed up training, but caching large datasets will consume a lot of GPU memory and may not provide much speed improvement:", + "显卡信息": "GPU Information", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible.
If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory Agreement-LICENSE.txt for details.", + "查看": "View", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "View model information (only supported for small model files extracted from the 'weights' folder)", + "检索特征占比": "Search feature ratio (controls accent strength, too high has artifacting):", + "模型": "Model", + "模型推理": "Model Inference", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model extraction (enter the path of the large file model under the 'logs' folder). This is useful if you want to stop training halfway and manually extract and save a small model file, or if you want to test an intermediate model:", + "模型是否带音高指导": "Whether the model has pitch guidance:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Whether the model has pitch guidance (required for singing, optional for speech):", + "模型是否带音高指导,1是0否": "Whether the model has pitch guidance (1: yes, 0: no):", + "模型版本型号": "Model architecture version:", + "模型融合, 可用于测试音色融合": "Model fusion, can be used to test timbre fusion", + "模型路径": "Path to Model:", + "每张显卡的batch_size": "Batch size per GPU:", + "淡入淡出长度": "Fade length", + "版本": "Version", + "特征提取": "Feature extraction", + "特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.", + "目标采样率": "Target sample rate:", + "算法延迟(ms):": "Algorithmic delays(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:", + "融合": "Fusion", + "要改的模型信息": "Model information to be modified:", + "要置入的模型信息": "Model information to be placed:", + "训练": "Train", + "训练模型": "Train model", + "训练特征索引": "Train feature index", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.", + "请指定说话人id": "Please specify the speaker/singer ID:", + "请选择index文件": "Please choose the .index file", + "请选择pth文件": "Please choose the .pth file", + "请选择说话人id": "Select Speaker/Singer ID:", + "转换": "Convert", + "输入实验名": "Enter the experiment name:", + "输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):", + "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:", + "输入监听": "Input voice monitor", + "输入训练文件夹路径": "Enter the path of the training folder:", + "输入设备": "Input device", + "输入降噪": "Input noise reduction", + "输出信息": "Output information", + "输出变声": "Output converted voice", + "输出设备": "Output device", + "输出降噪": "Output noise reduction", + "输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)", + "选择.index文件": "Select the .index file", + "选择.pth文件": "Select the .pth file", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Sample length", + "重载设备列表": "Reload device list", + "音调设置": "Pitch settings", + "音频设备(请使用同种类驱动)": "Audio device (please use the same type of driver)", + "音高算法": "pitch detection algorithm", + "额外推理时长": "Extra inference time" +} diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json new file mode 100644 index 0000000..08b8176 --- /dev/null +++ b/i18n/locale/es_ES.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si es >=3, entonces use el resultado del reconocimiento de tono de 'harvest' con filtro de mediana, el valor es el radio del filtro, su uso puede debilitar el sonido sordo", + "A模型权重": "Un peso modelo para el modelo A.", + "A模型路径": "Modelo A ruta.", + "B模型路径": "Modelo B ruta.", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Archivo de curva F0, opcional, un tono por línea, en lugar de F0 predeterminado y cambio de tono", + "Index Rate": "Tasa de índice", + "Onnx导出": "Exportar Onnx", + "Onnx输出路径": "Ruta de salida Onnx", + "RVC模型路径": "Ruta del modelo RVC", + "ckpt处理": "Procesamiento de recibos", + "harvest进程数": "Número de procesos", + "index文件路径不可包含中文": "La ruta del archivo .index no debe contener caracteres chinos.", + "pth文件路径不可包含中文": "La ruta del archivo .pth no debe contener caracteres chinos.", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.", + "step1:正在处理数据": "Paso 1: Procesando datos", + "step2:正在提取音高&正在提取特征": "Paso 2: Extracción del tono y extracción de características", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Paso 2a: Recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Paso 2b: Use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).", + "step3: 填写训练设置, 开始训练模型和索引": "Paso 3: Complete la configuración de entrenamiento y comience a entrenar el modelo y el índice.", + "step3a:正在训练模型": "Paso 3a: Entrenando el modelo", + "一键训练": "Entrenamiento con un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden importar varios archivos de audio. Si existe una ruta de carpeta, esta entrada se ignora.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.
Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).
El modelo se divide en tres categorías:
1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.
2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.
3. Modelos de des-reverberación y des-retardo (por FoxJoy):
  (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;
 (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.
Notas de des-reverberación/des-retardo:
1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.
2. El modelo MDX-Net-Dereverb es bastante lento.
3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.", + "伴奏人声分离&去混响&去回声": "Separación de voz acompañante & eliminación de reverberación & eco", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Guardar nombre", + "保存的文件名, 默认空为和源文件同名": "Nombre del archivo que se guardará, el valor predeterminado es el mismo que el nombre del archivo de origen", + "保存的模型名不带后缀": "Nombre del modelo guardado sin extensión.", + "保存频率save_every_epoch": "Frecuencia de guardado (save_every_epoch)", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteger las consonantes claras y la respiración, prevenir artefactos como la distorsión de sonido electrónico, 0.5 no está activado, reducir aumentará la protección pero puede reducir el efecto del índice", + "修改": "Modificar", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar la información del modelo (solo admite archivos de modelos pequeños extraídos en la carpeta weights)", + "停止音频转换": "Detener la conversión de audio", + "全流程结束!": "¡Todo el proceso ha terminado!", + "刷新音色列表和索引路径": "Actualizar la lista de modelos e índice de rutas", + "加载模型": "Cargar modelo", + "加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.", + "加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.", + "单次推理": "单次推理", + "卸载音色省显存": "Descargue la voz para ahorrar memoria GPU", + "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", + "后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear", + "否": "No", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Umbral de respuesta", + "响度因子": "factor de sonoridad", + "处理数据": "Procesar datos", + "导出Onnx模型": "Exportar modelo Onnx", + "导出文件格式": "Formato de archivo de exportación", + "常见问题解答": "Preguntas frecuentes", + "常规设置": "Configuración general", + "开始音频转换": "Iniciar conversión de audio", + "很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento", + "性能设置": "Configuración de rendimiento", + "总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).", + "指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal", + "指定输出文件夹": "Especificar carpeta de salida", + "指定输出非主人声文件夹": "Especifique la carpeta de salida para las voces no principales", + "推理时间(ms):": "Inferir tiempo (ms):", + "推理音色": "inferencia de voz", + "提取": "Extraer", + "提取音高和处理数据使用的CPU进程数": "Número de procesos de CPU utilizados para extraer el tono y procesar los datos", + "是": "Sí", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Guardar solo el archivo ckpt más reciente para ahorrar espacio en disco", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Guardar pequeño modelo final en la carpeta 'weights' en cada punto de guardado", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Si almacenar en caché todos los conjuntos de entrenamiento en la memoria de la GPU. Los conjuntos de datos pequeños (menos de 10 minutos) se pueden almacenar en caché para acelerar el entrenamiento, pero el almacenamiento en caché de conjuntos de datos grandes puede causar errores de memoria en la GPU y no aumenta la velocidad de manera significativa.", + "显卡信息": "información de la GPU", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Este software es de código abierto bajo la licencia MIT, el autor no tiene ningún control sobre el software, y aquellos que usan el software y difunden los sonidos exportados por el software son los únicos responsables.
Si no está de acuerdo con esta cláusula , no puede utilizar ni citar ningún código ni archivo del paquete de software Consulte el directorio raíz Agreement-LICENSE.txt para obtener más información.", + "查看": "Ver", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Ver información del modelo (solo aplicable a archivos de modelos pequeños extraídos de la carpeta 'pesos')", + "检索特征占比": "Proporción de función de búsqueda", + "模型": "Modelo", + "模型推理": "inferencia del modelo", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extracción de modelo (ingrese la ruta de un archivo de modelo grande en la carpeta 'logs'), aplicable cuando desea extraer un archivo de modelo pequeño después de entrenar a mitad de camino y no se guardó automáticamente, o cuando desea probar un modelo intermedio", + "模型是否带音高指导": "Si el modelo tiene guía de tono.", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Si el modelo tiene guía de tono (necesaria para cantar, pero no para hablar)", + "模型是否带音高指导,1是0否": "Si el modelo tiene guía de tono, 1 para sí, 0 para no", + "模型版本型号": "Versión y modelo del modelo", + "模型融合, 可用于测试音色融合": "Fusión de modelos, se puede utilizar para fusionar diferentes voces", + "模型路径": "Ruta del modelo", + "每张显卡的batch_size": "Tamaño del lote (batch_size) por tarjeta gráfica", + "淡入淡出长度": "Duración del fundido de entrada/salida", + "版本": "Versión", + "特征提取": "Extracción de características", + "特征检索库文件路径,为空则使用下拉的选择结果": "Ruta del archivo de la biblioteca de características, si está vacío, se utilizará el resultado de la selección desplegable", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tecla +12 recomendada para conversión de voz de hombre a mujer, tecla -12 para conversión de voz de mujer a hombre. Si el rango de tono es demasiado amplio y causa distorsión, ajústelo usted mismo a un rango adecuado.", + "目标采样率": "Tasa de muestreo objetivo", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Detección automática de la ruta del índice, selección desplegable (dropdown)", + "融合": "Fusión", + "要改的模型信息": "Información del modelo a modificar", + "要置入的模型信息": "Información del modelo a colocar.", + "训练": "Entrenamiento", + "训练模型": "Entrenar Modelo", + "训练特征索引": "Índice de características", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entrenamiento finalizado, puede ver el registro de entrenamiento en la consola o en el archivo train.log en la carpeta del experimento", + "请指定说话人id": "ID del modelo", + "请选择index文件": "Seleccione el archivo .index", + "请选择pth文件": "Seleccione el archivo .pth", + "请选择说话人id": "Seleccione una identificación de altavoz", + "转换": "Conversión", + "输入实验名": "Ingrese el nombre del modelo", + "输入待处理音频文件夹路径": "Ingrese la ruta a la carpeta de audio que se procesará", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Ingrese la ruta a la carpeta de audio que se procesará (simplemente cópiela desde la barra de direcciones del administrador de archivos)", + "输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Proporción de fusión para reemplazar el sobre de volumen de entrada con el sobre de volumen de salida, cuanto más cerca de 1, más se utiliza el sobre de salida", + "输入监听": "输入监听", + "输入训练文件夹路径": "Introduzca la ruta de la carpeta de entrenamiento", + "输入设备": "Dispositivo de entrada", + "输入降噪": "Reducción de ruido de entrada", + "输出信息": "Información de salida", + "输出变声": "输出变声", + "输出设备": "Dispositivo de salida", + "输出降噪": "Reducción de ruido de salida", + "输出音频(右下角三个点,点了可以下载)": "Salida de audio (haga clic en los tres puntos en la esquina inferior derecha para descargar)", + "选择.index文件": "Seleccione el archivo .index", + "选择.pth文件": "Seleccione el archivo .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleccione el algoritmo de extracción de tono, las voces de entrada se pueden acelerar con pm, harvest tiene buenos graves pero es muy lento, crepe es bueno pero se come las GPUs", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleccione el algoritmo de extracción de tono, use 'pm' para acelerar la entrada de canto, 'harvest' es bueno para los graves pero extremadamente lento, 'crepe' tiene buenos resultados pero consume GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Seleccione el algoritmo de extracción de tono: la canción de entrada se puede acelerar con pm, la voz de alta calidad pero CPU pobre se puede acelerar con dio, harvest es mejor pero más lento, rmvpe es el mejor y se come ligeramente la CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Longitud de muestreo", + "重载设备列表": "Actualizar lista de dispositivos", + "音调设置": "Ajuste de tono", + "音频设备(请使用同种类驱动)": "Dispositivo de audio (utilice el mismo tipo de controlador)", + "音高算法": "Algoritmo de tono", + "额外推理时长": "Tiempo de inferencia adicional" +} diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json new file mode 100644 index 0000000..db93e9a --- /dev/null +++ b/i18n/locale/fr_FR.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si >=3 : appliquer un filtrage médian aux résultats de la reconnaissance de la hauteur de récolte. La valeur représente le rayon du filtre et peut réduire la respiration.", + "A模型权重": "Poids (w) pour le modèle A :", + "A模型路径": "Chemin d'accès au modèle A :", + "B模型路径": "Chemin d'accès au modèle B :", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Fichier de courbe F0 (facultatif). Une hauteur par ligne. Remplace la fréquence fondamentale par défaut et la modulation de la hauteur :", + "Index Rate": "Taux d'indexation", + "Onnx导出": "Exporter en ONNX", + "Onnx输出路径": "Chemin d'exportation ONNX :", + "RVC模型路径": "Chemin du modèle RVC :", + "ckpt处理": "Traitement des fichiers .ckpt", + "harvest进程数": "Nombre de processus CPU utilisés pour l'algorithme de reconnaissance de la hauteur (pitch) dans le cadre de la récolte (harvest).", + "index文件路径不可包含中文": "Le chemin du fichier d'index ne doit pas contenir de caractères chinois.", + "pth文件路径不可包含中文": "Le chemin du fichier .pth ne doit pas contenir de caractères chinois.", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Configuration des numéros de carte RMVPE : séparez les index GPU par des tirets \"-\", par exemple, 0-0-1 pour utiliser 2 processus sur GPU0 et 1 processus sur GPU1.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Étape 1 : Remplissez la configuration expérimentale. Les données expérimentales sont stockées dans le dossier 'logs', avec chaque expérience ayant un dossier distinct. Entrez manuellement le chemin du nom de l'expérience, qui contient la configuration expérimentale, les journaux et les fichiers de modèle entraînés.", + "step1:正在处理数据": "Étape 1 : Traitement des données en cours.", + "step2:正在提取音高&正在提取特征": "Étape 2 : Extraction de la hauteur et extraction des caractéristiques en cours.", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Étape 2a : Parcours automatique de tous les fichiers du dossier d'entraînement qui peuvent être décodés en fichiers audio et réalisation d'une normalisation par tranches. Génère 2 dossiers wav dans le répertoire de l'expérience. Actuellement, seule la formation avec un seul chanteur/locuteur est prise en charge.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Étape 2b : Utilisez le CPU pour extraire la hauteur (si le modèle le permet), utilisez le GPU pour extraire les caractéristiques (sélectionnez l'index du GPU) :", + "step3: 填写训练设置, 开始训练模型和索引": "Étape 3 : Remplissez les paramètres d'entraînement et démarrez l'entraînement du modèle ainsi que l'indexation.", + "step3a:正在训练模型": "Étape 3a : L'entraînement du modèle a commencé.", + "一键训练": "Entraînement en un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Il est également possible d'importer plusieurs fichiers audio. Si un chemin de dossier existe, cette entrée est ignorée.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Traitement en lot pour la séparation de la voix et de l'accompagnement vocal à l'aide du modèle UVR5.
Exemple d'un format de chemin de dossier valide : D:\\chemin\\vers\\dossier\\d'entrée (copiez-le depuis la barre d'adresse du gestionnaire de fichiers).
Le modèle est divisé en trois catégories :
1. Préserver la voix : Choisissez cette option pour l'audio sans harmonies. Elle préserve la voix mieux que HP5. Il comprend deux modèles intégrés : HP2 et HP3. HP3 peut légèrement laisser passer l'accompagnement mais préserve légèrement mieux la voix que HP2.
2. Préserver uniquement la voix principale : Choisissez cette option pour l'audio avec harmonies. Cela peut affaiblir la voix principale. Il comprend un modèle intégré : HP5.
3. Modèles de suppression de la réverbération et du délai (par FoxJoy) :
  (1) MDX-Net : Le meilleur choix pour la suppression de la réverbération stéréo, mais ne peut pas supprimer la réverbération mono.
  (234) DeEcho : Supprime les effets de délai. Le mode Aggressive supprime plus efficacement que le mode Normal. DeReverb supprime également la réverbération et peut supprimer la réverbération mono, mais pas très efficacement pour les contenus à haute fréquence fortement réverbérés.
Notes sur la suppression de la réverbération et du délai :
1. Le temps de traitement pour le modèle DeEcho-DeReverb est environ deux fois plus long que pour les autres deux modèles DeEcho.
2. Le modèle MDX-Net-Dereverb est assez lent.
3. La configuration la plus propre recommandée est d'appliquer d'abord MDX-Net, puis DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Entrez le(s) index GPU séparé(s) par '-', par exemple, 0-1-2 pour utiliser les GPU 0, 1 et 2 :", + "伴奏人声分离&去混响&去回声": "Séparation des voix/accompagnement et suppression de la réverbération", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Nom de sauvegarde :", + "保存的文件名, 默认空为和源文件同名": "Nom du fichier de sauvegarde (par défaut : identique au nom du fichier source) :", + "保存的模型名不带后缀": "Nom du modèle enregistré (sans extension) :", + "保存频率save_every_epoch": "Fréquence de sauvegarde (save_every_epoch) :", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protéger les consonnes sourdes et les bruits de respiration pour éviter les artefacts tels que le déchirement dans la musique électronique. Réglez à 0,5 pour désactiver. Diminuez la valeur pour renforcer la protection, mais cela peut réduire la précision de l'indexation :", + "修改": "Modifier", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifier les informations du modèle (uniquement pris en charge pour les petits fichiers de modèle extraits du dossier 'weights')", + "停止音频转换": "Arrêter la conversion audio", + "全流程结束!": "Toutes les étapes ont été terminées !", + "刷新音色列表和索引路径": "Actualiser la liste des voix et le vers l'index.", + "加载模型": "Charger le modèle.", + "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", + "加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :", + "单次推理": "单次推理", + "卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", + "后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", + "否": "Non", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Seuil de réponse", + "响度因子": "Facteur de volume sonore", + "处理数据": "Traitement des données", + "导出Onnx模型": "Exporter le modèle au format ONNX.", + "导出文件格式": "Format de fichier d'exportation", + "常见问题解答": "FAQ (Foire Aux Questions)", + "常规设置": "Paramètres généraux", + "开始音频转换": "Démarrer la conversion audio.", + "很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.", + "性能设置": "Paramètres de performance", + "总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').", + "指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :", + "指定输出文件夹": "Spécifiez le dossier de sortie :", + "指定输出非主人声文件夹": "Spécifiez le dossier de sortie pour l'accompagnement :", + "推理时间(ms):": "Temps d'inférence (ms) :", + "推理音色": "Voix pour l'inférence", + "提取": "Extraire", + "提取音高和处理数据使用的CPU进程数": "Nombre de processus CPU utilisés pour l'extraction de la hauteur et le traitement des données :", + "是": "Oui", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Enregistrer uniquement le dernier fichier '.ckpt' pour économiser de l'espace disque :", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Enregistrer un petit modèle final dans le dossier 'weights' à chaque point de sauvegarde :", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Mettre en cache tous les ensembles d'entrainement dans la mémoire GPU. Mettre en cache de petits ensembles de données (moins de 10 minutes) peut accélérer l'entrainement, mais mettre en cache de grands ensembles de données consommera beaucoup de mémoire GPU et peut ne pas apporter beaucoup d'amélioration de vitesse :", + "显卡信息": "Informations sur la carte graphique (GPU)", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Ce logiciel est open source sous la licence MIT. L'auteur n'a aucun contrôle sur le logiciel. Les utilisateurs qui utilisent le logiciel et distribuent les sons exportés par le logiciel en sont entièrement responsables.
Si vous n'acceptez pas cette clause, vous ne pouvez pas utiliser ou faire référence à aucun code ni fichier contenu dans le package logiciel. Consultez le fichier Agreement-LICENSE.txt dans le répertoire racine pour plus de détails.", + "查看": "Voir", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Afficher les informations sur le modèle (uniquement pour les petits fichiers de modèle extraits du dossier \"weights\")", + "检索特征占比": "Rapport de recherche de caractéristiques (contrôle l'intensité de l'accent, un rapport trop élevé provoque des artefacts) :", + "模型": "Modèle", + "模型推理": "Inférence du modèle", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extraction du modèle (saisissez le chemin d'accès au modèle du grand fichier dans le dossier \"logs\"). Cette fonction est utile si vous souhaitez arrêter l'entrainement à mi-chemin et extraire et enregistrer manuellement un petit fichier de modèle, ou si vous souhaitez tester un modèle intermédiaire :", + "模型是否带音高指导": "Indique si le modèle dispose d'un guidage en hauteur :", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Indique si le modèle dispose d'un système de guidage de la hauteur (obligatoire pour le chant, facultatif pour la parole) :", + "模型是否带音高指导,1是0否": "Le modèle dispose-t-il d'un guide de hauteur (1 : oui, 0 : non) ?", + "模型版本型号": "Version de l'architecture du modèle :", + "模型融合, 可用于测试音色融合": "Fusion de modèles, peut être utilisée pour tester la fusion de timbres", + "模型路径": "Le chemin vers le modèle :", + "每张显卡的batch_size": "Taille du batch par GPU :", + "淡入淡出长度": "Longueur de la transition", + "版本": "Version", + "特征提取": "Extraction des caractéristiques", + "特征检索库文件路径,为空则使用下拉的选择结果": "Chemin d'accès au fichier d'index des caractéristiques. Laisser vide pour utiliser le résultat sélectionné dans la liste déroulante :", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Il est recommandé d'utiliser la clé +12 pour la conversion homme-femme et la clé -12 pour la conversion femme-homme. Si la plage sonore est trop large et que la voix est déformée, vous pouvez également l'ajuster vous-même à la plage appropriée.", + "目标采样率": "Taux d'échantillonnage cible :", + "算法延迟(ms):": "Délais algorithmiques (ms):", + "自动检测index路径,下拉式选择(dropdown)": "Détecter automatiquement le chemin d'accès à l'index et le sélectionner dans la liste déroulante :", + "融合": "Fusion", + "要改的模型信息": "Informations sur le modèle à modifier :", + "要置入的模型信息": "Informations sur le modèle à placer :", + "训练": "Entraîner", + "训练模型": "Entraîner le modèle", + "训练特征索引": "Entraîner l'index des caractéristiques", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entraînement terminé. Vous pouvez consulter les rapports d'entraînement dans la console ou dans le fichier 'train.log' situé dans le dossier de l'expérience.", + "请指定说话人id": "Veuillez spécifier l'ID de l'orateur ou du chanteur :", + "请选择index文件": "Veuillez sélectionner le fichier d'index", + "请选择pth文件": "Veuillez sélectionner le fichier pth", + "请选择说话人id": "Sélectionner l'ID de l'orateur ou du chanteur :", + "转换": "Convertir", + "输入实验名": "Saisissez le nom de l'expérience :", + "输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter :", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Entrez le chemin du dossier audio à traiter (copiez-le depuis la barre d'adresse du gestionnaire de fichiers) :", + "输入待处理音频文件路径(默认是正确格式示例)": "Entrez le chemin d'accès du fichier audio à traiter (par défaut, l'exemple de format correct) :", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Ajustez l'échelle de l'enveloppe de volume. Plus il est proche de 0, plus il imite le volume des voix originales. Cela peut aider à masquer les bruits et à rendre le volume plus naturel lorsqu'il est réglé relativement bas. Plus le volume est proche de 1, plus le volume sera fort et constant :", + "输入监听": "Moniteur vocal d'entrée", + "输入训练文件夹路径": "Indiquez le chemin d'accès au dossier d'entraînement :", + "输入设备": "Dispositif d'entrée", + "输入降噪": "Réduction du bruit d'entrée", + "输出信息": "Informations sur la sortie", + "输出变声": "Sortie voix convertie", + "输出设备": "Dispositif de sortie", + "输出降噪": "Réduction du bruit de sortie", + "输出音频(右下角三个点,点了可以下载)": "Exporter l'audio (cliquer sur les trois points dans le coin inférieur droit pour télécharger)", + "选择.index文件": "Sélectionner le fichier .index", + "选择.pth文件": "Sélectionner le fichier .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Sélection de l'algorithme d'extraction de la hauteur, les voix d'entrée peuvent être accélérées avec pm, harvest a de bonnes basses mais est très lent, crepe est bon mais consomme beaucoup de ressources GPU.", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Sélectionnez l'algorithme d'extraction de la hauteur de ton (\"pm\" : extraction plus rapide mais parole de moindre qualité ; \"harvest\" : meilleure basse mais extrêmement lente ; \"crepe\" : meilleure qualité mais utilisation intensive du GPU), \"rmvpe\" : meilleure qualité et peu d'utilisation du GPU.", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Sélection de l'algorithme d'extraction de la hauteur : la chanson d'entrée peut être traitée plus rapidement par pm, avec une voix de haute qualité mais un CPU médiocre, par dio, harvest est meilleur mais plus lent, rmvpe est le meilleur, mais consomme légèrement le CPU/GPU.", + "采样率:": "采样率:", + "采样长度": "Longueur de l'échantillon", + "重载设备列表": "Recharger la liste des dispositifs", + "音调设置": "Réglages de la hauteur", + "音频设备(请使用同种类驱动)": "Périphérique audio (veuillez utiliser le même type de pilote)", + "音高算法": "algorithme de détection de la hauteur", + "额外推理时长": "Temps d'inférence supplémentaire" +} diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json new file mode 100644 index 0000000..dc089be --- /dev/null +++ b/i18n/locale/it_IT.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Se >=3: applica il filtro mediano ai risultati del pitch raccolto. ", + "A模型权重": "Peso (w) per il modello A:", + "A模型路径": "Percorso per il modello A:", + "B模型路径": "Percorso per il modello B:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "File curva F0 (opzionale). ", + "Index Rate": "Tasso di indice", + "Onnx导出": "Esporta Onnx", + "Onnx输出路径": "Percorso di esportazione Onnx:", + "RVC模型路径": "Percorso modello RVC:", + "ckpt处理": "Elaborazione ckpt", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth è un'app per il futuro", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ", + "step1:正在处理数据": "Passaggio 1: elaborazione dei dati", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):", + "step3: 填写训练设置, 开始训练模型和索引": "Passaggio 3: compilare le impostazioni di addestramento e avviare l'addestramento del modello e dell'indice", + "step3a:正在训练模型": "Passaggio 3a: è iniziato l'addestramento del modello", + "一键训练": "Addestramento con un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Elaborazione batch per la separazione dell'accompagnamento vocale utilizzando il modello UVR5.
Esempio di un formato di percorso di cartella valido: D:\\path\\to\\input\\folder (copialo dalla barra degli indirizzi del file manager).
Il modello è suddiviso in tre categorie:
1. Conserva la voce: scegli questa opzione per l'audio senza armonie.
2. Mantieni solo la voce principale: scegli questa opzione per l'audio con armonie.
3. Modelli di de-riverbero e de-delay (di FoxJoy):
  (1) MDX-Net: la scelta migliore per la rimozione del riverbero stereo ma non può rimuovere il riverbero mono;

Note di de-riverbero/de-delay:
1. Il tempo di elaborazione per il modello DeEcho-DeReverb è circa il doppio rispetto agli altri due modelli DeEcho.
2. Il modello MDX-Net-Dereverb è piuttosto lento.
3. La configurazione più pulita consigliata consiste nell'applicare prima MDX-Net e poi DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Inserisci gli indici GPU separati da '-', ad esempio 0-1-2 per utilizzare GPU 0, 1 e 2:", + "伴奏人声分离&去混响&去回声": "Separazione voce/accompagnamento", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Salva nome:", + "保存的文件名, 默认空为和源文件同名": "Salva il nome del file (predefinito: uguale al file di origine):", + "保存的模型名不带后缀": "Nome del modello salvato (senza estensione):", + "保存频率save_every_epoch": "Frequenza di salvataggio (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteggi le consonanti senza voce e i suoni del respiro per evitare artefatti come il tearing nella musica elettronica. ", + "修改": "Modificare", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifica le informazioni sul modello (supportato solo per i file di modello di piccole dimensioni estratti dalla cartella 'weights')", + "停止音频转换": "Arresta la conversione audio", + "全流程结束!": "Tutti i processi sono stati completati!", + "刷新音色列表和索引路径": "Aggiorna l'elenco delle voci e il percorso dell'indice", + "加载模型": "Carica modello", + "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", + "加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:", + "单次推理": "单次推理", + "卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:", + "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", + "否": "NO", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Soglia di risposta", + "响度因子": "fattore di sonorità", + "处理数据": "Processa dati", + "导出Onnx模型": "Esporta modello Onnx", + "导出文件格式": "Formato file di esportazione", + "常见问题解答": "FAQ (Domande frequenti)", + "常规设置": "Impostazioni generali", + "开始音频转换": "Avvia la conversione audio", + "很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.", + "性能设置": "Impostazioni delle prestazioni", + "总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ", + "指定输出主人声文件夹": "Specifica la cartella di output per le voci:", + "指定输出文件夹": "Specifica la cartella di output:", + "指定输出非主人声文件夹": "Specificare la cartella di output per l'accompagnamento:", + "推理时间(ms):": "Tempo di inferenza (ms):", + "推理音色": "Voce di inferenza:", + "提取": "Estrai", + "提取音高和处理数据使用的CPU进程数": "Numero di processi CPU utilizzati per l'estrazione del tono e l'elaborazione dei dati:", + "是": "SÌ", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Salva solo l'ultimo file '.ckpt' per risparmiare spazio su disco:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Salva un piccolo modello finale nella cartella \"weights\" in ogni punto di salvataggio:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Memorizza nella cache tutti i set di addestramento nella memoria della GPU. ", + "显卡信息": "Informazioni GPU", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Questo software è open source con licenza MIT.
Se non si accetta questa clausola, non è possibile utilizzare o fare riferimento a codici e file all'interno del pacchetto software. Contratto-LICENZA.txt per dettagli.", + "查看": "Visualizzazione", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Visualizza le informazioni sul modello (supportato solo per file di modello piccoli estratti dalla cartella 'weights')", + "检索特征占比": "Rapporto funzionalità di ricerca (controlla la forza dell'accento, troppo alto ha artefatti):", + "模型": "Modello", + "模型推理": "Inferenza del modello", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Estrazione del modello (inserire il percorso del modello di file di grandi dimensioni nella cartella \"logs\"). ", + "模型是否带音高指导": "Se il modello ha una guida del tono:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Se il modello ha una guida del tono (necessario per il canto, facoltativo per il parlato):", + "模型是否带音高指导,1是0否": "Se il modello ha una guida del tono (1: sì, 0: no):", + "模型版本型号": "Versione dell'architettura del modello:", + "模型融合, 可用于测试音色融合": "Model fusion, può essere utilizzato per testare la fusione timbrica", + "模型路径": "Percorso al modello:", + "每张显卡的batch_size": "Dimensione batch per GPU:", + "淡入淡出长度": "Lunghezza dissolvenza", + "版本": "Versione", + "特征提取": "Estrazione delle caratteristiche", + "特征检索库文件路径,为空则使用下拉的选择结果": "Percorso del file di indice delle caratteristiche. ", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tonalità +12 consigliata per la conversione da maschio a femmina e tonalità -12 per la conversione da femmina a maschio. ", + "目标采样率": "Frequenza di campionamento target:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Rileva automaticamente il percorso dell'indice e seleziona dal menu a tendina:", + "融合": "Fusione", + "要改的模型信息": "Informazioni sul modello da modificare:", + "要置入的模型信息": "Informazioni sul modello da posizionare:", + "训练": "Addestramento", + "训练模型": "Addestra modello", + "训练特征索引": "Addestra indice delle caratteristiche", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Addestramento completato. ", + "请指定说话人id": "Si prega di specificare l'ID del locutore/cantante:", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth 文件", + "请选择说话人id": "Seleziona ID locutore/cantante:", + "转换": "Convertire", + "输入实验名": "Inserisci il nome dell'esperimento:", + "输入待处理音频文件夹路径": "Immettere il percorso della cartella audio da elaborare:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Immettere il percorso della cartella audio da elaborare (copiarlo dalla barra degli indirizzi del file manager):", + "输入待处理音频文件路径(默认是正确格式示例)": "Immettere il percorso del file audio da elaborare (l'impostazione predefinita è l'esempio di formato corretto):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Regola il ridimensionamento dell'inviluppo del volume. ", + "输入监听": "输入监听", + "输入训练文件夹路径": "Inserisci il percorso della cartella di addestramento:", + "输入设备": "Dispositivo di input", + "输入降噪": "Riduzione del rumore in ingresso", + "输出信息": "Informazioni sull'uscita", + "输出变声": "输出变声", + "输出设备": "Dispositivo di uscita", + "输出降噪": "Riduzione del rumore in uscita", + "输出音频(右下角三个点,点了可以下载)": "Esporta audio (clicca sui tre puntini in basso a destra per scaricarlo)", + "选择.index文件": "Seleziona il file .index", + "选择.pth文件": "Seleziona il file .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più veloce ma risultato di qualità inferiore; \"harvest\": bassi migliori ma estremamente lenti; \"crepe\": qualità migliore ma utilizzo intensivo della GPU):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Lunghezza del campione", + "重载设备列表": "Ricaricare l'elenco dei dispositivi", + "音调设置": "Impostazioni del tono", + "音频设备(请使用同种类驱动)": "Dispositivo audio (utilizzare lo stesso tipo di driver)", + "音高算法": "音高算法", + "额外推理时长": "Tempo di inferenza extra" +} diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json new file mode 100644 index 0000000..c5b33ff --- /dev/null +++ b/i18n/locale/ja_JP.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3 次に、harvestピッチの認識結果に対してメディアンフィルタを使用します。値はフィルター半径で、ミュートを減衰させるために使用します。", + "A模型权重": "Aモデルの重み", + "A模型路径": "Aモデルのパス", + "B模型路径": "Bモデルのパス", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0(最低共振周波数)カーブファイル(オプション、1行に1ピッチ、デフォルトのF0(最低共振周波数)とエレベーションを置き換えます。)", + "Index Rate": "Index Rate", + "Onnx导出": "Onnxエクスポート", + "Onnx输出路径": "Onnx出力パス", + "RVC模型路径": "RVCモデルパス", + "ckpt处理": "ckptファイルの処理", + "harvest进程数": "harvestプロセス数", + "index文件路径不可包含中文": "indexファイルのパスに漢字を含んではいけません", + "pth文件路径不可包含中文": "pthファイルのパスに漢字を含んではいけません", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpeカード番号設定:異なるプロセスに使用するカード番号を入力する。例えば、0-0-1でカード0に2つのプロセス、カード1に1つのプロセスを実行する。", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。", + "step1:正在处理数据": "step1:処理中のデータ", + "step2:正在提取音高&正在提取特征": "step2:ピッチ抽出と特徴抽出", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)", + "step3: 填写训练设置, 开始训练模型和索引": "ステップ3: トレーニング設定を入力して、モデルとインデックスのトレーニングを開始します", + "step3a:正在训练模型": "step3a:トレーニング中のモデル", + "一键训练": "ワンクリックトレーニング", + "也可批量输入音频文件, 二选一, 优先读文件夹": "複数のオーディオファイルをインポートすることもできます。フォルダパスが存在する場合、この入力は無視されます。", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "UVR5モデルを使用したボーカル伴奏の分離バッチ処理。
有効なフォルダーパスフォーマットの例: D:\\path\\to\\input\\folder (エクスプローラーのアドレスバーからコピーします)。
モデルは三つのカテゴリに分かれています:
1. ボーカルを保持: ハーモニーのないオーディオに対してこれを選択します。HP5よりもボーカルをより良く保持します。HP2とHP3の二つの内蔵モデルが含まれています。HP3は伴奏をわずかに漏らす可能性がありますが、HP2よりもわずかにボーカルをより良く保持します。
2. 主なボーカルのみを保持: ハーモニーのあるオーディオに対してこれを選択します。主なボーカルを弱める可能性があります。HP5の一つの内蔵モデルが含まれています。
3. ディリバーブとディレイモデル (by FoxJoy):
  (1) MDX-Net: ステレオリバーブの除去に最適な選択肢ですが、モノリバーブは除去できません;
 (234) DeEcho: ディレイ効果を除去します。AggressiveモードはNormalモードよりも徹底的に除去します。DeReverbはさらにリバーブを除去し、モノリバーブを除去することができますが、高周波のリバーブが強い内容に対しては非常に効果的ではありません。
ディリバーブ/ディレイに関する注意点:
1. DeEcho-DeReverbモデルの処理時間は、他の二つのDeEchoモデルの約二倍です。
2. MDX-Net-Dereverbモデルは非常に遅いです。
3. 推奨される最もクリーンな設定は、最初にMDX-Netを適用し、その後にDeEcho-Aggressiveを適用することです。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "ハイフンで区切って使用するGPUの番号を入力します。例えば0-1-2はGPU0、GPU1、GPU2を使用します", + "伴奏人声分离&去混响&去回声": "伴奏ボーカル分離&残響除去&エコー除去", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "保存ファイル名", + "保存的文件名, 默认空为和源文件同名": "保存するファイル名、デフォルトでは空欄で元のファイル名と同じ名前になります", + "保存的模型名不带后缀": "拡張子のない保存するモデル名", + "保存频率save_every_epoch": "エポックごとの保存頻度", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "明確な子音と呼吸音を保護し、電子音の途切れやその他のアーティファクトを防止します。0.5でオフになります。下げると保護が強化されますが、indexの効果が低下する可能性があります。", + "修改": "変更", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報の修正(weightsフォルダから抽出された小さなモデルファイルのみ対応)", + "停止音频转换": "音声変換を停止", + "全流程结束!": "全工程が完了!", + "刷新音色列表和索引路径": "音源リストとインデックスパスの更新", + "加载模型": "モデルをロード", + "加载预训练底模D路径": "事前学習済みのDモデルのパス", + "加载预训练底模G路径": "事前学習済みのGモデルのパス", + "单次推理": "单次推理", + "卸载音色省显存": "音源を削除してメモリを節約", + "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", + "后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0", + "否": "いいえ", + "启用相位声码器": "启用相位声码器", + "响应阈值": "反応閾値", + "响度因子": "ラウドネス係数", + "处理数据": "データ処理", + "导出Onnx模型": "Onnxに変換", + "导出文件格式": "エクスポート形式", + "常见问题解答": "よくある質問", + "常规设置": "一般設定", + "开始音频转换": "音声変換を開始", + "很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。", + "性能设置": "パフォーマンス設定", + "总训练轮数total_epoch": "総エポック数", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。", + "指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する", + "指定输出文件夹": "出力フォルダを指定してください", + "指定输出非主人声文件夹": "マスター以外の出力音声フォルダーを指定する", + "推理时间(ms):": "推論時間(ms):", + "推理音色": "音源推論", + "提取": "抽出", + "提取音高和处理数据使用的CPU进程数": "ピッチの抽出やデータ処理に使用するCPUスレッド数", + "是": "はい", + "是否仅保存最新的ckpt文件以节省硬盘空间": "ハードディスク容量を節約するため、最新のckptファイルのみを保存しますか?", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "各保存時点の小モデルを全部weightsフォルダに保存するかどうか", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "すべてのトレーニングデータをメモリにキャッシュするかどうか。10分以下の小さなデータはキャッシュしてトレーニングを高速化できますが、大きなデータをキャッシュするとメモリが破裂し、あまり速度が上がりません。", + "显卡信息": "GPU情報", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本ソフトウェアはMITライセンスに基づくオープンソースであり、製作者は本ソフトウェアに対していかなる責任を持ちません。本ソフトウェアの利用者および本ソフトウェアから派生した音源(成果物)を配布する者は、本ソフトウェアに対して自身で責任を負うものとします。
この条項に同意しない場合、パッケージ内のコードやファイルを使用や参照を禁じます。詳しくはLICENSEをご覧ください。", + "查看": "表示", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報を表示する(小さいモデルファイルはweightsフォルダーからのみサポートされています)", + "检索特征占比": "検索特徴率", + "模型": "モデル", + "模型推理": "モデル推論", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "モデル抽出(ログフォルダー内の大きなファイルのモデルパスを入力)、モデルを半分までトレーニングし、自動的に小さいファイルモデルを保存しなかったり、中間モデルをテストしたい場合に適用されます。", + "模型是否带音高指导": "モデルに音高ガイドを付けるかどうか", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "モデルに音高ガイドがあるかどうか(歌唱には必要ですが、音声には必要ありません)", + "模型是否带音高指导,1是0否": "モデルに音高ガイドを付けるかどうか、1は付ける、0は付けない", + "模型版本型号": "モデルのバージョン", + "模型融合, 可用于测试音色融合": "モデルのマージ、音源のマージテストに使用できます", + "模型路径": "モデルパス", + "每张显卡的batch_size": "GPUごとのバッチサイズ", + "淡入淡出长度": "フェードイン/フェードアウト長", + "版本": "バージョン", + "特征提取": "特徴抽出", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徴検索ライブラリへのパス 空の場合はドロップダウンで選択", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性から女性へは+12キーをお勧めします。女性から男性へは-12キーをお勧めします。音域が広すぎて音質が劣化した場合は、適切な音域に自分で調整してください。", + "目标采样率": "目標サンプリングレート", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "インデックスパスの自動検出 ドロップダウンで選択", + "融合": "マージ", + "要改的模型信息": "変更するモデル情報", + "要置入的模型信息": "挿入するモデル情報", + "训练": "トレーニング", + "训练模型": "モデルのトレーニング", + "训练特征索引": "特徴インデックスのトレーニング", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "トレーニング終了時に、トレーニングログやフォルダ内のtrain.logを確認することができます", + "请指定说话人id": "話者IDを指定してください", + "请选择index文件": "indexファイルを選択してください", + "请选择pth文件": "pthファイルを選択してください", + "请选择说话人id": "話者IDを選択してください", + "转换": "変換", + "输入实验名": "モデル名", + "输入待处理音频文件夹路径": "処理するオーディオファイルのフォルダパスを入力してください", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "処理対象音声フォルダーのパスを入力してください(エクスプローラーのアドレスバーからコピーしてください)", + "输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "入力ソースの音量エンベロープと出力音量エンベロープの融合率 1に近づくほど、出力音量エンベロープの割合が高くなる", + "输入监听": "输入监听", + "输入训练文件夹路径": "トレーニング用フォルダのパスを入力してください", + "输入设备": "入力デバイス", + "输入降噪": "入力ノイズの低減", + "输出信息": "出力情報", + "输出变声": "输出变声", + "输出设备": "出力デバイス", + "输出降噪": "出力ノイズの低減", + "输出音频(右下角三个点,点了可以下载)": "出力音声(右下の三点をクリックしてダウンロードできます)", + "选择.index文件": ".indexファイルを選択", + "选择.pth文件": ".pthファイルを選択", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを食います。", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを喰います", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "ピッチ抽出アルゴリズムの選択:歌声はpmで高速化でき、入力した音声が高音質でCPUが貧弱な場合はdioで高速化でき、harvestの方が良いが遅く、rmvpeがベストだがCPU/GPUを若干食います。", + "采样率:": "采样率:", + "采样长度": "サンプル長", + "重载设备列表": "デバイスリストをリロードする", + "音调设置": "音程設定", + "音频设备(请使用同种类驱动)": "オーディオデバイス(同じ種類のドライバーを使用してください)", + "音高算法": "ピッチアルゴリズム", + "额外推理时长": "追加推論時間" +} diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json new file mode 100644 index 0000000..f01bc8f --- /dev/null +++ b/i18n/locale/ru_RU.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Если значение больше 3: применить медианную фильтрацию к вытащенным тональностям. Значение контролирует радиус фильтра и может уменьшить излишнее дыхание.", + "A模型权重": "Весы (w) модели А:", + "A模型路径": "Путь к модели А:", + "B模型路径": "Путь к модели Б:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Файл дуги F0 (не обязательно). Одна тональность на каждую строчку. Заменяет обычный F0 и модуляцию тональности:", + "Index Rate": "Темп индекса", + "Onnx导出": "Экспорт ONNX", + "Onnx输出路径": "Путь для сохранения модели в формате ONNX:", + "RVC模型路径": "Путь к модели RVC:", + "ckpt处理": "Обработка ckpt", + "harvest进程数": "Количество процессор harvest", + "index文件路径不可包含中文": "Путь к файлу индекса", + "pth文件路径不可包含中文": "Путь к файлу pth", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Введите номера графических процессоров, разделенные символом «-», например, 0-0-1, чтобы запустить два процесса на GPU 0 и один процесс на GPU 1:", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1. Конфигурирование модели. Данные обучения модели сохраняются в папку 'logs', и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.", + "step1:正在处理数据": "Шаг 1. Переработка данных", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2А. Автоматическая обработка исходных аудиозаписей для обучения и выполнение нормализации среза. Создаст 2 папки wav в папке модели. В данный момент поддерживается обучение только на одноголосных записях.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2Б. Оценка и извлечение тональности в аудиофайлах с помощью процессора (если включена поддержка изменения высоты звука), извлечение черт с помощью GPU (выберите номер GPU):", + "step3: 填写训练设置, 开始训练模型和索引": "Шаг 3. Заполнение дополнительных настроек обучения и запуск обучения модели и индекса", + "step3a:正在训练模型": "Шаг 3. Запуск обучения модели", + "一键训练": "Обучение в одно нажатие", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Можно также импортировать несколько аудиофайлов. Если путь к папке существует, то этот ввод игнорируется.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Пакетная обработка для разделения вокального сопровождения с использованием модели UVR5.
Пример допустимого формата пути к папке: D:\\path\\to\\input\\folder
Модель разделена на три категории:
1. Сохранить вокал: выберите этот вариант для звука без гармоний. Он сохраняет вокал лучше, чем HP5. Он включает в себя две встроенные модели: HP2 и HP3. HP3 может немного пропускать инструментал, но сохраняет вокал немного лучше, чем HP2.
2. Сохранить только основной вокал: выберите этот вариант для звука с гармониями. Это может ослабить основной вокал. Он включает одну встроенную модель: HP5.
3. Модели удаления реверберации и задержки (от FoxJoy):
  (1) MDX-Net: лучший выбор для удаления стереореверберации, но он не может удалить монореверберацию;
 (234) DeEcho: удаляет эффекты задержки. Агрессивный режим удаляет более тщательно, чем Нормальный режим. DeReverb дополнительно удаляет реверберацию и может удалять монореверберацию, но не очень эффективно для сильно реверберированного высокочастотного контента.
Примечания по удалению реверберации/задержки:
1. Время обработки для модели DeEcho-DeReverb примерно в два раза больше, чем для двух других моделей DeEcho.
2. Модель MDX-Net-Dereverb довольно медленная.
3. Рекомендуемая самая чистая конфигурация — сначала применить MDX-Net, а затем DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Введите, какие(-ую) GPU(-у) хотите использовать через '-', например 0-1-2, чтобы использовать GPU с номерами 0, 1 и 2:", + "伴奏人声分离&去混响&去回声": "Разделение вокала/аккомпанемента и удаление эхо", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Имя файла для сохранения:", + "保存的文件名, 默认空为和源文件同名": "Название сохранённого файла (по умолчанию: такое же, как и у входного):", + "保存的模型名不带后缀": "Имя файла модели для сохранения (без расширения):", + "保存频率save_every_epoch": "Частота сохранения (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Защитить глухие согласные и звуки дыхания для предотвращения артефактов, например, разрывания в электронной музыке. Поставьте на 0.5, чтобы выключить. Уменьшите значение для повышения защиты, но учтите, что при этом может ухудшиться точность индексирования:", + "修改": "Изменить", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Изменить информацию о модели (работает только с маленькими моделями, взятыми из папки 'weights')", + "停止音频转换": "Закончить конвертацию аудио", + "全流程结束!": "Все процессы завершены!", + "刷新音色列表和索引路径": "Обновить список голосов и индексов", + "加载模型": "Загрузить модель", + "加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:", + "加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:", + "单次推理": "单次推理", + "卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов", + "变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", + "否": "Нет", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Порог ответа", + "响度因子": "коэффициент громкости", + "处理数据": "Обработать данные", + "导出Onnx模型": "Экспортировать модель", + "导出文件格式": "Формат выходных файлов", + "常见问题解答": "ЧаВо (часто задаваемые вопросы)", + "常规设置": "Основные настройки", + "开始音频转换": "Начать конвертацию аудио", + "很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.", + "性能设置": "Настройки быстроты", + "总训练轮数total_epoch": "Полное количество эпох (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').", + "指定输出主人声文件夹": "Путь к папке для сохранения вокала:", + "指定输出文件夹": "Папка для результатов:", + "指定输出非主人声文件夹": "Путь к папке для сохранения аккомпанемента:", + "推理时间(ms):": "Время переработки (мс):", + "推理音色": "Желаемый голос:", + "提取": "Создать модель", + "提取音高和处理数据使用的CPU进程数": "Число процессов ЦП, используемое для оценки высоты голоса и обработки данных:", + "是": "Да", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Сохранять только последний файл '.ckpt', чтобы сохранить место на диске:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Сохранять маленькую финальную модель в папку 'weights' на каждой точке сохранения:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Кэшировать все тренировочные сеты в видеопамять. Кэширование маленький датасетов (меньше 10 минут) может ускорить тренировку, но кэширование больших, наоборот, займёт много видеопамяти и не сильно ускорит тренировку:", + "显卡信息": "Информация о графических процессорах (GPUs):", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Это программное обеспечение с открытым исходным кодом распространяется по лицензии MIT. Автор никак не контролирует это программное обеспечение. Пользователи, которые используют эту программу и распространяют аудиозаписи, полученные с помощью этой программы, несут полную ответственность за это. Если вы не согласны с этим, вы не можете использовать какие-либо коды и файлы в рамках этой программы или ссылаться на них. Подробнее в файле Agreement-LICENSE.txt в корневом каталоге программы.", + "查看": "Просмотреть информацию", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Просмотреть информацию о модели (работает только с маленькими моделями, взятыми из папки 'weights')", + "检索特征占比": "Соотношение поиска черт:", + "模型": "Модели", + "模型推理": "Изменение голоса", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Создание модели из данных, полученных в процессе обучения (введите путь к большому файлу модели в папке 'logs'). Может пригодиться, если вам нужно завершить обучение и получить маленький файл готовой модели, или если вам нужно проверить недообученную модель:", + "模型是否带音高指导": "Поддерживает ли модель изменение высоты голоса (1: да, 0: нет):", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Поддержка изменения высоты звука (обязательно для пения, необязательно для речи):", + "模型是否带音高指导,1是0否": "Поддерживает ли модель изменение высоты голоса (1: да, 0: нет):", + "模型版本型号": "Версия архитектуры модели:", + "模型融合, 可用于测试音色融合": "Слияние моделей, может быть использовано для проверки слияния тембра", + "模型路径": "Путь к папке:", + "每张显卡的batch_size": "Размер пачки для GPU:", + "淡入淡出长度": "Длина затухания", + "版本": "Версия архитектуры модели:", + "特征提取": "Извлечь черты", + "特征检索库文件路径,为空则使用下拉的选择结果": "Путь к файлу индекса черт. Оставьте пустым, чтобы использовать выбранный вариант из списка ниже:", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Рекомендуется выбрать +12 для конвертирования мужского голоса в женский и -12 для конвертирования женского в мужской. Если диапазон голоса слишком велик, и голос искажается, можно выбрать значение на свой вкус.", + "目标采样率": "Частота дискретизации аудио:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Автоматически найденные файлы индексов черт (выберите вариант из списка):", + "融合": "Запустить слияние", + "要改的模型信息": "Информация, которая будет изменена:", + "要置入的模型信息": "Информация о модели:", + "训练": "Обучение модели", + "训练模型": "Обучить модель", + "训练特征索引": "Обучить индекс черт", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Обучение модели завершено. Журнал обучения можно просмотреть в консоли или в файле 'train.log' в папке с моделью.", + "请指定说话人id": "Номер говорящего/поющего:", + "请选择index文件": "Пожалуйста, выберите файл индекса", + "请选择pth文件": "Пожалуйста, выберите файл pth", + "请选择说话人id": "Номер говорящего:", + "转换": "Преобразовать", + "输入实验名": "Название модели:", + "输入待处理音频文件夹路径": "Путь к папке с аудиофайлами для обработки:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Путь к папке с аудиофайлами для переработки (можно скопировать путь из адресной строки файлового менеджера):", + "输入待处理音频文件路径(默认是正确格式示例)": "Путь к аудиофайлу, который хотите обработать (ниже указан пример пути к файлу):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Использовать громкость входного файла для замены или перемешивания с громкостью выходного файла. Чем ближе соотношение к 1, тем больше используется звука из выходного файла:", + "输入监听": "输入监听", + "输入训练文件夹路径": "Путь к папке с аудиозаписями, на которых будет обучаться модель:", + "输入设备": "Входное устройство", + "输入降噪": "Уменьшение входного шума", + "输出信息": "Статистика", + "输出变声": "输出变声", + "输出设备": "Выходное устройство", + "输出降噪": "Уменьшение выходного шума", + "输出音频(右下角三个点,点了可以下载)": "Аудиофайл (чтобы скачать, нажмите на три точки справа в плеере)", + "选择.index文件": "Выбрать файл .index", + "选择.pth文件": "Выбрать файл .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Выберите алгоритм оценки высоты голоса ('pm': работает быстро, но даёт низкое качество речи; 'harvest': басы лучше, но работает очень медленно; 'crepe': лучшее качество, но сильно нагружает GPU; 'rmvpe': лучшее качество и минимальная нагрузка на GPU):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Длина сэмпла", + "重载设备列表": "Обновить список устройств", + "音调设置": "Настройка высоты звука", + "音频设备(请使用同种类驱动)": "Аудиоустройство (пожалуйста, используйте такой же тип драйвера)", + "音高算法": "Алгоритм оценки высоты звука", + "额外推理时长": "Доп. время переработки" +} diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json new file mode 100644 index 0000000..bd1c17b --- /dev/null +++ b/i18n/locale/tr_TR.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Eğer >=3 ise, elde edilen pitch sonuçlarına median filtreleme uygula. Bu değer, filtre yarıçapını temsil eder ve nefesliliği azaltabilir.", + "A模型权重": "A Modeli Ağırlığı:", + "A模型路径": "A Modeli Yolu:", + "B模型路径": "B Modeli Yolu:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 eğrisi dosyası (isteğe bağlı). Her satırda bir pitch değeri bulunur. Varsayılan F0 ve pitch modülasyonunu değiştirir:", + "Index Rate": "Index Oranı", + "Onnx导出": "Onnx Dışa Aktar", + "Onnx输出路径": "Onnx Dışa Aktarım Yolu:", + "RVC模型路径": "RVC Model Yolu:", + "ckpt处理": "ckpt İşleme", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": ".index dosya yolu Çince karakter içeremez", + "pth文件路径不可包含中文": ".pth dosya yolu Çince karakter içeremez", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.", + "step1:正在处理数据": "Adım 1: Veri işleme", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):", + "step3: 填写训练设置, 开始训练模型和索引": "Adım 3: Eğitim ayarlarını doldurun ve modeli ve dizini eğitmeye başlayın", + "step3a:正在训练模型": "Adım 3a: Model eğitimi başladı", + "一键训练": "Tek Tuşla Eğit", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Ses dosyaları ayrıca toplu olarak, iki seçimle, öncelikli okuma klasörüyle içe aktarılabilir", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch işleme kullanarak vokal eşlik ayrımı için UVR5 modeli kullanılır.
Geçerli bir klasör yol formatı örneği: D:\\path\\to\\input\\folder (dosya yöneticisi adres çubuğundan kopyalanır).
Model üç kategoriye ayrılır:
1. Vokalleri koru: Bu seçeneği, harmoni içermeyen sesler için kullanın. HP5'ten daha iyi bir şekilde vokalleri korur. İki dahili model içerir: HP2 ve HP3. HP3, eşlik sesini hafifçe sızdırabilir, ancak vokalleri HP2'den biraz daha iyi korur.
2. Sadece ana vokalleri koru: Bu seçeneği, harmoni içeren sesler için kullanın. Ana vokalleri zayıflatabilir. Bir dahili model içerir: HP5.
3. Reverb ve gecikme modelleri (FoxJoy tarafından):
  (1) MDX-Net: Stereo reverb'i kaldırmak için en iyi seçenek, ancak mono reverb'i kaldıramaz;
 (234) DeEcho: Gecikme efektlerini kaldırır. Agresif mod, Normal moda göre daha kapsamlı bir şekilde kaldırma yapar. DeReverb ayrıca reverb'i kaldırır ve mono reverb'i kaldırabilir, ancak yoğun yankılı yüksek frekanslı içerikler için çok etkili değildir.
Reverb/gecikme notları:
1. DeEcho-DeReverb modelinin işleme süresi diğer iki DeEcho modeline göre yaklaşık olarak iki kat daha uzundur.
2. MDX-Net-Dereverb modeli oldukça yavaştır.
3. Tavsiye edilen en temiz yapılandırma önce MDX-Net'i uygulamak ve ardından DeEcho-Aggressive uygulamaktır.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "GPU indekslerini '-' ile ayırarak girin, örneğin 0-1-2, GPU 0, 1 ve 2'yi kullanmak için:", + "伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Kaydetme Adı:", + "保存的文件名, 默认空为和源文件同名": "Kaydedilecek dosya adı (varsayılan: kaynak dosya ile aynı):", + "保存的模型名不带后缀": "Kaydedilecek model adı (uzantı olmadan):", + "保存频率save_every_epoch": "Kaydetme sıklığı (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Sessiz ünsüzleri ve nefes seslerini koruyarak elektronik müzikte yırtılma gibi sanal hataların oluşmasını engeller. 0.5 olarak ayarlandığında devre dışı kalır. Değerin azaltılması korumayı artırabilir, ancak indeksleme doğruluğunu azaltabilir:", + "修改": "Düzenle", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini düzenle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "停止音频转换": "Ses dönüştürmeyi durdur", + "全流程结束!": "Tüm işlemler tamamlandı!", + "刷新音色列表和索引路径": "Ses listesini ve indeks yolunu yenile", + "加载模型": "Model yükle", + "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", + "加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:", + "单次推理": "单次推理", + "卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", + "否": "Hayır", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Tepki eşiği", + "响度因子": "ses yüksekliği faktörü", + "处理数据": "Verileri işle", + "导出Onnx模型": "Onnx Modeli Dışa Aktar", + "导出文件格式": "Dışa aktarma dosya formatı", + "常见问题解答": "Sıkça Sorulan Sorular (SSS)", + "常规设置": "Genel ayarlar", + "开始音频转换": "Ses dönüştürmeyi başlat", + "很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.", + "性能设置": "Performans ayarları", + "总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir", + "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:", + "指定输出文件夹": "Çıkış klasörünü belirt:", + "指定输出非主人声文件夹": "Müzik ve diğer sesler için çıkış klasörünü belirtin:", + "推理时间(ms):": "Çıkarsama süresi (ms):", + "推理音色": "Ses çıkartma (Inference):", + "提取": "Çıkart", + "提取音高和处理数据使用的CPU进程数": "Ses yüksekliği çıkartmak (Pitch) ve verileri işlemek için kullanılacak CPU işlemci sayısı:", + "是": "Evet", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Sadece en son '.ckpt' dosyasını kaydet:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Her kaydetme noktasında son küçük bir modeli 'weights' klasörüne kaydetmek için:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Tüm eğitim verilerini GPU belleğine önbelleğe alıp almayacağınızı belirtin. Küçük veri setlerini (10 dakikadan az) önbelleğe almak eğitimi hızlandırabilir, ancak büyük veri setlerini önbelleğe almak çok fazla GPU belleği tüketir ve çok fazla hız artışı sağlamaz:", + "显卡信息": "GPU Bilgisi", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Bu yazılım, MIT lisansı altında açık kaynaklıdır. Yazarın yazılım üzerinde herhangi bir kontrolü yoktur. Yazılımı kullanan ve yazılım tarafından dışa aktarılan sesleri dağıtan kullanıcılar sorumludur.
Eğer bu maddeyle aynı fikirde değilseniz, yazılım paketi içindeki herhangi bir kod veya dosyayı kullanamaz veya referans göremezsiniz. Detaylar için kök dizindeki Agreement-LICENSE.txt dosyasına bakınız.", + "查看": "Görüntüle", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini görüntüle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "检索特征占比": "Arama özelliği oranı (vurgu gücünü kontrol eder, çok yüksek olması sanal etkilere neden olur)", + "模型": "Model", + "模型推理": "Model çıkartma (Inference)", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model çıkartma (büyük dosya modeli yolunu 'logs' klasöründe girin). Bu, eğitimi yarıda bırakmak istediğinizde ve manuel olarak küçük bir model dosyası çıkartmak ve kaydetmek istediğinizde veya bir ara modeli test etmek istediğinizde kullanışlıdır:", + "模型是否带音高指导": "Modelin ses yüksekliği rehberi içerip içermediği:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Modelin ses yüksekliği (Pitch) rehberliği içerip içermediği (şarkı söyleme için şarttır, konuşma için isteğe bağlıdır):", + "模型是否带音高指导,1是0否": "Modelin ses yüksekliği rehberi içerip içermediği (1: evet, 0: hayır):", + "模型版本型号": "Model mimari versiyonu:", + "模型融合, 可用于测试音色融合": "Model birleştirme, ses rengi birleştirmesi için kullanılabilir", + "模型路径": "Model Yolu:", + "每张显卡的batch_size": "Her GPU için yığın boyutu (batch_size):", + "淡入淡出长度": "Geçiş (Fade) uzunluğu", + "版本": "Sürüm", + "特征提取": "Özellik çıkartma", + "特征检索库文件路径,为空则使用下拉的选择结果": "Özellik indeksi dosyasının yolunu belirtin. Seçilen sonucu kullanmak için boş bırakın veya açılır menüden seçim yapın.", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Erkekten kadına çevirmek için +12 tuş önerilir, kadından erkeğe çevirmek için ise -12 tuş önerilir. Eğer ses aralığı çok fazla genişler ve ses bozulursa, isteğe bağlı olarak uygun aralığa kendiniz de ayarlayabilirsiniz.", + "目标采样率": "Hedef örnekleme oranı:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "İndeks yolunu otomatik olarak tespit et ve açılır menüden seçim yap.", + "融合": "Birleştir", + "要改的模型信息": "Düzenlenecek model bilgileri:", + "要置入的模型信息": "Eklemek için model bilgileri:", + "训练": "Eğitim", + "训练模型": "Modeli Eğit", + "训练特征索引": "Özellik Dizinini Eğit", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Eğitim tamamlandı. Eğitim günlüklerini konsolda veya deney klasörü altındaki train.log dosyasında kontrol edebilirsiniz.", + "请指定说话人id": "Lütfen konuşmacı/sanatçı no belirtin:", + "请选择index文件": "Lütfen .index dosyası seçin", + "请选择pth文件": "Lütfen .pth dosyası seçin", + "请选择说话人id": "Konuşmacı/Şarkıcı No seçin:", + "转换": "Dönüştür", + "输入实验名": "Deneysel adı girin:", + "输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "İşlenecek ses klasörünün yolunu girin (dosya yöneticisinin adres çubuğundan kopyalayın):", + "输入待处理音频文件路径(默认是正确格式示例)": "İşlenecek ses dosyasının yolunu girin (varsayılan doğru format örneğidir):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Sesin hacim zarfını ayarlayın. 0'a yakın değerler, sesin orijinal vokallerin hacmine benzer olmasını sağlar. Düşük bir değerle ses gürültüsünü maskeleyebilir ve hacmi daha doğal bir şekilde duyulabilir hale getirebilirsiniz. 1'e yaklaştıkça sürekli bir yüksek ses seviyesi elde edilir:", + "输入监听": "输入监听", + "输入训练文件夹路径": "Eğitim klasörünün yolunu girin:", + "输入设备": "Giriş cihazı", + "输入降噪": "Giriş gürültü azaltma", + "输出信息": "Çıkış bilgisi", + "输出变声": "输出变声", + "输出设备": "Çıkış cihazı", + "输出降噪": "Çıkış gürültü azaltma", + "输出音频(右下角三个点,点了可以下载)": "Ses dosyasını dışa aktar (indirmek için sağ alt köşedeki üç noktaya tıklayın)", + "选择.index文件": ".index dosyası seç", + "选择.pth文件": ".pth dosyası seç", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Pitch algoritmasını seçin ('pm': daha hızlı çıkarır ancak daha düşük kaliteli konuşma; 'harvest': daha iyi konuşma sesi ancak son derece yavaş; 'crepe': daha da iyi kalite ancak GPU yoğunluğu gerektirir):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Örnekleme uzunluğu", + "重载设备列表": "Cihaz listesini yeniden yükle", + "音调设置": "Pitch ayarları", + "音频设备(请使用同种类驱动)": "Ses cihazı (aynı tür sürücüyü kullanın)", + "音高算法": "音高算法", + "额外推理时长": "Ekstra çıkartma süresi" +} diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json new file mode 100644 index 0000000..32ca5ef --- /dev/null +++ b/i18n/locale/zh_CN.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音", + "A模型权重": "A模型权重", + "A模型路径": "A模型路径", + "B模型路径": "B模型路径", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt处理", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", + "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一键训练", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "保存名", + "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", + "保存的模型名不带后缀": "保存的模型名不带后缀", + "保存频率save_every_epoch": "保存频率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", + "停止音频转换": "停止音频转换", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路径", + "加载模型": "加载模型", + "加载预训练底模D路径": "加载预训练底模D路径", + "加载预训练底模G路径": "加载预训练底模G路径", + "单次推理": "单次推理", + "卸载音色省显存": "卸载音色省显存", + "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "响应阈值", + "响度因子": "响度因子", + "处理数据": "处理数据", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "导出文件格式", + "常见问题解答": "常见问题解答", + "常规设置": "常规设置", + "开始音频转换": "开始音频转换", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "性能设置", + "总训练轮数total_epoch": "总训练轮数total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定输出文件夹", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理时间(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", + "显卡信息": "显卡信息", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", + "检索特征占比": "检索特征占比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", + "模型是否带音高指导": "模型是否带音高指导", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", + "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否", + "模型版本型号": "模型版本型号", + "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", + "模型路径": "模型路径", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出长度", + "版本": "版本", + "特征提取": "特征提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", + "目标采样率": "目标采样率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型信息", + "要置入的模型信息": "要置入的模型信息", + "训练": "训练", + "训练模型": "训练模型", + "训练特征索引": "训练特征索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "请指定说话人id": "请指定说话人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "请选择说话人id", + "转换": "转换", + "输入实验名": "输入实验名", + "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", + "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", + "输入监听": "输入监听", + "输入训练文件夹路径": "输入训练文件夹路径", + "输入设备": "输入设备", + "输入降噪": "输入降噪", + "输出信息": "输出信息", + "输出变声": "输出变声", + "输出设备": "输出设备", + "输出降噪": "输出降噪", + "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", + "选择.index文件": "选择.index文件", + "选择.pth文件": "选择.pth文件", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "采样长度", + "重载设备列表": "重载设备列表", + "音调设置": "音调设置", + "音频设备(请使用同种类驱动)": "音频设备(请使用同种类驱动)", + "音高算法": "音高算法", + "额外推理时长": "额外推理时长" +} diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json new file mode 100644 index 0000000..93aaff3 --- /dev/null +++ b/i18n/locale/zh_HK.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json new file mode 100644 index 0000000..93aaff3 --- /dev/null +++ b/i18n/locale/zh_SG.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json new file mode 100644 index 0000000..93aaff3 --- /dev/null +++ b/i18n/locale/zh_TW.json @@ -0,0 +1,135 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} From c16d1f50cdbcd42d842112e3756ad163a8dd218a Mon Sep 17 00:00:00 2001 From: DW <147780325+D3lik@users.noreply.github.com> Date: Wed, 17 Jan 2024 23:24:56 +1100 Subject: [PATCH 33/58] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60b1f46..25ab373 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ## Environment Preparation -If you are windows users (tested with win>=10), you don't need read this part. Just download the [Integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main), unzip it and double-click go-webui.bat to start GPT-SoVITS-WebUI. +If you are a **Windows** user (tested with win>=10) you can install directly via the prezip. Just download the [prezip](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true), unzip it and double-click **go-webui.bat** to start GPT-SoVITS-WebUI. ### Python and PyTorch Version From 8fd49410d13f646ec9b7d417f3cb6bf989d236d2 Mon Sep 17 00:00:00 2001 From: DW <147780325+D3lik@users.noreply.github.com> Date: Wed, 17 Jan 2024 23:25:29 +1100 Subject: [PATCH 34/58] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 25ab373..821dda5 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ## Environment Preparation -If you are a **Windows** user (tested with win>=10) you can install directly via the prezip. Just download the [prezip](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true), unzip it and double-click **go-webui.bat** to start GPT-SoVITS-WebUI. +If you are a Windows user (tested with win>=10) you can install directly via the prezip. Just download the [prezip](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true), unzip it and double-click go-webui.bat to start GPT-SoVITS-WebUI. ### Python and PyTorch Version From f2f3d1786773688cc98e1aaecc02fdf0d0bf192f Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 00:30:37 +0800 Subject: [PATCH 35/58] Add files via upload --- webui.py | 1317 +++++++++++++++--------------------------------------- 1 file changed, 354 insertions(+), 963 deletions(-) diff --git a/webui.py b/webui.py index e51b9e6..dbccba7 100644 --- a/webui.py +++ b/webui.py @@ -1,48 +1,35 @@ -import json, yaml, warnings, torch +import json,yaml,warnings,torch import platform warnings.filterwarnings("ignore") torch.manual_seed(233333) -import os, sys - +import os,pdb,sys now_dir = os.getcwd() tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp import site - -site_packages_root = "%s/runtime/Lib/site-packages" % now_dir +site_packages_root="%s/runtime/Lib/site-packages"%now_dir for path in site.getsitepackages(): - if "site-packages" in path: - site_packages_root = path + if("site-packages"in path):site_packages_root=path os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -with open("%s/users.pth" % (site_packages_root), "w") as f: - f.write( - "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" - % (now_dir, now_dir, now_dir, now_dir, now_dir) - ) +with open("%s/users.pth"%(site_packages_root),"w")as f: + f.write("%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"%(now_dir,now_dir,now_dir,now_dir,now_dir)) import traceback - sys.path.append(now_dir) +import shutil +import pdb import gradio as gr from subprocess import Popen -from config import ( - python_exec, - infer_device, - is_half, - exp_root, - webui_port_main, - webui_port_infer_tts, - webui_port_uvr5, - webui_port_subfix, -) -from tools.i18n.i18n import I18nAuto - +import signal +from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix +from i18n.i18n import I18nAuto i18n = I18nAuto() +from scipy.io import wavfile +from tools.my_utils import load_audio from multiprocessing import cpu_count - -n_cpu = cpu_count() +n_cpu=cpu_count() # 判断是否有能用来训练和加速推理的N卡 ngpu = torch.cuda.device_count() @@ -53,42 +40,11 @@ if_gpu_ok = False if torch.cuda.is_available() or ngpu != 0: for i in range(ngpu): gpu_name = torch.cuda.get_device_name(i) - if any( - value in gpu_name.upper() - for value in [ - "10", - "16", - "20", - "30", - "40", - "A2", - "A3", - "A4", - "P4", - "A50", - "500", - "A60", - "70", - "80", - "90", - "M4", - "T4", - "TITAN", - "L", - ] - ): + if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L"]): # A10#A100#V100#A40#P40#M40#K80#A4500 if_gpu_ok = True # 至少有一张能用的N卡 gpu_infos.append("%s\t%s" % (i, gpu_name)) - mem.append( - int( - torch.cuda.get_device_properties(i).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - ) + mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4)) if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) default_batch_size = min(mem) // 2 @@ -97,395 +53,230 @@ else: default_batch_size = 1 gpus = "-".join([i[0] for i in gpu_infos]) -pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" -pretrained_gpt_name = ( - "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" -) - - +pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" def get_weights_names(): SoVITS_names = [pretrained_sovits_name] for name in os.listdir(SoVITS_weight_root): - if name.endswith(".pth"): - SoVITS_names.append(name) + if name.endswith(".pth"):SoVITS_names.append(name) GPT_names = [pretrained_gpt_name] for name in os.listdir(GPT_weight_root): - if name.endswith(".ckpt"): - GPT_names.append(name) - return SoVITS_names, GPT_names - - -SoVITS_weight_root = "SoVITS_weights" -GPT_weight_root = "GPT_weights" -os.makedirs(SoVITS_weight_root, exist_ok=True) -os.makedirs(GPT_weight_root, exist_ok=True) -SoVITS_names, GPT_names = get_weights_names() - + if name.endswith(".ckpt"): GPT_names.append(name) + return SoVITS_names,GPT_names +SoVITS_weight_root="SoVITS_weights" +GPT_weight_root="GPT_weights" +os.makedirs(SoVITS_weight_root,exist_ok=True) +os.makedirs(GPT_weight_root,exist_ok=True) +SoVITS_names,GPT_names = get_weights_names() def change_choices(): SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names), "__type__": "update"}, { - "choices": sorted(GPT_names), - "__type__": "update", - } - - -p_label = None -p_uvr5 = None -p_asr = None -p_tts_inference = None - -system = platform.system() + return {"choices": sorted(SoVITS_names), "__type__": "update"}, {"choices": sorted(GPT_names), "__type__": "update"} +p_label=None +p_uvr5=None +p_asr=None +p_tts_inference=None +system=platform.system() def kill_process(pid): - if system == "Windows": + if(system=="Windows"): cmd = "taskkill /t /f /pid %s" % pid else: - cmd = "kill -9 %s" % pid + cmd = "kill -9 %s"%pid print(cmd) - os.system(cmd) ###linux上杀了webui,可能还会没杀干净。。。 + os.system(cmd)###linux上杀了webui,可能还会没杀干净。。。 # os.kill(p_label.pid,19)#主进程#控制台进程#python子进程###不好使,连主进程的webui一起关了,辣鸡 - -def change_label(if_label, path_list): +def change_label(if_label,path_list): global p_label - if if_label == True and p_label == None: - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s' % ( - python_exec, - path_list, - webui_port_subfix, - ) + if(if_label==True and p_label==None): + cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s'%(python_exec,path_list,webui_port_subfix) yield "打标工具WebUI已开启" print(cmd) p_label = Popen(cmd, shell=True) - elif if_label == False and p_label != None: + elif(if_label==False and p_label!=None): kill_process(p_label.pid) - p_label = None + p_label=None yield "打标工具WebUI已关闭" - def change_uvr5(if_uvr5): global p_uvr5 - if if_uvr5 == True and p_uvr5 == None: - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s' % ( - python_exec, - infer_device, - is_half, - webui_port_uvr5, - ) + if(if_uvr5==True and p_uvr5==None): + cmd = '"%s" tools/uvr5/webui.py "%s" %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5) yield "UVR5已开启" print(cmd) p_uvr5 = Popen(cmd, shell=True) - elif if_uvr5 == False and p_uvr5 != None: + elif(if_uvr5==False and p_uvr5!=None): kill_process(p_uvr5.pid) - p_uvr5 = None + p_uvr5=None yield "UVR5已关闭" - -def change_tts_inference( - if_tts, bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path -): +def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path): global p_tts_inference - if if_tts == True and p_tts_inference == None: - os.environ["gpt_path"] = ( - gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) - ) - os.environ["sovits_path"] = ( - sovits_path - if "/" in sovits_path - else "%s/%s" % (SoVITS_weight_root, sovits_path) - ) - os.environ["cnhubert_base_path"] = cnhubert_base_path - os.environ["bert_path"] = bert_path - os.environ["_CUDA_VISIBLE_DEVICES"] = gpu_number - os.environ["is_half"] = str(is_half) - os.environ["infer_ttswebui"] = str(webui_port_infer_tts) - cmd = '"%s" GPT_SoVITS/inference_webui.py' % (python_exec) + if(if_tts==True and p_tts_inference==None): + os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) + os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) + os.environ["cnhubert_base_path"]=cnhubert_base_path + os.environ["bert_path"]=bert_path + os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number + os.environ["is_half"]=str(is_half) + os.environ["infer_ttswebui"]=str(webui_port_infer_tts) + cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec) yield "TTS推理进程已开启" print(cmd) p_tts_inference = Popen(cmd, shell=True) - elif if_tts == False and p_tts_inference != None: + elif(if_tts==False and p_tts_inference!=None): kill_process(p_tts_inference.pid) - p_tts_inference = None + p_tts_inference=None yield "TTS推理进程已关闭" def open_asr(asr_inp_dir): global p_asr - if p_asr == None: - cmd = '"%s" tools/damo_asr/cmd-asr.py "%s"' % (python_exec, asr_inp_dir) - yield "ASR任务开启:%s" % cmd, {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + if(p_asr==None): + cmd = '"%s" tools/damo_asr/cmd-asr.py "%s"'%(python_exec,asr_inp_dir) + yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} print(cmd) p_asr = Popen(cmd, shell=True) p_asr.wait() - p_asr = None - yield "ASR任务完成", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + p_asr=None + yield "ASR任务完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的ASR任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - + yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} def close_asr(): global p_asr - if p_asr != None: + if(p_asr!=None): kill_process(p_asr.pid) - p_asr = None - return ( - "已终止ASR进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) + p_asr=None + return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False} - -""" +''' button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Bb,button1Ba_open,button1Ba_close]) button1Ba_close.click(close1Ba, [], [info1Bb,button1Ba_open,button1Ba_close]) -""" -p_train_SoVITS = None - - -def open1Ba( - batch_size, - total_epoch, - exp_name, - text_low_lr_rate, - if_save_latest, - if_save_every_weights, - save_every_epoch, - gpu_numbers1Ba, - pretrained_s2G, - pretrained_s2D, -): +''' +p_train_SoVITS=None +def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D): global p_train_SoVITS - if p_train_SoVITS == None: - with open("GPT_SoVITS/configs/s2.json") as f: - data = f.read() - data = json.loads(data) - s2_dir = "%s/%s" % (exp_root, exp_name) - os.makedirs("%s/logs_s2" % (s2_dir), exist_ok=True) - data["train"]["batch_size"] = batch_size - data["train"]["epochs"] = total_epoch - data["train"]["text_low_lr_rate"] = text_low_lr_rate - data["train"]["pretrained_s2G"] = pretrained_s2G - data["train"]["pretrained_s2D"] = pretrained_s2D - data["train"]["if_save_latest"] = if_save_latest - data["train"]["if_save_every_weights"] = if_save_every_weights - data["train"]["save_every_epoch"] = save_every_epoch - data["train"]["gpu_numbers"] = gpu_numbers1Ba - data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir - data["save_weight_dir"] = SoVITS_weight_root - data["name"] = exp_name - tmp_config_path = "TEMP/tmp_s2.json" - with open(tmp_config_path, "w") as f: - f.write(json.dumps(data)) + if(p_train_SoVITS==None): + with open("GPT_SoVITS/configs/s2.json")as f: + data=f.read() + data=json.loads(data) + s2_dir="%s/%s"%(exp_root,exp_name) + os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True) + data["train"]["batch_size"]=batch_size + data["train"]["epochs"]=total_epoch + data["train"]["text_low_lr_rate"]=text_low_lr_rate + data["train"]["pretrained_s2G"]=pretrained_s2G + data["train"]["pretrained_s2D"]=pretrained_s2D + data["train"]["if_save_latest"]=if_save_latest + data["train"]["if_save_every_weights"]=if_save_every_weights + data["train"]["save_every_epoch"]=save_every_epoch + data["train"]["gpu_numbers"]=gpu_numbers1Ba + data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir + data["save_weight_dir"]=SoVITS_weight_root + data["name"]=exp_name + tmp_config_path="TEMP/tmp_s2.json" + with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % ( - python_exec, - tmp_config_path, - ) - yield "SoVITS训练开始:%s" % cmd, {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) + yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} print(cmd) p_train_SoVITS = Popen(cmd, shell=True) p_train_SoVITS.wait() - p_train_SoVITS = None - yield "SoVITS训练完成", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + p_train_SoVITS=None + yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务", { - "__type__": "update", - "visible": False, - }, {"__type__": "update", "visible": True} - + yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} def close1Ba(): global p_train_SoVITS - if p_train_SoVITS != None: + if(p_train_SoVITS!=None): kill_process(p_train_SoVITS.pid) - p_train_SoVITS = None - return ( - "已终止SoVITS训练", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) + p_train_SoVITS=None + return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} - -p_train_GPT = None - - -def open1Bb( - batch_size, - total_epoch, - exp_name, - if_save_latest, - if_save_every_weights, - save_every_epoch, - gpu_numbers, - pretrained_s1, -): +p_train_GPT=None +def open1Bb(batch_size,total_epoch,exp_name,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): global p_train_GPT - if p_train_GPT == None: - with open("GPT_SoVITS/configs/s1longer.yaml") as f: - data = f.read() - data = yaml.load(data, Loader=yaml.FullLoader) - s1_dir = "%s/%s" % (exp_root, exp_name) - os.makedirs("%s/logs_s1" % (s1_dir), exist_ok=True) - data["train"]["batch_size"] = batch_size - data["train"]["epochs"] = total_epoch - data["pretrained_s1"] = pretrained_s1 - data["train"]["save_every_n_epoch"] = save_every_epoch - data["train"]["if_save_every_weights"] = if_save_every_weights - data["train"]["if_save_latest"] = if_save_latest - data["train"]["half_weights_save_dir"] = GPT_weight_root - data["train"]["exp_name"] = exp_name - data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir - data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir - data["output_dir"] = "%s/logs_s1" % s1_dir + if(p_train_GPT==None): + with open("GPT_SoVITS/configs/s1longer.yaml")as f: + data=f.read() + data=yaml.load(data, Loader=yaml.FullLoader) + s1_dir="%s/%s"%(exp_root,exp_name) + os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True) + data["train"]["batch_size"]=batch_size + data["train"]["epochs"]=total_epoch + data["pretrained_s1"]=pretrained_s1 + data["train"]["save_every_n_epoch"]=save_every_epoch + data["train"]["if_save_every_weights"]=if_save_every_weights + data["train"]["if_save_latest"]=if_save_latest + data["train"]["half_weights_save_dir"]=GPT_weight_root + data["train"]["exp_name"]=exp_name + data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir + data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir + data["output_dir"]="%s/logs_s1"%s1_dir - os.environ["_CUDA_VISIBLE_DEVICES"] = gpu_numbers.replace("-", ",") - os.environ["hz"] = "25hz" - tmp_config_path = "TEMP/tmp_s1.yaml" - with open(tmp_config_path, "w") as f: - f.write(yaml.dump(data, default_flow_style=False)) + os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",") + os.environ["hz"]="25hz" + tmp_config_path="TEMP/tmp_s1.yaml" + with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % ( - python_exec, - tmp_config_path, - ) - yield "GPT训练开始:%s" % cmd, {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) + yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} print(cmd) p_train_GPT = Popen(cmd, shell=True) p_train_GPT.wait() - p_train_GPT = None - yield "GPT训练完成", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + p_train_GPT=None + yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务", { - "__type__": "update", - "visible": False, - }, {"__type__": "update", "visible": True} - + yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} def close1Bb(): global p_train_GPT - if p_train_GPT != None: + if(p_train_GPT!=None): kill_process(p_train_GPT.pid) - p_train_GPT = None - return ( - "已终止GPT训练", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) + p_train_GPT=None + return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} - -ps_slice = [] - - -def open_slice( - inp, - opt_root, - threshold, - min_length, - min_interval, - hop_size, - max_sil_kept, - _max, - alpha, - n_parts, -): +ps_slice=[] +def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): global ps_slice - if os.path.exists(inp) == False: - yield "输入路径不存在", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + if(os.path.exists(inp)==False): + yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False} return - if os.path.isfile(inp): - n_parts = 1 - elif os.path.isdir(inp): - pass + if os.path.isfile(inp):n_parts=1 + elif os.path.isdir(inp):pass else: - yield "输入路径存在但既不是文件也不是文件夹", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False} return - if ps_slice == []: + if (ps_slice == []): for i_part in range(n_parts): - cmd = ( - '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' - "" - % ( - python_exec, - inp, - opt_root, - threshold, - min_length, - min_interval, - hop_size, - max_sil_kept, - _max, - alpha, - i_part, - n_parts, - ) - ) + cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts) print(cmd) p = Popen(cmd, shell=True) ps_slice.append(p) - yield "切割执行中", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps_slice: p.wait() - ps_slice = [] - yield "切割结束", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + ps_slice=[] + yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - + yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} def close_slice(): global ps_slice - if ps_slice != []: + if (ps_slice != []): for p_slice in ps_slice: try: kill_process(p_slice.pid) except: traceback.print_exc() - ps_slice = [] - return ( - "已终止所有切割进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) + ps_slice=[] + return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} - -""" +''' inp_text= os.environ.get("inp_text") inp_wav_dir= os.environ.get("inp_wav_dir") exp_name= os.environ.get("exp_name") @@ -494,71 +285,53 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir")#"/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name bert_pretrained_dir= os.environ.get("bert_pretrained_dir")#"/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" -""" -ps1a = [] - - -def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): +''' +ps1a=[] +def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): global ps1a - if ps1a == []: - config = { - "inp_text": inp_text, - "inp_wav_dir": inp_wav_dir, - "exp_name": exp_name, - "opt_dir": "%s/%s" % (exp_root, exp_name), - "bert_pretrained_dir": bert_pretrained_dir, + if (ps1a == []): + config={ + "inp_text":inp_text, + "inp_wav_dir":inp_wav_dir, + "exp_name":exp_name, + "opt_dir":"%s/%s"%(exp_root,exp_name), + "bert_pretrained_dir":bert_pretrained_dir, } - gpu_names = gpu_numbers.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { "i_part": str(i_part), "all_parts": str(all_parts), "_CUDA_VISIBLE_DEVICES": gpu_names[i_part], - "is_half": str(is_half), + "is_half": str(is_half) } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1a.append(p) - yield "文本进程执行中", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps1a: p.wait() - ps1a = [] - yield "文本进程结束", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + ps1a=[] + yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - + yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} def close1a(): global ps1a - if ps1a != []: + if (ps1a != []): for p1a in ps1a: try: kill_process(p1a.pid) except: traceback.print_exc() - ps1a = [] - return ( - "已终止所有1a进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) - - -""" + ps1a=[] + return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} +''' inp_text= os.environ.get("inp_text") inp_wav_dir= os.environ.get("inp_wav_dir") exp_name= os.environ.get("exp_name") @@ -567,23 +340,21 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir") cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") -""" -ps1b = [] - - -def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): +''' +ps1b=[] +def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): global ps1b - if ps1b == []: - config = { - "inp_text": inp_text, - "inp_wav_dir": inp_wav_dir, - "exp_name": exp_name, - "opt_dir": "%s/%s" % (exp_root, exp_name), - "cnhubert_base_dir": ssl_pretrained_dir, - "is_half": str(is_half), + if (ps1b == []): + config={ + "inp_text":inp_text, + "inp_wav_dir":inp_wav_dir, + "exp_name":exp_name, + "opt_dir":"%s/%s"%(exp_root,exp_name), + "cnhubert_base_dir":ssl_pretrained_dir, + "is_half": str(is_half) } - gpu_names = gpu_numbers.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { @@ -593,47 +364,29 @@ def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): } ) os.environ.update(config) - cmd = ( - '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec - ) + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1b.append(p) - yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps1b: p.wait() - ps1b = [] - yield "SSL提取进程结束", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + ps1b=[] + yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", { - "__type__": "update", - "visible": False, - }, {"__type__": "update", "visible": True} - + yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} def close1b(): global ps1b - if ps1b != []: + if (ps1b != []): for p1b in ps1b: try: kill_process(p1b.pid) except: traceback.print_exc() - ps1b = [] - return ( - "已终止所有1b进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) - - -""" + ps1b=[] + return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} +''' inp_text= os.environ.get("inp_text") exp_name= os.environ.get("exp_name") i_part= os.environ.get("i_part") @@ -641,23 +394,21 @@ all_parts= os.environ.get("all_parts") os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") opt_dir= os.environ.get("opt_dir") pretrained_s2G= os.environ.get("pretrained_s2G") -""" -ps1c = [] - - -def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): +''' +ps1c=[] +def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): global ps1c - if ps1c == []: - config = { - "inp_text": inp_text, - "exp_name": exp_name, - "opt_dir": "%s/%s" % (exp_root, exp_name), - "pretrained_s2G": pretrained_s2G_path, - "s2config_path": "GPT_SoVITS/configs/s2.json", - "is_half": str(is_half), + if (ps1c == []): + config={ + "inp_text":inp_text, + "exp_name":exp_name, + "opt_dir":"%s/%s"%(exp_root,exp_name), + "pretrained_s2G":pretrained_s2G_path, + "s2config_path":"GPT_SoVITS/configs/s2.json", + "is_half": str(is_half) } - gpu_names = gpu_numbers.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { @@ -667,76 +418,48 @@ def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1c.append(p) - yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps1c: p.wait() - ps1c = [] - yield "语义token提取进程结束", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + ps1c=[] + yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: - yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", { - "__type__": "update", - "visible": False, - }, {"__type__": "update", "visible": True} - + yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} def close1c(): global ps1c - if ps1c != []: + if (ps1c != []): for p1c in ps1c: try: kill_process(p1c.pid) except: traceback.print_exc() - ps1c = [] - return ( - "已终止所有语义token进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) - - + ps1c=[] + return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} #####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G -ps1abc = [] - - -def open1abc( - inp_text, - inp_wav_dir, - exp_name, - gpu_numbers1a, - gpu_numbers1Ba, - gpu_numbers1c, - bert_pretrained_dir, - ssl_pretrained_dir, - pretrained_s2G_path, -): +ps1abc=[] +def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path): global ps1abc - if ps1abc == []: - opt_dir = "%s/%s" % (exp_root, exp_name) + if (ps1abc == []): + opt_dir="%s/%s"%(exp_root,exp_name) try: #############################1a - path_text = "%s/2-name2text.txt" % opt_dir - if os.path.exists(path_text) == False: - config = { - "inp_text": inp_text, - "inp_wav_dir": inp_wav_dir, - "exp_name": exp_name, - "opt_dir": opt_dir, - "bert_pretrained_dir": bert_pretrained_dir, - "is_half": str(is_half), + path_text="%s/2-name2text.txt" % opt_dir + if(os.path.exists(path_text)==False): + config={ + "inp_text":inp_text, + "inp_wav_dir":inp_wav_dir, + "exp_name":exp_name, + "opt_dir":opt_dir, + "bert_pretrained_dir":bert_pretrained_dir, + "is_half": str(is_half) } - gpu_names = gpu_numbers1a.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers1a.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { @@ -746,43 +469,34 @@ def open1abc( } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a-ing", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - for p in ps1abc: - p.wait() + yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + for p in ps1abc:p.wait() opt = [] - for i_part in range( - all_parts - ): # txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) + for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r", encoding="utf8") as f: + with open(txt_path, "r",encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(txt_path) - with open(path_text, "w", encoding="utf8") as f: + with open(path_text, "w",encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield "进度:1a-done", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - ps1abc = [] + yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + ps1abc=[] #############################1b - config = { - "inp_text": inp_text, - "inp_wav_dir": inp_wav_dir, - "exp_name": exp_name, - "opt_dir": opt_dir, - "cnhubert_base_dir": ssl_pretrained_dir, + config={ + "inp_text":inp_text, + "inp_wav_dir":inp_wav_dir, + "exp_name":exp_name, + "opt_dir":opt_dir, + "cnhubert_base_dir":ssl_pretrained_dir, } - gpu_names = gpu_numbers1Ba.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers1Ba.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { @@ -792,36 +506,26 @@ def open1abc( } ) os.environ.update(config) - cmd = ( - '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' - % python_exec - ) + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - for p in ps1abc: - p.wait() - yield "进度:1a1b-done", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - ps1abc = [] + yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + for p in ps1abc:p.wait() + yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + ps1abc=[] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if os.path.exists(path_semantic) == False: - config = { - "inp_text": inp_text, - "exp_name": exp_name, - "opt_dir": opt_dir, - "pretrained_s2G": pretrained_s2G_path, - "s2config_path": "GPT_SoVITS/configs/s2.json", + if(os.path.exists(path_semantic)==False): + config={ + "inp_text":inp_text, + "exp_name":exp_name, + "opt_dir":opt_dir, + "pretrained_s2G":pretrained_s2G_path, + "s2config_path":"GPT_SoVITS/configs/s2.json", } - gpu_names = gpu_numbers1c.split("-") - all_parts = len(gpu_names) + gpu_names=gpu_numbers1c.split("-") + all_parts=len(gpu_names) for i_part in range(all_parts): config.update( { @@ -831,137 +535,74 @@ def open1abc( } ) os.environ.update(config) - cmd = ( - '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' - % python_exec - ) + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - for p in ps1abc: - p.wait() + yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + for p in ps1abc:p.wait() opt = ["item_name semantic_audio"] for i_part in range(all_parts): semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r", encoding="utf8") as f: + with open(semantic_path, "r",encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(semantic_path) - with open(path_semantic, "w", encoding="utf8") as f: + with open(path_semantic, "w",encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield "进度:all-done", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } + yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} ps1abc = [] - yield "一键三连进程结束", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} except: traceback.print_exc() close1abc() - yield "一键三连中途报错", {"__type__": "update", "visible": True}, { - "__type__": "update", - "visible": False, - } + yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} else: - yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, { - "__type__": "update", - "visible": True, - } - + yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} def close1abc(): global ps1abc - if ps1abc != []: + if (ps1abc != []): for p1abc in ps1abc: try: kill_process(p1abc.pid) except: traceback.print_exc() - ps1abc = [] - return ( - "已终止所有一键三连进程", - {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False}, - ) - + ps1abc=[] + return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value="本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + value= + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." ) with gr.Tabs(): - with gr.TabItem("0-前置数据集获取工具"): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 + with gr.TabItem("0-前置数据集获取工具"):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 gr.Markdown(value="0a-UVR5人声伴奏分离&去混响去延迟工具") with gr.Row(): - if_uvr5 = gr.Checkbox(label="是否开启UVR5-WebUI", show_label=True) + if_uvr5 = gr.Checkbox(label="是否开启UVR5-WebUI",show_label=True) uvr5_info = gr.Textbox(label="UVR5进程输出信息") gr.Markdown(value="0b-语音切分工具") with gr.Row(): with gr.Row(): - slice_inp_path = gr.Textbox(label="音频自动切分输入路径,可文件可文件夹", value="") - slice_opt_root = gr.Textbox( - label="切分后的子音频的输出根目录", value="output/slicer_opt" - ) - threshold = gr.Textbox( - label="threshold:音量小于这个值视作静音的备选切割点", value="-34" - ) - min_length = gr.Textbox( - label="min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值", value="4000" - ) - min_interval = gr.Textbox(label="min_interval:最短切割间隔", value="300") - hop_size = gr.Textbox( - label="hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)", value="10" - ) - max_sil_kept = gr.Textbox( - label="max_sil_kept:切完后静音最多留多长", value="500" - ) + slice_inp_path=gr.Textbox(label="音频自动切分输入路径,可文件可文件夹",value="") + slice_opt_root=gr.Textbox(label="切分后的子音频的输出根目录",value="output/slicer_opt") + threshold=gr.Textbox(label="threshold:音量小于这个值视作静音的备选切割点",value="-34") + min_length=gr.Textbox(label="min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值",value="4000") + min_interval=gr.Textbox(label="min_interval:最短切割间隔",value="300") + hop_size=gr.Textbox(label="hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",value="10") + max_sil_kept=gr.Textbox(label="max_sil_kept:切完后静音最多留多长",value="500") with gr.Row(): - open_slicer_button = gr.Button( - "开启语音切割", variant="primary", visible=True - ) - close_slicer_button = gr.Button( - "终止语音切割", variant="primary", visible=False - ) - _max = gr.Slider( - minimum=0, - maximum=1, - step=0.05, - label="max:归一化后最大值多少", - value=0.9, - interactive=True, - ) - alpha = gr.Slider( - minimum=0, - maximum=1, - step=0.05, - label="alpha_mix:混多少比例归一化后音频进来", - value=0.25, - interactive=True, - ) - n_process = gr.Slider( - minimum=1, - maximum=n_cpu, - step=1, - label="切割使用的进程数", - value=4, - interactive=True, - ) + open_slicer_button=gr.Button("开启语音切割", variant="primary",visible=True) + close_slicer_button=gr.Button("终止语音切割", variant="primary",visible=False) + _max=gr.Slider(minimum=0,maximum=1,step=0.05,label="max:归一化后最大值多少",value=0.9,interactive=True) + alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label="alpha_mix:混多少比例归一化后音频进来",value=0.25,interactive=True) + n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label="切割使用的进程数",value=4,interactive=True) slicer_info = gr.Textbox(label="语音切割进程输出信息") gr.Markdown(value="0c-中文批量离线ASR工具") with gr.Row(): - open_asr_button = gr.Button( - "开启离线批量ASR", variant="primary", visible=True - ) - close_asr_button = gr.Button( - "终止ASR进程", variant="primary", visible=False - ) + open_asr_button = gr.Button("开启离线批量ASR", variant="primary",visible=True) + close_asr_button = gr.Button("终止ASR进程", variant="primary",visible=False) asr_inp_dir = gr.Textbox( label="批量ASR(中文only)输入文件夹路径", value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx", @@ -970,365 +611,115 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: asr_info = gr.Textbox(label="ASR进程输出信息") gr.Markdown(value="0d-语音文本校对标注工具") with gr.Row(): - if_label = gr.Checkbox(label="是否开启打标WebUI", show_label=True) + if_label = gr.Checkbox(label="是否开启打标WebUI",show_label=True) path_list = gr.Textbox( label="打标数据标注文件路径", value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", interactive=True, ) label_info = gr.Textbox(label="打标工具进程输出信息") - if_label.change(change_label, [if_label, path_list], [label_info]) + if_label.change(change_label, [if_label,path_list], [label_info]) if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info]) - open_asr_button.click( - open_asr, [asr_inp_dir], [asr_info, open_asr_button, close_asr_button] - ) - close_asr_button.click( - close_asr, [], [asr_info, open_asr_button, close_asr_button] - ) - open_slicer_button.click( - open_slice, - [ - slice_inp_path, - slice_opt_root, - threshold, - min_length, - min_interval, - hop_size, - max_sil_kept, - _max, - alpha, - n_process, - ], - [slicer_info, open_slicer_button, close_slicer_button], - ) - close_slicer_button.click( - close_slice, [], [slicer_info, open_slicer_button, close_slicer_button] - ) + open_asr_button.click(open_asr, [asr_inp_dir], [asr_info,open_asr_button,close_asr_button]) + close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) + open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button]) + close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) with gr.TabItem("1-GPT-SoVITS-TTS"): with gr.Row(): exp_name = gr.Textbox(label="*实验/模型名", value="xxx", interactive=True) - gpu_info = gr.Textbox( - label="显卡信息", value=gpu_info, visible=True, interactive=False - ) - pretrained_s2G = gr.Textbox( - label="预训练的SoVITS-G模型路径", - value="GPT_SoVITS/pretrained_models/s2G488k.pth", - interactive=True, - ) - pretrained_s2D = gr.Textbox( - label="预训练的SoVITS-D模型路径", - value="GPT_SoVITS/pretrained_models/s2D488k.pth", - interactive=True, - ) - pretrained_s1 = gr.Textbox( - label="预训练的GPT模型路径", - value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", - interactive=True, - ) + gpu_info = gr.Textbox(label="显卡信息", value=gpu_info, visible=True, interactive=False) + pretrained_s2G = gr.Textbox(label="预训练的SoVITS-G模型路径", value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True) + pretrained_s2D = gr.Textbox(label="预训练的SoVITS-D模型路径", value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True) + pretrained_s1 = gr.Textbox(label="预训练的GPT模型路径", value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True) with gr.TabItem("1A-训练集格式化工具"): gr.Markdown(value="输出logs/实验名目录下应有23456开头的文件和文件夹") with gr.Row(): - inp_text = gr.Textbox( - label="*文本标注文件", - value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", - interactive=True, - ) - inp_wav_dir = gr.Textbox( - label="*训练集音频文件目录", - value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", - interactive=True, - ) + inp_text = gr.Textbox(label="*文本标注文件",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True) + inp_wav_dir = gr.Textbox(label="*训练集音频文件目录",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",interactive=True) gr.Markdown(value="1Aa-文本内容") with gr.Row(): - gpu_numbers1a = gr.Textbox( - label="GPU卡号以-分割,每个卡号一个进程", - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - bert_pretrained_dir = gr.Textbox( - label="预训练的中文BERT模型路径", - value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - interactive=False, - ) - button1a_open = gr.Button("开启文本获取", variant="primary", visible=True) - button1a_close = gr.Button( - "终止文本获取进程", variant="primary", visible=False - ) - info1a = gr.Textbox(label="文本进程输出信息") + gpu_numbers1a = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) + bert_pretrained_dir = gr.Textbox(label="预训练的中文BERT模型路径",value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False) + button1a_open = gr.Button("开启文本获取", variant="primary",visible=True) + button1a_close = gr.Button("终止文本获取进程", variant="primary",visible=False) + info1a=gr.Textbox(label="文本进程输出信息") gr.Markdown(value="1Ab-SSL自监督特征提取") with gr.Row(): - gpu_numbers1Ba = gr.Textbox( - label="GPU卡号以-分割,每个卡号一个进程", - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - cnhubert_base_dir = gr.Textbox( - label="预训练的SSL模型路径", - value="GPT_SoVITS/pretrained_models/chinese-hubert-base", - interactive=False, - ) - button1b_open = gr.Button( - "开启SSL提取", variant="primary", visible=True - ) - button1b_close = gr.Button( - "终止SSL提取进程", variant="primary", visible=False - ) - info1b = gr.Textbox(label="SSL进程输出信息") + gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) + cnhubert_base_dir = gr.Textbox(label="预训练的SSL模型路径",value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False) + button1b_open = gr.Button("开启SSL提取", variant="primary",visible=True) + button1b_close = gr.Button("终止SSL提取进程", variant="primary",visible=False) + info1b=gr.Textbox(label="SSL进程输出信息") gr.Markdown(value="1Ac-语义token提取") with gr.Row(): - gpu_numbers1c = gr.Textbox( - label="GPU卡号以-分割,每个卡号一个进程", - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - button1c_open = gr.Button( - "开启语义token提取", variant="primary", visible=True - ) - button1c_close = gr.Button( - "终止语义token提取进程", variant="primary", visible=False - ) - info1c = gr.Textbox(label="语义token提取进程输出信息") + gpu_numbers1c = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) + button1c_open = gr.Button("开启语义token提取", variant="primary",visible=True) + button1c_close = gr.Button("终止语义token提取进程", variant="primary",visible=False) + info1c=gr.Textbox(label="语义token提取进程输出信息") gr.Markdown(value="1Aabc-训练集格式化一键三连") with gr.Row(): - button1abc_open = gr.Button( - "开启一键三连", variant="primary", visible=True - ) - button1abc_close = gr.Button( - "终止一键三连", variant="primary", visible=False - ) - info1abc = gr.Textbox(label="一键三连进程输出信息") - button1a_open.click( - open1a, - [inp_text, inp_wav_dir, exp_name, gpu_numbers1a, bert_pretrained_dir], - [info1a, button1a_open, button1a_close], - ) - button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) - button1b_open.click( - open1b, - [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], - [info1b, button1b_open, button1b_close], - ) - button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) - button1c_open.click( - open1c, - [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], - [info1c, button1c_open, button1c_close], - ) - button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) - button1abc_open.click( - open1abc, - [ - inp_text, - inp_wav_dir, - exp_name, - gpu_numbers1a, - gpu_numbers1Ba, - gpu_numbers1c, - bert_pretrained_dir, - cnhubert_base_dir, - pretrained_s2G, - ], - [info1abc, button1abc_open, button1abc_close], - ) - button1abc_close.click( - close1abc, [], [info1abc, button1abc_open, button1abc_close] - ) + button1abc_open = gr.Button("开启一键三连", variant="primary",visible=True) + button1abc_close = gr.Button("终止一键三连", variant="primary",visible=False) + info1abc=gr.Textbox(label="一键三连进程输出信息") + button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close]) + button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close]) + button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close]) + button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close]) + button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close]) + button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close]) + button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close]) + button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close]) with gr.TabItem("1B-微调训练"): gr.Markdown(value="1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。") with gr.Row(): - batch_size = gr.Slider( - minimum=1, - maximum=40, - step=1, - label=i18n("每张显卡的batch_size"), - value=default_batch_size, - interactive=True, - ) - total_epoch = gr.Slider( - minimum=1, - maximum=20, - step=1, - label=i18n("总训练轮数total_epoch,不建议太高"), - value=8, - interactive=True, - ) - text_low_lr_rate = gr.Slider( - minimum=0.2, - maximum=0.6, - step=0.05, - label="文本模块学习率权重", - value=0.4, - interactive=True, - ) - save_every_epoch = gr.Slider( - minimum=1, - maximum=50, - step=1, - label=i18n("保存频率save_every_epoch"), - value=4, - interactive=True, - ) - if_save_latest = gr.Checkbox( - label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), - value=True, - interactive=True, - show_label=True, - ) - if_save_every_weights = gr.Checkbox( - label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), - value=True, - interactive=True, - show_label=True, - ) - gpu_numbers1Ba = gr.Textbox( - label="GPU卡号以-分割,每个卡号一个进程", - value="%s" % (gpus), - interactive=True, - ) + batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) + total_epoch = gr.Slider(minimum=1,maximum=20,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=8,interactive=True) + text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label="文本模块学习率权重",value=0.4,interactive=True) + save_every_epoch = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True) + if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) + if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) + gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程", value="%s" % (gpus), interactive=True) with gr.Row(): - button1Ba_open = gr.Button( - "开启SoVITS训练", variant="primary", visible=True - ) - button1Ba_close = gr.Button( - "终止SoVITS训练", variant="primary", visible=False - ) - info1Ba = gr.Textbox(label="SoVITS训练进程输出信息") + button1Ba_open = gr.Button("开启SoVITS训练", variant="primary",visible=True) + button1Ba_close = gr.Button("终止SoVITS训练", variant="primary",visible=False) + info1Ba=gr.Textbox(label="SoVITS训练进程输出信息") gr.Markdown(value="1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。") with gr.Row(): - batch_size1Bb = gr.Slider( - minimum=1, - maximum=40, - step=1, - label=i18n("每张显卡的batch_size"), - value=default_batch_size, - interactive=True, - ) - total_epoch1Bb = gr.Slider( - minimum=2, - maximum=100, - step=1, - label=i18n("总训练轮数total_epoch"), - value=15, - interactive=True, - ) - if_save_latest1Bb = gr.Checkbox( - label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), - value=True, - interactive=True, - show_label=True, - ) - if_save_every_weights1Bb = gr.Checkbox( - label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), - value=True, - interactive=True, - show_label=True, - ) - save_every_epoch1Bb = gr.Slider( - minimum=1, - maximum=50, - step=1, - label=i18n("保存频率save_every_epoch"), - value=5, - interactive=True, - ) - gpu_numbers1Bb = gr.Textbox( - label="GPU卡号以-分割,每个卡号一个进程", - value="%s" % (gpus), - interactive=True, - ) + batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) + total_epoch1Bb = gr.Slider(minimum=2,maximum=100,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True) + if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True) + if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) + save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True) + gpu_numbers1Bb = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程", value="%s" % (gpus), interactive=True) with gr.Row(): - button1Bb_open = gr.Button( - "开启GPT训练", variant="primary", visible=True - ) - button1Bb_close = gr.Button( - "终止GPT训练", variant="primary", visible=False - ) - info1Bb = gr.Textbox(label="GPT训练进程输出信息") - button1Ba_open.click( - open1Ba, - [ - batch_size, - total_epoch, - exp_name, - text_low_lr_rate, - if_save_latest, - if_save_every_weights, - save_every_epoch, - gpu_numbers1Ba, - pretrained_s2G, - pretrained_s2D, - ], - [info1Ba, button1Ba_open, button1Ba_close], - ) - button1Ba_close.click( - close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close] - ) - button1Bb_open.click( - open1Bb, - [ - batch_size1Bb, - total_epoch1Bb, - exp_name, - if_save_latest1Bb, - if_save_every_weights1Bb, - save_every_epoch1Bb, - gpu_numbers1Bb, - pretrained_s1, - ], - [info1Bb, button1Bb_open, button1Bb_close], - ) - button1Bb_close.click( - close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close] - ) + button1Bb_open = gr.Button("开启GPT训练", variant="primary",visible=True) + button1Bb_close = gr.Button("终止GPT训练", variant="primary",visible=False) + info1Bb=gr.Textbox(label="GPT训练进程输出信息") + button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close]) + button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close]) + button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close]) + button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close]) with gr.TabItem("1C-推理"): - gr.Markdown( - value="选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" - ) + gr.Markdown(value="选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。") with gr.Row(): - GPT_dropdown = gr.Dropdown( - label="*GPT模型列表", - choices=sorted(GPT_names), - value=pretrained_gpt_name, - ) - SoVITS_dropdown = gr.Dropdown( - label="*SoVITS模型列表", - choices=sorted(SoVITS_names), - value=pretrained_sovits_name, - ) - gpu_number_1C = gr.Textbox( - label="GPU卡号,只能填1个整数", value=gpus, interactive=True - ) + GPT_dropdown = gr.Dropdown(label="*GPT模型列表", choices=sorted(GPT_names),value=pretrained_gpt_name) + SoVITS_dropdown = gr.Dropdown(label="*SoVITS模型列表", choices=sorted(SoVITS_names),value=pretrained_sovits_name) + gpu_number_1C=gr.Textbox(label="GPU卡号,只能填1个整数", value=gpus, interactive=True) refresh_button = gr.Button("刷新模型路径", variant="primary") - refresh_button.click( - fn=change_choices, - inputs=[], - outputs=[SoVITS_dropdown, GPT_dropdown], - ) + refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown]) with gr.Row(): if_tts = gr.Checkbox(label="是否开启TTS推理WebUI", show_label=True) tts_info = gr.Textbox(label="TTS推理WebUI进程输出信息") - if_tts.change( - change_tts_inference, - [ - if_tts, - bert_pretrained_dir, - cnhubert_base_dir, - gpu_number_1C, - GPT_dropdown, - SoVITS_dropdown, - ], - [tts_info], - ) - with gr.TabItem("2-GPT-SoVITS-变声"): - gr.Markdown(value="施工中,请静候佳音") + if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info]) + with gr.TabItem("2-GPT-SoVITS-变声"):gr.Markdown(value="施工中,请静候佳音") - """ + ''' os.environ["gpt_path"]=gpt_path os.environ["sovits_path"]=sovits_path#bert_pretrained_dir os.environ["cnhubert_base_path"]=cnhubert_base_path#cnhubert_base_dir os.environ["bert_path"]=bert_path os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number - """ + ''' app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", From d2d43437a8e95343055088b0a32ad04b23119d18 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 00:31:02 +0800 Subject: [PATCH 36/58] Add files via upload --- .../prepare_datasets/2-get-hubert-wav32k.py | 110 ++++++++---------- 1 file changed, 48 insertions(+), 62 deletions(-) diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 25cb4a8..1a5de8c 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -1,23 +1,20 @@ # -*- coding: utf-8 -*- -import sys, os - -inp_text = os.environ.get("inp_text") -inp_wav_dir = os.environ.get("inp_wav_dir") -exp_name = os.environ.get("exp_name") -i_part = os.environ.get("i_part") -all_parts = os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +import sys,os +inp_text= os.environ.get("inp_text") +inp_wav_dir= os.environ.get("inp_wav_dir") +exp_name= os.environ.get("exp_name") +i_part= os.environ.get("i_part") +all_parts= os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") from feature_extractor import cnhubert +opt_dir= os.environ.get("opt_dir") +cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") +is_half=eval(os.environ.get("is_half","True")) -opt_dir = os.environ.get("opt_dir") -cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir") -is_half = eval(os.environ.get("is_half", "True")) - -import pdb, traceback, numpy as np, logging +import pdb,traceback,numpy as np,logging from scipy.io import wavfile -import librosa, torch - +import librosa,torch now_dir = os.getcwd() sys.path.append(now_dir) from my_utils import load_audio @@ -35,75 +32,64 @@ from my_utils import load_audio from time import time as ttime import shutil +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) +hubert_dir="%s/4-cnhubert"%(opt_dir) +wav32dir="%s/5-wav32k"%(opt_dir) +os.makedirs(opt_dir,exist_ok=True) +os.makedirs(hubert_dir,exist_ok=True) +os.makedirs(wav32dir,exist_ok=True) -def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path - dir = os.path.dirname(path) - name = os.path.basename(path) - tmp_path = "%s/%s%s.pth" % (dir, ttime(), i_part) - torch.save(fea, tmp_path) - shutil.move(tmp_path, "%s/%s" % (dir, name)) - - -hubert_dir = "%s/4-cnhubert" % (opt_dir) -wav32dir = "%s/5-wav32k" % (opt_dir) -os.makedirs(opt_dir, exist_ok=True) -os.makedirs(hubert_dir, exist_ok=True) -os.makedirs(wav32dir, exist_ok=True) - -maxx = 0.95 -alpha = 0.5 -device = "cuda:0" -model = cnhubert.get_model() -if is_half == True: - model = model.half().to(device) +maxx=0.95 +alpha=0.5 +device="cuda:0" +model=cnhubert.get_model() +if(is_half==True): + model=model.half().to(device) else: model = model.to(device) - - def name2go(wav_name): - hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) - if os.path.exists(hubert_path): - return - wav_path = "%s/%s" % (inp_wav_dir, wav_name) + hubert_path="%s/%s.pt"%(hubert_dir,wav_name) + if(os.path.exists(hubert_path)):return + if(inp_wav_dir!=""): + wav_path="%s/%s"%(inp_wav_dir,wav_name) tmp_audio = load_audio(wav_path, 32000) tmp_max = np.abs(tmp_audio).max() if tmp_max > 2.2: print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) return - tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ( - (1 - alpha) * 32768 - ) * tmp_audio - tmp_audio = librosa.resample(tmp_audio32, orig_sr=32000, target_sr=16000) + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio + tmp_audio = librosa.resample( + tmp_audio32, orig_sr=32000, target_sr=16000 + ) tensor_wav16 = torch.from_numpy(tmp_audio) - if is_half == True: - tensor_wav16 = tensor_wav16.half().to(device) + if (is_half == True): + tensor_wav16=tensor_wav16.half().to(device) else: tensor_wav16 = tensor_wav16.to(device) - ssl = ( - model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"] - .transpose(1, 2) - .cpu() - ) # torch.Size([1, 768, 215]) - if np.isnan(ssl.detach().numpy()).sum() != 0: - return + ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum()!= 0:return wavfile.write( - "%s/%s" % (wav32dir, wav_name), + "%s/%s"%(wav32dir,wav_name), 32000, tmp_audio32.astype("int16"), ) # torch.save(ssl,hubert_path ) - my_save(ssl, hubert_path) + my_save(ssl,hubert_path ) +with open(inp_text,"r",encoding="utf8")as f: + lines=f.read().strip("\n").split("\n") -with open(inp_text, "r", encoding="utf8") as f: - lines = f.read().strip("\n").split("\n") - -for line in lines[int(i_part) :: int(all_parts)]: +for line in lines[int(i_part)::int(all_parts)]: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name = os.path.basename(wav_name) + wav_name=os.path.basename(wav_name) name2go(wav_name) except: - print(line, traceback.format_exc()) + print(line,traceback.format_exc()) From 9619223bc38f31933eaa9eb50d6ca055a6317ee3 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 00:58:08 +0800 Subject: [PATCH 37/58] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 821dda5..2adecf0 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. - [ ] **High Priority:** - [ ] Localization in Japanese and English. - [ ] User guide. + - [ ] Japanese and English dataset fine tune training. - [ ] **Features:** - [ ] Zero-shot voice conversion (5s) / few-shot voice conversion (1min). @@ -131,7 +132,9 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. - [ ] Develop tiny and larger-sized TTS models. - [ ] Colab scripts. - [ ] Expand training dataset (2k -> 10k). - + - [ ] better sovits base model (enhanced audio quality) + - [ ] model mix + ## Credits Special thanks to the following projects and contributors: From ee1d99ab3caeda97ba2a3a2d4d913fcd2d3c0ecc Mon Sep 17 00:00:00 2001 From: Ilaria <108286953+TheStingerX@users.noreply.github.com> Date: Thu, 18 Jan 2024 01:36:11 +0100 Subject: [PATCH 38/58] Fixed i18n error Line 27 of webui.py contains an import from i18n.i18n but there was no module in the requirements. I added the module i18n. Line for context: "from i18n.i18n import I18nAuto" --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7c613b3..d4b6303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ sentencepiece transformers chardet PyYAML +i18n From 0d9a04cf60f55c2d0b79d43e7e9916ce54a343ee Mon Sep 17 00:00:00 2001 From: Yuan-Man <68322456+Yuan-ManX@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:51:54 +0800 Subject: [PATCH 39/58] Change i18n folder --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index dbccba7..3dd1bfd 100644 --- a/webui.py +++ b/webui.py @@ -24,7 +24,7 @@ import gradio as gr from subprocess import Popen import signal from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() from scipy.io import wavfile from tools.my_utils import load_audio From 678616c0edfde97b0c249bb75c4c4b1ef84de206 Mon Sep 17 00:00:00 2001 From: Erythrocyte3803 <2544390577@qq.com> Date: Thu, 18 Jan 2024 14:04:43 +0900 Subject: [PATCH 40/58] fixed i18n.i18n not found error --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index dbccba7..3dd1bfd 100644 --- a/webui.py +++ b/webui.py @@ -24,7 +24,7 @@ import gradio as gr from subprocess import Popen import signal from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix -from i18n.i18n import I18nAuto +from tools.i18n.i18n import I18nAuto i18n = I18nAuto() from scipy.io import wavfile from tools.my_utils import load_audio From 3a167888e2f6199f8089208e60d2bb80f797a355 Mon Sep 17 00:00:00 2001 From: Ke Date: Thu, 18 Jan 2024 14:55:38 +0800 Subject: [PATCH 41/58] Kill process in Linux platform Add a function to kill process and its children recusively in Linux platform. --- webui.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index dbccba7..838e98b 100644 --- a/webui.py +++ b/webui.py @@ -1,5 +1,8 @@ import json,yaml,warnings,torch import platform +import psutil +import os +import signal warnings.filterwarnings("ignore") torch.manual_seed(233333) @@ -30,7 +33,7 @@ from scipy.io import wavfile from tools.my_utils import load_audio from multiprocessing import cpu_count n_cpu=cpu_count() - + # 判断是否有能用来训练和加速推理的N卡 ngpu = torch.cuda.device_count() gpu_infos = [] @@ -78,15 +81,33 @@ p_uvr5=None p_asr=None p_tts_inference=None +def kill_proc_tree(pid, including_parent=True): + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + # Process already terminated + return + + children = parent.children(recursive=True) + for child in children: + try: + os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL + except OSError: + pass + if including_parent: + try: + os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL + except OSError: + pass + system=platform.system() def kill_process(pid): if(system=="Windows"): cmd = "taskkill /t /f /pid %s" % pid + os.system(cmd) else: - cmd = "kill -9 %s"%pid - print(cmd) - os.system(cmd)###linux上杀了webui,可能还会没杀干净。。。 - # os.kill(p_label.pid,19)#主进程#控制台进程#python子进程###不好使,连主进程的webui一起关了,辣鸡 + kill_proc_tree(pid) + def change_label(if_label,path_list): global p_label From 47c7c45e78a1ba140e09e9e500811bc0f706dbea Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 15:02:24 +0800 Subject: [PATCH 42/58] Update requirements.txt --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4b6303..7c613b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,3 @@ sentencepiece transformers chardet PyYAML -i18n From dbbf616762350ed043ca1e699fd8b9abee22e709 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:58:10 +0800 Subject: [PATCH 43/58] Update webui.py --- webui.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index 94a83cd..d799dcf 100644 --- a/webui.py +++ b/webui.py @@ -12,13 +12,19 @@ tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp import site -site_packages_root="%s/runtime/Lib/site-packages"%now_dir +site_packages_roots = [] for path in site.getsitepackages(): - if("site-packages"in path):site_packages_root=path -os.environ["OPENBLAS_NUM_THREADS"] = "4" + if "packages" in path: + site_packages_roots.append(path) +if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] +#os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" -with open("%s/users.pth"%(site_packages_root),"w")as f: - f.write("%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"%(now_dir,now_dir,now_dir,now_dir,now_dir)) +for site_packages_root in site_packages_roots: + with open("%s/users.pth" % (site_packages_root), "w") as f: + f.write( + "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" + % (now_dir, now_dir, now_dir, now_dir, now_dir) + ) import traceback sys.path.append(now_dir) import shutil From e2ddf97c313f672f14a83562e8fed8f7934bff4b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 18:46:11 +0800 Subject: [PATCH 44/58] Add files via upload --- webui.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index d799dcf..8f23854 100644 --- a/webui.py +++ b/webui.py @@ -317,11 +317,12 @@ ps1a=[] def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): global ps1a if (ps1a == []): + opt_dir="%s/%s"%(exp_root,exp_name) config={ "inp_text":inp_text, "inp_wav_dir":inp_wav_dir, "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), + "opt_dir":opt_dir, "bert_pretrained_dir":bert_pretrained_dir, } gpu_names=gpu_numbers.split("-") @@ -335,7 +336,7 @@ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): "is_half": str(is_half) } ) - os.environ.update(config) + os.environ.update(config)# cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec print(cmd) p = Popen(cmd, shell=True) @@ -343,6 +344,15 @@ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps1a: p.wait() + opt = [] + for i_part in range(all_parts): + txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) + with open(txt_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(txt_path) + path_text = "%s/2-name2text.txt" % opt_dir + with open(path_text, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") ps1a=[] yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: @@ -426,10 +436,11 @@ ps1c=[] def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): global ps1c if (ps1c == []): + opt_dir="%s/%s"%(exp_root,exp_name) config={ "inp_text":inp_text, "exp_name":exp_name, - "opt_dir":"%s/%s"%(exp_root,exp_name), + "opt_dir":opt_dir, "pretrained_s2G":pretrained_s2G_path, "s2config_path":"GPT_SoVITS/configs/s2.json", "is_half": str(is_half) @@ -452,6 +463,15 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} for p in ps1c: p.wait() + opt = ["item_name semantic_audio"] + path_semantic = "%s/6-name2semantic.tsv" % opt_dir + for i_part in range(all_parts): + semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) + with open(semantic_path, "r", encoding="utf8") as f: + opt += f.read().strip("\n").split("\n") + os.remove(semantic_path) + with open(path_semantic, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") ps1c=[] yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} else: @@ -476,7 +496,7 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb try: #############################1a path_text="%s/2-name2text.txt" % opt_dir - if(os.path.exists(path_text)==False): + if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and os.path.getsize(path_text)<10)): config={ "inp_text":inp_text, "inp_wav_dir":inp_wav_dir, @@ -543,7 +563,7 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb ps1abc=[] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False): + if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<28)): config={ "inp_text":inp_text, "exp_name":exp_name, From 54bd2b796163732bc2b31fa51ee98dc4e75025c8 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 20:03:17 +0800 Subject: [PATCH 45/58] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7c613b3..2e64033 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ sentencepiece transformers chardet PyYAML +psutil From 230bf5bae6f85ec1b235b48fefe6ea6dba87324b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 20:22:42 +0800 Subject: [PATCH 46/58] Add files via upload --- webui.py | 51 ++++++++------------------------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/webui.py b/webui.py index 8f23854..cdc87b7 100644 --- a/webui.py +++ b/webui.py @@ -179,10 +179,6 @@ def close_asr(): p_asr=None return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False} -''' - button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Bb,button1Ba_open,button1Ba_close]) - button1Ba_close.click(close1Ba, [], [info1Bb,button1Ba_open,button1Ba_close]) -''' p_train_SoVITS=None def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D): global p_train_SoVITS @@ -303,16 +299,6 @@ def close_slice(): ps_slice=[] return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -''' -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir= os.environ.get("opt_dir")#"/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name -bert_pretrained_dir= os.environ.get("bert_pretrained_dir")#"/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" -''' ps1a=[] def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): global ps1a @@ -368,16 +354,7 @@ def close1a(): traceback.print_exc() ps1a=[] return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -''' -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir= os.environ.get("opt_dir") -cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") -''' + ps1b=[] def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): global ps1b @@ -423,15 +400,7 @@ def close1b(): traceback.print_exc() ps1b=[] return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} -''' -inp_text= os.environ.get("inp_text") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir= os.environ.get("opt_dir") -pretrained_s2G= os.environ.get("pretrained_s2G") -''' + ps1c=[] def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): global ps1c @@ -682,7 +651,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown(value="输出logs/实验名目录下应有23456开头的文件和文件夹") with gr.Row(): inp_text = gr.Textbox(label="*文本标注文件",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True) - inp_wav_dir = gr.Textbox(label="*训练集音频文件目录",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",interactive=True) + inp_wav_dir = gr.Textbox( + label="*训练集音频文件目录", + # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", + interactive=True, + placeholder="训练集音频文件目录拼list文件的目录。如果list文件已经是绝对路径,这里应该为空。" + ) gr.Markdown(value="1Aa-文本内容") with gr.Row(): gpu_numbers1a = gr.Textbox(label="GPU卡号以-分割,每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True) @@ -759,15 +733,6 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: tts_info = gr.Textbox(label="TTS推理WebUI进程输出信息") if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info]) with gr.TabItem("2-GPT-SoVITS-变声"):gr.Markdown(value="施工中,请静候佳音") - - ''' - os.environ["gpt_path"]=gpt_path - os.environ["sovits_path"]=sovits_path#bert_pretrained_dir - os.environ["cnhubert_base_path"]=cnhubert_base_path#cnhubert_base_dir - os.environ["bert_path"]=bert_path - os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number - ''' - app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, From 48509304992f541fda734e3671942366a87f9f8f Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 20:23:04 +0800 Subject: [PATCH 47/58] Update 2-get-hubert-wav32k.py --- GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 1a5de8c..a5075ff 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -56,8 +56,7 @@ else: def name2go(wav_name): hubert_path="%s/%s.pt"%(hubert_dir,wav_name) if(os.path.exists(hubert_path)):return - if(inp_wav_dir!=""): - wav_path="%s/%s"%(inp_wav_dir,wav_name) + wav_path="%s/%s"%(inp_wav_dir,wav_name) tmp_audio = load_audio(wav_path, 32000) tmp_max = np.abs(tmp_audio).max() if tmp_max > 2.2: From cc33a767ebc7f96c91f8222037b662b3b138fe2a Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 20:24:19 +0800 Subject: [PATCH 48/58] Update webui.py --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index cdc87b7..58821ce 100644 --- a/webui.py +++ b/webui.py @@ -655,7 +655,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label="*训练集音频文件目录", # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", interactive=True, - placeholder="训练集音频文件目录拼list文件的目录。如果list文件已经是绝对路径,这里应该为空。" + placeholder="训练集音频文件目录 拼接 list文件里波形对应的文件名。" ) gr.Markdown(value="1Aa-文本内容") with gr.Row(): From 1afdb42295fc40de1328564f49755ceb8692f65b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 18 Jan 2024 22:12:16 +0800 Subject: [PATCH 49/58] Update webui.py --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index 58821ce..f958a07 100644 --- a/webui.py +++ b/webui.py @@ -532,7 +532,7 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb ps1abc=[] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<28)): + if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)): config={ "inp_text":inp_text, "exp_name":exp_name, From 6dcaf262addba7c2309192152e9794574b59bef3 Mon Sep 17 00:00:00 2001 From: DW <147780325+D3lik@users.noreply.github.com> Date: Fri, 19 Jan 2024 07:34:10 +1100 Subject: [PATCH 50/58] Update webui.py --- webui.py | 1 + 1 file changed, 1 insertion(+) diff --git a/webui.py b/webui.py index f958a07..6f3391c 100644 --- a/webui.py +++ b/webui.py @@ -736,6 +736,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, + share=True, server_port=webui_port_main, quiet=True, ) From 79708faed48458c7d045a3682f3d3c92ff6a2bfe Mon Sep 17 00:00:00 2001 From: Ke Date: Fri, 19 Jan 2024 10:13:17 +0800 Subject: [PATCH 51/58] Disable debug level logging When using `inference_webui.py`, it produces debug level info for http requests, for example: ``` DEBUG:httpcore.http11:response_closed.started ``` Here I changed it to warning level. --- GPT_SoVITS/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py index 0ce03b3..e1a66ea 100644 --- a/GPT_SoVITS/utils.py +++ b/GPT_SoVITS/utils.py @@ -18,7 +18,7 @@ logging.getLogger("matplotlib").setLevel(logging.ERROR) MATPLOTLIB_FLAG = False -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logging.basicConfig(stream=sys.stdout, level=logging.WARNING) logger = logging @@ -310,13 +310,13 @@ def check_git_hash(model_dir): def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.WARNING) formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") if not os.path.exists(model_dir): os.makedirs(model_dir) h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) + h.setLevel(logging.WARNING) h.setFormatter(formatter) logger.addHandler(h) return logger From 76164a07749538bb251b3279992b77b4c1ae4fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AC=AC=E7=B4=97=E7=89=B9?= <66856838+Miuzarte@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:08:31 +0800 Subject: [PATCH 52/58] Add api.py --- api.py | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 api.py diff --git a/api.py b/api.py new file mode 100644 index 0000000..41ddbf6 --- /dev/null +++ b/api.py @@ -0,0 +1,324 @@ +import argparse +import os +import signal +import sys +from time import time as ttime +import torch +import librosa +import soundfile as sf +from fastapi import FastAPI, Request, HTTPException +from fastapi.responses import StreamingResponse +import uvicorn +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np +from feature_extractor import cnhubert +from io import BytesIO +from module.models import SynthesizerTrn +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from text import cleaned_text_to_sequence +from text.cleaner import clean_text +from module.mel_processing import spectrogram_torch +from my_utils import load_audio + +DEFAULT_PORT = 9880 +DEFAULT_CNHUBERT = "GPT_SoVITS/pretrained_models/chinese-hubert-base" +DEFAULT_BERT = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +DEFAULT_HALF = True + +DEFAULT_GPT = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +DEFAULT_SOVITS = "GPT_SoVITS/pretrained_models/s2G488k.pth" + +AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu" + +parser = argparse.ArgumentParser(description="GPT-SoVITS api") + +parser.add_argument("-g", "--gpt_path", type=str, default="", help="GPT模型路径") +parser.add_argument("-s", "--sovits_path", type=str, default="", help="SoVITS模型路径") + +parser.add_argument("-dr", "--default_refer_path", type=str, default="", + help="默认参考音频路径, 请求缺少参考音频时调用") +parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本") +parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") + +parser.add_argument("-d", "--device", type=str, default=AVAILABLE_COMPUTE, help="cuda / cpu") +parser.add_argument("-p", "--port", type=int, default=DEFAULT_PORT, help="default: 9880") +parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") +parser.add_argument("-hp", "--half_precision", action='store_true', default=False) + +parser.add_argument("-hb", "--hubert_path", type=str, default=DEFAULT_CNHUBERT) +parser.add_argument("-b", "--bert_path", type=str, default=DEFAULT_BERT) + +args = parser.parse_args() + +gpt_path = args.gpt_path +sovits_path = args.sovits_path + +default_refer_path = args.default_refer_path +default_refer_text = args.default_refer_text +default_refer_language = args.default_refer_language +has_preset = False + +device = args.device +port = args.port +host = args.bind_addr +is_half = args.half_precision + +cnhubert_base_path = args.hubert_path +bert_path = args.bert_path + +if gpt_path == "": + gpt_path = DEFAULT_GPT + print("[WARN] 未指定GPT模型路径") +if sovits_path == "": + sovits_path = DEFAULT_SOVITS + print("[WARN] 未指定SoVITS模型路径") + +if default_refer_path == "" or default_refer_text == "" or default_refer_language == "": + default_refer_path, default_refer_text, default_refer_language = "", "", "" + print("[INFO] 未指定默认参考音频") + has_preset = False +else: + print(f"[INFO] 默认参考音频路径: {default_refer_path}") + print(f"[INFO] 默认参考音频文本: {default_refer_text}") + print(f"[INFO] 默认参考音频语种: {default_refer_language}") + has_preset = True + +cnhubert.cnhubert_base_path = cnhubert_base_path +tokenizer = AutoTokenizer.from_pretrained(bert_path) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +# bert_model = AutoModelForSequenceClassification.from_pretrained(bert_path, config=bert_path+"/config.json") +if (is_half == True): + bert_model = bert_model.half().to(device) +else: + bert_model = bert_model.to(device) + + +# bert_model=bert_model.to(device) +def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + # if(is_half==True):phone_level_feature=phone_level_feature.half() + return phone_level_feature.T + + +n_semantic = 1024 +dict_s2 = torch.load(sovits_path, map_location="cpu") +hps = dict_s2["config"] + + +class DictToAttrRecursive: + def __init__(self, input_dict): + for key, value in input_dict.items(): + if isinstance(value, dict): + # 如果值是字典,递归调用构造函数 + setattr(self, key, DictToAttrRecursive(value)) + else: + setattr(self, key, value) + + +hps = DictToAttrRecursive(hps) +hps.model.semantic_frame_rate = "25hz" +dict_s1 = torch.load(gpt_path, map_location="cpu") +config = dict_s1["config"] +ssl_model = cnhubert.get_model() +if is_half: + ssl_model = ssl_model.half().to(device) +else: + ssl_model = ssl_model.to(device) + +vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model) +if is_half: + vq_model = vq_model.half().to(device) +else: + vq_model = vq_model.to(device) +vq_model.eval() +print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) +hz = 50 +max_sec = config['data']['max_sec'] +t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False) +t2s_model.load_state_dict(dict_s1["weight"]) +if is_half: + t2s_model = t2s_model.half() +t2s_model = t2s_model.to(device) +t2s_model.eval() +total = sum([param.nelement() for param in t2s_model.parameters()]) +print("Number of parameter: %.2fM" % (total / 1e6)) + + +def get_spepc(hps, filename): + audio = load_audio(filename, int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, + hps.data.win_length, center=False) + return spec + + +dict_language = { + "中文": "zh", + "英文": "en", + "日文": "ja", + "ZH": "zh", + "EN": "en", + "JA": "ja", + "zh": "zh", + "en": "en", + "ja": "ja" +} + + +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language): + t0 = ttime() + prompt_text = prompt_text.strip("\n") + prompt_language, text = prompt_language, text.strip("\n") + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙 + wav16k = torch.from_numpy(wav16k) + if (is_half == True): + wav16k = wav16k.half().to(device) + else: + wav16k = wav16k.to(device) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() + codes = vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + t1 = ttime() + prompt_language = dict_language[prompt_language] + text_language = dict_language[text_language] + phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language) + phones1 = cleaned_text_to_sequence(phones1) + texts = text.split("\n") + audio_opt = [] + zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32) + for text in texts: + phones2, word2ph2, norm_text2 = clean_text(text, text_language) + phones2 = cleaned_text_to_sequence(phones2) + if (prompt_language == "zh"): + bert1 = get_bert_feature(norm_text1, word2ph1).to(device) + else: + bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to( + device) + if (text_language == "zh"): + bert2 = get_bert_feature(norm_text2, word2ph2).to(device) + else: + bert2 = torch.zeros((1024, len(phones2))).to(bert1) + bert = torch.cat([bert1, bert2], 1) + + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + prompt = prompt_semantic.unsqueeze(0).to(device) + t2 = ttime() + with torch.no_grad(): + # pred_semantic = t2s_model.model.infer( + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=config['inference']['top_k'], + early_stop_num=hz * max_sec) + t3 = ttime() + # print(pred_semantic.shape,idx) + pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次 + refer = get_spepc(hps, ref_wav_path) # .to(device) + if (is_half == True): + refer = refer.half().to(device) + else: + refer = refer.to(device) + # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] + audio = \ + vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), + refer).detach().cpu().numpy()[ + 0, 0] ###试试重建不带上prompt部分 + audio_opt.append(audio) + audio_opt.append(zero_wav) + t4 = ttime() + print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16) + + +def restart(): + python = sys.executable + os.execl(python, python, *sys.argv) + + +def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language): + if command == "/restart": + restart() + elif command == "/exit": + os.kill(os.getpid(), signal.SIGTERM) + exit(0) + + if ( + refer_wav_path == "" or refer_wav_path is None + or prompt_text == "" or prompt_text is None + or prompt_language == "" or prompt_language is None + ): + refer_wav_path, prompt_text, prompt_language = ( + default_refer_path, + default_refer_text, + default_refer_language, + ) + if not has_preset: + raise HTTPException(status_code=400, detail="未指定参考音频且接口无预设") + + with torch.no_grad(): + gen = get_tts_wav( + refer_wav_path, prompt_text, prompt_language, text, text_language + ) + sampling_rate, audio_data = next(gen) + + wav = BytesIO() + sf.write(wav, audio_data, sampling_rate, format="wav") + wav.seek(0) + + torch.cuda.empty_cache() + return StreamingResponse(wav, media_type="audio/wav") + + +app = FastAPI() + + +@app.post("/") +async def tts_endpoint(request: Request): + json_post_raw = await request.json() + return handle( + json_post_raw.get("command"), + json_post_raw.get("refer_wav_path"), + json_post_raw.get("prompt_text"), + json_post_raw.get("prompt_language"), + json_post_raw.get("text"), + json_post_raw.get("text_language"), + ) + + +@app.get("/") +async def tts_endpoint( + command: str = None, + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None, + text: str = None, + text_language: str = None, +): + return handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language) + + +if __name__ == "__main__": + uvicorn.run(app, host=host, port=port, workers=1) From 192668435b36502bef3bce259e5543266bf5f45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AC=AC=E7=B4=97=E7=89=B9?= <66856838+Miuzarte@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:15:35 +0800 Subject: [PATCH 53/58] Match config.py --- api.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/api.py b/api.py index 41ddbf6..8a476cd 100644 --- a/api.py +++ b/api.py @@ -19,16 +19,17 @@ from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch from my_utils import load_audio +from config import python_exec, infer_device, is_half, api_port -DEFAULT_PORT = 9880 +DEFAULT_PORT = api_port DEFAULT_CNHUBERT = "GPT_SoVITS/pretrained_models/chinese-hubert-base" DEFAULT_BERT = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" -DEFAULT_HALF = True +DEFAULT_HALF = is_half DEFAULT_GPT = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" DEFAULT_SOVITS = "GPT_SoVITS/pretrained_models/s2G488k.pth" -AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu" +# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu" parser = argparse.ArgumentParser(description="GPT-SoVITS api") @@ -40,7 +41,7 @@ parser.add_argument("-dr", "--default_refer_path", type=str, default="", parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本") parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") -parser.add_argument("-d", "--device", type=str, default=AVAILABLE_COMPUTE, help="cuda / cpu") +parser.add_argument("-d", "--device", type=str, default=infer_device, help="cuda / cpu") parser.add_argument("-p", "--port", type=int, default=DEFAULT_PORT, help="default: 9880") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-hp", "--half_precision", action='store_true', default=False) @@ -253,14 +254,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16) -def restart(): - python = sys.executable - os.execl(python, python, *sys.argv) - - def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language): if command == "/restart": - restart() + os.execl(python_exec, python_exec, *sys.argv) elif command == "/exit": os.kill(os.getpid(), signal.SIGTERM) exit(0) From 18c390768298d5950c6f1679e391a32f372b0d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AC=AC=E7=B4=97=E7=89=B9?= <66856838+Miuzarte@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:25:15 +0800 Subject: [PATCH 54/58] Update config.py --- config.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/config.py b/config.py index 8e9721a..aeeffe5 100644 --- a/config.py +++ b/config.py @@ -1,10 +1,13 @@ import sys -is_half=True -exp_root="logs" -python_exec=sys.executable or "python" -infer_device="cuda" -webui_port_main=9874 -webui_port_uvr5=9873 -webui_port_infer_tts=9872 -webui_port_subfix=9871 +is_half = True +exp_root = "logs" +python_exec = sys.executable or "python" +infer_device = "cuda" + +webui_port_main = 9874 +webui_port_uvr5 = 9873 +webui_port_infer_tts = 9872 +webui_port_subfix = 9871 + +api_port = 9880 From e0590b9c2659b6aa1ffd468babad52b7b6fdaba2 Mon Sep 17 00:00:00 2001 From: Yongzheng Lai Date: Fri, 19 Jan 2024 07:15:11 +0000 Subject: [PATCH 55/58] fix: users.pth path check --- webui.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index 6f3391c..3b93e15 100644 --- a/webui.py +++ b/webui.py @@ -20,11 +20,12 @@ if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" #os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" for site_packages_root in site_packages_roots: - with open("%s/users.pth" % (site_packages_root), "w") as f: - f.write( - "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" - % (now_dir, now_dir, now_dir, now_dir, now_dir) - ) + if os.path.exists("%s/users.pth" % (site_packages_root)): + with open("%s/users.pth" % (site_packages_root), "w") as f: + f.write( + "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" + % (now_dir, now_dir, now_dir, now_dir, now_dir) + ) import traceback sys.path.append(now_dir) import shutil From d2c2d4eb34a6dcbd8f0127b212ad4cedd434a2a0 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:23:14 +0800 Subject: [PATCH 56/58] Update webui.py --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index 3b93e15..02ba03d 100644 --- a/webui.py +++ b/webui.py @@ -20,7 +20,7 @@ if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" #os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" for site_packages_root in site_packages_roots: - if os.path.exists("%s/users.pth" % (site_packages_root)): + if os.path.exists(site_packages_root): with open("%s/users.pth" % (site_packages_root), "w") as f: f.write( "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5" From 95bb2c921e857971d72b0118fd4f65691fd594a6 Mon Sep 17 00:00:00 2001 From: c4fun Date: Fri, 19 Jan 2024 20:49:03 +0800 Subject: [PATCH 57/58] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E8=BE=93=E5=85=A5?= =?UTF-8?q?=E7=9B=AE=E6=A0=87=E6=96=87=E6=9C=AC=E7=9A=84=E7=A9=BA=E8=A1=8C?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E6=8A=A5=E9=94=99=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/inference_webui.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 7920d60..e5e604f 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -175,6 +175,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) dtype=np.float16 if is_half == True else np.float32, ) for text in texts: + # 解决输入目标文本的空行导致报错的问题 + if (len(text.strip()) == 0): + continue phones2, word2ph2, norm_text2 = clean_text(text, text_language) phones2 = cleaned_text_to_sequence(phones2) if prompt_language == "zh": From 426cc32258fb094e50097c3833c2b1da316e0a8e Mon Sep 17 00:00:00 2001 From: http-404-usernotfound <107795857+http-404-usernotfound@users.noreply.github.com> Date: Fri, 19 Jan 2024 21:47:42 +0530 Subject: [PATCH 58/58] Update README.md Some users were facing problems while installing the PIP packages because distutils.cmd module was missing in their Python environment. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 2adecf0..2c88c3b 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,12 @@ conda activate GPTSoVits bash install.sh ``` ### Install Manually +#### Make sure you have the distutils for python3.9 installed + +```bash +sudo apt-get install python3.9-distutils +``` + #### Pip Packages ```bash