Merge 60414d25a39f3786a392523297734c144d1c59a9 into ea2d2a81667239d37615697e8f0056e35bab2db6

This commit is contained in:
__kaning123__ 2026-04-19 14:16:06 +01:00 committed by GitHub
commit a94fd5a14a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 1459 additions and 35 deletions

View File

@ -0,0 +1,178 @@
import zipfile
from . import file_lib as fl
from . import time_lib as tl
from . import info_lib as il
import os
from typing import Union
import numpy as np
import torch
POOL:set = set()
def get_unique_name(name,MySet:set=set()):
_id = 1
if name not in POOL and name not in MySet:
POOL.add(name)
return name
while name in POOL or name in MySet:
_id += 1
name = f'{name}_{_id}'
POOL.add(name)
return name
TEMP_DIR = fl.merge_dir_txt2(fl.get_my_dir(), "Temp")
TEMP_ZIP_DIR = fl.merge_dir_txt2(TEMP_DIR, "ZipTemp")
def _tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
cloned = tensor.clone().detach()
np_array = cloned.cpu().numpy()
return np_array
def save_np(path: str, np_array: np.ndarray) -> None:
np.save(path, np_array)
class ZIP_File:
def __init__(self, path: str,name:str,MySet:set=set()):
self.path = path
if not os.path.exists(self.path):
with zipfile.ZipFile(self.path, 'w') as zipf:
pass
self.name = get_unique_name(name,MySet=MySet)#MySet用于补充命名集合防止文件夹混淆
self.temp_write = fl.merge_dir_txt2(TEMP_ZIP_DIR, self.name)
if not os.path.exists(self.temp_write):
os.makedirs(self.temp_write)
def release(self):
'''relaese the zip file, extract it to temp dir'''
if os.path.exists(self.temp_write):
fl.delete_dir(self.temp_write)
fl.create_dir(self.temp_write)
with zipfile.ZipFile(self.path, 'r') as zipf:
zipf.extractall(self.temp_write)
#fl.delete_file(self.path)
def create_dir(self, dir_:str):
dir_path = fl.merge_dir_txt2(self.temp_write, dir_)
if not os.path.exists(dir_path):
os.makedirs(dir_path,exist_ok=True)
def create_file(self, file_name:str,location:str=''):
if location == '':
file_path = fl.merge_dir_txt2(self.temp_write,file_name)
else:
file_path = fl.merge_dir_txt2(self.temp_write, location, file_name)
if not os.path.exists(file_path):
os.makedirs(os.path.dirname(file_path),exist_ok=True)
with open(file_path, 'w') as f:
pass
def get_file_path(self, file_name:str,location:str=''):
if location == '':
file_path = fl.merge_dir_txt2(self.temp_write,file_name)
else:
file_path = fl.merge_dir_txt2(self.temp_write, location, file_name)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File {file_path} does not exist.")
return file_path
def get_file_obj(self, file_name:str,location:str='',mode:str='r'):
if location == '':
file_path = fl.merge_dir_txt2(self.temp_write,file_name)
else:
file_path = fl.merge_dir_txt2(self.temp_write, location, file_name)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File {file_path} does not exist.")
return open(file_path, mode)
def save_file(self, obj):
obj.close()
def save_zip(self):
with zipfile.ZipFile(self.path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(self.temp_write):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, self.temp_write)
zipf.write(file_path, arcname)
#fl.delete_dir(self.temp_write)
def close(self):
self.save_zip()
fl.delete_dir(self.temp_write)
POOL.remove(self.name)
def save_tensor(path: str,
tensors: Union[torch.Tensor, list],
name:str,
MySet:set=set(),
file_names:Union[str,list,None]=None,
**info_save,) -> None:
if isinstance(tensors, torch.Tensor):
tensors = [tensors]
if not file_names:
return
if isinstance(file_names, str):
files = [file_names]
else:
files = file_names
print(f"length of tensors: {len(tensors)}, length of files: {len(files)}")
if len(tensors) != len(files):
raise ValueError("The number of tensors and files must be the same.")
np_arrays = []
for tensor in tensors:
np_array = _tensor_to_numpy(tensor)
np_arrays.append(np_array)
zf = ZIP_File(path, name, MySet=MySet)
zf.create_file("voice.json")
info = {'name': name}
info.update(info_save)
il.save_info(info, str(zf.get_file_path("voice.json")))
for i in range(len(files)):
file_name = files[i]
np_array = np_arrays[i]
zf.create_file(file_name)
save_np(str(zf.get_file_path(file_name)), np_array)
zf.close()
del zf
def load_tensor(path: str,
name:str,
find_func,
MySet:set=set(),) -> list[torch.Tensor]:
zf = ZIP_File(path, name, MySet=MySet)
zf.release()
voice_path = find_func(zf,il)
tensors = []
for i in range(len(voice_path)):
v = voice_path[i]
np_array = np.load(v,allow_pickle=True)
tensor = torch.from_numpy(np_array)
tensors.append(tensor)
zf.close()
del zf
return tensors
def add_tensor(add:list[torch.Tensor],
path: str,
name:str,
find_func,
MySet:set=set(),
file_names:Union[str,list,None]=None,
**info_save,):
tensors = load_tensor(path,name,find_func,MySet=MySet)
tensors.extend(add)
save_tensor(path,tensors,name,MySet=MySet,file_names=file_names,**info_save)
def __find_func__(zf,il):
f = zf.get_file_path("voice.json")
info = il.load_info(f)
if info is None:
return None
list_names = info["access_list"]
ret = []
for name in list_names:
try:
a = zf.get_file_path(name)
ret.append(a)
except FileNotFoundError:
continue
return ret

View File

@ -0,0 +1,35 @@
import os
import shutil
from pathlib import Path
def get_my_dir():
return os.path.dirname(os.path.abspath(__file__))
def get_parent_dir(dir_path,depth=1):
parent_path = Path(dir_path)
for _ in range(depth):
parent_path = parent_path.parent
return parent_path
def merge_dir_txt(a,b):
c=os.path.join(a,b)
return c
def merge_dir_txt2(*TXT):
return Path(os.path.join(*TXT))
def create_dir(path: Path, overwrite=False) -> bool:
if overwrite and path.exists():
shutil.rmtree(path)
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
return path.exists()
def get_dir_children_dirs(path: Path):
return [item for item in path.iterdir() if item.is_dir()]
def get_dir_children_files(path: Path):
return [item for item in path.iterdir() if item.is_file()]
def delete_dir(path: Path):
return shutil.rmtree(path)
def delete_file(path: Path):
return os.remove(path)
def file_exists(path: Path):
path = Path(path)
return path.exists()

View File

@ -0,0 +1,10 @@
import json
def load_info(info_path):
with open(info_path, 'r', encoding='utf-8') as f:
info = json.load(f)
return info
def save_info(info, info_path):
with open(info_path, 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=4)

View File

@ -0,0 +1,38 @@
import time
#time styles
STYLE_Y = "%Y"
STYLE_M = "%m"
STYLE_D = "%d"
STYLE_H = "%H"
STYLE_MIN = "%M"
STYLE_S = "%S"
STYLE_FULL = "%Y-%m-%d_%H.%M.%S"
#quick calls
def get_time_y(STYLE = STYLE_Y):
return time.strftime(STYLE, time.localtime())
def get_time_m(STYLE = STYLE_M):
return time.strftime(STYLE, time.localtime())
def get_time_d(STYLE = STYLE_D):
return time.strftime(STYLE, time.localtime())
def get_time_h(STYLE = STYLE_H):
return time.strftime(STYLE, time.localtime())
def get_time_min(STYLE = STYLE_MIN):
return time.strftime(STYLE, time.localtime())
def get_time_s(STYLE = STYLE_S):
return time.strftime(STYLE, time.localtime())
def get_time_full(STYLE = STYLE_FULL):
return time.strftime(STYLE, time.localtime())
def s(t:float):
time.sleep(t)
return
###
if __name__ == '__main__':
print(get_time_y())
print(get_time_m())
print(get_time_d())
print(get_time_h())
print(get_time_min())
print(get_time_s())
print(get_time_full())

7
GPT_SoVITS/config.json Normal file
View File

@ -0,0 +1,7 @@
{
"running_on" : "local",
"Default":{
"GPT_Path": "不训练直接推v3底模",
"SoVITS_Path": "不训练直接推v2ProPlus底模"
}
}

View File

@ -24,6 +24,7 @@ class CNHubert(nn.Module):
super().__init__() super().__init__()
if base_path is None: if base_path is None:
base_path = cnhubert_base_path base_path = cnhubert_base_path
print(f"Loading CN-Hubert from \"{base_path}\"")
if os.path.exists(base_path): if os.path.exists(base_path):
... ...
else: else:
@ -69,6 +70,7 @@ class CNHubert(nn.Module):
def get_model(): def get_model():
print("cnhubert_base_path:", cnhubert_base_path)
model = CNHubert() model = CNHubert()
model.eval() model.eval()
return model return model

View File

@ -8,6 +8,62 @@
""" """
import psutil import psutil
import os import os
import sys
import json
from pathlib import Path
import uuid
from scipy.io.wavfile import write
def get_my_dir():
return os.path.dirname(os.path.abspath(__file__))
def get_parent_dir(dir_path,depth=1):
parent_path = Path(dir_path)
for _ in range(depth):
parent_path = parent_path.parent
return parent_path
def merge_dir_txt2(*TXT):
return Path(os.path.join(*TXT))
with open(merge_dir_txt2(get_my_dir(), "config.json"), "r", encoding="utf-8") as f:
config_json = f.read()
config_json = json.loads(config_json)
running_on = config_json["running_on"]
Default = config_json["Default"]
ROOT_DIR = str(get_parent_dir(get_my_dir()))
sys.path.append(get_my_dir())
import VoiceSave
POOL:set = set()
def _get_unique_name(name,MySet:set=set()):
_id = 1
if name not in POOL and name not in MySet:
POOL.add(name)
return name
while name in POOL or name in MySet:
_id += 1
name = f'{name}_{_id}'
POOL.add(name)
return name
def find_func(zf,il):
f = zf.get_file_path("voice.json")
info = il.load_info(f)
if info is None:
return None
list_names = info["access_list"]
ret = []
for name in list_names:
try:
a = zf.get_file_path(name)
ret.append(a)
except FileNotFoundError:
continue
return ret
def set_high_priority(): def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS""" """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
@ -70,6 +126,7 @@ with open("./weight.json", "r", encoding="utf-8") as file:
if isinstance(sovits_path, list): if isinstance(sovits_path, list):
sovits_path = sovits_path[0] sovits_path = sovits_path[0]
# print(2333333) # print(2333333)
# print(os.environ["gpt_path"]) # print(os.environ["gpt_path"])
# print(gpt_path) # print(gpt_path)
@ -96,7 +153,7 @@ import numpy as np
from feature_extractor import cnhubert from feature_extractor import cnhubert
from transformers import AutoModelForMaskedLM, AutoTokenizer from transformers import AutoModelForMaskedLM, AutoTokenizer
cnhubert.cnhubert_base_path = cnhubert_base_path cnhubert.cnhubert_base_path = merge_dir_txt2(ROOT_DIR, cnhubert_base_path)
import random import random
@ -130,6 +187,12 @@ language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language) i18n = I18nAuto(language=language)
if gpt_path in [None, "",]:
gpt_path = str(merge_dir_txt2(ROOT_DIR, name2gpt_path[i18n(Default["GPT_Path"])]))
if sovits_path in [None, "",]:
sovits_path = str(merge_dir_txt2(ROOT_DIR, name2sovits_path[i18n(Default["SoVITS_Path"])]))
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
if torch.cuda.is_available(): if torch.cuda.is_available():
@ -160,8 +223,8 @@ dict_language_v2 = {
} }
dict_language = dict_language_v1 if version == "v1" else dict_language_v2 dict_language = dict_language_v1 if version == "v1" else dict_language_v2
tokenizer = AutoTokenizer.from_pretrained(bert_path) tokenizer = AutoTokenizer.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path)))
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) bert_model = AutoModelForMaskedLM.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path)))
if is_half == True: if is_half == True:
bert_model = bert_model.half().to(device) bert_model = bert_model.half().to(device)
else: else:
@ -374,6 +437,7 @@ except:
def change_gpt_weights(gpt_path): def change_gpt_weights(gpt_path):
print("gpt_path:", gpt_path)
if "" in gpt_path or "!" in gpt_path: if "" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path] gpt_path = name2gpt_path[gpt_path]
global hz, max_sec, t2s_model, config global hz, max_sec, t2s_model, config
@ -765,6 +829,27 @@ def get_tts_wav(
sample_steps=8, sample_steps=8,
if_sr=False, if_sr=False,
pause_second=0.3, pause_second=0.3,
SaveSvEmb=False,
SaveRefers=False,
SaveSvEmbName="sv_emb.voice",
SaveRefersName="refers.voice",
SaveGE=False,
SaveGEName="ge.voice",
InjectSvEmb=False,
InjectRefers=False,
InjectSvEmbName="sv_emb.voice",
InjectRefersName="refers.voice",
EnableAudioLoad=True,
SaveOutputAsUndecoded=False,
SaveOutputAsUndecodedName="output.voice",
AddRandomSaltToSaveOutputAsUndecodedName=False,
ReturnWay = "yield", # "yield" or "return"
): ):
global cache global cache
if ref_wav_path: if ref_wav_path:
@ -898,20 +983,146 @@ def get_tts_wav(
sv_emb = [] sv_emb = []
if sv_cn_model == None: if sv_cn_model == None:
init_sv_cn() init_sv_cn()
if inp_refs:
for path in inp_refs: try:
try: #####这里加上提取sv的逻辑要么一堆sv一堆refer要么单个sv单个refer if EnableAudioLoad:
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) if inp_refs:
refers.append(refer) for path in inp_refs:
try: #####这里加上提取sv的逻辑要么一堆sv一堆refer要么单个sv单个refer
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
refers.append(refer)
if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
#print("refer:", refer.shape)
except:
traceback.print_exc()
if len(refers) == 0:
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
refers = [refers]
if is_v2pro: if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor)) sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
except: else:
traceback.print_exc() refers = []
if len(refers) == 0: sv_emb = []
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) except:
refers = [refers] traceback.print_exc()
if is_v2pro:
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)] try:
if SaveSvEmb and is_v2pro:
names = []
for i in sv_emb:
names.append(_get_unique_name(str(i.shape))+".npy")
sv_path = merge_dir_txt2(ROOT_DIR,"output","sv_emb_opt")
if not os.path.exists(sv_path):
os.makedirs(sv_path,exist_ok=True)
if not os.path.exists(SaveSvEmbName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","sv_emb_opt",SaveSvEmbName))
else:
_pth_ = SaveSvEmbName
VoiceSave.save_tensor(_pth_,sv_emb,SaveSvEmbName,file_names=names,access_list=names)
except:
traceback.print_exc()
try:
if SaveRefers:
names = []
for i in refers:
names.append(_get_unique_name(str(i.shape))+".npy")
refers_path = merge_dir_txt2(ROOT_DIR,"output","refers_opt")
if not os.path.exists(refers_path):
os.makedirs(refers_path,exist_ok=True)
if not os.path.exists(SaveRefersName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","refers_opt",SaveRefersName))
else:
_pth_ = SaveRefersName
VoiceSave.save_tensor(_pth_,refers,SaveRefersName,file_names=names,access_list=names)
except:
traceback.print_exc()
#print("refers数量:", len(refers))
#print("sv_emb数量:", len(sv_emb) if is_v2pro else "无sv_emb")
try:
if InjectSvEmb and is_v2pro:
if not os.path.exists(InjectSvEmbName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","sv_emb_opt",InjectSvEmbName))
else:
_pth_ = InjectSvEmbName
_sv_emb = VoiceSave.load_tensor(_pth_,InjectSvEmbName,find_func)
for i in range(len(_sv_emb)):
sv_emb.append(_sv_emb[i].to(device))
except:
traceback.print_exc()
try:
if InjectRefers:
if not os.path.exists(InjectRefersName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","refers_opt",InjectRefersName))
else:
_pth_ = InjectRefersName
_refers = VoiceSave.load_tensor(_pth_,InjectRefersName,find_func)
for i in range(len(_refers)):
refers.append(_refers[i].to(device))
except:
traceback.print_exc()
#print("注入后refers数量:", len(refers))
#print("注入后sv_emb数量:", len(sv_emb) if is_v2pro else "无sv_emb")
try:
ges = []
for i in range(len(refers)):
if is_v2pro:
ge_ = vq_model.ge_(refers[i],sv_emb[i])
else:
ge_ = vq_model.ge_(refers[i])
ges.append(ge_)
if SaveGE:
names = []
for i in ges:
names.append(_get_unique_name(str(i.shape))+".npy")
ge_path = merge_dir_txt2(ROOT_DIR,"output","ge_opt")
if not os.path.exists(ge_path):
os.makedirs(ge_path,exist_ok=True)
if not os.path.exists(SaveGEName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","ge_opt",SaveGEName))
else:
_pth_ = SaveGEName
VoiceSave.save_tensor(_pth_,ges,SaveGEName,file_names=names,access_list=names)
except:
traceback.print_exc()
if AddRandomSaltToSaveOutputAsUndecodedName:
ranA = uuid.uuid4()
ranB = uuid.uuid4()
SaveOutputAsUndecodedName = f"{SaveOutputAsUndecodedName}_{ranA}_{ranB}.voice"
try:
if SaveOutputAsUndecoded:
if is_v2pro:
z_p,mask,ge = vq_model.decode2(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
refers, speed=speed, sv_emb=sv_emb)
else:
z_p,mask,ge = vq_model.decode2(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
refers, speed=speed)
ret = [z_p.cpu().detach(),
mask.cpu().detach(),
ge.cpu().detach()]
names = [f"z_p_{str(ret[0].shape)}",
f"mask_{str(ret[1].shape)}",
f"ge_{str(ret[2].shape)}"]
undecoded_path = merge_dir_txt2(ROOT_DIR,"output","undecoded_opt")
if not os.path.exists(undecoded_path):
os.makedirs(undecoded_path,exist_ok=True)
if not os.path.exists(SaveOutputAsUndecodedName):
_pth_ = str(merge_dir_txt2(ROOT_DIR,"output","undecoded_opt",SaveOutputAsUndecodedName))
else:
_pth_ = SaveOutputAsUndecodedName
VoiceSave.save_tensor(_pth_,ret,SaveOutputAsUndecodedName,file_names=names,access_list=names)
except:
traceback.print_exc()
if is_v2pro: if is_v2pro:
audio = vq_model.decode( audio = vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb
@ -998,8 +1209,215 @@ def get_tts_wav(
audio_opt /= max_audio audio_opt /= max_audio
else: else:
audio_opt = audio_opt.cpu().detach().numpy() audio_opt = audio_opt.cpu().detach().numpy()
yield opt_sr, (audio_opt * 32767).astype(np.int16)
if ReturnWay == "yield":
yield opt_sr, (audio_opt * 32767).astype(np.int16)
else:
return opt_sr, (audio_opt * 32767).astype(np.int16)
def batched_tts_wav(
ref_wav_path,
prompt_text,
prompt_language,
texts,
text_language,
how_to_cut=i18n("不切"),
top_k=20,
top_p=0.6,
temperature=0.6,
ref_free=False,
speed=1,
if_freeze=False,
inp_refs=None,
sample_steps=8,
if_sr=False,
pause_second=0.3,
SaveSvEmb=False,
SaveRefers=False,
SaveSvEmbName="sv_emb.voice",
SaveRefersName="refers.voice",
SaveGE=False,
SaveGEName="ge.voice",
InjectSvEmb=False,
InjectRefers=False,
InjectSvEmbName="sv_emb.voice",
InjectRefersName="refers.voice",
EnableAudioLoad=True,
SaveOutputAsUndecoded=False,
SaveOutputAsUndecodedName="output.voice",
AddRandomSaltToSaveOutputAsUndecodedName=False,
ReturnWay = "yield", # "yield" or "return"
):
count = 0
out = []
SaveDir = merge_dir_txt2(ROOT_DIR,"output","tts_output",f"batch_{uuid.uuid4()}")
if not os.path.exists(SaveDir):
os.makedirs(SaveDir,exist_ok=True)
for text in texts:
if text in [None, " ", ""]:
gr.Warning(i18n(f"输入文本第{count}行中有空行,已跳过"))
continue
else:
unparsed = get_tts_wav(
ref_wav_path,
prompt_text,
prompt_language,
text,
text_language,
how_to_cut,
top_k,
top_p,
temperature,
ref_free,
speed,
if_freeze,
inp_refs,
sample_steps,
if_sr,
pause_second,
SaveSvEmb,
SaveRefers,
SaveSvEmbName,
SaveRefersName,
SaveGE,
SaveGEName,
InjectSvEmb,
InjectRefers,
InjectSvEmbName,
InjectRefersName,
EnableAudioLoad,
SaveOutputAsUndecoded,
SaveOutputAsUndecodedName,
AddRandomSaltToSaveOutputAsUndecodedName,
"yield",
)
unparsed = list(unparsed)
print(unparsed)
a = text.strip().replace(' ','_').replace('\n','_')
wav_path = os.path.join(SaveDir,f"tts_output_{a}_{str(uuid.uuid4())}.wav")
write(wav_path, unparsed[0][0], unparsed[0][1])
out.append(wav_path)
count += 1
if ReturnWay == "yield":
yield SaveDir
else:
return SaveDir
def read_tts_batch_file(file_path):
ret = []
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for l in lines:
if l.strip() in [None, " ", ""]:
continue
else:
ret.append(l)
return ret
def batch_tts(
ref_wav_path,
prompt_text,
prompt_language,
text_paths,
text_language,
how_to_cut=i18n("不切"),
top_k=20,
top_p=0.6,
temperature=0.6,
ref_free=False,
speed=1,
if_freeze=False,
inp_refs=None,
sample_steps=8,
if_sr=False,
pause_second=0.3,
SaveSvEmb=False,
SaveRefers=False,
SaveSvEmbName="sv_emb.voice",
SaveRefersName="refers.voice",
SaveGE=False,
SaveGEName="ge.voice",
InjectSvEmb=False,
InjectRefers=False,
InjectSvEmbName="sv_emb.voice",
InjectRefersName="refers.voice",
EnableAudioLoad=True,
SaveOutputAsUndecoded=False,
SaveOutputAsUndecodedName="output.voice",
AddRandomSaltToSaveOutputAsUndecodedName=False,
ReturnWay = "yield", # "yield" or "return"
):
print(text_paths)
text_list = []
for i in text_paths:
text_list.extend(read_tts_batch_file(i))
out = batched_tts_wav(
ref_wav_path,
prompt_text,
prompt_language,
text_list,
text_language,
how_to_cut,
top_k,
top_p,
temperature,
ref_free,
speed,
if_freeze,
inp_refs,
sample_steps,
if_sr,
pause_second,
SaveSvEmb,
SaveRefers,
SaveSvEmbName,
SaveRefersName,
SaveGE,
SaveGEName,
InjectSvEmb,
InjectRefers,
InjectSvEmbName,
InjectRefersName,
EnableAudioLoad,
SaveOutputAsUndecoded,
SaveOutputAsUndecodedName,
AddRandomSaltToSaveOutputAsUndecodedName,
"yield"
)
out = list(out)
if ReturnWay == "yield":
yield out
else:
return out
def close_serv():
if running_on == "local":
sys.exit(0)
else:
gr.Warning(i18n("服务器环境下该功能不可用"))
def split(todo_text): def split(todo_text):
todo_text = todo_text.replace("……", "").replace("——", "") todo_text = todo_text.replace("……", "").replace("——", "")
@ -1178,6 +1596,112 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
) )
) )
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1) prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1)
SaveSvEmb = gr.Checkbox(
label=i18n("保存参考音频的语义向量"),
interactive=True,
show_label=True,
value = False,
visible=False if model_version not in {"v2Pro","v2ProPlus"} else True
)
SaveRefers = gr.Checkbox(
label=i18n("保存参考音频的声纹特征"),
interactive=True,
show_label=True,
value = False,
visible=True
)
SaveSvEmbName = gr.Textbox(
label=i18n("保存的语义向量文件名默认保存在output/sv_emb_opt目录下"),
value="sv_emb.voice",
interactive=True,
visible=True,
)
SaveRefersName = gr.Textbox(
label=i18n("保存的声纹特征文件名默认保存在output/refers_opt目录下"),
value="refers.voice",
interactive=True,
visible=True,
)
InjectSvEmb = gr.Checkbox(
label=i18n("注入参考音频的语义向量"),
interactive=True,
show_label=True,
value = False,
visible=False if model_version not in {"v2Pro","v2ProPlus"} else True
)
InjectRefers = gr.Checkbox(
label=i18n("注入参考音频的声纹特征"),
interactive=True,
show_label=True,
value = False,
visible=True
)
InjectSvEmbName = gr.Textbox(
label=i18n("注入的语义向量文件名默认保存在output/sv_emb_opt目录下"),
value="sv_emb.voice",
interactive=True,
visible=True,
)
InjectRefersName = gr.Textbox(
label=i18n("注入的声纹特征文件名默认保存在output/refers_opt目录下"),
value="refers.voice",
interactive=True,
visible=True,
)
EnableAudioLoad = gr.Checkbox(
label=i18n("启用音频加载。开启后会加载参考音频"),
value=True,
interactive=True,
show_label=True,
visible=True,
)
SaveGE = gr.Checkbox(
label = i18n("保存GE"),
value = True,
interactive = True,
show_label = True,
visible = True,
)
SaveGEName = gr.Textbox(
label = i18n("保存的GE文件名默认保存在output/ge_opt目录下"),
value = "ge.voice",
interactive = True,
show_label = True,
visible = True,
)
SaveOutputAsUndecoded = gr.Checkbox(
label = i18n("保存未解码的输出"),
value = False,
interactive = True,
show_label = True,
visible = True,
)
SaveOutputAsUndecodedName = gr.Textbox(
label = i18n("保存的未解码输出文件名默认保存在output/undecoded_opt目录下"),
value = "output.voice",
interactive = True,
show_label = True,
visible = True,
)
AddRandomSaltToSaveOutputAsUndecodedName = gr.Checkbox(
label = i18n("给未解码输出文件名添加随机盐,防止覆盖"),
value = False,
interactive = True,
show_label = True,
visible = True,
)
with gr.Column(scale=14): with gr.Column(scale=14):
prompt_language = gr.Dropdown( prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"), label=i18n("参考音频的语种"),
@ -1200,6 +1724,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
visible=False, visible=False,
) )
) )
sample_steps = ( sample_steps = (
gr.Radio( gr.Radio(
label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
@ -1222,6 +1747,25 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
show_label=True, show_label=True,
visible=False if model_version != "v3" else True, visible=False if model_version != "v3" else True,
) )
with gr.Row():
gr.Markdown(html_center(i18n("批量语音合成参数"), "h3"))
with gr.Column(scale=13):
txt_paths = gr.File(label=i18n("批量语音合成文本文件,每行一个文本"),
file_types=[".txt"],
interactive=True,
file_count="multiple",
scale=13)
with gr.Column(scale=7):
out = gr.File(label=i18n("批量合成输出的语音文件"),
file_types=[".wav"],
file_count="directory",)
start_batch_btn = gr.Button(i18n("开始批量合成"),
variant="primary",
size="lg",
interactive=True,
scale=25)
gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
with gr.Row(): with gr.Row():
with gr.Column(scale=13): with gr.Column(scale=13):
@ -1286,6 +1830,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
output = gr.Audio(label=i18n("输出的语音"), scale=14) output = gr.Audio(label=i18n("输出的语音"), scale=14)
with gr.Row():
close_button = gr.Button(value=i18n("关闭服务器"), variant="danger", size="lg", scale=25)
close_button.click(close_serv)
inference_button.click( inference_button.click(
get_tts_wav, get_tts_wav,
[ [
@ -1305,9 +1854,71 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
sample_steps, sample_steps,
if_sr_Checkbox, if_sr_Checkbox,
pause_second_slider, pause_second_slider,
SaveSvEmb,
SaveRefers,
SaveSvEmbName,
SaveRefersName,
SaveGE,
SaveGEName,
InjectSvEmb,
InjectRefers,
InjectSvEmbName,
InjectRefersName,
EnableAudioLoad,
SaveOutputAsUndecoded,
SaveOutputAsUndecodedName,
AddRandomSaltToSaveOutputAsUndecodedName,
], ],
[output], [output],
api_name="get_tts_wav",
) )
start_batch_btn.click(
batch_tts,
[
inp_ref,
prompt_text,
prompt_language,
txt_paths,
text_language,
how_to_cut,
top_k,
top_p,
temperature,
ref_text_free,
speed,
if_freeze,
inp_refs,
sample_steps,
if_sr_Checkbox,
pause_second_slider,
SaveSvEmb,
SaveRefers,
SaveSvEmbName,
SaveRefersName,
SaveGE,
SaveGEName,
InjectSvEmb,
InjectRefers,
InjectSvEmbName,
InjectRefersName,
EnableAudioLoad,
SaveOutputAsUndecoded,
SaveOutputAsUndecodedName,
AddRandomSaltToSaveOutputAsUndecodedName,
],
[out],
api_name="batch_tts",
)
SoVITS_dropdown.change( SoVITS_dropdown.change(
change_sovits_weights, change_sovits_weights,
[SoVITS_dropdown, prompt_language, text_language], [SoVITS_dropdown, prompt_language, text_language],

View File

@ -0,0 +1,175 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchaudio
import math
from torchaudio.transforms import Resample
import VoiceSave
import uuid
def get_train_set(voice_file_path):
if type(voice_file_path) == str:
voice_file_path = [voice_file_path]
ret = []
for i in voice_file_path:
tensors_ = VoiceSave.load_tensor(i,
f"get_{uuid.uuid4()}",
find_func=VoiceSave.__find_func__,
MySet=set())
ret.append(tensors_)
return ret
class MelSpectrogram(nn.Module):
def __init__(self, hps):
super().__init__()
self.filter_length = hps.data.filter_length
self.hop_length = hps.data.hop_length
self.win_length = hps.data.win_length
self.sampling_rate = hps.data.sampling_rate
self.n_mel_channels = hps.data.n_mel_channels
self.mel_fmin = hps.data.mel_fmin if hasattr(hps.data, 'mel_fmin') else 0
self.mel_fmax = hps.data.mel_fmax if hasattr(hps.data, 'mel_fmax') else None
# 构建梅尔频谱变换
self.mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=self.sampling_rate,
n_fft=self.filter_length,
hop_length=self.hop_length,
win_length=self.win_length,
f_min=self.mel_fmin,
f_max=self.mel_fmax,
n_mels=192, # self.n_mel_channels,
window_fn=torch.hann_window,
center=False,
power=1.0,
)
def forward(self, audio):
"""
输入audio [B, 1, T] [1, T]单声道音频
输出mel_spec [B, n_mel_channels, T']
"""
if len(audio.shape) == 2:
audio = audio.unsqueeze(0) # [1, T] → [1, 1, T]
# 提取梅尔频谱
mel_spec = self.mel_transform(audio.squeeze(1)) # [B, n_mel, T']
# 对数缩放TTS标准操作
mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
return mel_spec
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_seq_length=5000):
super(PositionalEncoding, self).__init__()
self.pe = torch.zeros(max_seq_length, d_model) # 初始化位置编码矩阵
position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
self.pe[:, 0::2] = torch.sin(position * div_term) # 偶数位置使用正弦函数
self.pe[:, 1::2] = torch.cos(position * div_term) # 奇数位置使用余弦函数
self.register_buffer('pe', self.pe.unsqueeze(0)) # 注册为缓冲区
def forward(self, x):
# 将位置编码添加到输入中
return x + self.pe[:, :x.size(1)]
class Spliter(nn.Module):
'''output: z_p shape: torch.Size([1, 192, x]), y_mask shape: torch.Size([1, 1, x]), ge shape: torch.Size([1, 1024, 1])'''
def __init__(self,
hps,
ge,
device):
super().__init__()
self.hps = hps
self.ge = ge
self.device = device
#TODO: 将mel_spec与ge输入Transformer模型
self.mel_dim = 192
self.ge_dim = 1024
self.transformer_dim = 512
self.ge_proj = nn.Linear(self.ge_dim, self.transformer_dim).to(self.device)
self.mel_proj = nn.Linear(self.mel_dim, self.transformer_dim).to(self.device)
self.pos_encoder = PositionalEncoding(self.transformer_dim).to(self.device)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=self.transformer_dim,
nhead=hps.model.nhead,
dim_feedforward=hps.model.ffn_dim,
batch_first=False,
dropout=0.1
),
num_layers=hps.model.num_layers
).to(self.device)
self.out_proj = nn.Linear(self.transformer_dim, self.mel_dim).to(self.device)
@torch.no_grad()
def mel_(self,audio_path, hps, device, dtype):
sr_target = int(hps.data.sampling_rate)
audio, sr_origin = torchaudio.load(audio_path)
if audio.shape[0] > 1:
audio = audio.mean(0, keepdim=True)
if sr_origin != sr_target:
resampler = Resample(sr_origin, sr_target).to(device)
audio = resampler(audio.to(device))
else:
audio = audio.to(device)
max_audio = audio.abs().max()
if max_audio > 1.0:
audio = audio / max_audio
mel_extractor = MelSpectrogram(hps).to(device)
mel_spec = mel_extractor(audio).to(dtype)
return mel_spec
def forward(self, audio_path, ge,device,dtype):
# 输入audio_path, ge
# 输出z_p, y_mask, ge
ge_ = ge
mel = self.mel_(audio_path, self.hps, device, dtype)
mel = mel.permute(2, 0, 1)
# 梅尔谱投影到Transformer维度[T, 1, 512]
mel_feat = self.mel_proj(mel)
# 全局情感特征GE处理[1,1024,1] → [1,1024] → [1,1,512]
ge = ge.to(device, dtype=dtype)
ge_squeeze = ge.squeeze(-1) # [1, 1024]
ge_feat = self.ge_proj(ge_squeeze).unsqueeze(0) # [1, 1, 512]
# ===================== 3. 特征融合与Transformer输入 =====================
# 将GE特征拼接在梅尔谱序列开头[T+1, 1, 512]
self.transformer_input = torch.cat([ge_feat, mel_feat], dim=0)
# 添加位置编码
self.transformer_input = self.pos_encoder(self.transformer_input)
# ===================== 4. Transformer编码 =====================
transformer_out = self.transformer(self.transformer_input) # [T+1, 1, 512]
# ===================== 5. 输出特征重构 =====================
# 去除GE开头提取梅尔谱对应的输出[T, 1, 512]
mel_out = transformer_out[1:, :, :]
# 投影回原始梅尔维度:[T, 1, 192]
mel_out = self.out_proj(mel_out)
# 转换为目标格式:[1, 192, T] → z_p
z_p = mel_out.permute(1, 2, 0)
# ===================== 6. 生成掩码 =====================
T = z_p.shape[-1] # 梅尔谱时间步
y_mask = torch.ones(1, 1, T, device=device, dtype=dtype) # [1,1,T] 全1掩码
# ===================== 7. 输出(严格匹配注释格式) =====================
return z_p, y_mask, ge_
class SpliterDataset(torch.utils.data.Dataset):
def __init__(self, voice_file_paths):
self.voice_file_paths = voice_file_paths
self.datas = get_train_set(voice_file_paths)
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
return self.datas[idx]

View File

@ -25,6 +25,53 @@ import contextlib
import random import random
import torchaudio
from torchaudio.transforms import Resample
import os
from pathlib import Path
def merge_dir_txt2(*TXT):
return Path(os.path.join(*TXT))
def get_my_dir():
return os.path.dirname(os.path.abspath(__file__))
def get_parent_dir(dir_path,depth=1):
parent_path = Path(dir_path)
for _ in range(depth):
parent_path = parent_path.parent
return parent_path
POOL:set = set()
def _get_unique_name(name,MySet:set=set()):
_id = 1
if name not in POOL and name not in MySet:
POOL.add(name)
return name
while name in POOL or name in MySet:
_id += 1
name = f'{name}_{_id}'
POOL.add(name)
return name
def find_func(zf,il):
f = zf.get_file_path("voice.json")
info = il.load_info(f)
if info is None:
return None
list_names = info["access_list"]
global POOL
POOL.update(list_names)
ret = []
for name in list_names:
try:
a = zf.get_file_path(name)
ret.append(a)
except FileNotFoundError:
continue
return ret
ROOT_DIR = str(get_parent_dir(get_my_dir()))
class StochasticDurationPredictor(nn.Module): class StochasticDurationPredictor(nn.Module):
def __init__( def __init__(
self, self,
@ -153,7 +200,7 @@ class DurationPredictor(nn.Module):
WINDOW = {} WINDOW = {}
class TextEncoder(nn.Module): class TextEncoder(nn.Module):
def __init__( def __init__(
self, self,
out_channels, out_channels,
@ -989,10 +1036,8 @@ class SynthesizerTrn(nn.Module):
o = self.dec((z * y_mask)[:, :, :], g=ge) o = self.dec((z * y_mask)[:, :, :], g=ge)
return o, y_mask, (z, z_p, m_p, logs_p) return o, y_mask, (z, z_p, m_p, logs_p)
@torch.no_grad() @torch.no_grad()
def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None): def ge_(self, refer, sv_emb=None, InjectGE=False, GE=None, LoadGE=True):
def get_ge(refer, sv_emb): def get_ge(refer, sv_emb):
ge = None ge = None
if refer is not None: if refer is not None:
@ -1006,16 +1051,36 @@ class SynthesizerTrn(nn.Module):
sv_emb = self.sv_emb(sv_emb) # B*20480->B*512 sv_emb = self.sv_emb(sv_emb) # B*20480->B*512
ge += sv_emb.unsqueeze(-1) ge += sv_emb.unsqueeze(-1)
ge = self.prelu(ge) ge = self.prelu(ge)
print(f"ge.shape : {ge.shape}")
return ge return ge
if type(refer) == list: if LoadGE:
ges = [] if type(refer) == list:
for idx, _refer in enumerate(refer): ges = []
ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None) for idx, _refer in enumerate(refer):
ges.append(ge) ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
ge = torch.stack(ges, 0).mean(0) ges.append(ge)
ge = torch.stack(ges, 0).mean(0)
else:
ge = get_ge(refer, sv_emb)
else: else:
ge = get_ge(refer, sv_emb) if InjectGE:
if type(GE) == list:
GE = torch.stack(GE, 0).mean(0)
ge = GE
else:
raise ValueError("No GE stream provided!")
return ge
@torch.no_grad()
def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None,
InjectGE=False,GE=None,LoadGE=True,
InjectZP=False,ZP=None,LoadZP=True,
OverWrite_Mask=False,Mask=None,
SaveGE=False,SaveZP=False,SaveMask=False,
GE_Name=None, ZP_Name=None, Mask_Name=None,
VoiceSave=None):
ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE)
y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
@ -1031,14 +1096,75 @@ class SynthesizerTrn(nn.Module):
self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge, self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
speed, speed,
) )
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
if InjectZP:
if type(ZP) == list:
ZP = torch.stack(ZP, 0).mean(0)
else:
ZP = ZP
z_p = ZP
else:
if LoadZP:
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
else:
raise ValueError("No z_p stream provided!")
if OverWrite_Mask:
if type(Mask) == list:
Mask = torch.stack(Mask, 0).mean(0)
if Mask is None:
raise ValueError("No mask stream provided!")
y_mask = Mask
print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}")
z = self.flow(z_p, y_mask, g=ge, reverse=True) z = self.flow(z_p, y_mask, g=ge, reverse=True)
o = self.dec((z * y_mask)[:, :, :], g=ge) o = self.dec((z * y_mask)[:, :, :], g=ge)
return o return o
@torch.no_grad()
def decode2(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None,
InjectGE=False,GE=None,LoadGE=True,
InjectZP=False,ZP=None,LoadZP=True,
OverWrite_Mask=False,Mask=None,):
ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE)
y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
quantized = self.quantizer.decode(codes)
if self.semantic_frame_rate == "25hz":
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
x, m_p, logs_p, y_mask, _, _ = self.enc_p(
quantized,
y_lengths,
text,
text_lengths,
self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
speed,
)
if InjectZP:
if type(ZP) == list:
ZP = torch.stack(ZP, 0).mean(0)
else:
ZP = ZP
z_p = ZP
else:
if LoadZP:
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
else:
raise ValueError("No z_p stream provided!")
if OverWrite_Mask:
if type(Mask) == list:
Mask = torch.stack(Mask, 0).mean(0)
if Mask is None:
raise ValueError("No mask stream provided!")
y_mask = Mask
print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}")
return z_p, y_mask, ge
@torch.no_grad() @torch.no_grad()
def decode_streaming(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, result_length:int=None, overlap_frames:torch.Tensor=None, padding_length:int=None): def decode_streaming(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, result_length:int=None, overlap_frames:torch.Tensor=None, padding_length:int=None):
def get_ge(refer, sv_emb): def get_ge(refer, sv_emb):

View File

@ -432,6 +432,8 @@ class ResidualCouplingLayer(nn.Module):
self.post.bias.data.zero_() self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False): def forward(self, x, x_mask, g=None, reverse=False):
print(f"x.shape: {x.shape}, x_mask.shape: {x_mask.shape}")
x0, x1 = torch.split(x, [self.half_channels] * 2, 1) x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g) h = self.enc(h, x_mask, g=g)

View File

@ -1,9 +1,10 @@
import sys import sys
import os import os
import torch import torch
from pathlib import Path
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net") sys.path.append(f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/eres2net")
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt" sv_path = f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
from ERes2NetV2 import ERes2NetV2 from ERes2NetV2 import ERes2NetV2
import kaldi as Kaldi import kaldi as Kaldi

View File

@ -82,6 +82,15 @@ conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5] pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
``` ```
If install.ps1 fails, you can try again or run the following commands:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
inst.bat
pwsh -F inst2.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
```
### Linux ### Linux
```bash ```bash

3
conda-go-webui.bat Normal file
View File

@ -0,0 +1,3 @@
chcp 65001
cd /d %~dp0
conda activate %1 | python -I webui.py zh_CN

5
config.json Normal file
View File

@ -0,0 +1,5 @@
{
"GPU_CHECK":{
"DisableGPUMemCheck":false
}
}

View File

@ -1,11 +1,20 @@
import os import os
import re import re
import sys import sys
import json
from pathlib import Path
import torch import torch
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
current_dir = str(Path(__file__).parent)
def merge_dir_txt2(*TXT):
return Path(os.path.join(*TXT))
config_json_location = merge_dir_txt2(current_dir,"config.json")
with open(str(config_json_location),"r") as f:
__info__ = f.read()
__info__ = json.loads(__info__)
i18n = I18nAuto(language=os.environ.get("language", "Auto")) i18n = I18nAuto(language=os.environ.get("language", "Auto"))
@ -159,8 +168,9 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
major, minor = capability major, minor = capability
sm_version = major + minor / 10.0 sm_version = major + minor / 10.0
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5 is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
if mem_gb < 4 or sm_version < 5.3: if not __info__["GPU_CHECK"]["DisableGPUMemCheck"]:
return cpu, torch.float32, 0.0, 0.0 if mem_gb < 4 or sm_version < 5.3:
return cpu, torch.float32, 0.0, 0.0
if sm_version == 6.1 or is_16_series == True: if sm_version == 6.1 or is_16_series == True:
return cuda, torch.float32, sm_version, mem_gb return cuda, torch.float32, sm_version, mem_gb
if sm_version > 6.1: if sm_version > 6.1:

3
inst.bat Normal file
View File

@ -0,0 +1,3 @@
chcp 65001
conda install -y -c conda-forge ffmpeg
conda install -y -c conda-forge cmake

209
inst2.ps1 Normal file
View File

@ -0,0 +1,209 @@
Param (
[Parameter(Mandatory=$true)][ValidateSet("CU126", "CU128", "CPU")][string]$Device,
[Parameter(Mandatory=$true)][ValidateSet("HF", "HF-Mirror", "ModelScope")][string]$Source,
[switch]$DownloadUVR5
)
$global:ErrorActionPreference = 'Stop'
trap {
Write-ErrorLog $_
}
function Write-ErrorLog {
param (
[System.Management.Automation.ErrorRecord]$ErrorRecord
)
Write-Host "`n[ERROR] Command failed:" -ForegroundColor Red
if (-not $ErrorRecord.Exception.Message){
} else {
Write-Host "Message:" -ForegroundColor Red
$ErrorRecord.Exception.Message -split "`n" | ForEach-Object {
Write-Host " $_"
}
}
Write-Host "Command:" -ForegroundColor Red -NoNewline
Write-Host " $($ErrorRecord.InvocationInfo.Line)".Replace("`r", "").Replace("`n", "")
Write-Host "Location:" -ForegroundColor Red -NoNewline
Write-Host " $($ErrorRecord.InvocationInfo.ScriptName):$($ErrorRecord.InvocationInfo.ScriptLineNumber)"
Write-Host "Call Stack:" -ForegroundColor DarkRed
$ErrorRecord.ScriptStackTrace -split "`n" | ForEach-Object {
Write-Host " $_" -ForegroundColor DarkRed
}
exit 1
}
function Write-Info($msg) {
Write-Host "[INFO]:" -ForegroundColor Green -NoNewline
Write-Host " $msg"
}
function Write-Success($msg) {
Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline
Write-Host " $msg"
}
function Invoke-Pip {
param (
[Parameter(ValueFromRemainingArguments = $true)]
[string[]]$Args
)
$output = & pip install @Args 2>&1
$exitCode = $LASTEXITCODE
if ($exitCode -ne 0) {
$errorMessages = @()
Write-Host "Pip Install $Args Failed" -ForegroundColor Red
foreach ($item in $output) {
if ($item -is [System.Management.Automation.ErrorRecord]) {
$msg = $item.Exception.Message
Write-Host "$msg" -ForegroundColor Red
$errorMessages += $msg
}
else {
Write-Host $item
$errorMessages += $item
}
}
throw [System.Exception]::new(($errorMessages -join "`n"))
}
}
function Invoke-Download {
param (
[Parameter(Mandatory = $true)]
[string]$Uri,
[Parameter()]
[string]$OutFile
)
try {
$params = @{
Uri = $Uri
}
if ($OutFile) {
$params["OutFile"] = $OutFile
}
$null = Invoke-WebRequest @params -ErrorAction Stop
} catch {
Write-Host "Failed to download:" -ForegroundColor Red
Write-Host " $Uri"
throw
}
}
function Invoke-Unzip {
param($ZipPath, $DestPath)
Expand-Archive -Path $ZipPath -DestinationPath $DestPath -Force
Remove-Item $ZipPath -Force
}
chcp 65001
Set-Location $PSScriptRoot
$PretrainedURL = ""
$G2PWURL = ""
$UVR5URL = ""
$NLTKURL = ""
$OpenJTalkURL = ""
switch ($Source) {
"HF" {
Write-Info "Download Model From HuggingFace"
$PretrainedURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
$G2PWURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
$UVR5URL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
$NLTKURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
$OpenJTalkURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
}
"HF-Mirror" {
Write-Info "Download Model From HuggingFace-Mirror"
$PretrainedURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
$G2PWURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
$UVR5URL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
$NLTKURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
$OpenJTalkURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
}
"ModelScope" {
Write-Info "Download Model From ModelScope"
$PretrainedURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
$G2PWURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
$UVR5URL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
$NLTKURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
$OpenJTalkURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
}
}
if (-not (Test-Path "GPT_SoVITS/pretrained_models/sv")) {
Write-Info "Downloading Pretrained Models..."
Invoke-Download -Uri $PretrainedURL -OutFile "pretrained_models.zip"
Invoke-Unzip "pretrained_models.zip" "GPT_SoVITS"
Write-Success "Pretrained Models Downloaded"
} else {
Write-Info "Pretrained Model Exists"
Write-Info "Skip Downloading Pretrained Models"
}
if (-not (Test-Path "GPT_SoVITS/text/G2PWModel")) {
Write-Info "Downloading G2PWModel..."
Invoke-Download -Uri $G2PWURL -OutFile "G2PWModel.zip"
Invoke-Unzip "G2PWModel.zip" "GPT_SoVITS/text"
Write-Success "G2PWModel Downloaded"
} else {
Write-Info "G2PWModel Exists"
Write-Info "Skip Downloading G2PWModel"
}
if ($DownloadUVR5) {
if (-not (Test-Path "tools/uvr5/uvr5_weights")) {
Write-Info "Downloading UVR5 Models..."
Invoke-Download -Uri $UVR5URL -OutFile "uvr5_weights.zip"
Invoke-Unzip "uvr5_weights.zip" "tools/uvr5"
Write-Success "UVR5 Models Downloaded"
} else {
Write-Info "UVR5 Models Exists"
Write-Info "Skip Downloading UVR5 Models"
}
}
switch ($Device) {
"CU128" {
Write-Info "Installing PyTorch For CUDA 12.8..."
Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu128"
}
"CU126" {
Write-Info "Installing PyTorch For CUDA 12.6..."
Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu126"
}
"CPU" {
Write-Info "Installing PyTorch For CPU..."
Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cpu"
}
}
Write-Success "PyTorch Installed"
Write-Info "Installing Python Dependencies From requirements.txt..."
Invoke-Pip -r extra-req.txt --no-deps
Invoke-Pip -r requirements.txt
Write-Success "Python Dependencies Installed"
Write-Info "Downloading NLTK Data..."
Invoke-Download -Uri $NLTKURL -OutFile "nltk_data.zip"
Invoke-Unzip "nltk_data.zip" (python -c "import sys; print(sys.prefix)").Trim()
Write-Info "Downloading Open JTalk Dict..."
Invoke-Download -Uri $OpenJTalkURL -OutFile "open_jtalk_dic_utf_8-1.11.tar.gz"
$target = (python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))").Trim()
tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C $target
Remove-Item "open_jtalk_dic_utf_8-1.11.tar.gz" -Force
Write-Success "Open JTalk Dic Downloaded"
Write-Success "Installation Completed"

View File

@ -52,7 +52,7 @@ function Invoke-Conda {
[string[]]$Args [string[]]$Args
) )
$output = & conda install -y -q -c conda-forge @Args 2>&1 $output = & conda install -y -c conda-forge @Args 2>&1
$exitCode = $LASTEXITCODE $exitCode = $LASTEXITCODE
if ($exitCode -ne 0) { if ($exitCode -ne 0) {