support sovits v2Pro v2ProPlus

support sovits v2Pro v2ProPlus
This commit is contained in:
RVC-Boss 2025-06-04 15:15:54 +08:00 committed by GitHub
parent 663c3cc6fc
commit b7c0c5ca87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 640 additions and 133 deletions

2
api.py
View File

@ -374,7 +374,7 @@ hz = 50
def get_gpt_weights(gpt_path):
dict_s1 = torch.load(gpt_path, map_location="cpu")
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
config = dict_s1["config"]
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)

442
batch_inference.py Normal file
View File

@ -0,0 +1,442 @@
import argparse
import os
import pdb
import signal
import sys
from time import time as ttime
import torch
import librosa
import soundfile as sf
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import StreamingResponse
import uvicorn
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
from feature_extractor import cnhubert
from io import BytesIO
from module.models import SynthesizerTrn
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
from module.mel_processing import spectrogram_torch
from my_utils import load_audio
import config as global_config
g_config = global_config.Config()
# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
parser.add_argument("-dr", "--default_refer_path", type=str, default="",
help="默认参考音频路径, 请求缺少参考音频时调用")
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
# bool值的用法为 `python ./api.py -fp ...`
# 此时 full_precision==True, half_precision==False
parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
args = parser.parse_args()
sovits_path = args.sovits_path
gpt_path = args.gpt_path
default_refer_path = args.default_refer_path
default_refer_text = args.default_refer_text
default_refer_language = args.default_refer_language
has_preset = False
device = args.device
port = args.port
host = args.bind_addr
if sovits_path == "":
sovits_path = g_config.pretrained_sovits_path
print(f"[WARN] 未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
if gpt_path == "":
gpt_path = g_config.pretrained_gpt_path
print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
if default_refer_path == "" or default_refer_text == "" or default_refer_language == "":
default_refer_path, default_refer_text, default_refer_language = "", "", ""
print("[INFO] 未指定默认参考音频")
has_preset = False
else:
print(f"[INFO] 默认参考音频路径: {default_refer_path}")
print(f"[INFO] 默认参考音频文本: {default_refer_text}")
print(f"[INFO] 默认参考音频语种: {default_refer_language}")
has_preset = True
is_half = g_config.is_half
if args.full_precision:
is_half = False
if args.half_precision:
is_half = True
if args.full_precision and args.half_precision:
is_half = g_config.is_half # 炒饭fallback
print(f"[INFO] 半精: {is_half}")
cnhubert_base_path = args.hubert_path
bert_path = args.bert_path
cnhubert.cnhubert_base_path = cnhubert_base_path
tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
if is_half:
bert_model = bert_model.half().to(device)
else:
bert_model = bert_model.to(device)
def get_bert_feature(text, word2ph):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题精度随bert_model
res = bert_model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
assert len(word2ph) == len(text)
phone_level_feature = []
for i in range(len(word2ph)):
repeat_feature = res[i].repeat(word2ph[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
# if(is_half==True):phone_level_feature=phone_level_feature.half()
return phone_level_feature.T
n_semantic = 1024
dict_s2 = torch.load(sovits_path, map_location="cpu", weights_only=False)
hps = dict_s2["config"]
print(hps)
class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
for key, value in input_dict.items():
if isinstance(value, dict):
value = DictToAttrRecursive(value)
self[key] = value
setattr(self, key, value)
def __getattr__(self, item):
try:
return self[item]
except KeyError:
raise AttributeError(f"Attribute {item} not found")
def __setattr__(self, key, value):
if isinstance(value, dict):
value = DictToAttrRecursive(value)
super(DictToAttrRecursive, self).__setitem__(key, value)
super().__setattr__(key, value)
def __delattr__(self, item):
try:
del self[item]
except KeyError:
raise AttributeError(f"Attribute {item} not found")
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
config = dict_s1["config"]
ssl_model = cnhubert.get_model()
if is_half:
ssl_model = ssl_model.half().to(device)
else:
ssl_model = ssl_model.to(device)
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
if is_half:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
hz = 50
max_sec = config['data']['max_sec']
t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
def get_spepc(hps, filename):
audio = load_audio(filename, int(hps.data.sampling_rate))
audio = torch.FloatTensor(audio)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
hps.data.win_length, center=False)
return spec
dict_language = {
"中文": "zh",
"英文": "en",
"日文": "ja",
"ZH": "zh",
"EN": "en",
"JA": "ja",
"zh": "zh",
"en": "en",
"ja": "ja"
}
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
t0 = ttime()
prompt_text = prompt_text.strip("\n")
prompt_language, text = prompt_language, text.strip("\n")
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
with torch.no_grad():
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
wav16k = torch.from_numpy(wav16k)
zero_wav_torch = torch.from_numpy(zero_wav)
if (is_half == True):
wav16k = wav16k.half().to(device)
zero_wav_torch = zero_wav_torch.half().to(device)
else:
wav16k = wav16k.to(device)
zero_wav_torch = zero_wav_torch.to(device)
wav16k=torch.cat([wav16k,zero_wav_torch])
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
codes = vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
t1 = ttime()
prompt_language = dict_language[prompt_language]
text_language = dict_language[text_language]
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
phones1 = cleaned_text_to_sequence(phones1)
texts = text.split("\n")
audio_opt = []
for text in texts:
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
phones2 = cleaned_text_to_sequence(phones2)
if (prompt_language == "zh"):
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
else:
bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
device)
if (text_language == "zh"):
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
else:
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
bert = torch.cat([bert1, bert2], 1)
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
prompt = prompt_semantic.unsqueeze(0).to(device)
t2 = ttime()
with torch.no_grad():
# pred_semantic = t2s_model.model.infer(
pred_semantic, idx = t2s_model.model.infer_panel(
all_phoneme_ids,
all_phoneme_len,
prompt,
bert,
# prompt_phone_len=ph_offset,
top_k=config['inference']['top_k'],
early_stop_num=hz * max_sec)
t3 = ttime()
# print(pred_semantic.shape,idx)
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
refer = get_spepc(hps, ref_wav_path) # .to(device)
if (is_half == True):
refer = refer.half().to(device)
else:
refer = refer.to(device)
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
audio = \
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
refer).detach().cpu().numpy()[
0, 0] ###试试重建不带上prompt部分
audio_opt.append(audio)
audio_opt.append(zero_wav)
t4 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
# yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
return hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
def get_tts_wavs(ref_wav_path, prompt_text, prompt_language, textss, text_language):
t0 = ttime()
prompt_text = prompt_text.strip("\n")
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
with torch.no_grad():
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
wav16k = torch.from_numpy(wav16k)
zero_wav_torch = torch.from_numpy(zero_wav)
if (is_half == True):
wav16k = wav16k.half().to(device)
zero_wav_torch = zero_wav_torch.half().to(device)
else:
wav16k = wav16k.to(device)
zero_wav_torch = zero_wav_torch.to(device)
wav16k=torch.cat([wav16k,zero_wav_torch])
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
codes = vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
t1 = ttime()
prompt_language = dict_language[prompt_language]
text_language = dict_language[text_language]
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
phones1 = cleaned_text_to_sequence(phones1)
audios_opt=[]
for text0 in textss:
texts = text0.strip("\n").split("\n")
audio_opt = []
for text in texts:
text=text.strip("")+""
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
phones2 = cleaned_text_to_sequence(phones2)
if (prompt_language == "zh"):
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
else:
bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
device)
if (text_language == "zh"):
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
else:
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
bert = torch.cat([bert1, bert2], 1)
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
prompt = prompt_semantic.unsqueeze(0).to(device)
t2 = ttime()
with torch.no_grad():
# pred_semantic = t2s_model.model.infer(
pred_semantic, idx = t2s_model.model.infer_panel(
all_phoneme_ids,
all_phoneme_len,
prompt,
bert,
# prompt_phone_len=ph_offset,
top_k=config['inference']['top_k'],
early_stop_num=hz * max_sec)
t3 = ttime()
# print(pred_semantic.shape,idx)
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
refer = get_spepc(hps, ref_wav_path) # .to(device)
if (is_half == True):
refer = refer.half().to(device)
else:
refer = refer.to(device)
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
audio = \
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
refer).detach().cpu().numpy()[
0, 0] ###试试重建不带上prompt部分
audio_opt.append(audio)
audio_opt.append(zero_wav)
t4 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
audios_opt.append([text0,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16)])
return audios_opt
# get_tts_wav(r"D:\BaiduNetdiskDownload\gsv\speech\萧逸声音-你得先从滑雪的基本技巧学起.wav", "你得先从滑雪的基本技巧学起。", "中文", "我觉得还是该给喜欢的女孩子一场认真的告白。", "中文")
# with open(r"D:\BaiduNetdiskDownload\gsv\烟嗓-todo1.txt","r",encoding="utf8")as f:
# with open(r"D:\BaiduNetdiskDownload\gsv\年下-todo1.txt","r",encoding="utf8")as f:
# with open(r"D:\BaiduNetdiskDownload\gsv\萧逸3b.txt","r",encoding="utf8")as f:
with open(r"D:\BaiduNetdiskDownload\gsv\萧逸4.txt","r",encoding="utf8")as f:
textss=f.read().split("\n")
for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\speech\萧逸声音-你得先从滑雪的基本技巧学起.wav", "你得先从滑雪的基本技巧学起。", "中文", textss, "中文")):
# for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\足够的能力,去制定好自己的生活规划。低沉烟嗓.MP3_1940480_2095360.wav", "足够的能力,去制定好自己的生活规划。", "中文", textss, "中文")):
# for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\不会呀!你前几天才吃过你还说好吃来着。年下少年音.MP3_537600_711040.wav", "不会呀!你前几天才吃过你还说好吃来着。", "中文", textss, "中文")):
print(idx,text)
# sf.write(r"D:\BaiduNetdiskDownload\gsv\output\烟嗓第一批\%04d-%s.wav"%(idx,text),audio,32000)
# sf.write(r"D:\BaiduNetdiskDownload\gsv\output\年下\%04d-%s.wav"%(idx,text),audio,32000)
sf.write(r"D:\BaiduNetdiskDownload\gsv\output\萧逸第4批\%04d-%s.wav"%(idx,text),audio,32000)
# def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language):
# if command == "/restart":
# os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
# elif command == "/exit":
# os.kill(os.getpid(), signal.SIGTERM)
# exit(0)
#
# if (
# refer_wav_path == "" or refer_wav_path is None
# or prompt_text == "" or prompt_text is None
# or prompt_language == "" or prompt_language is None
# ):
# refer_wav_path, prompt_text, prompt_language = (
# default_refer_path,
# default_refer_text,
# default_refer_language,
# )
# if not has_preset:
# raise HTTPException(status_code=400, detail="未指定参考音频且接口无预设")
#
# with torch.no_grad():
# gen = get_tts_wav(
# refer_wav_path, prompt_text, prompt_language, text, text_language
# )
# sampling_rate, audio_data = next(gen)
#
# wav = BytesIO()
# sf.write(wav, audio_data, sampling_rate, format="wav")
# wav.seek(0)
#
# torch.cuda.empty_cache()
# return StreamingResponse(wav, media_type="audio/wav")
# app = FastAPI()
#
#
# @app.post("/")
# async def tts_endpoint(request: Request):
# json_post_raw = await request.json()
# return handle(
# json_post_raw.get("command"),
# json_post_raw.get("refer_wav_path"),
# json_post_raw.get("prompt_text"),
# json_post_raw.get("prompt_language"),
# json_post_raw.get("text"),
# json_post_raw.get("text_language"),
# )
#
#
# @app.get("/")
# async def tts_endpoint(
# command: str = None,
# refer_wav_path: str = None,
# prompt_text: str = None,
# prompt_language: str = None,
# text: str = None,
# text_language: str = None,
# ):
# return handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language)
#
#
# if __name__ == "__main__":
# uvicorn.run(app, host=host, port=port, workers=1)

View File

@ -1,7 +1,93 @@
import sys
import os
import torch
import torch,re
from tools.i18n.i18n import I18nAuto, scan_language_list
i18n = I18nAuto(language=os.environ["language"])
pretrained_sovits_name = {
"v1":"GPT_SoVITS/pretrained_models/s2G488k.pth",
"v2":"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"v3":"GPT_SoVITS/pretrained_models/s2Gv3.pth",###v3v4还要检查vocoder算了。。。
"v4":"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
"v2Pro":"GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro_pre1.pth",
"v2ProPlus":"GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus_pre1.pth",
}
pretrained_gpt_name = {
"v1":"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"v2":"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"v3":"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v4":"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2Pro":"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2ProPlus":"GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
name2sovits_path={
# i18n("不训练直接推v1底模"): "GPT_SoVITS/pretrained_models/s2G488k.pth",
i18n("不训练直接推v2底模"): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
# i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s2Gv3.pth",
# i18n("不训练直接推v4底模"): "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
i18n("不训练直接推v2Pro底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro_pre1.pth",
i18n("不训练直接推v2ProPlus底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus_pre1.pth",
}
name2gpt_path={
# i18n("不训练直接推v1底模"):"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
i18n("不训练直接推v2底模"):"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
i18n("不训练直接推v3底模"):"GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4", "SoVITS_weights_v2Pro", "SoVITS_weights_v2ProPlus"]
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4", "GPT_weights_v2Pro", "GPT_weights_v2ProPlus"]
SoVITS_weight_version2root={
"v1":"SoVITS_weights",
"v2":"SoVITS_weights_v2",
"v3":"SoVITS_weights_v3",
"v4":"SoVITS_weights_v4",
"v2Pro":"SoVITS_weights_v2Pro",
"v2ProPlus":"SoVITS_weights_v2ProPlus",
}
GPT_weight_version2root={
"v1":"GPT_weights",
"v2":"GPT_weights_v2",
"v3":"GPT_weights_v3",
"v4":"GPT_weights_v4",
"v2Pro":"GPT_weights_v2Pro",
"v2ProPlus":"GPT_weights_v2ProPlus",
}
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def get_weights_names():
SoVITS_names = []
for key in name2sovits_path:
if os.path.exists(name2sovits_path[key]):SoVITS_names.append(key)
for path in SoVITS_weight_root:
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
GPT_names = []
for key in name2gpt_path:
if os.path.exists(name2gpt_path[key]):GPT_names.append(key)
for path in GPT_weight_root:
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
SoVITS_names=sorted(SoVITS_names, key=custom_sort_key)
GPT_names=sorted(GPT_names, key=custom_sort_key)
return SoVITS_names, GPT_names
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": SoVITS_names, "__type__": "update"}, {
"choices": GPT_names,
"__type__": "update",
}
# 推理用的指定模型
sovits_path = ""

241
webui.py
View File

@ -1,10 +1,6 @@
import os
import sys
if len(sys.argv) == 1:
sys.argv.append("v2")
version = "v1" if sys.argv[1] == "v1" else "v2"
os.environ["version"] = version
os.environ["version"] = version="v2Pro"
now_dir = os.getcwd()
sys.path.insert(0, now_dir)
import warnings
@ -63,7 +59,11 @@ for site_packages_root in site_packages_roots:
import shutil
import subprocess
from subprocess import Popen
from tools.i18n.i18n import I18nAuto, scan_language_list
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
os.environ["language"] = language
i18n = I18nAuto(language=language)
from config import (
exp_root,
infer_device,
@ -76,11 +76,6 @@ from config import (
webui_port_uvr5,
)
from tools import my_utils
from tools.i18n.i18n import I18nAuto, scan_language_list
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
os.environ["language"] = language
i18n = I18nAuto(language=language)
from multiprocessing import cpu_count
from tools.my_utils import check_details, check_for_existance
@ -232,86 +227,32 @@ def fix_gpu_numbers(inputs):
return inputs
pretrained_sovits_name = [
"GPT_SoVITS/pretrained_models/s2G488k.pth",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
]
pretrained_gpt_name = [
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
]
from config import pretrained_sovits_name,pretrained_gpt_name
pretrained_model_list = (
pretrained_sovits_name[int(version[-1]) - 1],
pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"),
pretrained_gpt_name[int(version[-1]) - 1],
"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
"GPT_SoVITS/pretrained_models/chinese-hubert-base",
)
def check_pretrained_is_exist(version):
pretrained_model_list = (
pretrained_sovits_name[version],
pretrained_sovits_name[version].replace("s2G", "s2D"),
pretrained_gpt_name[version],
"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
"GPT_SoVITS/pretrained_models/chinese-hubert-base",
)
_ = ""
for i in pretrained_model_list:
if "s2Dv3" not in i and "s2Dv4" not in i and os.path.exists(i) == False:
_ += f"\n {i}"
if _:
print("warning: ", i18n("以下模型不存在:") + _)
check_pretrained_is_exist(version)
for key in pretrained_sovits_name.keys():
if os.path.exists(pretrained_sovits_name[key])==False:pretrained_sovits_name[key]=""
for key in pretrained_gpt_name.keys():
if os.path.exists(pretrained_gpt_name[key])==False:pretrained_gpt_name[key]=""
_ = ""
for i in pretrained_model_list:
if "s2Dv3" not in i and os.path.exists(i) == False:
_ += f"\n {i}"
if _:
print("warning: ", i18n("以下模型不存在:") + _)
_ = [[], []]
for i in range(4):
if os.path.exists(pretrained_gpt_name[i]):
_[0].append(pretrained_gpt_name[i])
else:
_[0].append("") ##没有下pretrained模型的说不定他们是想自己从零训底模呢
if os.path.exists(pretrained_sovits_name[i]):
_[-1].append(pretrained_sovits_name[i])
else:
_[-1].append("")
pretrained_gpt_name, pretrained_sovits_name = _
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
from config import SoVITS_weight_root,GPT_weight_root,get_weights_names,change_choices,SoVITS_weight_version2root,GPT_weight_version2root
for root in SoVITS_weight_root + GPT_weight_root:
os.makedirs(root, exist_ok=True)
def get_weights_names():
SoVITS_names = [name for name in pretrained_sovits_name if name != ""]
for path in SoVITS_weight_root:
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
GPT_names = [name for name in pretrained_gpt_name if name != ""]
for path in GPT_weight_root:
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
return SoVITS_names, GPT_names
SoVITS_names, GPT_names = get_weights_names()
for path in SoVITS_weight_root + GPT_weight_root:
os.makedirs(path, exist_ok=True)
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
"choices": sorted(GPT_names, key=custom_sort_key),
"__type__": "update",
}
p_label = None
p_uvr5 = None
@ -450,8 +391,8 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so
# if version=="v3":
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if p_tts_inference is None:
os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path)
os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path)
os.environ["gpt_path"] = gpt_path
os.environ["sovits_path"] = sovits_path
os.environ["cnhubert_base_path"] = cnhubert_base_path
os.environ["bert_path"] = bert_path
os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number)
@ -599,6 +540,7 @@ process_name_sovits = i18n("SoVITS训练")
def open1Ba(
version,
batch_size,
total_epoch,
exp_name,
@ -614,7 +556,8 @@ def open1Ba(
):
global p_train_SoVITS
if p_train_SoVITS == None:
with open("GPT_SoVITS/configs/s2.json") as f:
config_file="GPT_SoVITS/configs/s2.json" if version not in {"v2Pro","v2ProPlus"}else f"GPT_SoVITS/configs/s2{version}.json"
with open(config_file) as f:
data = f.read()
data = json.loads(data)
s2_dir = "%s/%s" % (exp_root, exp_name)
@ -637,13 +580,13 @@ def open1Ba(
data["train"]["lora_rank"] = lora_rank
data["model"]["version"] = version
data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir
data["save_weight_dir"] = SoVITS_weight_root[int(version[-1]) - 1]
data["save_weight_dir"] = SoVITS_weight_version2root[version]
data["name"] = exp_name
data["version"] = version
tmp_config_path = "%s/tmp_s2.json" % tmp
with open(tmp_config_path, "w") as f:
f.write(json.dumps(data))
if version in ["v1", "v2"]:
if version in ["v1", "v2","v2Pro","v2ProPlus"]:
cmd = '"%s" -s GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path)
else:
cmd = '"%s" -s GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path)
@ -724,7 +667,7 @@ def open1Bb(
data["train"]["if_save_every_weights"] = if_save_every_weights
data["train"]["if_save_latest"] = if_save_latest
data["train"]["if_dpo"] = if_dpo
data["train"]["half_weights_save_dir"] = GPT_weight_root[int(version[-1]) - 1]
data["train"]["half_weights_save_dir"] = GPT_weight_version2root[version]
data["train"]["exp_name"] = exp_name
data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir
data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir
@ -964,12 +907,10 @@ def close1a():
{"__type__": "update", "visible": False},
)
sv_path="GPT_SoVITS\pretrained_models\sv\pretrained_eres2netv2w24s4ep4.ckpt"
ps1b = []
process_name_1b = i18n("语音自监督特征提取")
def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir):
def open1b(version,inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir):
global ps1b
inp_text = my_utils.clean_path(inp_text)
inp_wav_dir = my_utils.clean_path(inp_wav_dir)
@ -982,6 +923,7 @@ def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir):
"exp_name": exp_name,
"opt_dir": "%s/%s" % (exp_root, exp_name),
"cnhubert_base_dir": ssl_pretrained_dir,
"sv_path": sv_path,
"is_half": str(is_half),
}
gpu_names = gpu_numbers.split("-")
@ -1007,6 +949,23 @@ def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir):
for p in ps1b:
p.wait()
ps1b = []
if "Pro"in version:
for i_part in range(all_parts):
config.update(
{
"i_part": str(i_part),
"all_parts": str(all_parts),
"_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]),
}
)
os.environ.update(config)
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-sv.py' % python_exec
print(cmd)
p = Popen(cmd, shell=True)
ps1b.append(p)
for p in ps1b:
p.wait()
ps1b = []
yield (
process_info(process_name_1b, "finish"),
{"__type__": "update", "visible": True},
@ -1040,19 +999,20 @@ ps1c = []
process_name_1c = i18n("语义Token提取")
def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path):
def open1c(version,inp_text,inp_wav_dir, exp_name, gpu_numbers, pretrained_s2G_path):
global ps1c
inp_text = my_utils.clean_path(inp_text)
if check_for_existance([inp_text, ""], is_dataset_processing=True):
check_details([inp_text, ""], is_dataset_processing=True)
if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True):
check_details([inp_text, inp_wav_dir], is_dataset_processing=True)
if ps1c == []:
opt_dir = "%s/%s" % (exp_root, exp_name)
config_file="GPT_SoVITS/configs/s2.json" if version not in {"v2Pro","v2ProPlus"}else f"GPT_SoVITS/configs/s2{version}.json"
config = {
"inp_text": inp_text,
"exp_name": exp_name,
"opt_dir": opt_dir,
"pretrained_s2G": pretrained_s2G_path,
"s2config_path": "GPT_SoVITS/configs/s2.json",
"s2config_path": config_file,
"is_half": str(is_half),
}
gpu_names = gpu_numbers.split("-")
@ -1121,6 +1081,7 @@ process_name_1abc = i18n("训练集格式化一键三连")
def open1abc(
version,
inp_text,
inp_wav_dir,
exp_name,
@ -1198,6 +1159,7 @@ def open1abc(
"exp_name": exp_name,
"opt_dir": opt_dir,
"cnhubert_base_dir": ssl_pretrained_dir,
"sv_path": sv_path,
}
gpu_names = gpu_numbers1Ba.split("-")
all_parts = len(gpu_names)
@ -1221,23 +1183,41 @@ def open1abc(
)
for p in ps1abc:
p.wait()
ps1abc=[]
if "Pro" in version:
for i_part in range(all_parts):
config.update(
{
"i_part": str(i_part),
"all_parts": str(all_parts),
"_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]),
}
)
os.environ.update(config)
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-sv.py' % python_exec
print(cmd)
p = Popen(cmd, shell=True)
ps1abc.append(p)
for p in ps1abc:
p.wait()
ps1abc = []
yield (
i18n("进度") + ": 1A-Done, 1B-Done",
{"__type__": "update", "visible": False},
{"__type__": "update", "visible": True},
)
ps1abc = []
#############################1c
path_semantic = "%s/6-name2semantic.tsv" % opt_dir
if os.path.exists(path_semantic) == False or (
os.path.exists(path_semantic) == True and os.path.getsize(path_semantic) < 31
):
config_file = "GPT_SoVITS/configs/s2.json" if version not in {"v2Pro", "v2ProPlus"} else f"GPT_SoVITS/configs/s2{version}.json"
config = {
"inp_text": inp_text,
"exp_name": exp_name,
"opt_dir": opt_dir,
"pretrained_s2G": pretrained_s2G_path,
"s2config_path": "GPT_SoVITS/configs/s2.json",
"s2config_path": config_file,
}
gpu_names = gpu_numbers1c.split("-")
all_parts = len(gpu_names)
@ -1317,17 +1297,17 @@ def switch_version(version_):
os.environ["version"] = version_
global version
version = version_
if pretrained_sovits_name[int(version[-1]) - 1] != "" and pretrained_gpt_name[int(version[-1]) - 1] != "":
if pretrained_sovits_name[version] != "" and pretrained_gpt_name[version] != "":
...
else:
gr.Warning(i18n("未下载模型") + ": " + version.upper())
set_default()
return (
{"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]},
{"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D")},
{"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]},
{"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]},
{"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]},
{"__type__": "update", "value": pretrained_sovits_name[version]},
{"__type__": "update", "value": pretrained_sovits_name[version].replace("s2G", "s2D")},
{"__type__": "update", "value": pretrained_gpt_name[version]},
{"__type__": "update", "value": pretrained_gpt_name[version]},
{"__type__": "update", "value": pretrained_sovits_name[version]},
{"__type__": "update", "value": default_batch_size, "maximum": default_max_batch_size},
{"__type__": "update", "value": default_sovits_epoch, "maximum": max_sovits_epoch},
{"__type__": "update", "value": default_sovits_save_every_epoch, "maximum": max_sovits_save_every_epoch},
@ -1357,10 +1337,7 @@ def sync(text):
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
gr.Markdown(
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+ "<br>"
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
)
gr.Markdown(value=i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")+ "<br>"+i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
with gr.Tabs():
with gr.TabItem("0-" + i18n("前置数据集获取工具")): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
@ -1419,8 +1396,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
value=process_info(process_name_slice, "close"), variant="primary", visible=False
)
gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(不稳定,先别用,可能劣化模型效果!)"))
with gr.Row():
# gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(不稳定,先别用,可能劣化模型效果!)"))
with gr.Row(visible=False):
with gr.Column(scale=3):
with gr.Row():
denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="")
@ -1512,33 +1489,33 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
with gr.Row():
with gr.Row():
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"]) # , "v3"
with gr.Row():
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True,scale=3,)
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False,scale=5,)
version_checkbox = gr.Radio(label=i18n("训练模型的版本"), value=version, choices=["v1","v2", "v4", "v2Pro", "v2ProPlus"],scale=5,)
# version_checkbox = gr.Radio(label=i18n("训练模型的版本"), value=version, choices=["v2", "v4", "v2Pro", "v2ProPlus", "v2ProMax"],scale=5,)
pretrained_s2G = gr.Textbox(
label=i18n("预训练SoVITS-G模型路径"),
value=pretrained_sovits_name[int(version[-1]) - 1],
value=pretrained_sovits_name[version],
interactive=True,
lines=2,
max_lines=3,
scale=9,
scale=5,
)
pretrained_s2D = gr.Textbox(
label=i18n("预训练SoVITS-D模型路径"),
value=pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"),
value=pretrained_sovits_name[version].replace("s2G", "s2D"),
interactive=True,
lines=2,
max_lines=3,
scale=9,
scale=5,
)
pretrained_s1 = gr.Textbox(
label=i18n("预训练GPT模型路径"),
value=pretrained_gpt_name[int(version[-1]) - 1],
value=pretrained_gpt_name[version],
interactive=True,
lines=2,
max_lines=3,
scale=10,
scale=5,
)
with gr.TabItem("1A-" + i18n("训练集格式化工具")):
@ -1623,7 +1600,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
with gr.Row():
pretrained_s2G_ = gr.Textbox(
label=i18n("预训练SoVITS-G模型路径"),
value=pretrained_sovits_name[int(version[-1]) - 1],
value=pretrained_sovits_name[version],
interactive=False,
lines=2,
)
@ -1688,17 +1665,18 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close])
button1b_open.click(
open1b,
[inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir],
[version_checkbox,inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir],
[info1b, button1b_open, button1b_close],
)
button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close])
button1c_open.click(
open1c, [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close]
open1c, [version_checkbox,inp_text, inp_wav_dir,exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close]
)
button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close])
button1abc_open.click(
open1abc,
[
version_checkbox,
inp_text,
inp_wav_dir,
exp_name,
@ -1862,21 +1840,21 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
with gr.TabItem("1C-" + i18n("推理")):
gr.Markdown(
value=i18n(
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。"
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。"
)
)
with gr.Row():
with gr.Row():
GPT_dropdown = gr.Dropdown(
label=i18n("GPT模型列表"),
choices=sorted(GPT_names, key=custom_sort_key),
value=pretrained_gpt_name[0],
choices=GPT_names,
value=GPT_names[-1],
interactive=True,
)
SoVITS_dropdown = gr.Dropdown(
label=i18n("SoVITS模型列表"),
choices=sorted(SoVITS_names, key=custom_sort_key),
value=pretrained_sovits_name[0],
choices=SoVITS_names,
value=SoVITS_names[0],
interactive=True,
)
with gr.Row():
@ -1924,6 +1902,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
button1Ba_open.click(
open1Ba,
[
version_checkbox,
batch_size,
total_epoch,
exp_name,