This commit is contained in:
林北1941783147 2024-03-01 18:54:40 +08:00
commit 4130c15d4d
8 changed files with 256 additions and 72 deletions

View File

@ -4,12 +4,30 @@ logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
logging.getLogger("asyncio").setLevel(logging.ERROR)
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
import pdb
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
gpt_path = os.environ.get(
"gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
)
sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
if os.path.exists("./gweight.txt"):
with open("./gweight.txt", 'r',encoding="utf-8") as file:
gweight_data = file.read()
gpt_path = os.environ.get(
"gpt_path", gweight_data)
else:
gpt_path = os.environ.get(
"gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
if os.path.exists("./sweight.txt"):
with open("./sweight.txt", 'r',encoding="utf-8") as file:
sweight_data = file.read()
sovits_path = os.environ.get("sovits_path", sweight_data)
else:
sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
# gpt_path = os.environ.get(
# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
# )
# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
cnhubert_base_path = os.environ.get(
"cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base"
)
@ -62,7 +80,7 @@ def get_bert_feature(text, word2ph):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
for i in inputs:
inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题精度随bert_model
inputs[i] = inputs[i].to(device)
res = bert_model(**inputs, output_hidden_states=True)
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
assert len(word2ph) == len(text)
@ -119,13 +137,15 @@ def change_sovits_weights(sovits_path):
n_speakers=hps.data.n_speakers,
**hps.model
)
del vq_model.enc_q
if("pretrained"not in sovits_path):
del vq_model.enc_q
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path)
change_sovits_weights(sovits_path)
def change_gpt_weights(gpt_path):
@ -142,6 +162,7 @@ def change_gpt_weights(gpt_path):
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path)
change_gpt_weights(gpt_path)
def get_spepc(hps, filename):
@ -167,7 +188,84 @@ dict_language={
}
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
def splite_en_inf(sentence, language):
pattern = re.compile(r'[a-zA-Z. ]+')
textlist = []
langlist = []
pos = 0
for match in pattern.finditer(sentence):
start, end = match.span()
if start > pos:
textlist.append(sentence[pos:start])
langlist.append(language)
textlist.append(sentence[start:end])
langlist.append("en")
pos = end
if pos < len(sentence):
textlist.append(sentence[pos:])
langlist.append(language)
return textlist, langlist
def clean_text_inf(text, language):
phones, word2ph, norm_text = clean_text(text, language)
phones = cleaned_text_to_sequence(phones)
return phones, word2ph, norm_text
def get_bert_inf(phones, word2ph, norm_text, language):
if language == "zh":
bert = get_bert_feature(norm_text, word2ph).to(device)
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
return bert
def nonen_clean_text_inf(text, language):
textlist, langlist = splite_en_inf(text, language)
phones_list = []
word2ph_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
phones_list.append(phones)
if lang == "en" or "ja":
pass
else:
word2ph_list.append(word2ph)
norm_text_list.append(norm_text)
print(word2ph_list)
phones = sum(phones_list, [])
word2ph = sum(word2ph_list, [])
norm_text = ' '.join(norm_text_list)
return phones, word2ph, norm_text
def nonen_get_bert_inf(text, language):
textlist, langlist = splite_en_inf(text, language)
print(textlist)
print(langlist)
bert_list = []
for i in range(len(textlist)):
text = textlist[i]
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(text, lang)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
return bert
#i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切")
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,how_to_cut=i18n("不切")):
t0 = ttime()
prompt_text = prompt_text.strip("\n")
prompt_language, text = prompt_language, text.strip("\n")
@ -196,27 +294,38 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
t1 = ttime()
prompt_language = dict_language[prompt_language]
text_language = dict_language[text_language]
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
phones1 = cleaned_text_to_sequence(phones1)
texts = text.split("\n")
if prompt_language == "en":
phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
else:
phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language)
if(how_to_cut==i18n("凑五句一切")):text=cut1(text)
elif(how_to_cut==i18n("凑50字一切")):text=cut2(text)
elif(how_to_cut==i18n("按中文句号。切")):text=cut3(text)
elif(how_to_cut==i18n("按英文句号.切")):text=cut4(text)
text = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n")
if(text[-1]not in splits):text+=""if text_language!="en"else "."
texts=text.split("\n")
audio_opt = []
if prompt_language == "en":
bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
else:
bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
for text in texts:
# 解决输入目标文本的空行导致报错的问题
if (len(text.strip()) == 0):
continue
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
phones2 = cleaned_text_to_sequence(phones2)
if prompt_language == "zh":
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
if text_language == "en":
phones2, word2ph2, norm_text2 = clean_text_inf(text, text_language)
else:
bert1 = torch.zeros(
(1024, len(phones1)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
if text_language == "zh":
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
phones2, word2ph2, norm_text2 = nonen_clean_text_inf(text, text_language)
if text_language == "en":
bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
else:
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
bert2 = nonen_get_bert_inf(text, text_language)
bert = torch.cat([bert1, bert2], 1)
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
@ -339,6 +448,9 @@ def cut2(inp):
def cut3(inp):
inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")])
def cut4(inp):
inp = inp.strip("\n")
return "\n".join(["%s." % item for item in inp.strip(".").split(".")])
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
@ -445,17 +557,24 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
)
gr.Markdown(value=i18n("*请填写需要合成的目标文本"))
gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
with gr.Row():
text = gr.Textbox(label=i18n("需要合成的文本"), value="")
text_language = gr.Dropdown(
label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
)
how_to_cut = gr.Radio(
label=i18n("怎么切"),
choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),],
value=i18n("凑50字一切"),
interactive=True,
)
inference_button = gr.Button(i18n("合成语音"), variant="primary")
output = gr.Audio(label=i18n("输出的语音"))
inference_button.click(
get_tts_wav,
[inp_ref, prompt_text, prompt_language, text, text_language],
[inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut],
[output],
)
history_audio = []
@ -485,10 +604,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
button1 = gr.Button(i18n("凑五句一切"), variant="primary")
button2 = gr.Button(i18n("凑50字一切"), variant="primary")
button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
button1.click(cut1, [text_inp], [text_opt])
button2.click(cut2, [text_inp], [text_opt])
button3.click(cut3, [text_inp], [text_opt])
button4.click(cut4, [text_inp], [text_opt])
gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。"))
app.queue(concurrency_count=511, max_size=1022).launch(

View File

@ -47,12 +47,12 @@ if os.path.exists(txt_path) == False:
bert_dir = "%s/3-bert" % (opt_dir)
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(bert_dir, exist_ok=True)
if torch.cuda.is_available():
device = "cuda:0"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
if is_half == True:

View File

@ -38,12 +38,12 @@ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
if os.path.exists(semantic_path) == False:
os.makedirs(opt_dir, exist_ok=True)
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
hps = utils.get_hparams_from_file(s2config_path)
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,

View File

@ -455,6 +455,35 @@ class ToneSandhi:
"电子",
"人人",
"虎虎",
"幺幺",
"干嘛",
"学子",
"哈哈",
"数数",
"袅袅",
"局地",
"以下",
"娃哈哈",
"花花草草",
"留得",
"耕地",
"想想",
"熙熙",
"攘攘",
"卵子",
"死死",
"冉冉",
"恳恳",
"佼佼",
"吵吵",
"打打",
"考考",
"整整",
"莘莘",
"落地",
"算子",
"家家户户",
"青青",
}
self.punc = ":,;。?!“”‘’':,;.?!"

3
api.py
View File

@ -13,7 +13,7 @@
`-dt` - `默认参考音频文本`
`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
`-d` - `推理设备, "cuda","cpu"`
`-d` - `推理设备, "cuda","cpu","mps"`
`-a` - `绑定地址, 默认"127.0.0.1"`
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
`-fp` - `覆盖 config.py 使用全精度`
@ -139,7 +139,6 @@ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")

View File

@ -29,6 +29,18 @@ webui_port_subfix = 9871
api_port = 9880
gpu_name = torch.cuda.get_device_name(0)
if (
("16" in gpu_name and "V100" not in gpu_name.upper())
or "P40" in gpu_name.upper()
or "P10" in gpu_name.upper()
or "1060" in gpu_name
or "1070" in gpu_name
or "1080" in gpu_name
):
is_half=False
if(infer_device=="cpu"):is_half=False
class Config:
def __init__(self):

View File

@ -29,3 +29,22 @@
3-优化模型文件排序逻辑
4-中文分词使用jieba_fast代替jieba
### 20240126更新
1-支持输出文本中英混合、日英混合
2-输出可选切分模式
3-修复uvr5读取到目录自动跳出的问题
4-修复多个换行导致推理报错
5-去除推理界面大量冗余log
6-支持mac训练推理
7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。

View File

@ -52,39 +52,32 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
paths = [path.name for path in paths]
for path in paths:
inp_path = os.path.join(inp_root, path)
need_reformat = 1
done = 0
if(os.path.isfile(inp_path)==False):continue
try:
y, sr = librosa.load(inp_path, sr=None)
info = sf.info(inp_path)
channels = info.channels
if channels == 2 and sr == 44100:
need_reformat = 0
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
)
done = 1
else:
done = 0
try:
y, sr = librosa.load(inp_path, sr=None)
info = sf.info(inp_path)
channels = info.channels
if channels == 2 and sr == 44100:
need_reformat = 0
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
)
done = 1
else:
need_reformat = 1
except:
need_reformat = 1
except:
need_reformat = 1
traceback.print_exc()
if need_reformat == 1:
tmp_path = "%s/%s.reformatted.wav" % (
os.path.join(os.environ["TEMP"]),
os.path.basename(inp_path),
)
y_resampled = librosa.resample(y, sr, 44100)
sf.write(tmp_path, y_resampled, 44100, "PCM_16")
inp_path = tmp_path
try:
if done == 0:
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
traceback.print_exc()
if need_reformat == 1:
tmp_path = "%s/%s.reformatted.wav" % (
os.path.join(os.environ["TEMP"]),
os.path.basename(inp_path),
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
y_resampled = librosa.resample(y, sr, 44100)
sf.write(tmp_path, y_resampled, 44100, "PCM_16")
inp_path = tmp_path
try:
if done == 0:
pre_fun._path_audio_(
@ -93,10 +86,21 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
try:
if done == 0:
pre_fun._path_audio_(
inp_path, save_root_ins, save_root_vocal, format0
)
infos.append("%s->Success" % (os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append(
"%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
)
yield "\n".join(infos)
except:
infos.append("Oh my god. %s->%s"%(os.path.basename(inp_path), traceback.format_exc()))
yield "\n".join(infos)
except:
infos.append(traceback.format_exc())
yield "\n".join(infos)