mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-07 07:02:57 +08:00
Merge branch 'RVC-Boss:main' into dev
This commit is contained in:
commit
3c7a65c0bc
@ -3,4 +3,6 @@ logs
|
||||
output
|
||||
reference
|
||||
SoVITS_weights
|
||||
.git
|
||||
GPT_weights
|
||||
TEMP
|
||||
.git
|
||||
|
@ -72,8 +72,6 @@ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
|
||||
@ -209,54 +207,8 @@ dict_language = {
|
||||
}
|
||||
|
||||
|
||||
def splite_en_inf(sentence, language):
|
||||
pattern = re.compile(r'[a-zA-Z ]+')
|
||||
textlist = []
|
||||
langlist = []
|
||||
pos = 0
|
||||
for match in pattern.finditer(sentence):
|
||||
start, end = match.span()
|
||||
if start > pos:
|
||||
textlist.append(sentence[pos:start])
|
||||
langlist.append(language)
|
||||
textlist.append(sentence[start:end])
|
||||
langlist.append("en")
|
||||
pos = end
|
||||
if pos < len(sentence):
|
||||
textlist.append(sentence[pos:])
|
||||
langlist.append(language)
|
||||
# Merge punctuation into previous word
|
||||
for i in range(len(textlist)-1, 0, -1):
|
||||
if re.match(r'^[\W_]+$', textlist[i]):
|
||||
textlist[i-1] += textlist[i]
|
||||
del textlist[i]
|
||||
del langlist[i]
|
||||
# Merge consecutive words with the same language tag
|
||||
i = 0
|
||||
while i < len(langlist) - 1:
|
||||
if langlist[i] == langlist[i+1]:
|
||||
textlist[i] += textlist[i+1]
|
||||
del textlist[i+1]
|
||||
del langlist[i+1]
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return textlist, langlist
|
||||
|
||||
|
||||
def clean_text_inf(text, language):
|
||||
formattext = ""
|
||||
language = language.replace("all_","")
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
if language == "ja":
|
||||
if tmp["lang"] == language or tmp["lang"] == "zh":
|
||||
formattext += tmp["text"] + " "
|
||||
continue
|
||||
if tmp["lang"] == language:
|
||||
formattext += tmp["text"] + " "
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
phones, word2ph, norm_text = clean_text(formattext, language)
|
||||
phones, word2ph, norm_text = clean_text(text, language)
|
||||
phones = cleaned_text_to_sequence(phones)
|
||||
return phones, word2ph, norm_text
|
||||
|
||||
@ -274,55 +226,6 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
||||
return bert
|
||||
|
||||
|
||||
def nonen_clean_text_inf(text, language):
|
||||
if(language!="auto"):
|
||||
textlist, langlist = splite_en_inf(text, language)
|
||||
else:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
phones_list = []
|
||||
word2ph_list = []
|
||||
norm_text_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
||||
phones_list.append(phones)
|
||||
if lang == "zh":
|
||||
word2ph_list.append(word2ph)
|
||||
norm_text_list.append(norm_text)
|
||||
print(word2ph_list)
|
||||
phones = sum(phones_list, [])
|
||||
word2ph = sum(word2ph_list, [])
|
||||
norm_text = ' '.join(norm_text_list)
|
||||
|
||||
return phones, word2ph, norm_text
|
||||
|
||||
|
||||
def nonen_get_bert_inf(text, language):
|
||||
if(language!="auto"):
|
||||
textlist, langlist = splite_en_inf(text, language)
|
||||
else:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
print(textlist)
|
||||
print(langlist)
|
||||
bert_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
||||
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
||||
bert_list.append(bert)
|
||||
bert = torch.cat(bert_list, dim=1)
|
||||
|
||||
return bert
|
||||
|
||||
|
||||
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
|
||||
|
||||
|
||||
@ -332,23 +235,59 @@ def get_first(text):
|
||||
return text
|
||||
|
||||
|
||||
def get_cleaned_text_final(text,language):
|
||||
def get_phones_and_bert(text,language):
|
||||
if language in {"en","all_zh","all_ja"}:
|
||||
phones, word2ph, norm_text = clean_text_inf(text, language)
|
||||
language = language.replace("all_","")
|
||||
if language == "en":
|
||||
LangSegment.setfilters(["en"])
|
||||
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
|
||||
else:
|
||||
# 因无法区别中日文汉字,以用户输入为准
|
||||
formattext = text
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
phones, word2ph, norm_text = clean_text_inf(formattext, language)
|
||||
if language == "zh":
|
||||
bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||
else:
|
||||
bert = torch.zeros(
|
||||
(1024, len(phones)),
|
||||
dtype=torch.float16 if is_half == True else torch.float32,
|
||||
).to(device)
|
||||
elif language in {"zh", "ja","auto"}:
|
||||
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
|
||||
return phones, word2ph, norm_text
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
LangSegment.setfilters(["zh","ja","en"])
|
||||
if language == "auto":
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
if tmp["lang"] == "en":
|
||||
langlist.append(tmp["lang"])
|
||||
else:
|
||||
# 因无法区别中日文汉字,以用户输入为准
|
||||
langlist.append(language)
|
||||
textlist.append(tmp["text"])
|
||||
print(textlist)
|
||||
print(langlist)
|
||||
phones_list = []
|
||||
bert_list = []
|
||||
norm_text_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
||||
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
||||
phones_list.append(phones)
|
||||
norm_text_list.append(norm_text)
|
||||
bert_list.append(bert)
|
||||
bert = torch.cat(bert_list, dim=1)
|
||||
phones = sum(phones_list, [])
|
||||
norm_text = ''.join(norm_text_list)
|
||||
|
||||
return phones,bert.to(dtype),norm_text
|
||||
|
||||
def get_bert_final(phones, word2ph, text,language,device):
|
||||
if language == "en":
|
||||
bert = get_bert_inf(phones, word2ph, text, language)
|
||||
elif language in {"zh", "ja","auto"}:
|
||||
bert = nonen_get_bert_inf(text, language)
|
||||
elif language == "all_zh":
|
||||
bert = get_bert_feature(text, word2ph).to(device)
|
||||
else:
|
||||
bert = torch.zeros((1024, len(phones))).to(device)
|
||||
return bert
|
||||
|
||||
def merge_short_text_in_array(texts, threshold):
|
||||
if (len(texts)) < 2:
|
||||
@ -425,8 +364,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
texts = merge_short_text_in_array(texts, 5)
|
||||
audio_opt = []
|
||||
if not ref_free:
|
||||
phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
|
||||
bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
|
||||
phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
|
||||
|
||||
for text in texts:
|
||||
# 解决输入目标文本的空行导致报错的问题
|
||||
@ -434,8 +372,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
continue
|
||||
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
||||
print(i18n("实际输入的目标文本(每句):"), text)
|
||||
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
||||
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
||||
phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
|
||||
print(i18n("前端处理后的文本(每句):"), norm_text2)
|
||||
if not ref_free:
|
||||
bert = torch.cat([bert1, bert2], 1)
|
||||
all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
|
||||
@ -623,12 +561,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
|
||||
with gr.Column():
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
|
||||
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT"))
|
||||
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
|
||||
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
|
||||
prompt_language = gr.Dropdown(
|
||||
label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
|
||||
)
|
||||
gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
|
||||
gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
|
||||
with gr.Row():
|
||||
text = gr.Textbox(label=i18n("需要合成的文本"), value="")
|
||||
text_language = gr.Dropdown(
|
||||
|
@ -99,7 +99,7 @@ for line in lines[int(i_part)::int(all_parts)]:
|
||||
try:
|
||||
# wav_name,text=line.split("\t")
|
||||
wav_name, spk_name, language, text = line.split("|")
|
||||
if (inp_wav_dir !=None):
|
||||
if (inp_wav_dir != "" and inp_wav_dir != None):
|
||||
wav_name = os.path.basename(wav_name)
|
||||
wav_path = "%s/%s"%(inp_wav_dir, wav_name)
|
||||
|
||||
|
@ -30,10 +30,12 @@ rep_map = {
|
||||
"\n": ".",
|
||||
"·": ",",
|
||||
"、": ",",
|
||||
# "...": "…",
|
||||
"...": "…",
|
||||
"$": ".",
|
||||
"/": ",",
|
||||
"—": "-",
|
||||
"~": "…",
|
||||
"~":"…",
|
||||
}
|
||||
|
||||
tone_modifier = ToneSandhi()
|
||||
|
@ -172,6 +172,21 @@ def replace_range(match) -> str:
|
||||
return result
|
||||
|
||||
|
||||
# ~至表达式
|
||||
RE_TO_RANGE = re.compile(
|
||||
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
|
||||
|
||||
def replace_to_range(match) -> str:
|
||||
"""
|
||||
Args:
|
||||
match (re.Match)
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
result = match.group(0).replace('~', '至')
|
||||
return result
|
||||
|
||||
|
||||
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
|
||||
stripped = value_string.lstrip('0')
|
||||
if len(stripped) == 0:
|
||||
|
@ -33,6 +33,7 @@ from .num import RE_NUMBER
|
||||
from .num import RE_PERCENTAGE
|
||||
from .num import RE_POSITIVE_QUANTIFIERS
|
||||
from .num import RE_RANGE
|
||||
from .num import RE_TO_RANGE
|
||||
from .num import replace_default_num
|
||||
from .num import replace_frac
|
||||
from .num import replace_negative_num
|
||||
@ -40,6 +41,7 @@ from .num import replace_number
|
||||
from .num import replace_percentage
|
||||
from .num import replace_positive_quantifier
|
||||
from .num import replace_range
|
||||
from .num import replace_to_range
|
||||
from .phonecode import RE_MOBILE_PHONE
|
||||
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
||||
from .phonecode import RE_TELEPHONE
|
||||
@ -65,7 +67,7 @@ class TextNormalizer():
|
||||
if lang == "zh":
|
||||
text = text.replace(" ", "")
|
||||
# 过滤掉特殊字符
|
||||
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text)
|
||||
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||||
text = text.strip()
|
||||
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||||
@ -73,8 +75,8 @@ class TextNormalizer():
|
||||
|
||||
def _post_replace(self, sentence: str) -> str:
|
||||
sentence = sentence.replace('/', '每')
|
||||
sentence = sentence.replace('~', '至')
|
||||
sentence = sentence.replace('~', '至')
|
||||
# sentence = sentence.replace('~', '至')
|
||||
# sentence = sentence.replace('~', '至')
|
||||
sentence = sentence.replace('①', '一')
|
||||
sentence = sentence.replace('②', '二')
|
||||
sentence = sentence.replace('③', '三')
|
||||
@ -128,6 +130,8 @@ class TextNormalizer():
|
||||
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
|
||||
sentence = RE_TIME.sub(replace_time, sentence)
|
||||
|
||||
# 处理~波浪号作为至的替换
|
||||
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
|
||||
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
||||
sentence = replace_measure(sentence)
|
||||
sentence = RE_FRAC.sub(replace_frac, sentence)
|
||||
|
152
GPT_SoVITS_Inference.ipynb
Normal file
152
GPT_SoVITS_Inference.ipynb
Normal file
@ -0,0 +1,152 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "himHYZmra7ix"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "e9b7iFV3dm1f"
|
||||
},
|
||||
"source": [
|
||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||
"%cd GPT-SoVITS\n",
|
||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||
"!pip install -r requirements.txt"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# @title Download pretrained models 下载预训练模型\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
|
||||
"%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
|
||||
"%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
|
||||
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
|
||||
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
|
||||
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
|
||||
"# @title UVR5 pretrains 安装uvr5模型\n",
|
||||
"%cd /content/GPT-SoVITS/tools/uvr5\n",
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title Create folder models 创建文件夹模型\n",
|
||||
"import os\n",
|
||||
"base_directory = \"/content/GPT-SoVITS\"\n",
|
||||
"folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n",
|
||||
"\n",
|
||||
"for folder_name in folder_names:\n",
|
||||
" if os.path.exists(os.path.join(base_directory, folder_name)):\n",
|
||||
" print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n",
|
||||
" else:\n",
|
||||
" os.makedirs(os.path.join(base_directory, folder_name))\n",
|
||||
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
|
||||
"\n",
|
||||
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "cPDEH-9czOJF"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"import shutil\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"#@title Import model 导入模型 (HuggingFace)\n",
|
||||
"hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n",
|
||||
"\n",
|
||||
"output_path = '/content/'\n",
|
||||
"\n",
|
||||
"response = requests.get(hf_link)\n",
|
||||
"with open(output_path + 'file.zip', 'wb') as file:\n",
|
||||
" file.write(response.content)\n",
|
||||
"\n",
|
||||
"with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n",
|
||||
" zip_ref.extractall(output_path)\n",
|
||||
"\n",
|
||||
"os.remove(output_path + \"file.zip\")\n",
|
||||
"\n",
|
||||
"source_directory = output_path\n",
|
||||
"SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n",
|
||||
"GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n",
|
||||
"\n",
|
||||
"for filename in os.listdir(source_directory):\n",
|
||||
" if filename.endswith(\".pth\"):\n",
|
||||
" source_path = os.path.join(source_directory, filename)\n",
|
||||
" destination_path = os.path.join(SoVITS_destination_directory, filename)\n",
|
||||
" shutil.move(source_path, destination_path)\n",
|
||||
"\n",
|
||||
"for filename in os.listdir(source_directory):\n",
|
||||
" if filename.endswith(\".ckpt\"):\n",
|
||||
" source_path = os.path.join(source_directory, filename)\n",
|
||||
" destination_path = os.path.join(GPT_destination_directory, filename)\n",
|
||||
" shutil.move(source_path, destination_path)\n",
|
||||
"\n",
|
||||
"print(f'Model downloaded. (模型已下载。)')"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbZY-LnM0tzq"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# @title launch WebUI 启动WebUI\n",
|
||||
"!/usr/local/bin/pip install ipykernel\n",
|
||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||
"%cd /content/GPT-SoVITS/\n",
|
||||
"!/usr/local/bin/python webui.py"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
@ -107,7 +107,7 @@ For Chinese ASR (additionally), download models from [Damo ASR Model](https://mo
|
||||
|
||||
If you are a Mac user, make sure you meet the following conditions for training and inferencing with GPU:
|
||||
|
||||
- Mac computers with Apple silicon or AMD GPUs
|
||||
- Mac computers with Apple silicon
|
||||
- macOS 12.3 or later
|
||||
- Xcode command-line tools installed by running `xcode-select --install`
|
||||
|
||||
|
54
api.py
54
api.py
@ -144,7 +144,7 @@ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="
|
||||
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
||||
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
|
||||
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
||||
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
||||
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
|
||||
@ -227,6 +227,44 @@ def is_full(*items): # 任意一项为空返回False
|
||||
return False
|
||||
return True
|
||||
|
||||
def change_sovits_weights(sovits_path):
|
||||
global vq_model, hps
|
||||
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
||||
hps = dict_s2["config"]
|
||||
hps = DictToAttrRecursive(hps)
|
||||
hps.model.semantic_frame_rate = "25hz"
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model
|
||||
)
|
||||
if ("pretrained" not in sovits_path):
|
||||
del vq_model.enc_q
|
||||
if is_half == True:
|
||||
vq_model = vq_model.half().to(device)
|
||||
else:
|
||||
vq_model = vq_model.to(device)
|
||||
vq_model.eval()
|
||||
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||
with open("./sweight.txt", "w", encoding="utf-8") as f:
|
||||
f.write(sovits_path)
|
||||
def change_gpt_weights(gpt_path):
|
||||
global hz, max_sec, t2s_model, config
|
||||
hz = 50
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
||||
config = dict_s1["config"]
|
||||
max_sec = config["data"]["max_sec"]
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
t2s_model.load_state_dict(dict_s1["weight"])
|
||||
if is_half == True:
|
||||
t2s_model = t2s_model.half()
|
||||
t2s_model = t2s_model.to(device)
|
||||
t2s_model.eval()
|
||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
print("Number of parameter: %.2fM" % (total / 1e6))
|
||||
with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
|
||||
|
||||
|
||||
def get_bert_feature(text, word2ph):
|
||||
with torch.no_grad():
|
||||
@ -452,6 +490,20 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
#clark新增-----2024-02-21
|
||||
#可在启动后动态修改模型,以此满足同一个api不同的朗读者请求
|
||||
@app.post("/set_model")
|
||||
async def set_model(request: Request):
|
||||
json_post_raw = await request.json()
|
||||
global gpt_path
|
||||
gpt_path=json_post_raw.get("gpt_model_path")
|
||||
global sovits_path
|
||||
sovits_path=json_post_raw.get("sovits_model_path")
|
||||
print("gptpath"+gpt_path+";vitspath"+sovits_path)
|
||||
change_sovits_weights(sovits_path)
|
||||
change_gpt_weights(gpt_path)
|
||||
return "ok"
|
||||
# 新增-----end------
|
||||
|
||||
@app.post("/control")
|
||||
async def control(request: Request):
|
||||
|
@ -19,8 +19,6 @@ exp_root = "logs"
|
||||
python_exec = sys.executable or "python"
|
||||
if torch.cuda.is_available():
|
||||
infer_device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
infer_device = "mps"
|
||||
else:
|
||||
infer_device = "cpu"
|
||||
|
||||
|
@ -2,13 +2,20 @@
|
||||
|
||||
# 获取当前日期,格式为 YYYYMMDD
|
||||
DATE=$(date +%Y%m%d)
|
||||
# 获取最新的 Git commit 哈希值的前 7 位
|
||||
COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
|
||||
|
||||
# 构建 full 版本的镜像
|
||||
docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
|
||||
# 为同一个镜像添加带日期的标签
|
||||
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
|
||||
# 为同一个镜像添加带当前代码库Commit哈希值的标签
|
||||
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
|
||||
|
||||
# 构建 elite 版本的镜像
|
||||
|
||||
# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
|
||||
docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
|
||||
# 为同一个镜像添加带日期的标签
|
||||
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
|
||||
# 为同一个镜像添加带当前代码库Commit哈希值的标签
|
||||
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite
|
||||
|
@ -125,6 +125,16 @@
|
||||
|
||||
2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
|
||||
|
||||
### 20240221更新
|
||||
|
||||
1-数据处理添加语音降噪选项
|
||||
|
||||
2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
|
||||
|
||||
3-mac CPU推理更快因此把推理设备从mps改到CPU
|
||||
|
||||
4-colab修复不开启公网url
|
||||
|
||||
todolist:
|
||||
|
||||
1-中文多音字推理优化
|
||||
|
@ -49,7 +49,7 @@ _注意: numba==0.56.4 需要 python<3.11_
|
||||
|
||||
如果你是 Mac 用户,请先确保满足以下条件以使用 GPU 进行训练和推理:
|
||||
|
||||
- 搭载 Apple 芯片或 AMD GPU 的 Mac
|
||||
- 搭载 Apple 芯片的 Mac
|
||||
- macOS 12.3 或更高版本
|
||||
- 已通过运行`xcode-select --install`安装 Xcode command-line tools
|
||||
|
||||
|
@ -47,7 +47,7 @@ _注記: numba==0.56.4 は py<3.11 が必要です_
|
||||
|
||||
如果あなたが Mac ユーザーである場合、GPU を使用してトレーニングおよび推論を行うために以下の条件を満たしていることを確認してください:
|
||||
|
||||
- Apple シリコンまたは AMD GPU を搭載した Mac コンピューター
|
||||
- Apple シリコンを搭載した Mac コンピューター
|
||||
- macOS 12.3 以降
|
||||
- `xcode-select --install`を実行してインストールされた Xcode コマンドラインツール
|
||||
|
||||
|
@ -49,7 +49,7 @@ _참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._
|
||||
|
||||
MacOS 사용자는 GPU를 사용하여 훈련 및 추론을 하려면 다음 조건을 충족해야 합니다:
|
||||
|
||||
- Apple 칩 또는 AMD GPU가 장착된 Mac
|
||||
- Apple 칩이 장착된 Mac
|
||||
- macOS 12.3 이상
|
||||
- `xcode-select --install`을 실행하여 Xcode command-line tools를 설치했습니다.
|
||||
|
||||
|
@ -8,8 +8,16 @@
|
||||
"是否开启UVR5-WebUI": "¿Habilitar UVR5-WebUI?",
|
||||
"UVR5进程输出信息": "Información de salida del proceso UVR5",
|
||||
"0b-语音切分工具": "0b-Herramienta de división de voz",
|
||||
".list标注文件的路径": "Ruta del archivo de anotación .list",
|
||||
"GPT模型列表": "Lista de modelos GPT",
|
||||
"SoVITS模型列表": "Lista de modelos SoVITS",
|
||||
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。": "Directorio donde se guardan los archivos de audio después del corte! Ruta completa del archivo de audio a leer = este directorio - nombre de archivo correspondiente a la forma de onda en el archivo de lista (no la ruta completa).",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
|
||||
"切分后的子音频的输出根目录": "Directorio raíz de salida de los sub-audios después de la división",
|
||||
"怎么切": "Cómo cortar",
|
||||
"不切": "No cortar",
|
||||
"凑四句一切": "Completa cuatro oraciones para rellenar todo",
|
||||
"按英文句号.切": "Cortar por puntos en inglés.",
|
||||
"threshold:音量小于这个值视作静音的备选切割点": "umbral: puntos de corte alternativos considerados como silencio si el volumen es menor que este valor",
|
||||
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: duración mínima de cada segmento, si el primer segmento es demasiado corto, se conecta continuamente con los siguientes hasta que supera este valor",
|
||||
"min_interval:最短切割间隔": "min_interval: intervalo mínimo de corte",
|
||||
|
@ -23,5 +23,5 @@ PyYAML
|
||||
psutil
|
||||
jieba_fast
|
||||
jieba
|
||||
LangSegment
|
||||
LangSegment>=0.2.0
|
||||
Faster_Whisper
|
29
tools/cmd-denoise.py
Normal file
29
tools/cmd-denoise.py
Normal file
@ -0,0 +1,29 @@
|
||||
import os,argparse
|
||||
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from tqdm import tqdm
|
||||
|
||||
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
|
||||
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
||||
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
|
||||
def execute_denoise(input_folder,output_folder):
|
||||
os.makedirs(output_folder,exist_ok=True)
|
||||
# print(input_folder)
|
||||
# print(list(os.listdir(input_folder).sort()))
|
||||
for name in tqdm(os.listdir(input_folder)):
|
||||
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||
help="Path to the folder containing WAV files.")
|
||||
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||
help="Output folder to store transcriptions.")
|
||||
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
|
||||
help="fp16 or fp32")#还没接入
|
||||
cmd = parser.parse_args()
|
||||
execute_denoise(
|
||||
input_folder = cmd.input_folder,
|
||||
output_folder = cmd.output_folder,
|
||||
)
|
2
tools/denoise-model/.gitignore
vendored
Normal file
2
tools/denoise-model/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
34
webui.py
34
webui.py
@ -117,6 +117,7 @@ def change_choices():
|
||||
p_label=None
|
||||
p_uvr5=None
|
||||
p_asr=None
|
||||
p_denoise=None
|
||||
p_tts_inference=None
|
||||
|
||||
def kill_proc_tree(pid, including_parent=True):
|
||||
@ -220,6 +221,29 @@ def close_asr():
|
||||
kill_process(p_asr.pid)
|
||||
p_asr=None
|
||||
return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
||||
def open_denoise(denoise_inp_dir, denoise_opt_dir):
|
||||
global p_denoise
|
||||
if(p_denoise==None):
|
||||
denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
|
||||
denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
|
||||
cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
|
||||
|
||||
yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
||||
print(cmd)
|
||||
p_denoise = Popen(cmd, shell=True)
|
||||
p_denoise.wait()
|
||||
p_denoise=None
|
||||
yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
||||
else:
|
||||
yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
||||
# return None
|
||||
|
||||
def close_denoise():
|
||||
global p_denoise
|
||||
if(p_denoise!=None):
|
||||
kill_process(p_denoise.pid)
|
||||
p_denoise=None
|
||||
return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
|
||||
|
||||
p_train_SoVITS=None
|
||||
def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
|
||||
@ -678,6 +702,13 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
|
||||
n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
|
||||
slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
|
||||
gr.Markdown(value=i18n("0bb-语音降噪工具"))
|
||||
with gr.Row():
|
||||
open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True)
|
||||
close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
|
||||
denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
|
||||
denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
|
||||
denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
|
||||
gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
|
||||
with gr.Row():
|
||||
open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
|
||||
@ -740,6 +771,9 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
|
||||
open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
|
||||
close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
|
||||
open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
|
||||
close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
|
||||
|
||||
with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
|
||||
with gr.Row():
|
||||
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user