Merge branch 'RVC-Boss:main' into dev

This commit is contained in:
刘洋 2024-02-22 22:42:50 +08:00 committed by GitHub
commit 3c7a65c0bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 386 additions and 133 deletions

View File

@ -3,4 +3,6 @@ logs
output
reference
SoVITS_weights
.git
GPT_weights
TEMP
.git

View File

@ -72,8 +72,6 @@ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
@ -209,54 +207,8 @@ dict_language = {
}
def splite_en_inf(sentence, language):
pattern = re.compile(r'[a-zA-Z ]+')
textlist = []
langlist = []
pos = 0
for match in pattern.finditer(sentence):
start, end = match.span()
if start > pos:
textlist.append(sentence[pos:start])
langlist.append(language)
textlist.append(sentence[start:end])
langlist.append("en")
pos = end
if pos < len(sentence):
textlist.append(sentence[pos:])
langlist.append(language)
# Merge punctuation into previous word
for i in range(len(textlist)-1, 0, -1):
if re.match(r'^[\W_]+$', textlist[i]):
textlist[i-1] += textlist[i]
del textlist[i]
del langlist[i]
# Merge consecutive words with the same language tag
i = 0
while i < len(langlist) - 1:
if langlist[i] == langlist[i+1]:
textlist[i] += textlist[i+1]
del textlist[i+1]
del langlist[i+1]
else:
i += 1
return textlist, langlist
def clean_text_inf(text, language):
formattext = ""
language = language.replace("all_","")
for tmp in LangSegment.getTexts(text):
if language == "ja":
if tmp["lang"] == language or tmp["lang"] == "zh":
formattext += tmp["text"] + " "
continue
if tmp["lang"] == language:
formattext += tmp["text"] + " "
while " " in formattext:
formattext = formattext.replace(" ", " ")
phones, word2ph, norm_text = clean_text(formattext, language)
phones, word2ph, norm_text = clean_text(text, language)
phones = cleaned_text_to_sequence(phones)
return phones, word2ph, norm_text
@ -274,55 +226,6 @@ def get_bert_inf(phones, word2ph, norm_text, language):
return bert
def nonen_clean_text_inf(text, language):
if(language!="auto"):
textlist, langlist = splite_en_inf(text, language)
else:
textlist=[]
langlist=[]
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
phones_list = []
word2ph_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
phones_list.append(phones)
if lang == "zh":
word2ph_list.append(word2ph)
norm_text_list.append(norm_text)
print(word2ph_list)
phones = sum(phones_list, [])
word2ph = sum(word2ph_list, [])
norm_text = ' '.join(norm_text_list)
return phones, word2ph, norm_text
def nonen_get_bert_inf(text, language):
if(language!="auto"):
textlist, langlist = splite_en_inf(text, language)
else:
textlist=[]
langlist=[]
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
print(textlist)
print(langlist)
bert_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
return bert
splits = {"", "", "", "", ",", ".", "?", "!", "~", ":", "", "", "", }
@ -332,23 +235,59 @@ def get_first(text):
return text
def get_cleaned_text_final(text,language):
def get_phones_and_bert(text,language):
if language in {"en","all_zh","all_ja"}:
phones, word2ph, norm_text = clean_text_inf(text, language)
language = language.replace("all_","")
if language == "en":
LangSegment.setfilters(["en"])
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
else:
# 因无法区别中日文汉字,以用户输入为准
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
phones, word2ph, norm_text = clean_text_inf(formattext, language)
if language == "zh":
bert = get_bert_feature(norm_text, word2ph).to(device)
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
elif language in {"zh", "ja","auto"}:
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
return phones, word2ph, norm_text
textlist=[]
langlist=[]
LangSegment.setfilters(["zh","ja","en"])
if language == "auto":
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegment.getTexts(text):
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
print(textlist)
print(langlist)
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = ''.join(norm_text_list)
return phones,bert.to(dtype),norm_text
def get_bert_final(phones, word2ph, text,language,device):
if language == "en":
bert = get_bert_inf(phones, word2ph, text, language)
elif language in {"zh", "ja","auto"}:
bert = nonen_get_bert_inf(text, language)
elif language == "all_zh":
bert = get_bert_feature(text, word2ph).to(device)
else:
bert = torch.zeros((1024, len(phones))).to(device)
return bert
def merge_short_text_in_array(texts, threshold):
if (len(texts)) < 2:
@ -425,8 +364,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
texts = merge_short_text_in_array(texts, 5)
audio_opt = []
if not ref_free:
phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
for text in texts:
# 解决输入目标文本的空行导致报错的问题
@ -434,8 +372,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
continue
if (text[-1] not in splits): text += "" if text_language != "en" else "."
print(i18n("实际输入的目标文本(每句):"), text)
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
print(i18n("前端处理后的文本(每句):"), norm_text2)
if not ref_free:
bert = torch.cat([bert1, bert2], 1)
all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
@ -623,12 +561,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频超过会报错"), type="filepath")
with gr.Column():
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT"))
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
)
gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
with gr.Row():
text = gr.Textbox(label=i18n("需要合成的文本"), value="")
text_language = gr.Dropdown(

View File

@ -99,7 +99,7 @@ for line in lines[int(i_part)::int(all_parts)]:
try:
# wav_name,text=line.split("\t")
wav_name, spk_name, language, text = line.split("|")
if (inp_wav_dir !=None):
if (inp_wav_dir != "" and inp_wav_dir != None):
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s"%(inp_wav_dir, wav_name)

View File

@ -30,10 +30,12 @@ rep_map = {
"\n": ".",
"·": ",",
"": ",",
# "...": "…",
"...": "",
"$": ".",
"/": ",",
"": "-",
"~": "",
"":"",
}
tone_modifier = ToneSandhi()

View File

@ -172,6 +172,21 @@ def replace_range(match) -> str:
return result
# ~至表达式
RE_TO_RANGE = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
def replace_to_range(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = match.group(0).replace('~', '')
return result
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
stripped = value_string.lstrip('0')
if len(stripped) == 0:

View File

@ -33,6 +33,7 @@ from .num import RE_NUMBER
from .num import RE_PERCENTAGE
from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import RE_TO_RANGE
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
@ -40,6 +41,7 @@ from .num import replace_number
from .num import replace_percentage
from .num import replace_positive_quantifier
from .num import replace_range
from .num import replace_to_range
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
from .phonecode import RE_TELEPHONE
@ -65,7 +67,7 @@ class TextNormalizer():
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[——《》【】<=>{}()#&@“”^_|\\]', '', text)
text = re.sub(r'[——《》【】<=>{}()#&@“”^_|\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
@ -73,8 +75,8 @@ class TextNormalizer():
def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '')
sentence = sentence.replace('~', '')
sentence = sentence.replace('', '')
# sentence = sentence.replace('~', '至')
# sentence = sentence.replace('', '至')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
@ -128,6 +130,8 @@ class TextNormalizer():
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
sentence = RE_TIME.sub(replace_time, sentence)
# 处理~波浪号作为至的替换
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
sentence = replace_measure(sentence)
sentence = RE_FRAC.sub(replace_frac, sentence)

152
GPT_SoVITS_Inference.ipynb Normal file
View File

@ -0,0 +1,152 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
],
"metadata": {
"id": "himHYZmra7ix"
}
},
{
"cell_type": "code",
"metadata": {
"id": "e9b7iFV3dm1f"
},
"source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
"!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
"%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
"%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
"# @title UVR5 pretrains 安装uvr5模型\n",
"%cd /content/GPT-SoVITS/tools/uvr5\n",
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
],
"metadata": {
"id": "0NgxXg5sjv7z",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Create folder models 创建文件夹模型\n",
"import os\n",
"base_directory = \"/content/GPT-SoVITS\"\n",
"folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n",
"\n",
"for folder_name in folder_names:\n",
" if os.path.exists(os.path.join(base_directory, folder_name)):\n",
" print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n",
" else:\n",
" os.makedirs(os.path.join(base_directory, folder_name))\n",
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
],
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import requests\n",
"import zipfile\n",
"import shutil\n",
"import os\n",
"\n",
"#@title Import model 导入模型 (HuggingFace)\n",
"hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n",
"\n",
"output_path = '/content/'\n",
"\n",
"response = requests.get(hf_link)\n",
"with open(output_path + 'file.zip', 'wb') as file:\n",
" file.write(response.content)\n",
"\n",
"with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n",
" zip_ref.extractall(output_path)\n",
"\n",
"os.remove(output_path + \"file.zip\")\n",
"\n",
"source_directory = output_path\n",
"SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n",
"GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".pth\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(SoVITS_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".ckpt\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(GPT_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"print(f'Model downloaded. (模型已下载。)')"
],
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}
]
}

View File

@ -107,7 +107,7 @@ For Chinese ASR (additionally), download models from [Damo ASR Model](https://mo
If you are a Mac user, make sure you meet the following conditions for training and inferencing with GPU:
- Mac computers with Apple silicon or AMD GPUs
- Mac computers with Apple silicon
- macOS 12.3 or later
- Xcode command-line tools installed by running `xcode-select --install`

54
api.py
View File

@ -144,7 +144,7 @@ parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
@ -227,6 +227,44 @@ def is_full(*items): # 任意一项为空返回False
return False
return True
def change_sovits_weights(sovits_path):
global vq_model, hps
dict_s2 = torch.load(sovits_path, map_location="cpu")
hps = dict_s2["config"]
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
if ("pretrained" not in sovits_path):
del vq_model.enc_q
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
with open("./sweight.txt", "w", encoding="utf-8") as f:
f.write(sovits_path)
def change_gpt_weights(gpt_path):
global hz, max_sec, t2s_model, config
hz = 50
dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"]
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
def get_bert_feature(text, word2ph):
with torch.no_grad():
@ -452,6 +490,20 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
app = FastAPI()
#clark新增-----2024-02-21
#可在启动后动态修改模型以此满足同一个api不同的朗读者请求
@app.post("/set_model")
async def set_model(request: Request):
json_post_raw = await request.json()
global gpt_path
gpt_path=json_post_raw.get("gpt_model_path")
global sovits_path
sovits_path=json_post_raw.get("sovits_model_path")
print("gptpath"+gpt_path+";vitspath"+sovits_path)
change_sovits_weights(sovits_path)
change_gpt_weights(gpt_path)
return "ok"
# 新增-----end------
@app.post("/control")
async def control(request: Request):

View File

@ -19,8 +19,6 @@ exp_root = "logs"
python_exec = sys.executable or "python"
if torch.cuda.is_available():
infer_device = "cuda"
elif torch.backends.mps.is_available():
infer_device = "mps"
else:
infer_device = "cpu"

View File

@ -2,13 +2,20 @@
# 获取当前日期,格式为 YYYYMMDD
DATE=$(date +%Y%m%d)
# 获取最新的 Git commit 哈希值的前 7 位
COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
# 构建 full 版本的镜像
docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
# 构建 elite 版本的镜像
# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite

View File

@ -125,6 +125,16 @@
2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
### 20240221更新
1-数据处理添加语音降噪选项
2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
3-mac CPU推理更快因此把推理设备从mps改到CPU
4-colab修复不开启公网url
todolist
1-中文多音字推理优化

View File

@ -49,7 +49,7 @@ _注意: numba==0.56.4 需要 python<3.11_
如果你是 Mac 用户,请先确保满足以下条件以使用 GPU 进行训练和推理:
- 搭载 Apple 芯片或 AMD GPU 的 Mac
- 搭载 Apple 芯片的 Mac
- macOS 12.3 或更高版本
- 已通过运行`xcode-select --install`安装 Xcode command-line tools

View File

@ -47,7 +47,7 @@ _注記: numba==0.56.4 は py<3.11 が必要です_
如果あなたが Mac ユーザーである場合、GPU を使用してトレーニングおよび推論を行うために以下の条件を満たしていることを確認してください:
- Apple シリコンまたは AMD GPU を搭載した Mac コンピューター
- Apple シリコンを搭載した Mac コンピューター
- macOS 12.3 以降
- `xcode-select --install`を実行してインストールされた Xcode コマンドラインツール

View File

@ -49,7 +49,7 @@ _참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._
MacOS 사용자는 GPU를 사용하여 훈련 및 추론을 하려면 다음 조건을 충족해야 합니다:
- Apple 칩 또는 AMD GPU가 장착된 Mac
- Apple 칩 장착된 Mac
- macOS 12.3 이상
- `xcode-select --install`을 실행하여 Xcode command-line tools를 설치했습니다.

View File

@ -8,8 +8,16 @@
"是否开启UVR5-WebUI": "¿Habilitar UVR5-WebUI?",
"UVR5进程输出信息": "Información de salida del proceso UVR5",
"0b-语音切分工具": "0b-Herramienta de división de voz",
".list标注文件的路径": "Ruta del archivo de anotación .list",
"GPT模型列表": "Lista de modelos GPT",
"SoVITS模型列表": "Lista de modelos SoVITS",
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名不是全路径。": "Directorio donde se guardan los archivos de audio después del corte! Ruta completa del archivo de audio a leer = este directorio - nombre de archivo correspondiente a la forma de onda en el archivo de lista (no la ruta completa).",
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
"切分后的子音频的输出根目录": "Directorio raíz de salida de los sub-audios después de la división",
"怎么切": "Cómo cortar",
"不切": "No cortar",
"凑四句一切": "Completa cuatro oraciones para rellenar todo",
"按英文句号.切": "Cortar por puntos en inglés.",
"threshold:音量小于这个值视作静音的备选切割点": "umbral: puntos de corte alternativos considerados como silencio si el volumen es menor que este valor",
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: duración mínima de cada segmento, si el primer segmento es demasiado corto, se conecta continuamente con los siguientes hasta que supera este valor",
"min_interval:最短切割间隔": "min_interval: intervalo mínimo de corte",

View File

@ -23,5 +23,5 @@ PyYAML
psutil
jieba_fast
jieba
LangSegment
LangSegment>=0.2.0
Faster_Whisper

29
tools/cmd-denoise.py Normal file
View File

@ -0,0 +1,29 @@
import os,argparse
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from tqdm import tqdm
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
def execute_denoise(input_folder,output_folder):
os.makedirs(output_folder,exist_ok=True)
# print(input_folder)
# print(list(os.listdir(input_folder).sort()))
for name in tqdm(os.listdir(input_folder)):
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, required=True,
help="Path to the folder containing WAV files.")
parser.add_argument("-o", "--output_folder", type=str, required=True,
help="Output folder to store transcriptions.")
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
help="fp16 or fp32")#还没接入
cmd = parser.parse_args()
execute_denoise(
input_folder = cmd.input_folder,
output_folder = cmd.output_folder,
)

2
tools/denoise-model/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -117,6 +117,7 @@ def change_choices():
p_label=None
p_uvr5=None
p_asr=None
p_denoise=None
p_tts_inference=None
def kill_proc_tree(pid, including_parent=True):
@ -220,6 +221,29 @@ def close_asr():
kill_process(p_asr.pid)
p_asr=None
return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
def open_denoise(denoise_inp_dir, denoise_opt_dir):
global p_denoise
if(p_denoise==None):
denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
print(cmd)
p_denoise = Popen(cmd, shell=True)
p_denoise.wait()
p_denoise=None
yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
else:
yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
# return None
def close_denoise():
global p_denoise
if(p_denoise!=None):
kill_process(p_denoise.pid)
p_denoise=None
return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
p_train_SoVITS=None
def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
@ -678,6 +702,13 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
gr.Markdown(value=i18n("0bb-语音降噪工具"))
with gr.Row():
open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True)
close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
with gr.Row():
open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
@ -740,6 +771,9 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
with gr.Row():
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)