From e106a5ee8848a65b5a8f242c57a50f4d90d5cf0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E9=9C=87=E4=B8=9C?= Date: Fri, 7 Jun 2024 22:03:27 +0800 Subject: [PATCH 1/7] Update s2_train.py (#1159) --- GPT_SoVITS/s2_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index 4f0ca4c..ddbe2ab 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -305,7 +305,7 @@ def train_and_evaluate( y_lengths, text, text_lengths, - ) in tqdm(enumerate(train_loader)): + ) in enumerate(tqdm(train_loader)): if torch.cuda.is_available(): spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( rank, non_blocking=True From 501a74ae96789a26b48932babed5eb4e9483a232 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:14:15 +0800 Subject: [PATCH 2/7] Add files via upload --- tools/uvr5/mdxnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py index 0d609c4..372db25 100644 --- a/tools/uvr5/mdxnet.py +++ b/tools/uvr5/mdxnet.py @@ -220,7 +220,7 @@ class Predictor: opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) + "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) ) if os.path.exists(opt_path_vocal): try: @@ -229,7 +229,7 @@ class Predictor: pass if os.path.exists(path_other): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) + "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) ) if os.path.exists(opt_path_other): try: From bedb421adb4a01b3aa19cdbc9422f4ebec9a2858 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:18:30 +0100 Subject: [PATCH 3/7] =?UTF-8?q?=E5=85=B3=E4=BA=8E=E6=A0=87=E7=82=B9?= =?UTF-8?q?=E7=AC=A6=E5=8F=B7=E5=AF=BC=E8=87=B4=E5=8F=82=E8=80=83=E6=B3=84?= =?UTF-8?q?=E6=BC=8F=E7=9A=84=E9=97=AE=E9=A2=98=20(#1168)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * punctuation * Update inference_webui.py * Update * update * update --- GPT_SoVITS/inference_webui.py | 36 ++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4fe8045..03440a3 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -50,6 +50,7 @@ is_share = eval(is_share) if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +punctuation = set(['!', '?', '…', ',', '.', '-'," "]) import gradio as gr from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np @@ -322,6 +323,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) text = text.strip("\n") + text = replace_consecutive_punctuation(text) if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text print(i18n("实际输入的目标文本:"), text) @@ -366,6 +368,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, text = text.replace("\n\n", "\n") print(i18n("实际输入的目标文本(切句后):"), text) texts = text.split("\n") + texts = process_text(texts) texts = merge_short_text_in_array(texts, 5) audio_opt = [] if not ref_free: @@ -463,6 +466,7 @@ def cut1(inp): opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) else: opts = [inp] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) @@ -487,17 +491,21 @@ def cut2(inp): if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 opts[-2] = opts[-2] + opts[-1] opts = opts[:-1] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) def cut3(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) - + opts = ["%s" % item for item in inp.strip("。").split("。")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) def cut4(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) + opts = ["%s" % item for item in inp.strip(".").split(".")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py @@ -511,8 +519,8 @@ def cut5(inp): # 在句子不存在符号或句尾无符号的时候保证文本完整 if len(items)%2 == 1: mergeitems.append(items[-1]) - opt = "\n".join(mergeitems) - return opt + opt = [item for item in mergeitems if not set(item).issubset(punctuation)] + return "\n".join(opt) def custom_sort_key(s): @@ -522,6 +530,24 @@ def custom_sort_key(s): parts = [int(part) if part.isdigit() else part for part in parts] return parts +def process_text(texts): + _text=[] + if all(text in [None, " ", "\n",""] for text in texts): + raise ValueError(i18n("请输入有效文本")) + for text in texts: + if text in [None, " ", ""]: + pass + else: + _text.append(text) + return _text + + +def replace_consecutive_punctuation(text): + punctuations = ''.join(re.escape(p) for p in punctuation) + pattern = f'([{punctuations}])([{punctuations}])+' + result = re.sub(pattern, r'\1', text) + return result + def change_choices(): SoVITS_names, GPT_names = get_weights_names() From 5de60d53bd018d760e2be79de20960e0543a99bc Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:48:56 +0800 Subject: [PATCH 4/7] Update Changelog_CN.md --- docs/cn/Changelog_CN.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 36c1db4..be1c066 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -169,6 +169,21 @@ 6-nan自动转fp32阶段的hubert提取bug修复 +### 20240610 + +小问题修复: + +1-完善纯标点、多标点文本输入的判断逻辑; + +2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格; + +3-s2训练进度条逻辑修复 + +大问题修复: + +4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化。 + + todolist: 1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488 @@ -177,3 +192,5 @@ todolist: 2-正在尝试解决低音质参考音频导致音质较差的问题,v2再试试如果能解决就发了,节点暂定高考后吧 + + From 29d6e1533663d48bc6e99ff54b520d8c291648c5 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:56:40 +0800 Subject: [PATCH 5/7] Update Changelog_CN.md --- docs/cn/Changelog_CN.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index be1c066..abd7263 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -173,15 +173,15 @@ 小问题修复: -1-完善纯标点、多标点文本输入的判断逻辑; +1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169 -2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格; +2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) -3-s2训练进度条逻辑修复 +3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159 大问题修复: -4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化。 +4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) todolist: From db50670598f0236613eefa6f2d5a23a271d82041 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:44:19 +0800 Subject: [PATCH 6/7] Update webui.py --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index c71c1ca..a200a74 100644 --- a/webui.py +++ b/webui.py @@ -85,7 +85,7 @@ if if_gpu_ok and len(gpu_infos) > 0: else: gpu_info = ("%s\t%s" % ("0", "CPU")) gpu_infos.append("%s\t%s" % ("0", "CPU")) - default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2 + default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2) gpus = "-".join([i[0] for i in gpu_infos]) pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth" From d8bcc732d747e3e52ea20d6340e57dac18bef06d Mon Sep 17 00:00:00 2001 From: Spr_Aachen <51275522+Spr-Aachen@users.noreply.github.com> Date: Fri, 21 Jun 2024 18:05:16 +0800 Subject: [PATCH 7/7] Update gui.py&webui.py (#1216) 1. Fix the issue that gui needs to reload models for each inference 2. Fix the issue that webui would be toggled by relevant import 3. Complement missing package import 4. Simplify GUI's code and address various inefficiencies, including: enabling direct input of ref text and target text (akin to the WebUI), facilitating file selection for ref audio uploads, adding language options for CH-EN/JA-EN/Multi (with Multi as the default), standardizing variable name to enhance readability. --- GPT_SoVITS/inference_gui.py | 118 +++++++++++++--------------------- GPT_SoVITS/inference_webui.py | 16 +++-- 2 files changed, 53 insertions(+), 81 deletions(-) diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index f6cfdc5..2059155 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -1,3 +1,4 @@ +import os import sys from PyQt5.QtCore import QEvent from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit @@ -7,16 +8,16 @@ import soundfile as sf from tools.i18n.i18n import I18nAuto i18n = I18nAuto() -from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav +from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav class GPTSoVITSGUI(QMainWindow): + GPT_Path = gpt_path + SoVITS_Path = sovits_path + def __init__(self): super().__init__() - self.init_ui() - - def init_ui(self): self.setWindowTitle('GPT-SoVITS GUI') self.setGeometry(800, 450, 950, 850) @@ -71,6 +72,7 @@ class GPTSoVITSGUI(QMainWindow): self.GPT_model_label = QLabel("选择GPT模型:") self.GPT_model_input = QLineEdit() self.GPT_model_input.setPlaceholderText("拖拽或选择文件") + self.GPT_model_input.setText(self.GPT_Path) self.GPT_model_input.setReadOnly(True) self.GPT_model_button = QPushButton("选择GPT模型文件") self.GPT_model_button.clicked.connect(self.select_GPT_model) @@ -78,6 +80,7 @@ class GPTSoVITSGUI(QMainWindow): self.SoVITS_model_label = QLabel("选择SoVITS模型:") self.SoVITS_model_input = QLineEdit() self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") + self.SoVITS_model_input.setText(self.SoVITS_Path) self.SoVITS_model_input.setReadOnly(True) self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) @@ -91,25 +94,25 @@ class GPTSoVITSGUI(QMainWindow): self.ref_text_label = QLabel("参考音频文本:") self.ref_text_input = QLineEdit() - self.ref_text_input.setPlaceholderText("拖拽或选择文件") - self.ref_text_input.setReadOnly(True) + self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") self.ref_text_button = QPushButton("上传文本") self.ref_text_button.clicked.connect(self.upload_ref_text) - self.language_label = QLabel("参考音频语言:") - self.language_combobox = QComboBox() - self.language_combobox.addItems(["中文", "英文", "日文"]) + self.ref_language_label = QLabel("参考音频语言:") + self.ref_language_combobox = QComboBox() + self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.ref_language_combobox.setCurrentText("多语种混合") self.target_text_label = QLabel("合成目标文本:") self.target_text_input = QLineEdit() - self.target_text_input.setPlaceholderText("拖拽或选择文件") - self.target_text_input.setReadOnly(True) + self.target_text_input.setPlaceholderText("直接输入文字或上传文本") self.target_text_button = QPushButton("上传文本") self.target_text_button.clicked.connect(self.upload_target_text) - self.language_label_02 = QLabel("合成音频语言:") - self.language_combobox_02 = QComboBox() - self.language_combobox_02.addItems(["中文", "英文", "日文"]) + self.target_language_label = QLabel("合成音频语言:") + self.target_language_combobox = QComboBox() + self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.target_language_combobox.setCurrentText("多语种混合") self.output_label = QLabel("输出音频路径:") self.output_input = QLineEdit() @@ -140,10 +143,8 @@ class GPTSoVITSGUI(QMainWindow): main_layout = QVBoxLayout() - input_layout = QGridLayout() - input_layout.setSpacing(10) - - self.setLayout(input_layout) + input_layout = QGridLayout(self) + input_layout.setSpacing(10) input_layout.addWidget(license_label, 0, 0, 1, 3) @@ -159,22 +160,22 @@ class GPTSoVITSGUI(QMainWindow): input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) input_layout.addWidget(self.ref_audio_button, 6, 2) - input_layout.addWidget(self.language_label, 7, 0) - input_layout.addWidget(self.language_combobox, 8, 0, 1, 1) + input_layout.addWidget(self.ref_language_label, 7, 0) + input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) input_layout.addWidget(self.ref_text_label, 9, 0) input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) input_layout.addWidget(self.ref_text_button, 10, 2) - input_layout.addWidget(self.language_label_02, 11, 0) - input_layout.addWidget(self.language_combobox_02, 12, 0, 1, 1) + input_layout.addWidget(self.target_language_label, 11, 0) + input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) input_layout.addWidget(self.target_text_label, 13, 0) input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) input_layout.addWidget(self.target_text_button, 14, 2) - + input_layout.addWidget(self.output_label, 15, 0) input_layout.addWidget(self.output_input, 16, 0, 1, 2) input_layout.addWidget(self.output_button, 16, 2) - + main_layout.addLayout(input_layout) output_layout = QVBoxLayout() @@ -198,10 +199,8 @@ class GPTSoVITSGUI(QMainWindow): def dropEvent(self, event): if event.mimeData().hasUrls(): file_paths = [url.toLocalFile() for url in event.mimeData().urls()] - if len(file_paths) == 1: self.update_ref_audio(file_paths[0]) - self.update_input_paths(self.ref_audio_input, file_paths[0]) else: self.update_ref_audio(", ".join(file_paths)) @@ -211,23 +210,13 @@ class GPTSoVITSGUI(QMainWindow): widget.installEventFilter(self) def eventFilter(self, obj, event): - if event.type() == QEvent.DragEnter: + if event.type() in (QEvent.DragEnter, QEvent.Drop): mime_data = event.mimeData() if mime_data.hasUrls(): event.acceptProposedAction() - - elif event.type() == QEvent.Drop: - mime_data = event.mimeData() - if mime_data.hasUrls(): - file_paths = [url.toLocalFile() for url in mime_data.urls()] - if len(file_paths) == 1: - self.update_input_paths(obj, file_paths[0]) - else: - self.update_input_paths(obj, ", ".join(file_paths)) - event.acceptProposedAction() return super().eventFilter(obj, event) - + def select_GPT_model(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") if file_path: @@ -239,24 +228,9 @@ class GPTSoVITSGUI(QMainWindow): self.SoVITS_model_input.setText(file_path) def select_ref_audio(self): - options = QFileDialog.Options() - options |= QFileDialog.DontUseNativeDialog - options |= QFileDialog.ShowDirsOnly - - file_dialog = QFileDialog() - file_dialog.setOptions(options) - - file_dialog.setFileMode(QFileDialog.AnyFile) - file_dialog.setNameFilter("Audio Files (*.wav *.mp3)") - - if file_dialog.exec_(): - file_paths = file_dialog.selectedFiles() - - if len(file_paths) == 1: - self.update_ref_audio(file_paths[0]) - self.update_input_paths(self.ref_audio_input, file_paths[0]) - else: - self.update_ref_audio(", ".join(file_paths)) + file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") + if file_path: + self.update_ref_audio(file_path) def upload_ref_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") @@ -264,7 +238,6 @@ class GPTSoVITSGUI(QMainWindow): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() self.ref_text_input.setText(content) - self.update_input_paths(self.ref_text_input, file_path) def upload_target_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") @@ -272,7 +245,6 @@ class GPTSoVITSGUI(QMainWindow): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() self.target_text_input.setText(content) - self.update_input_paths(self.target_text_input, file_path) def select_output_path(self): options = QFileDialog.Options() @@ -290,9 +262,6 @@ class GPTSoVITSGUI(QMainWindow): def update_ref_audio(self, file_path): self.ref_audio_input.setText(file_path) - def update_input_paths(self, input_box, file_path): - input_box.setText(file_path) - def clear_output(self): self.output_text.clear() @@ -300,23 +269,27 @@ class GPTSoVITSGUI(QMainWindow): GPT_model_path = self.GPT_model_input.text() SoVITS_model_path = self.SoVITS_model_input.text() ref_audio_path = self.ref_audio_input.text() - language_combobox = self.language_combobox.currentText() + language_combobox = self.ref_language_combobox.currentText() language_combobox = i18n(language_combobox) ref_text = self.ref_text_input.text() - language_combobox_02 = self.language_combobox_02.currentText() - language_combobox_02 = i18n(language_combobox_02) + target_language_combobox = self.target_language_combobox.currentText() + target_language_combobox = i18n(target_language_combobox) target_text = self.target_text_input.text() output_path = self.output_input.text() - change_gpt_weights(gpt_path=GPT_model_path) - change_sovits_weights(sovits_path=SoVITS_model_path) + if GPT_model_path != self.GPT_Path: + change_gpt_weights(gpt_path=GPT_model_path) + self.GPT_Path = GPT_model_path + if SoVITS_model_path != self.SoVITS_Path: + change_sovits_weights(sovits_path=SoVITS_model_path) + self.SoVITS_Path = SoVITS_model_path synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, prompt_text=ref_text, prompt_language=language_combobox, text=target_text, - text_language=language_combobox_02) - + text_language=target_language_combobox) + result_list = list(synthesis_result) if result_list: @@ -329,12 +302,9 @@ class GPTSoVITSGUI(QMainWindow): self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) self.output_text.append("处理结果:\n" + result) -def main(): + +if __name__ == '__main__': app = QApplication(sys.argv) mainWin = GPTSoVITSGUI() mainWin.show() - sys.exit(app.exec_()) - - -if __name__ == '__main__': - main() + sys.exit(app.exec_()) \ No newline at end of file diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 03440a3..b21b954 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -639,10 +639,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: button5.click(cut5, [text_inp], [text_opt]) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -app.queue(concurrency_count=511, max_size=1022).launch( - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=infer_ttswebui, - quiet=True, -) + +if __name__ == '__main__': + app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, + )