From 501a74ae96789a26b48932babed5eb4e9483a232 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:14:15 +0800 Subject: [PATCH 01/12] Add files via upload --- tools/uvr5/mdxnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py index 0d609c41..372db25b 100644 --- a/tools/uvr5/mdxnet.py +++ b/tools/uvr5/mdxnet.py @@ -220,7 +220,7 @@ class Predictor: opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) + "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) ) if os.path.exists(opt_path_vocal): try: @@ -229,7 +229,7 @@ class Predictor: pass if os.path.exists(path_other): os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) + "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) ) if os.path.exists(opt_path_other): try: From bedb421adb4a01b3aa19cdbc9422f4ebec9a2858 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:18:30 +0100 Subject: [PATCH 02/12] =?UTF-8?q?=E5=85=B3=E4=BA=8E=E6=A0=87=E7=82=B9?= =?UTF-8?q?=E7=AC=A6=E5=8F=B7=E5=AF=BC=E8=87=B4=E5=8F=82=E8=80=83=E6=B3=84?= =?UTF-8?q?=E6=BC=8F=E7=9A=84=E9=97=AE=E9=A2=98=20(#1168)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * punctuation * Update inference_webui.py * Update * update * update --- GPT_SoVITS/inference_webui.py | 36 ++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4fe8045d..03440a39 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -50,6 +50,7 @@ is_share = eval(is_share) if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() +punctuation = set(['!', '?', '…', ',', '.', '-'," "]) import gradio as gr from transformers import AutoModelForMaskedLM, AutoTokenizer import numpy as np @@ -322,6 +323,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) text = text.strip("\n") + text = replace_consecutive_punctuation(text) if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text print(i18n("实际输入的目标文本:"), text) @@ -366,6 +368,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, text = text.replace("\n\n", "\n") print(i18n("实际输入的目标文本(切句后):"), text) texts = text.split("\n") + texts = process_text(texts) texts = merge_short_text_in_array(texts, 5) audio_opt = [] if not ref_free: @@ -463,6 +466,7 @@ def cut1(inp): opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) else: opts = [inp] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) @@ -487,17 +491,21 @@ def cut2(inp): if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 opts[-2] = opts[-2] + opts[-1] opts = opts[:-1] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) def cut3(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) - + opts = ["%s" % item for item in inp.strip("。").split("。")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) def cut4(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) + opts = ["%s" % item for item in inp.strip(".").split(".")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py @@ -511,8 +519,8 @@ def cut5(inp): # 在句子不存在符号或句尾无符号的时候保证文本完整 if len(items)%2 == 1: mergeitems.append(items[-1]) - opt = "\n".join(mergeitems) - return opt + opt = [item for item in mergeitems if not set(item).issubset(punctuation)] + return "\n".join(opt) def custom_sort_key(s): @@ -522,6 +530,24 @@ def custom_sort_key(s): parts = [int(part) if part.isdigit() else part for part in parts] return parts +def process_text(texts): + _text=[] + if all(text in [None, " ", "\n",""] for text in texts): + raise ValueError(i18n("请输入有效文本")) + for text in texts: + if text in [None, " ", ""]: + pass + else: + _text.append(text) + return _text + + +def replace_consecutive_punctuation(text): + punctuations = ''.join(re.escape(p) for p in punctuation) + pattern = f'([{punctuations}])([{punctuations}])+' + result = re.sub(pattern, r'\1', text) + return result + def change_choices(): SoVITS_names, GPT_names = get_weights_names() From 5de60d53bd018d760e2be79de20960e0543a99bc Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:48:56 +0800 Subject: [PATCH 03/12] Update Changelog_CN.md --- docs/cn/Changelog_CN.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 36c1db45..be1c066a 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -169,6 +169,21 @@ 6-nan自动转fp32阶段的hubert提取bug修复 +### 20240610 + +小问题修复: + +1-完善纯标点、多标点文本输入的判断逻辑; + +2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格; + +3-s2训练进度条逻辑修复 + +大问题修复: + +4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化。 + + todolist: 1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488 @@ -177,3 +192,5 @@ todolist: 2-正在尝试解决低音质参考音频导致音质较差的问题,v2再试试如果能解决就发了,节点暂定高考后吧 + + From 29d6e1533663d48bc6e99ff54b520d8c291648c5 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:56:40 +0800 Subject: [PATCH 04/12] Update Changelog_CN.md --- docs/cn/Changelog_CN.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index be1c066a..abd7263f 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -173,15 +173,15 @@ 小问题修复: -1-完善纯标点、多标点文本输入的判断逻辑; +1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169 -2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格; +2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) -3-s2训练进度条逻辑修复 +3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159 大问题修复: -4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化。 +4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) todolist: From db50670598f0236613eefa6f2d5a23a271d82041 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:44:19 +0800 Subject: [PATCH 05/12] Update webui.py --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index c71c1ca4..a200a747 100644 --- a/webui.py +++ b/webui.py @@ -85,7 +85,7 @@ if if_gpu_ok and len(gpu_infos) > 0: else: gpu_info = ("%s\t%s" % ("0", "CPU")) gpu_infos.append("%s\t%s" % ("0", "CPU")) - default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2 + default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2) gpus = "-".join([i[0] for i in gpu_infos]) pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth" From d8bcc732d747e3e52ea20d6340e57dac18bef06d Mon Sep 17 00:00:00 2001 From: Spr_Aachen <51275522+Spr-Aachen@users.noreply.github.com> Date: Fri, 21 Jun 2024 18:05:16 +0800 Subject: [PATCH 06/12] Update gui.py&webui.py (#1216) 1. Fix the issue that gui needs to reload models for each inference 2. Fix the issue that webui would be toggled by relevant import 3. Complement missing package import 4. Simplify GUI's code and address various inefficiencies, including: enabling direct input of ref text and target text (akin to the WebUI), facilitating file selection for ref audio uploads, adding language options for CH-EN/JA-EN/Multi (with Multi as the default), standardizing variable name to enhance readability. --- GPT_SoVITS/inference_gui.py | 118 +++++++++++++--------------------- GPT_SoVITS/inference_webui.py | 16 +++-- 2 files changed, 53 insertions(+), 81 deletions(-) diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index f6cfdc5e..2059155d 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -1,3 +1,4 @@ +import os import sys from PyQt5.QtCore import QEvent from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit @@ -7,16 +8,16 @@ import soundfile as sf from tools.i18n.i18n import I18nAuto i18n = I18nAuto() -from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav +from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav class GPTSoVITSGUI(QMainWindow): + GPT_Path = gpt_path + SoVITS_Path = sovits_path + def __init__(self): super().__init__() - self.init_ui() - - def init_ui(self): self.setWindowTitle('GPT-SoVITS GUI') self.setGeometry(800, 450, 950, 850) @@ -71,6 +72,7 @@ class GPTSoVITSGUI(QMainWindow): self.GPT_model_label = QLabel("选择GPT模型:") self.GPT_model_input = QLineEdit() self.GPT_model_input.setPlaceholderText("拖拽或选择文件") + self.GPT_model_input.setText(self.GPT_Path) self.GPT_model_input.setReadOnly(True) self.GPT_model_button = QPushButton("选择GPT模型文件") self.GPT_model_button.clicked.connect(self.select_GPT_model) @@ -78,6 +80,7 @@ class GPTSoVITSGUI(QMainWindow): self.SoVITS_model_label = QLabel("选择SoVITS模型:") self.SoVITS_model_input = QLineEdit() self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") + self.SoVITS_model_input.setText(self.SoVITS_Path) self.SoVITS_model_input.setReadOnly(True) self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) @@ -91,25 +94,25 @@ class GPTSoVITSGUI(QMainWindow): self.ref_text_label = QLabel("参考音频文本:") self.ref_text_input = QLineEdit() - self.ref_text_input.setPlaceholderText("拖拽或选择文件") - self.ref_text_input.setReadOnly(True) + self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") self.ref_text_button = QPushButton("上传文本") self.ref_text_button.clicked.connect(self.upload_ref_text) - self.language_label = QLabel("参考音频语言:") - self.language_combobox = QComboBox() - self.language_combobox.addItems(["中文", "英文", "日文"]) + self.ref_language_label = QLabel("参考音频语言:") + self.ref_language_combobox = QComboBox() + self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.ref_language_combobox.setCurrentText("多语种混合") self.target_text_label = QLabel("合成目标文本:") self.target_text_input = QLineEdit() - self.target_text_input.setPlaceholderText("拖拽或选择文件") - self.target_text_input.setReadOnly(True) + self.target_text_input.setPlaceholderText("直接输入文字或上传文本") self.target_text_button = QPushButton("上传文本") self.target_text_button.clicked.connect(self.upload_target_text) - self.language_label_02 = QLabel("合成音频语言:") - self.language_combobox_02 = QComboBox() - self.language_combobox_02.addItems(["中文", "英文", "日文"]) + self.target_language_label = QLabel("合成音频语言:") + self.target_language_combobox = QComboBox() + self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.target_language_combobox.setCurrentText("多语种混合") self.output_label = QLabel("输出音频路径:") self.output_input = QLineEdit() @@ -140,10 +143,8 @@ class GPTSoVITSGUI(QMainWindow): main_layout = QVBoxLayout() - input_layout = QGridLayout() - input_layout.setSpacing(10) - - self.setLayout(input_layout) + input_layout = QGridLayout(self) + input_layout.setSpacing(10) input_layout.addWidget(license_label, 0, 0, 1, 3) @@ -159,22 +160,22 @@ class GPTSoVITSGUI(QMainWindow): input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) input_layout.addWidget(self.ref_audio_button, 6, 2) - input_layout.addWidget(self.language_label, 7, 0) - input_layout.addWidget(self.language_combobox, 8, 0, 1, 1) + input_layout.addWidget(self.ref_language_label, 7, 0) + input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) input_layout.addWidget(self.ref_text_label, 9, 0) input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) input_layout.addWidget(self.ref_text_button, 10, 2) - input_layout.addWidget(self.language_label_02, 11, 0) - input_layout.addWidget(self.language_combobox_02, 12, 0, 1, 1) + input_layout.addWidget(self.target_language_label, 11, 0) + input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) input_layout.addWidget(self.target_text_label, 13, 0) input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) input_layout.addWidget(self.target_text_button, 14, 2) - + input_layout.addWidget(self.output_label, 15, 0) input_layout.addWidget(self.output_input, 16, 0, 1, 2) input_layout.addWidget(self.output_button, 16, 2) - + main_layout.addLayout(input_layout) output_layout = QVBoxLayout() @@ -198,10 +199,8 @@ class GPTSoVITSGUI(QMainWindow): def dropEvent(self, event): if event.mimeData().hasUrls(): file_paths = [url.toLocalFile() for url in event.mimeData().urls()] - if len(file_paths) == 1: self.update_ref_audio(file_paths[0]) - self.update_input_paths(self.ref_audio_input, file_paths[0]) else: self.update_ref_audio(", ".join(file_paths)) @@ -211,23 +210,13 @@ class GPTSoVITSGUI(QMainWindow): widget.installEventFilter(self) def eventFilter(self, obj, event): - if event.type() == QEvent.DragEnter: + if event.type() in (QEvent.DragEnter, QEvent.Drop): mime_data = event.mimeData() if mime_data.hasUrls(): event.acceptProposedAction() - - elif event.type() == QEvent.Drop: - mime_data = event.mimeData() - if mime_data.hasUrls(): - file_paths = [url.toLocalFile() for url in mime_data.urls()] - if len(file_paths) == 1: - self.update_input_paths(obj, file_paths[0]) - else: - self.update_input_paths(obj, ", ".join(file_paths)) - event.acceptProposedAction() return super().eventFilter(obj, event) - + def select_GPT_model(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") if file_path: @@ -239,24 +228,9 @@ class GPTSoVITSGUI(QMainWindow): self.SoVITS_model_input.setText(file_path) def select_ref_audio(self): - options = QFileDialog.Options() - options |= QFileDialog.DontUseNativeDialog - options |= QFileDialog.ShowDirsOnly - - file_dialog = QFileDialog() - file_dialog.setOptions(options) - - file_dialog.setFileMode(QFileDialog.AnyFile) - file_dialog.setNameFilter("Audio Files (*.wav *.mp3)") - - if file_dialog.exec_(): - file_paths = file_dialog.selectedFiles() - - if len(file_paths) == 1: - self.update_ref_audio(file_paths[0]) - self.update_input_paths(self.ref_audio_input, file_paths[0]) - else: - self.update_ref_audio(", ".join(file_paths)) + file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") + if file_path: + self.update_ref_audio(file_path) def upload_ref_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") @@ -264,7 +238,6 @@ class GPTSoVITSGUI(QMainWindow): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() self.ref_text_input.setText(content) - self.update_input_paths(self.ref_text_input, file_path) def upload_target_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") @@ -272,7 +245,6 @@ class GPTSoVITSGUI(QMainWindow): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() self.target_text_input.setText(content) - self.update_input_paths(self.target_text_input, file_path) def select_output_path(self): options = QFileDialog.Options() @@ -290,9 +262,6 @@ class GPTSoVITSGUI(QMainWindow): def update_ref_audio(self, file_path): self.ref_audio_input.setText(file_path) - def update_input_paths(self, input_box, file_path): - input_box.setText(file_path) - def clear_output(self): self.output_text.clear() @@ -300,23 +269,27 @@ class GPTSoVITSGUI(QMainWindow): GPT_model_path = self.GPT_model_input.text() SoVITS_model_path = self.SoVITS_model_input.text() ref_audio_path = self.ref_audio_input.text() - language_combobox = self.language_combobox.currentText() + language_combobox = self.ref_language_combobox.currentText() language_combobox = i18n(language_combobox) ref_text = self.ref_text_input.text() - language_combobox_02 = self.language_combobox_02.currentText() - language_combobox_02 = i18n(language_combobox_02) + target_language_combobox = self.target_language_combobox.currentText() + target_language_combobox = i18n(target_language_combobox) target_text = self.target_text_input.text() output_path = self.output_input.text() - change_gpt_weights(gpt_path=GPT_model_path) - change_sovits_weights(sovits_path=SoVITS_model_path) + if GPT_model_path != self.GPT_Path: + change_gpt_weights(gpt_path=GPT_model_path) + self.GPT_Path = GPT_model_path + if SoVITS_model_path != self.SoVITS_Path: + change_sovits_weights(sovits_path=SoVITS_model_path) + self.SoVITS_Path = SoVITS_model_path synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, prompt_text=ref_text, prompt_language=language_combobox, text=target_text, - text_language=language_combobox_02) - + text_language=target_language_combobox) + result_list = list(synthesis_result) if result_list: @@ -329,12 +302,9 @@ class GPTSoVITSGUI(QMainWindow): self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) self.output_text.append("处理结果:\n" + result) -def main(): + +if __name__ == '__main__': app = QApplication(sys.argv) mainWin = GPTSoVITSGUI() mainWin.show() - sys.exit(app.exec_()) - - -if __name__ == '__main__': - main() + sys.exit(app.exec_()) \ No newline at end of file diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 03440a39..b21b9540 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -639,10 +639,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: button5.click(cut5, [text_inp], [text_opt]) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -app.queue(concurrency_count=511, max_size=1022).launch( - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=infer_ttswebui, - quiet=True, -) + +if __name__ == '__main__': + app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, + ) From 3e81e215a5e542b6fb2340fc9e7eefaae62ad955 Mon Sep 17 00:00:00 2001 From: david l euler Date: Wed, 26 Jun 2024 22:17:05 +0800 Subject: [PATCH 07/12] inference cli tool (#1195) Co-authored-by: david Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> --- GPT_SoVITS/inference_cli.py | 55 +++++++++++++++++++++++++++++++++++ GPT_SoVITS/inference_webui.py | 1 - 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 GPT_SoVITS/inference_cli.py diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py new file mode 100644 index 00000000..bd987aaf --- /dev/null +++ b/GPT_SoVITS/inference_cli.py @@ -0,0 +1,55 @@ +import argparse +import os +import soundfile as sf + +from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav + +i18n = I18nAuto() + +def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): + # Read reference text + with open(ref_text_path, 'r', encoding='utf-8') as file: + ref_text = file.read() + + # Read target text + with open(target_text_path, 'r', encoding='utf-8') as file: + target_text = file.read() + + # Change model weights + change_gpt_weights(gpt_path=GPT_model_path) + change_sovits_weights(sovits_path=SoVITS_model_path) + + # Synthesize audio + synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=i18n(ref_language), + text=target_text, + text_language=i18n(target_language), top_p=1, temperature=1) + + result_list = list(synthesis_result) + + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + output_wav_path = os.path.join(output_path, "output.wav") + sf.write(output_wav_path, last_audio_data, last_sampling_rate) + print(f"Audio saved to {output_wav_path}") + +def main(): + parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") + parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") + parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") + parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") + parser.add_argument('--ref_text', required=True, help="Path to the reference text file") + parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio") + parser.add_argument('--target_text', required=True, help="Path to the target text file") + parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text") + parser.add_argument('--output_path', required=True, help="Path to the output directory") + + args = parser.parse_args() + + synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path) + +if __name__ == '__main__': + main() + diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index b21b9540..4dc1040d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -639,7 +639,6 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: button5.click(cut5, [text_inp], [text_opt]) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) - if __name__ == '__main__': app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", From bce37f792f6da150b0f6de41437f4378f0437852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=A4=E5=B7=9D=E9=B6=B4=E9=B3=B4?= <104302166+AkagawaTsurunaki@users.noreply.github.com> Date: Wed, 26 Jun 2024 22:17:27 +0800 Subject: [PATCH 08/12] fix: Fixed issue #1199 (#1200) --- api.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/api.py b/api.py index 041fa349..b5340715 100644 --- a/api.py +++ b/api.py @@ -339,8 +339,46 @@ def pack_audio(audio_bytes, data, rate): def pack_ogg(audio_bytes, data, rate): - with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: - audio_file.write(data) + # Author: AkagawaTsurunaki + # Issue: + # Stack overflow probabilistically occurs + # when the function `sf_writef_short` of `libsndfile_64bit.dll` is called + # using the Python library `soundfile` + # Note: + # This is an issue related to `libsndfile`, not this project itself. + # It happens when you generate a large audio tensor (about 499804 frames in my PC) + # and try to convert it to an ogg file. + # Related: + # https://github.com/RVC-Boss/GPT-SoVITS/issues/1199 + # https://github.com/libsndfile/libsndfile/issues/1023 + # https://github.com/bastibe/python-soundfile/issues/396 + # Suggestion: + # Or split the whole audio data into smaller audio segment to avoid stack overflow? + + def handle_pack_ogg(): + with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: + audio_file.write(data) + + import threading + # See: https://docs.python.org/3/library/threading.html + # The stack size of this thread is at least 32768 + # If stack overflow error still occurs, just modify the `stack_size`. + # stack_size = n * 4096, where n should be a positive integer. + # Here we chose n = 4096. + stack_size = 4096 * 4096 + try: + threading.stack_size(stack_size) + pack_ogg_thread = threading.Thread(target=handle_pack_ogg) + pack_ogg_thread.start() + pack_ogg_thread.join() + except RuntimeError as e: + # If changing the thread stack size is unsupported, a RuntimeError is raised. + print("RuntimeError: {}".format(e)) + print("Changing the thread stack size is unsupported.") + except ValueError as e: + # If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified. + print("ValueError: {}".format(e)) + print("The specified stack size is invalid.") return audio_bytes From d1d43661a992cc6782bf35d5edb700dd42b929a3 Mon Sep 17 00:00:00 2001 From: sam Date: Wed, 26 Jun 2024 22:18:55 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E4=BF=AE=E6=AD=A3numba=E4=B8=8Enumpy?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E9=97=AE=E9=A2=98=EF=BC=8C=E5=9C=A8ubuntu24.?= =?UTF-8?q?04=E4=B8=8B=E6=B5=8B=E8=AF=95=E9=80=9A=E8=BF=87=20(#1236)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 73912d01..bf2e28a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy==1.23.4 scipy tensorboard librosa==0.9.2 From 1b4c01e860168d25a286787a58f8720ea9039cb7 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Fri, 28 Jun 2024 22:39:29 +0800 Subject: [PATCH 10/12] DS_STORE (#1258) --- tools/cmd-denoise.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmd-denoise.py b/tools/cmd-denoise.py index 69b51e66..1fdcab6d 100644 --- a/tools/cmd-denoise.py +++ b/tools/cmd-denoise.py @@ -1,4 +1,5 @@ import os,argparse +import traceback from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks @@ -12,7 +13,10 @@ def execute_denoise(input_folder,output_folder): # print(input_folder) # print(list(os.listdir(input_folder).sort())) for name in tqdm(os.listdir(input_folder)): - ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) + try: + ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) + except: + traceback.print_exc() if __name__ == '__main__': parser = argparse.ArgumentParser() From a208698e775155efc95b187b746d153d0f2847ca Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Sat, 29 Jun 2024 22:54:05 +0800 Subject: [PATCH 11/12] Update s1_train.py --- GPT_SoVITS/s1_train.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 43cfa19a..ece295d3 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -79,15 +79,17 @@ class my_model_ckpt(ModelCheckpoint): to_save_od["config"] = self.config to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) # torch.save( - my_save( - to_save_od, - "%s/%s-e%s.ckpt" - % ( - self.half_weights_save_dir, - self.exp_name, - trainer.current_epoch + 1, - ), - ) + # print(os.environ) + if(os.environ.get("LOCAL_RANK","0")=="0"): + my_save( + to_save_od, + "%s/%s-e%s.ckpt" + % ( + self.half_weights_save_dir, + self.exp_name, + trainer.current_epoch + 1, + ), + ) self._save_last_checkpoint(trainer, monitor_candidates) From 582ba7d5192bc4d12be75ef3a0bf2dd18e94d303 Mon Sep 17 00:00:00 2001 From: aoguai <34203474+aoguai@users.noreply.github.com> Date: Sat, 29 Jun 2024 22:57:01 +0800 Subject: [PATCH 12/12] =?UTF-8?q?fix:=20=E7=A7=BB=E9=99=A4=E5=A4=9A?= =?UTF-8?q?=E4=BD=99=20my=5Futils.py=20(#1189)=20(#1251)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: 移除多余 my_utils.py (#1189) * feat: update my_utils --- GPT_SoVITS/inference_webui.py | 2 +- GPT_SoVITS/module/data_utils.py | 2 +- GPT_SoVITS/my_utils.py | 21 ------------------- GPT_SoVITS/onnx_export.py | 2 +- .../prepare_datasets/2-get-hubert-wav32k.py | 2 +- api.py | 2 +- tools/slice_audio.py | 2 +- 7 files changed, 6 insertions(+), 27 deletions(-) delete mode 100644 GPT_SoVITS/my_utils.py diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4dc1040d..44c6d0eb 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -65,7 +65,7 @@ from text import cleaned_text_to_sequence from text.cleaner import clean_text from time import time as ttime from module.mel_processing import spectrogram_torch -from my_utils import load_audio +from tools.my_utils import load_audio from tools.i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index ff4c4f43..72c80555 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -17,7 +17,7 @@ from functools import lru_cache import requests from scipy.io import wavfile from io import BytesIO -from my_utils import load_audio +from tools.my_utils import load_audio # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): diff --git a/GPT_SoVITS/my_utils.py b/GPT_SoVITS/my_utils.py deleted file mode 100644 index 776939dd..00000000 --- a/GPT_SoVITS/my_utils.py +++ /dev/null @@ -1,21 +0,0 @@ -import ffmpeg -import numpy as np - - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index b82e987f..ab457d75 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -9,7 +9,7 @@ cnhubert.cnhubert_base_path=cnhubert_base_path ssl_model = cnhubert.get_model() from text import cleaned_text_to_sequence import soundfile -from my_utils import load_audio +from tools.my_utils import load_audio import os import json diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 61c933a4..17394ee4 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -17,7 +17,7 @@ from scipy.io import wavfile import librosa,torch now_dir = os.getcwd() sys.path.append(now_dir) -from my_utils import load_audio +from tools.my_utils import load_audio # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path diff --git a/api.py b/api.py index b5340715..aa822ca7 100644 --- a/api.py +++ b/api.py @@ -143,7 +143,7 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch -from my_utils import load_audio +from tools.my_utils import load_audio import config as global_config import logging import subprocess diff --git a/tools/slice_audio.py b/tools/slice_audio.py index 46ee408a..8a06292d 100644 --- a/tools/slice_audio.py +++ b/tools/slice_audio.py @@ -3,7 +3,7 @@ import traceback from scipy.io import wavfile # parent_directory = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(parent_directory) -from my_utils import load_audio +from tools.my_utils import load_audio from slicer2 import Slicer def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):