Merge remote-tracking branch 'upstream/main' into DS_STORE

This commit is contained in:
XXXXRT666 2024-07-01 20:05:23 +08:00
commit 7a35e4b482
14 changed files with 216 additions and 128 deletions

View File

@ -0,0 +1,55 @@
import argparse
import os
import soundfile as sf
from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
i18n = I18nAuto()
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
# Read reference text
with open(ref_text_path, 'r', encoding='utf-8') as file:
ref_text = file.read()
# Read target text
with open(target_text_path, 'r', encoding='utf-8') as file:
target_text = file.read()
# Change model weights
change_gpt_weights(gpt_path=GPT_model_path)
change_sovits_weights(sovits_path=SoVITS_model_path)
# Synthesize audio
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
prompt_text=ref_text,
prompt_language=i18n(ref_language),
text=target_text,
text_language=i18n(target_language), top_p=1, temperature=1)
result_list = list(synthesis_result)
if result_list:
last_sampling_rate, last_audio_data = result_list[-1]
output_wav_path = os.path.join(output_path, "output.wav")
sf.write(output_wav_path, last_audio_data, last_sampling_rate)
print(f"Audio saved to {output_wav_path}")
def main():
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file")
parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file")
parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file")
parser.add_argument('--ref_text', required=True, help="Path to the reference text file")
parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio")
parser.add_argument('--target_text', required=True, help="Path to the target text file")
parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text")
parser.add_argument('--output_path', required=True, help="Path to the output directory")
args = parser.parse_args()
synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path)
if __name__ == '__main__':
main()

View File

@ -1,3 +1,4 @@
import os
import sys import sys
from PyQt5.QtCore import QEvent from PyQt5.QtCore import QEvent
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
@ -7,16 +8,16 @@ import soundfile as sf
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
i18n = I18nAuto() i18n = I18nAuto()
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
class GPTSoVITSGUI(QMainWindow): class GPTSoVITSGUI(QMainWindow):
GPT_Path = gpt_path
SoVITS_Path = sovits_path
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.init_ui()
def init_ui(self):
self.setWindowTitle('GPT-SoVITS GUI') self.setWindowTitle('GPT-SoVITS GUI')
self.setGeometry(800, 450, 950, 850) self.setGeometry(800, 450, 950, 850)
@ -71,6 +72,7 @@ class GPTSoVITSGUI(QMainWindow):
self.GPT_model_label = QLabel("选择GPT模型:") self.GPT_model_label = QLabel("选择GPT模型:")
self.GPT_model_input = QLineEdit() self.GPT_model_input = QLineEdit()
self.GPT_model_input.setPlaceholderText("拖拽或选择文件") self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
self.GPT_model_input.setText(self.GPT_Path)
self.GPT_model_input.setReadOnly(True) self.GPT_model_input.setReadOnly(True)
self.GPT_model_button = QPushButton("选择GPT模型文件") self.GPT_model_button = QPushButton("选择GPT模型文件")
self.GPT_model_button.clicked.connect(self.select_GPT_model) self.GPT_model_button.clicked.connect(self.select_GPT_model)
@ -78,6 +80,7 @@ class GPTSoVITSGUI(QMainWindow):
self.SoVITS_model_label = QLabel("选择SoVITS模型:") self.SoVITS_model_label = QLabel("选择SoVITS模型:")
self.SoVITS_model_input = QLineEdit() self.SoVITS_model_input = QLineEdit()
self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
self.SoVITS_model_input.setText(self.SoVITS_Path)
self.SoVITS_model_input.setReadOnly(True) self.SoVITS_model_input.setReadOnly(True)
self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
@ -91,25 +94,25 @@ class GPTSoVITSGUI(QMainWindow):
self.ref_text_label = QLabel("参考音频文本:") self.ref_text_label = QLabel("参考音频文本:")
self.ref_text_input = QLineEdit() self.ref_text_input = QLineEdit()
self.ref_text_input.setPlaceholderText("拖拽或选择文件") self.ref_text_input.setPlaceholderText("直接输入文字或上传文本")
self.ref_text_input.setReadOnly(True)
self.ref_text_button = QPushButton("上传文本") self.ref_text_button = QPushButton("上传文本")
self.ref_text_button.clicked.connect(self.upload_ref_text) self.ref_text_button.clicked.connect(self.upload_ref_text)
self.language_label = QLabel("参考音频语言:") self.ref_language_label = QLabel("参考音频语言:")
self.language_combobox = QComboBox() self.ref_language_combobox = QComboBox()
self.language_combobox.addItems(["中文", "英文", "日文"]) self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
self.ref_language_combobox.setCurrentText("多语种混合")
self.target_text_label = QLabel("合成目标文本:") self.target_text_label = QLabel("合成目标文本:")
self.target_text_input = QLineEdit() self.target_text_input = QLineEdit()
self.target_text_input.setPlaceholderText("拖拽或选择文件") self.target_text_input.setPlaceholderText("直接输入文字或上传文本")
self.target_text_input.setReadOnly(True)
self.target_text_button = QPushButton("上传文本") self.target_text_button = QPushButton("上传文本")
self.target_text_button.clicked.connect(self.upload_target_text) self.target_text_button.clicked.connect(self.upload_target_text)
self.language_label_02 = QLabel("合成音频语言:") self.target_language_label = QLabel("合成音频语言:")
self.language_combobox_02 = QComboBox() self.target_language_combobox = QComboBox()
self.language_combobox_02.addItems(["中文", "英文", "日文"]) self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
self.target_language_combobox.setCurrentText("多语种混合")
self.output_label = QLabel("输出音频路径:") self.output_label = QLabel("输出音频路径:")
self.output_input = QLineEdit() self.output_input = QLineEdit()
@ -140,10 +143,8 @@ class GPTSoVITSGUI(QMainWindow):
main_layout = QVBoxLayout() main_layout = QVBoxLayout()
input_layout = QGridLayout() input_layout = QGridLayout(self)
input_layout.setSpacing(10) input_layout.setSpacing(10)
self.setLayout(input_layout)
input_layout.addWidget(license_label, 0, 0, 1, 3) input_layout.addWidget(license_label, 0, 0, 1, 3)
@ -159,22 +160,22 @@ class GPTSoVITSGUI(QMainWindow):
input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
input_layout.addWidget(self.ref_audio_button, 6, 2) input_layout.addWidget(self.ref_audio_button, 6, 2)
input_layout.addWidget(self.language_label, 7, 0) input_layout.addWidget(self.ref_language_label, 7, 0)
input_layout.addWidget(self.language_combobox, 8, 0, 1, 1) input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1)
input_layout.addWidget(self.ref_text_label, 9, 0) input_layout.addWidget(self.ref_text_label, 9, 0)
input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
input_layout.addWidget(self.ref_text_button, 10, 2) input_layout.addWidget(self.ref_text_button, 10, 2)
input_layout.addWidget(self.language_label_02, 11, 0) input_layout.addWidget(self.target_language_label, 11, 0)
input_layout.addWidget(self.language_combobox_02, 12, 0, 1, 1) input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1)
input_layout.addWidget(self.target_text_label, 13, 0) input_layout.addWidget(self.target_text_label, 13, 0)
input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
input_layout.addWidget(self.target_text_button, 14, 2) input_layout.addWidget(self.target_text_button, 14, 2)
input_layout.addWidget(self.output_label, 15, 0) input_layout.addWidget(self.output_label, 15, 0)
input_layout.addWidget(self.output_input, 16, 0, 1, 2) input_layout.addWidget(self.output_input, 16, 0, 1, 2)
input_layout.addWidget(self.output_button, 16, 2) input_layout.addWidget(self.output_button, 16, 2)
main_layout.addLayout(input_layout) main_layout.addLayout(input_layout)
output_layout = QVBoxLayout() output_layout = QVBoxLayout()
@ -198,10 +199,8 @@ class GPTSoVITSGUI(QMainWindow):
def dropEvent(self, event): def dropEvent(self, event):
if event.mimeData().hasUrls(): if event.mimeData().hasUrls():
file_paths = [url.toLocalFile() for url in event.mimeData().urls()] file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
if len(file_paths) == 1: if len(file_paths) == 1:
self.update_ref_audio(file_paths[0]) self.update_ref_audio(file_paths[0])
self.update_input_paths(self.ref_audio_input, file_paths[0])
else: else:
self.update_ref_audio(", ".join(file_paths)) self.update_ref_audio(", ".join(file_paths))
@ -211,23 +210,13 @@ class GPTSoVITSGUI(QMainWindow):
widget.installEventFilter(self) widget.installEventFilter(self)
def eventFilter(self, obj, event): def eventFilter(self, obj, event):
if event.type() == QEvent.DragEnter: if event.type() in (QEvent.DragEnter, QEvent.Drop):
mime_data = event.mimeData() mime_data = event.mimeData()
if mime_data.hasUrls(): if mime_data.hasUrls():
event.acceptProposedAction() event.acceptProposedAction()
elif event.type() == QEvent.Drop:
mime_data = event.mimeData()
if mime_data.hasUrls():
file_paths = [url.toLocalFile() for url in mime_data.urls()]
if len(file_paths) == 1:
self.update_input_paths(obj, file_paths[0])
else:
self.update_input_paths(obj, ", ".join(file_paths))
event.acceptProposedAction()
return super().eventFilter(obj, event) return super().eventFilter(obj, event)
def select_GPT_model(self): def select_GPT_model(self):
file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
if file_path: if file_path:
@ -239,24 +228,9 @@ class GPTSoVITSGUI(QMainWindow):
self.SoVITS_model_input.setText(file_path) self.SoVITS_model_input.setText(file_path)
def select_ref_audio(self): def select_ref_audio(self):
options = QFileDialog.Options() file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)")
options |= QFileDialog.DontUseNativeDialog if file_path:
options |= QFileDialog.ShowDirsOnly self.update_ref_audio(file_path)
file_dialog = QFileDialog()
file_dialog.setOptions(options)
file_dialog.setFileMode(QFileDialog.AnyFile)
file_dialog.setNameFilter("Audio Files (*.wav *.mp3)")
if file_dialog.exec_():
file_paths = file_dialog.selectedFiles()
if len(file_paths) == 1:
self.update_ref_audio(file_paths[0])
self.update_input_paths(self.ref_audio_input, file_paths[0])
else:
self.update_ref_audio(", ".join(file_paths))
def upload_ref_text(self): def upload_ref_text(self):
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
@ -264,7 +238,6 @@ class GPTSoVITSGUI(QMainWindow):
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
content = file.read() content = file.read()
self.ref_text_input.setText(content) self.ref_text_input.setText(content)
self.update_input_paths(self.ref_text_input, file_path)
def upload_target_text(self): def upload_target_text(self):
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
@ -272,7 +245,6 @@ class GPTSoVITSGUI(QMainWindow):
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
content = file.read() content = file.read()
self.target_text_input.setText(content) self.target_text_input.setText(content)
self.update_input_paths(self.target_text_input, file_path)
def select_output_path(self): def select_output_path(self):
options = QFileDialog.Options() options = QFileDialog.Options()
@ -290,9 +262,6 @@ class GPTSoVITSGUI(QMainWindow):
def update_ref_audio(self, file_path): def update_ref_audio(self, file_path):
self.ref_audio_input.setText(file_path) self.ref_audio_input.setText(file_path)
def update_input_paths(self, input_box, file_path):
input_box.setText(file_path)
def clear_output(self): def clear_output(self):
self.output_text.clear() self.output_text.clear()
@ -300,23 +269,27 @@ class GPTSoVITSGUI(QMainWindow):
GPT_model_path = self.GPT_model_input.text() GPT_model_path = self.GPT_model_input.text()
SoVITS_model_path = self.SoVITS_model_input.text() SoVITS_model_path = self.SoVITS_model_input.text()
ref_audio_path = self.ref_audio_input.text() ref_audio_path = self.ref_audio_input.text()
language_combobox = self.language_combobox.currentText() language_combobox = self.ref_language_combobox.currentText()
language_combobox = i18n(language_combobox) language_combobox = i18n(language_combobox)
ref_text = self.ref_text_input.text() ref_text = self.ref_text_input.text()
language_combobox_02 = self.language_combobox_02.currentText() target_language_combobox = self.target_language_combobox.currentText()
language_combobox_02 = i18n(language_combobox_02) target_language_combobox = i18n(target_language_combobox)
target_text = self.target_text_input.text() target_text = self.target_text_input.text()
output_path = self.output_input.text() output_path = self.output_input.text()
change_gpt_weights(gpt_path=GPT_model_path) if GPT_model_path != self.GPT_Path:
change_sovits_weights(sovits_path=SoVITS_model_path) change_gpt_weights(gpt_path=GPT_model_path)
self.GPT_Path = GPT_model_path
if SoVITS_model_path != self.SoVITS_Path:
change_sovits_weights(sovits_path=SoVITS_model_path)
self.SoVITS_Path = SoVITS_model_path
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
prompt_text=ref_text, prompt_text=ref_text,
prompt_language=language_combobox, prompt_language=language_combobox,
text=target_text, text=target_text,
text_language=language_combobox_02) text_language=target_language_combobox)
result_list = list(synthesis_result) result_list = list(synthesis_result)
if result_list: if result_list:
@ -329,12 +302,9 @@ class GPTSoVITSGUI(QMainWindow):
self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000)
self.output_text.append("处理结果:\n" + result) self.output_text.append("处理结果:\n" + result)
def main():
if __name__ == '__main__':
app = QApplication(sys.argv) app = QApplication(sys.argv)
mainWin = GPTSoVITSGUI() mainWin = GPTSoVITSGUI()
mainWin.show() mainWin.show()
sys.exit(app.exec_()) sys.exit(app.exec_())
if __name__ == '__main__':
main()

View File

@ -50,6 +50,7 @@ is_share = eval(is_share)
if "_CUDA_VISIBLE_DEVICES" in os.environ: if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
punctuation = set(['!', '?', '', ',', '.', '-'," "])
import gradio as gr import gradio as gr
from transformers import AutoModelForMaskedLM, AutoTokenizer from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np import numpy as np
@ -64,7 +65,7 @@ from text import cleaned_text_to_sequence
from text.cleaner import clean_text from text.cleaner import clean_text
from time import time as ttime from time import time as ttime
from module.mel_processing import spectrogram_torch from module.mel_processing import spectrogram_torch
from my_utils import load_audio from tools.my_utils import load_audio
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
i18n = I18nAuto() i18n = I18nAuto()
@ -322,6 +323,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
if (prompt_text[-1] not in splits): prompt_text += "" if prompt_language != "en" else "." if (prompt_text[-1] not in splits): prompt_text += "" if prompt_language != "en" else "."
print(i18n("实际输入的参考文本:"), prompt_text) print(i18n("实际输入的参考文本:"), prompt_text)
text = text.strip("\n") text = text.strip("\n")
text = replace_consecutive_punctuation(text)
if (text[0] not in splits and len(get_first(text)) < 4): text = "" + text if text_language != "en" else "." + text if (text[0] not in splits and len(get_first(text)) < 4): text = "" + text if text_language != "en" else "." + text
print(i18n("实际输入的目标文本:"), text) print(i18n("实际输入的目标文本:"), text)
@ -366,6 +368,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
text = text.replace("\n\n", "\n") text = text.replace("\n\n", "\n")
print(i18n("实际输入的目标文本(切句后):"), text) print(i18n("实际输入的目标文本(切句后):"), text)
texts = text.split("\n") texts = text.split("\n")
texts = process_text(texts)
texts = merge_short_text_in_array(texts, 5) texts = merge_short_text_in_array(texts, 5)
audio_opt = [] audio_opt = []
if not ref_free: if not ref_free:
@ -463,6 +466,7 @@ def cut1(inp):
opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
else: else:
opts = [inp] opts = [inp]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts) return "\n".join(opts)
@ -487,17 +491,21 @@ def cut2(inp):
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
opts[-2] = opts[-2] + opts[-1] opts[-2] = opts[-2] + opts[-1]
opts = opts[:-1] opts = opts[:-1]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts) return "\n".join(opts)
def cut3(inp): def cut3(inp):
inp = inp.strip("\n") inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")]) opts = ["%s" % item for item in inp.strip("").split("")]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
def cut4(inp): def cut4(inp):
inp = inp.strip("\n") inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) opts = ["%s" % item for item in inp.strip(".").split(".")]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
@ -511,8 +519,8 @@ def cut5(inp):
# 在句子不存在符号或句尾无符号的时候保证文本完整 # 在句子不存在符号或句尾无符号的时候保证文本完整
if len(items)%2 == 1: if len(items)%2 == 1:
mergeitems.append(items[-1]) mergeitems.append(items[-1])
opt = "\n".join(mergeitems) opt = [item for item in mergeitems if not set(item).issubset(punctuation)]
return opt return "\n".join(opt)
def custom_sort_key(s): def custom_sort_key(s):
@ -522,6 +530,24 @@ def custom_sort_key(s):
parts = [int(part) if part.isdigit() else part for part in parts] parts = [int(part) if part.isdigit() else part for part in parts]
return parts return parts
def process_text(texts):
_text=[]
if all(text in [None, " ", "\n",""] for text in texts):
raise ValueError(i18n("请输入有效文本"))
for text in texts:
if text in [None, " ", ""]:
pass
else:
_text.append(text)
return _text
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result
def change_choices(): def change_choices():
SoVITS_names, GPT_names = get_weights_names() SoVITS_names, GPT_names = get_weights_names()
@ -613,10 +639,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
button5.click(cut5, [text_inp], [text_opt]) button5.click(cut5, [text_inp], [text_opt])
gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
app.queue(concurrency_count=511, max_size=1022).launch( if __name__ == '__main__':
server_name="0.0.0.0", app.queue(concurrency_count=511, max_size=1022).launch(
inbrowser=True, server_name="0.0.0.0",
share=is_share, inbrowser=True,
server_port=infer_ttswebui, share=is_share,
quiet=True, server_port=infer_ttswebui,
) quiet=True,
)

View File

@ -17,7 +17,7 @@ from functools import lru_cache
import requests import requests
from scipy.io import wavfile from scipy.io import wavfile
from io import BytesIO from io import BytesIO
from my_utils import load_audio from tools.my_utils import load_audio
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
class TextAudioSpeakerLoader(torch.utils.data.Dataset): class TextAudioSpeakerLoader(torch.utils.data.Dataset):

View File

@ -1,21 +0,0 @@
import ffmpeg
import numpy as np
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}")
return np.frombuffer(out, np.float32).flatten()

View File

@ -9,7 +9,7 @@ cnhubert.cnhubert_base_path=cnhubert_base_path
ssl_model = cnhubert.get_model() ssl_model = cnhubert.get_model()
from text import cleaned_text_to_sequence from text import cleaned_text_to_sequence
import soundfile import soundfile
from my_utils import load_audio from tools.my_utils import load_audio
import os import os
import json import json

View File

@ -17,7 +17,7 @@ from scipy.io import wavfile
import librosa,torch import librosa,torch
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
from my_utils import load_audio from tools.my_utils import load_audio
# from config import cnhubert_base_path # from config import cnhubert_base_path
# cnhubert.cnhubert_base_path=cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path

View File

@ -79,15 +79,17 @@ class my_model_ckpt(ModelCheckpoint):
to_save_od["config"] = self.config to_save_od["config"] = self.config
to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1)
# torch.save( # torch.save(
my_save( # print(os.environ)
to_save_od, if(os.environ.get("LOCAL_RANK","0")=="0"):
"%s/%s-e%s.ckpt" my_save(
% ( to_save_od,
self.half_weights_save_dir, "%s/%s-e%s.ckpt"
self.exp_name, % (
trainer.current_epoch + 1, self.half_weights_save_dir,
), self.exp_name,
) trainer.current_epoch + 1,
),
)
self._save_last_checkpoint(trainer, monitor_candidates) self._save_last_checkpoint(trainer, monitor_candidates)

44
api.py
View File

@ -143,7 +143,7 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence from text import cleaned_text_to_sequence
from text.cleaner import clean_text from text.cleaner import clean_text
from module.mel_processing import spectrogram_torch from module.mel_processing import spectrogram_torch
from my_utils import load_audio from tools.my_utils import load_audio
import config as global_config import config as global_config
import logging import logging
import subprocess import subprocess
@ -339,8 +339,46 @@ def pack_audio(audio_bytes, data, rate):
def pack_ogg(audio_bytes, data, rate): def pack_ogg(audio_bytes, data, rate):
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: # Author: AkagawaTsurunaki
audio_file.write(data) # Issue:
# Stack overflow probabilistically occurs
# when the function `sf_writef_short` of `libsndfile_64bit.dll` is called
# using the Python library `soundfile`
# Note:
# This is an issue related to `libsndfile`, not this project itself.
# It happens when you generate a large audio tensor (about 499804 frames in my PC)
# and try to convert it to an ogg file.
# Related:
# https://github.com/RVC-Boss/GPT-SoVITS/issues/1199
# https://github.com/libsndfile/libsndfile/issues/1023
# https://github.com/bastibe/python-soundfile/issues/396
# Suggestion:
# Or split the whole audio data into smaller audio segment to avoid stack overflow?
def handle_pack_ogg():
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
audio_file.write(data)
import threading
# See: https://docs.python.org/3/library/threading.html
# The stack size of this thread is at least 32768
# If stack overflow error still occurs, just modify the `stack_size`.
# stack_size = n * 4096, where n should be a positive integer.
# Here we chose n = 4096.
stack_size = 4096 * 4096
try:
threading.stack_size(stack_size)
pack_ogg_thread = threading.Thread(target=handle_pack_ogg)
pack_ogg_thread.start()
pack_ogg_thread.join()
except RuntimeError as e:
# If changing the thread stack size is unsupported, a RuntimeError is raised.
print("RuntimeError: {}".format(e))
print("Changing the thread stack size is unsupported.")
except ValueError as e:
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
print("ValueError: {}".format(e))
print("The specified stack size is invalid.")
return audio_bytes return audio_bytes

View File

@ -169,6 +169,21 @@
6-nan自动转fp32阶段的hubert提取bug修复 6-nan自动转fp32阶段的hubert提取bug修复
### 20240610
小问题修复:
1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
2-uvr5中的mdxnet去混响cmd格式修复兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
大问题修复:
4-修复了webui的GPT中文微调没读到bert导致和推理不一致训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
todolist todolist
1-中文多音字推理优化(有没有人来测试的欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488 1-中文多音字推理优化(有没有人来测试的欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488
@ -177,3 +192,5 @@ todolist
2-正在尝试解决低音质参考音频导致音质较差的问题v2再试试如果能解决就发了节点暂定高考后吧 2-正在尝试解决低音质参考音频导致音质较差的问题v2再试试如果能解决就发了节点暂定高考后吧

View File

@ -1,4 +1,4 @@
numpy numpy==1.23.4
scipy scipy
tensorboard tensorboard
librosa==0.9.2 librosa==0.9.2

View File

@ -3,7 +3,7 @@ import traceback
from scipy.io import wavfile from scipy.io import wavfile
# parent_directory = os.path.dirname(os.path.abspath(__file__)) # parent_directory = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(parent_directory) # sys.path.append(parent_directory)
from my_utils import load_audio from tools.my_utils import load_audio
from slicer2 import Slicer from slicer2 import Slicer
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):

View File

@ -220,7 +220,7 @@ class Predictor:
opt_path_other = path_other[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format
if os.path.exists(path_vocal): if os.path.exists(path_vocal):
os.system( os.system(
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)
) )
if os.path.exists(opt_path_vocal): if os.path.exists(opt_path_vocal):
try: try:
@ -229,7 +229,7 @@ class Predictor:
pass pass
if os.path.exists(path_other): if os.path.exists(path_other):
os.system( os.system(
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)
) )
if os.path.exists(opt_path_other): if os.path.exists(opt_path_other):
try: try:

View File

@ -85,7 +85,7 @@ if if_gpu_ok and len(gpu_infos) > 0:
else: else:
gpu_info = ("%s\t%s" % ("0", "CPU")) gpu_info = ("%s\t%s" % ("0", "CPU"))
gpu_infos.append("%s\t%s" % ("0", "CPU")) gpu_infos.append("%s\t%s" % ("0", "CPU"))
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2 default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2)
gpus = "-".join([i[0] for i in gpu_infos]) gpus = "-".join([i[0] for i in gpu_infos])
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth" pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"