mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-07 15:19:59 +08:00
Merge remote-tracking branch 'upstream/main' into DS_STORE
This commit is contained in:
commit
7a35e4b482
55
GPT_SoVITS/inference_cli.py
Normal file
55
GPT_SoVITS/inference_cli.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from tools.i18n.i18n import I18nAuto
|
||||||
|
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
||||||
|
|
||||||
|
i18n = I18nAuto()
|
||||||
|
|
||||||
|
def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path):
|
||||||
|
# Read reference text
|
||||||
|
with open(ref_text_path, 'r', encoding='utf-8') as file:
|
||||||
|
ref_text = file.read()
|
||||||
|
|
||||||
|
# Read target text
|
||||||
|
with open(target_text_path, 'r', encoding='utf-8') as file:
|
||||||
|
target_text = file.read()
|
||||||
|
|
||||||
|
# Change model weights
|
||||||
|
change_gpt_weights(gpt_path=GPT_model_path)
|
||||||
|
change_sovits_weights(sovits_path=SoVITS_model_path)
|
||||||
|
|
||||||
|
# Synthesize audio
|
||||||
|
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
|
||||||
|
prompt_text=ref_text,
|
||||||
|
prompt_language=i18n(ref_language),
|
||||||
|
text=target_text,
|
||||||
|
text_language=i18n(target_language), top_p=1, temperature=1)
|
||||||
|
|
||||||
|
result_list = list(synthesis_result)
|
||||||
|
|
||||||
|
if result_list:
|
||||||
|
last_sampling_rate, last_audio_data = result_list[-1]
|
||||||
|
output_wav_path = os.path.join(output_path, "output.wav")
|
||||||
|
sf.write(output_wav_path, last_audio_data, last_sampling_rate)
|
||||||
|
print(f"Audio saved to {output_wav_path}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
||||||
|
parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file")
|
||||||
|
parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file")
|
||||||
|
parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file")
|
||||||
|
parser.add_argument('--ref_text', required=True, help="Path to the reference text file")
|
||||||
|
parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio")
|
||||||
|
parser.add_argument('--target_text', required=True, help="Path to the target text file")
|
||||||
|
parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text")
|
||||||
|
parser.add_argument('--output_path', required=True, help="Path to the output directory")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from PyQt5.QtCore import QEvent
|
from PyQt5.QtCore import QEvent
|
||||||
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
|
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
|
||||||
@ -7,16 +8,16 @@ import soundfile as sf
|
|||||||
from tools.i18n.i18n import I18nAuto
|
from tools.i18n.i18n import I18nAuto
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
|
||||||
|
|
||||||
|
|
||||||
class GPTSoVITSGUI(QMainWindow):
|
class GPTSoVITSGUI(QMainWindow):
|
||||||
|
GPT_Path = gpt_path
|
||||||
|
SoVITS_Path = sovits_path
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.init_ui()
|
|
||||||
|
|
||||||
def init_ui(self):
|
|
||||||
self.setWindowTitle('GPT-SoVITS GUI')
|
self.setWindowTitle('GPT-SoVITS GUI')
|
||||||
self.setGeometry(800, 450, 950, 850)
|
self.setGeometry(800, 450, 950, 850)
|
||||||
|
|
||||||
@ -71,6 +72,7 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
self.GPT_model_label = QLabel("选择GPT模型:")
|
self.GPT_model_label = QLabel("选择GPT模型:")
|
||||||
self.GPT_model_input = QLineEdit()
|
self.GPT_model_input = QLineEdit()
|
||||||
self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
|
self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
|
||||||
|
self.GPT_model_input.setText(self.GPT_Path)
|
||||||
self.GPT_model_input.setReadOnly(True)
|
self.GPT_model_input.setReadOnly(True)
|
||||||
self.GPT_model_button = QPushButton("选择GPT模型文件")
|
self.GPT_model_button = QPushButton("选择GPT模型文件")
|
||||||
self.GPT_model_button.clicked.connect(self.select_GPT_model)
|
self.GPT_model_button.clicked.connect(self.select_GPT_model)
|
||||||
@ -78,6 +80,7 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
self.SoVITS_model_label = QLabel("选择SoVITS模型:")
|
self.SoVITS_model_label = QLabel("选择SoVITS模型:")
|
||||||
self.SoVITS_model_input = QLineEdit()
|
self.SoVITS_model_input = QLineEdit()
|
||||||
self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
|
self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
|
||||||
|
self.SoVITS_model_input.setText(self.SoVITS_Path)
|
||||||
self.SoVITS_model_input.setReadOnly(True)
|
self.SoVITS_model_input.setReadOnly(True)
|
||||||
self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
|
self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
|
||||||
self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
|
self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
|
||||||
@ -91,25 +94,25 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
|
|
||||||
self.ref_text_label = QLabel("参考音频文本:")
|
self.ref_text_label = QLabel("参考音频文本:")
|
||||||
self.ref_text_input = QLineEdit()
|
self.ref_text_input = QLineEdit()
|
||||||
self.ref_text_input.setPlaceholderText("拖拽或选择文件")
|
self.ref_text_input.setPlaceholderText("直接输入文字或上传文本")
|
||||||
self.ref_text_input.setReadOnly(True)
|
|
||||||
self.ref_text_button = QPushButton("上传文本")
|
self.ref_text_button = QPushButton("上传文本")
|
||||||
self.ref_text_button.clicked.connect(self.upload_ref_text)
|
self.ref_text_button.clicked.connect(self.upload_ref_text)
|
||||||
|
|
||||||
self.language_label = QLabel("参考音频语言:")
|
self.ref_language_label = QLabel("参考音频语言:")
|
||||||
self.language_combobox = QComboBox()
|
self.ref_language_combobox = QComboBox()
|
||||||
self.language_combobox.addItems(["中文", "英文", "日文"])
|
self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
|
||||||
|
self.ref_language_combobox.setCurrentText("多语种混合")
|
||||||
|
|
||||||
self.target_text_label = QLabel("合成目标文本:")
|
self.target_text_label = QLabel("合成目标文本:")
|
||||||
self.target_text_input = QLineEdit()
|
self.target_text_input = QLineEdit()
|
||||||
self.target_text_input.setPlaceholderText("拖拽或选择文件")
|
self.target_text_input.setPlaceholderText("直接输入文字或上传文本")
|
||||||
self.target_text_input.setReadOnly(True)
|
|
||||||
self.target_text_button = QPushButton("上传文本")
|
self.target_text_button = QPushButton("上传文本")
|
||||||
self.target_text_button.clicked.connect(self.upload_target_text)
|
self.target_text_button.clicked.connect(self.upload_target_text)
|
||||||
|
|
||||||
self.language_label_02 = QLabel("合成音频语言:")
|
self.target_language_label = QLabel("合成音频语言:")
|
||||||
self.language_combobox_02 = QComboBox()
|
self.target_language_combobox = QComboBox()
|
||||||
self.language_combobox_02.addItems(["中文", "英文", "日文"])
|
self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
|
||||||
|
self.target_language_combobox.setCurrentText("多语种混合")
|
||||||
|
|
||||||
self.output_label = QLabel("输出音频路径:")
|
self.output_label = QLabel("输出音频路径:")
|
||||||
self.output_input = QLineEdit()
|
self.output_input = QLineEdit()
|
||||||
@ -140,10 +143,8 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
|
|
||||||
main_layout = QVBoxLayout()
|
main_layout = QVBoxLayout()
|
||||||
|
|
||||||
input_layout = QGridLayout()
|
input_layout = QGridLayout(self)
|
||||||
input_layout.setSpacing(10)
|
input_layout.setSpacing(10)
|
||||||
|
|
||||||
self.setLayout(input_layout)
|
|
||||||
|
|
||||||
input_layout.addWidget(license_label, 0, 0, 1, 3)
|
input_layout.addWidget(license_label, 0, 0, 1, 3)
|
||||||
|
|
||||||
@ -159,22 +160,22 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
|
input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
|
||||||
input_layout.addWidget(self.ref_audio_button, 6, 2)
|
input_layout.addWidget(self.ref_audio_button, 6, 2)
|
||||||
|
|
||||||
input_layout.addWidget(self.language_label, 7, 0)
|
input_layout.addWidget(self.ref_language_label, 7, 0)
|
||||||
input_layout.addWidget(self.language_combobox, 8, 0, 1, 1)
|
input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1)
|
||||||
input_layout.addWidget(self.ref_text_label, 9, 0)
|
input_layout.addWidget(self.ref_text_label, 9, 0)
|
||||||
input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
|
input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
|
||||||
input_layout.addWidget(self.ref_text_button, 10, 2)
|
input_layout.addWidget(self.ref_text_button, 10, 2)
|
||||||
|
|
||||||
input_layout.addWidget(self.language_label_02, 11, 0)
|
input_layout.addWidget(self.target_language_label, 11, 0)
|
||||||
input_layout.addWidget(self.language_combobox_02, 12, 0, 1, 1)
|
input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1)
|
||||||
input_layout.addWidget(self.target_text_label, 13, 0)
|
input_layout.addWidget(self.target_text_label, 13, 0)
|
||||||
input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
|
input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
|
||||||
input_layout.addWidget(self.target_text_button, 14, 2)
|
input_layout.addWidget(self.target_text_button, 14, 2)
|
||||||
|
|
||||||
input_layout.addWidget(self.output_label, 15, 0)
|
input_layout.addWidget(self.output_label, 15, 0)
|
||||||
input_layout.addWidget(self.output_input, 16, 0, 1, 2)
|
input_layout.addWidget(self.output_input, 16, 0, 1, 2)
|
||||||
input_layout.addWidget(self.output_button, 16, 2)
|
input_layout.addWidget(self.output_button, 16, 2)
|
||||||
|
|
||||||
main_layout.addLayout(input_layout)
|
main_layout.addLayout(input_layout)
|
||||||
|
|
||||||
output_layout = QVBoxLayout()
|
output_layout = QVBoxLayout()
|
||||||
@ -198,10 +199,8 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
def dropEvent(self, event):
|
def dropEvent(self, event):
|
||||||
if event.mimeData().hasUrls():
|
if event.mimeData().hasUrls():
|
||||||
file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
|
file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
|
||||||
|
|
||||||
if len(file_paths) == 1:
|
if len(file_paths) == 1:
|
||||||
self.update_ref_audio(file_paths[0])
|
self.update_ref_audio(file_paths[0])
|
||||||
self.update_input_paths(self.ref_audio_input, file_paths[0])
|
|
||||||
else:
|
else:
|
||||||
self.update_ref_audio(", ".join(file_paths))
|
self.update_ref_audio(", ".join(file_paths))
|
||||||
|
|
||||||
@ -211,23 +210,13 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
widget.installEventFilter(self)
|
widget.installEventFilter(self)
|
||||||
|
|
||||||
def eventFilter(self, obj, event):
|
def eventFilter(self, obj, event):
|
||||||
if event.type() == QEvent.DragEnter:
|
if event.type() in (QEvent.DragEnter, QEvent.Drop):
|
||||||
mime_data = event.mimeData()
|
mime_data = event.mimeData()
|
||||||
if mime_data.hasUrls():
|
if mime_data.hasUrls():
|
||||||
event.acceptProposedAction()
|
event.acceptProposedAction()
|
||||||
|
|
||||||
elif event.type() == QEvent.Drop:
|
|
||||||
mime_data = event.mimeData()
|
|
||||||
if mime_data.hasUrls():
|
|
||||||
file_paths = [url.toLocalFile() for url in mime_data.urls()]
|
|
||||||
if len(file_paths) == 1:
|
|
||||||
self.update_input_paths(obj, file_paths[0])
|
|
||||||
else:
|
|
||||||
self.update_input_paths(obj, ", ".join(file_paths))
|
|
||||||
event.acceptProposedAction()
|
|
||||||
|
|
||||||
return super().eventFilter(obj, event)
|
return super().eventFilter(obj, event)
|
||||||
|
|
||||||
def select_GPT_model(self):
|
def select_GPT_model(self):
|
||||||
file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
|
||||||
if file_path:
|
if file_path:
|
||||||
@ -239,24 +228,9 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
self.SoVITS_model_input.setText(file_path)
|
self.SoVITS_model_input.setText(file_path)
|
||||||
|
|
||||||
def select_ref_audio(self):
|
def select_ref_audio(self):
|
||||||
options = QFileDialog.Options()
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)")
|
||||||
options |= QFileDialog.DontUseNativeDialog
|
if file_path:
|
||||||
options |= QFileDialog.ShowDirsOnly
|
self.update_ref_audio(file_path)
|
||||||
|
|
||||||
file_dialog = QFileDialog()
|
|
||||||
file_dialog.setOptions(options)
|
|
||||||
|
|
||||||
file_dialog.setFileMode(QFileDialog.AnyFile)
|
|
||||||
file_dialog.setNameFilter("Audio Files (*.wav *.mp3)")
|
|
||||||
|
|
||||||
if file_dialog.exec_():
|
|
||||||
file_paths = file_dialog.selectedFiles()
|
|
||||||
|
|
||||||
if len(file_paths) == 1:
|
|
||||||
self.update_ref_audio(file_paths[0])
|
|
||||||
self.update_input_paths(self.ref_audio_input, file_paths[0])
|
|
||||||
else:
|
|
||||||
self.update_ref_audio(", ".join(file_paths))
|
|
||||||
|
|
||||||
def upload_ref_text(self):
|
def upload_ref_text(self):
|
||||||
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
||||||
@ -264,7 +238,6 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
self.ref_text_input.setText(content)
|
self.ref_text_input.setText(content)
|
||||||
self.update_input_paths(self.ref_text_input, file_path)
|
|
||||||
|
|
||||||
def upload_target_text(self):
|
def upload_target_text(self):
|
||||||
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
||||||
@ -272,7 +245,6 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
self.target_text_input.setText(content)
|
self.target_text_input.setText(content)
|
||||||
self.update_input_paths(self.target_text_input, file_path)
|
|
||||||
|
|
||||||
def select_output_path(self):
|
def select_output_path(self):
|
||||||
options = QFileDialog.Options()
|
options = QFileDialog.Options()
|
||||||
@ -290,9 +262,6 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
def update_ref_audio(self, file_path):
|
def update_ref_audio(self, file_path):
|
||||||
self.ref_audio_input.setText(file_path)
|
self.ref_audio_input.setText(file_path)
|
||||||
|
|
||||||
def update_input_paths(self, input_box, file_path):
|
|
||||||
input_box.setText(file_path)
|
|
||||||
|
|
||||||
def clear_output(self):
|
def clear_output(self):
|
||||||
self.output_text.clear()
|
self.output_text.clear()
|
||||||
|
|
||||||
@ -300,23 +269,27 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
GPT_model_path = self.GPT_model_input.text()
|
GPT_model_path = self.GPT_model_input.text()
|
||||||
SoVITS_model_path = self.SoVITS_model_input.text()
|
SoVITS_model_path = self.SoVITS_model_input.text()
|
||||||
ref_audio_path = self.ref_audio_input.text()
|
ref_audio_path = self.ref_audio_input.text()
|
||||||
language_combobox = self.language_combobox.currentText()
|
language_combobox = self.ref_language_combobox.currentText()
|
||||||
language_combobox = i18n(language_combobox)
|
language_combobox = i18n(language_combobox)
|
||||||
ref_text = self.ref_text_input.text()
|
ref_text = self.ref_text_input.text()
|
||||||
language_combobox_02 = self.language_combobox_02.currentText()
|
target_language_combobox = self.target_language_combobox.currentText()
|
||||||
language_combobox_02 = i18n(language_combobox_02)
|
target_language_combobox = i18n(target_language_combobox)
|
||||||
target_text = self.target_text_input.text()
|
target_text = self.target_text_input.text()
|
||||||
output_path = self.output_input.text()
|
output_path = self.output_input.text()
|
||||||
|
|
||||||
change_gpt_weights(gpt_path=GPT_model_path)
|
if GPT_model_path != self.GPT_Path:
|
||||||
change_sovits_weights(sovits_path=SoVITS_model_path)
|
change_gpt_weights(gpt_path=GPT_model_path)
|
||||||
|
self.GPT_Path = GPT_model_path
|
||||||
|
if SoVITS_model_path != self.SoVITS_Path:
|
||||||
|
change_sovits_weights(sovits_path=SoVITS_model_path)
|
||||||
|
self.SoVITS_Path = SoVITS_model_path
|
||||||
|
|
||||||
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
|
synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path,
|
||||||
prompt_text=ref_text,
|
prompt_text=ref_text,
|
||||||
prompt_language=language_combobox,
|
prompt_language=language_combobox,
|
||||||
text=target_text,
|
text=target_text,
|
||||||
text_language=language_combobox_02)
|
text_language=target_language_combobox)
|
||||||
|
|
||||||
result_list = list(synthesis_result)
|
result_list = list(synthesis_result)
|
||||||
|
|
||||||
if result_list:
|
if result_list:
|
||||||
@ -329,12 +302,9 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000)
|
self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000)
|
||||||
self.output_text.append("处理结果:\n" + result)
|
self.output_text.append("处理结果:\n" + result)
|
||||||
|
|
||||||
def main():
|
|
||||||
|
if __name__ == '__main__':
|
||||||
app = QApplication(sys.argv)
|
app = QApplication(sys.argv)
|
||||||
mainWin = GPTSoVITSGUI()
|
mainWin = GPTSoVITSGUI()
|
||||||
mainWin.show()
|
mainWin.show()
|
||||||
sys.exit(app.exec_())
|
sys.exit(app.exec_())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -50,6 +50,7 @@ is_share = eval(is_share)
|
|||||||
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
|
punctuation = set(['!', '?', '…', ',', '.', '-'," "])
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -64,7 +65,7 @@ from text import cleaned_text_to_sequence
|
|||||||
from text.cleaner import clean_text
|
from text.cleaner import clean_text
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
from module.mel_processing import spectrogram_torch
|
from module.mel_processing import spectrogram_torch
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
from tools.i18n.i18n import I18nAuto
|
from tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
@ -322,6 +323,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
||||||
print(i18n("实际输入的参考文本:"), prompt_text)
|
print(i18n("实际输入的参考文本:"), prompt_text)
|
||||||
text = text.strip("\n")
|
text = text.strip("\n")
|
||||||
|
text = replace_consecutive_punctuation(text)
|
||||||
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
||||||
|
|
||||||
print(i18n("实际输入的目标文本:"), text)
|
print(i18n("实际输入的目标文本:"), text)
|
||||||
@ -366,6 +368,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
text = text.replace("\n\n", "\n")
|
text = text.replace("\n\n", "\n")
|
||||||
print(i18n("实际输入的目标文本(切句后):"), text)
|
print(i18n("实际输入的目标文本(切句后):"), text)
|
||||||
texts = text.split("\n")
|
texts = text.split("\n")
|
||||||
|
texts = process_text(texts)
|
||||||
texts = merge_short_text_in_array(texts, 5)
|
texts = merge_short_text_in_array(texts, 5)
|
||||||
audio_opt = []
|
audio_opt = []
|
||||||
if not ref_free:
|
if not ref_free:
|
||||||
@ -463,6 +466,7 @@ def cut1(inp):
|
|||||||
opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
|
opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
|
||||||
else:
|
else:
|
||||||
opts = [inp]
|
opts = [inp]
|
||||||
|
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
||||||
return "\n".join(opts)
|
return "\n".join(opts)
|
||||||
|
|
||||||
|
|
||||||
@ -487,17 +491,21 @@ def cut2(inp):
|
|||||||
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
|
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
|
||||||
opts[-2] = opts[-2] + opts[-1]
|
opts[-2] = opts[-2] + opts[-1]
|
||||||
opts = opts[:-1]
|
opts = opts[:-1]
|
||||||
|
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
||||||
return "\n".join(opts)
|
return "\n".join(opts)
|
||||||
|
|
||||||
|
|
||||||
def cut3(inp):
|
def cut3(inp):
|
||||||
inp = inp.strip("\n")
|
inp = inp.strip("\n")
|
||||||
return "\n".join(["%s" % item for item in inp.strip("。").split("。")])
|
opts = ["%s" % item for item in inp.strip("。").split("。")]
|
||||||
|
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
||||||
|
return "\n".join(opts)
|
||||||
|
|
||||||
def cut4(inp):
|
def cut4(inp):
|
||||||
inp = inp.strip("\n")
|
inp = inp.strip("\n")
|
||||||
return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
|
opts = ["%s" % item for item in inp.strip(".").split(".")]
|
||||||
|
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
||||||
|
return "\n".join(opts)
|
||||||
|
|
||||||
|
|
||||||
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
|
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
|
||||||
@ -511,8 +519,8 @@ def cut5(inp):
|
|||||||
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
||||||
if len(items)%2 == 1:
|
if len(items)%2 == 1:
|
||||||
mergeitems.append(items[-1])
|
mergeitems.append(items[-1])
|
||||||
opt = "\n".join(mergeitems)
|
opt = [item for item in mergeitems if not set(item).issubset(punctuation)]
|
||||||
return opt
|
return "\n".join(opt)
|
||||||
|
|
||||||
|
|
||||||
def custom_sort_key(s):
|
def custom_sort_key(s):
|
||||||
@ -522,6 +530,24 @@ def custom_sort_key(s):
|
|||||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
def process_text(texts):
|
||||||
|
_text=[]
|
||||||
|
if all(text in [None, " ", "\n",""] for text in texts):
|
||||||
|
raise ValueError(i18n("请输入有效文本"))
|
||||||
|
for text in texts:
|
||||||
|
if text in [None, " ", ""]:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
_text.append(text)
|
||||||
|
return _text
|
||||||
|
|
||||||
|
|
||||||
|
def replace_consecutive_punctuation(text):
|
||||||
|
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||||
|
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||||
|
result = re.sub(pattern, r'\1', text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def change_choices():
|
def change_choices():
|
||||||
SoVITS_names, GPT_names = get_weights_names()
|
SoVITS_names, GPT_names = get_weights_names()
|
||||||
@ -613,10 +639,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
|||||||
button5.click(cut5, [text_inp], [text_opt])
|
button5.click(cut5, [text_inp], [text_opt])
|
||||||
gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
|
gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
|
||||||
|
|
||||||
app.queue(concurrency_count=511, max_size=1022).launch(
|
if __name__ == '__main__':
|
||||||
server_name="0.0.0.0",
|
app.queue(concurrency_count=511, max_size=1022).launch(
|
||||||
inbrowser=True,
|
server_name="0.0.0.0",
|
||||||
share=is_share,
|
inbrowser=True,
|
||||||
server_port=infer_ttswebui,
|
share=is_share,
|
||||||
quiet=True,
|
server_port=infer_ttswebui,
|
||||||
)
|
quiet=True,
|
||||||
|
)
|
||||||
|
@ -17,7 +17,7 @@ from functools import lru_cache
|
|||||||
import requests
|
import requests
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
|
|
||||||
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
|
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
|
||||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||||
|
@ -1,21 +0,0 @@
|
|||||||
import ffmpeg
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def load_audio(file, sr):
|
|
||||||
try:
|
|
||||||
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
|
|
||||||
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
|
||||||
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
|
||||||
file = (
|
|
||||||
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
|
||||||
) # 防止小白拷路径头尾带了空格和"和回车
|
|
||||||
out, _ = (
|
|
||||||
ffmpeg.input(file, threads=0)
|
|
||||||
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
|
||||||
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Failed to load audio: {e}")
|
|
||||||
|
|
||||||
return np.frombuffer(out, np.float32).flatten()
|
|
@ -9,7 +9,7 @@ cnhubert.cnhubert_base_path=cnhubert_base_path
|
|||||||
ssl_model = cnhubert.get_model()
|
ssl_model = cnhubert.get_model()
|
||||||
from text import cleaned_text_to_sequence
|
from text import cleaned_text_to_sequence
|
||||||
import soundfile
|
import soundfile
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ from scipy.io import wavfile
|
|||||||
import librosa,torch
|
import librosa,torch
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
|
|
||||||
# from config import cnhubert_base_path
|
# from config import cnhubert_base_path
|
||||||
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
||||||
|
@ -79,15 +79,17 @@ class my_model_ckpt(ModelCheckpoint):
|
|||||||
to_save_od["config"] = self.config
|
to_save_od["config"] = self.config
|
||||||
to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1)
|
to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1)
|
||||||
# torch.save(
|
# torch.save(
|
||||||
my_save(
|
# print(os.environ)
|
||||||
to_save_od,
|
if(os.environ.get("LOCAL_RANK","0")=="0"):
|
||||||
"%s/%s-e%s.ckpt"
|
my_save(
|
||||||
% (
|
to_save_od,
|
||||||
self.half_weights_save_dir,
|
"%s/%s-e%s.ckpt"
|
||||||
self.exp_name,
|
% (
|
||||||
trainer.current_epoch + 1,
|
self.half_weights_save_dir,
|
||||||
),
|
self.exp_name,
|
||||||
)
|
trainer.current_epoch + 1,
|
||||||
|
),
|
||||||
|
)
|
||||||
self._save_last_checkpoint(trainer, monitor_candidates)
|
self._save_last_checkpoint(trainer, monitor_candidates)
|
||||||
|
|
||||||
|
|
||||||
|
44
api.py
44
api.py
@ -143,7 +143,7 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|||||||
from text import cleaned_text_to_sequence
|
from text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from text.cleaner import clean_text
|
||||||
from module.mel_processing import spectrogram_torch
|
from module.mel_processing import spectrogram_torch
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
import config as global_config
|
import config as global_config
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -339,8 +339,46 @@ def pack_audio(audio_bytes, data, rate):
|
|||||||
|
|
||||||
|
|
||||||
def pack_ogg(audio_bytes, data, rate):
|
def pack_ogg(audio_bytes, data, rate):
|
||||||
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
# Author: AkagawaTsurunaki
|
||||||
audio_file.write(data)
|
# Issue:
|
||||||
|
# Stack overflow probabilistically occurs
|
||||||
|
# when the function `sf_writef_short` of `libsndfile_64bit.dll` is called
|
||||||
|
# using the Python library `soundfile`
|
||||||
|
# Note:
|
||||||
|
# This is an issue related to `libsndfile`, not this project itself.
|
||||||
|
# It happens when you generate a large audio tensor (about 499804 frames in my PC)
|
||||||
|
# and try to convert it to an ogg file.
|
||||||
|
# Related:
|
||||||
|
# https://github.com/RVC-Boss/GPT-SoVITS/issues/1199
|
||||||
|
# https://github.com/libsndfile/libsndfile/issues/1023
|
||||||
|
# https://github.com/bastibe/python-soundfile/issues/396
|
||||||
|
# Suggestion:
|
||||||
|
# Or split the whole audio data into smaller audio segment to avoid stack overflow?
|
||||||
|
|
||||||
|
def handle_pack_ogg():
|
||||||
|
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
||||||
|
audio_file.write(data)
|
||||||
|
|
||||||
|
import threading
|
||||||
|
# See: https://docs.python.org/3/library/threading.html
|
||||||
|
# The stack size of this thread is at least 32768
|
||||||
|
# If stack overflow error still occurs, just modify the `stack_size`.
|
||||||
|
# stack_size = n * 4096, where n should be a positive integer.
|
||||||
|
# Here we chose n = 4096.
|
||||||
|
stack_size = 4096 * 4096
|
||||||
|
try:
|
||||||
|
threading.stack_size(stack_size)
|
||||||
|
pack_ogg_thread = threading.Thread(target=handle_pack_ogg)
|
||||||
|
pack_ogg_thread.start()
|
||||||
|
pack_ogg_thread.join()
|
||||||
|
except RuntimeError as e:
|
||||||
|
# If changing the thread stack size is unsupported, a RuntimeError is raised.
|
||||||
|
print("RuntimeError: {}".format(e))
|
||||||
|
print("Changing the thread stack size is unsupported.")
|
||||||
|
except ValueError as e:
|
||||||
|
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
|
||||||
|
print("ValueError: {}".format(e))
|
||||||
|
print("The specified stack size is invalid.")
|
||||||
|
|
||||||
return audio_bytes
|
return audio_bytes
|
||||||
|
|
||||||
|
@ -169,6 +169,21 @@
|
|||||||
|
|
||||||
6-nan自动转fp32阶段的hubert提取bug修复
|
6-nan自动转fp32阶段的hubert提取bug修复
|
||||||
|
|
||||||
|
### 20240610
|
||||||
|
|
||||||
|
小问题修复:
|
||||||
|
|
||||||
|
1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
|
||||||
|
|
||||||
|
2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
|
||||||
|
|
||||||
|
3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
|
||||||
|
|
||||||
|
大问题修复:
|
||||||
|
|
||||||
|
4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
|
||||||
|
|
||||||
|
|
||||||
todolist:
|
todolist:
|
||||||
|
|
||||||
1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488
|
1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488
|
||||||
@ -177,3 +192,5 @@ todolist:
|
|||||||
2-正在尝试解决低音质参考音频导致音质较差的问题,v2再试试如果能解决就发了,节点暂定高考后吧
|
2-正在尝试解决低音质参考音频导致音质较差的问题,v2再试试如果能解决就发了,节点暂定高考后吧
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
numpy
|
numpy==1.23.4
|
||||||
scipy
|
scipy
|
||||||
tensorboard
|
tensorboard
|
||||||
librosa==0.9.2
|
librosa==0.9.2
|
||||||
|
@ -3,7 +3,7 @@ import traceback
|
|||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
# sys.path.append(parent_directory)
|
# sys.path.append(parent_directory)
|
||||||
from my_utils import load_audio
|
from tools.my_utils import load_audio
|
||||||
from slicer2 import Slicer
|
from slicer2 import Slicer
|
||||||
|
|
||||||
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
|
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
|
||||||
|
@ -220,7 +220,7 @@ class Predictor:
|
|||||||
opt_path_other = path_other[:-4] + ".%s" % format
|
opt_path_other = path_other[:-4] + ".%s" % format
|
||||||
if os.path.exists(path_vocal):
|
if os.path.exists(path_vocal):
|
||||||
os.system(
|
os.system(
|
||||||
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal)
|
"ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)
|
||||||
)
|
)
|
||||||
if os.path.exists(opt_path_vocal):
|
if os.path.exists(opt_path_vocal):
|
||||||
try:
|
try:
|
||||||
@ -229,7 +229,7 @@ class Predictor:
|
|||||||
pass
|
pass
|
||||||
if os.path.exists(path_other):
|
if os.path.exists(path_other):
|
||||||
os.system(
|
os.system(
|
||||||
"ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other)
|
"ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)
|
||||||
)
|
)
|
||||||
if os.path.exists(opt_path_other):
|
if os.path.exists(opt_path_other):
|
||||||
try:
|
try:
|
||||||
|
2
webui.py
2
webui.py
@ -85,7 +85,7 @@ if if_gpu_ok and len(gpu_infos) > 0:
|
|||||||
else:
|
else:
|
||||||
gpu_info = ("%s\t%s" % ("0", "CPU"))
|
gpu_info = ("%s\t%s" % ("0", "CPU"))
|
||||||
gpu_infos.append("%s\t%s" % ("0", "CPU"))
|
gpu_infos.append("%s\t%s" % ("0", "CPU"))
|
||||||
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
|
default_batch_size = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2)
|
||||||
gpus = "-".join([i[0] for i in gpu_infos])
|
gpus = "-".join([i[0] for i in gpu_infos])
|
||||||
|
|
||||||
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user