From 6a2ab63e18684cc46fae272eafaed81236776044 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Mon, 26 May 2025 12:43:14 +0800 Subject: [PATCH 1/8] Add new subfix webui, fix bugs in requirements --- GPT_SoVITS/text/g2pw/onnx_api.py | 5 +- requirements.txt | 2 +- tools/subfix.py | 544 +++++++++++++++++++++++++++++++ tools/subfix_webui.py | 422 ------------------------ tools/uvr5/webui.py | 20 +- webui.py | 19 +- 6 files changed, 563 insertions(+), 449 deletions(-) create mode 100644 tools/subfix.py delete mode 100644 tools/subfix_webui.py diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index a8268107..9d153745 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -23,8 +23,9 @@ from .utils import load_config onnxruntime.set_default_logger_severity(3) try: onnxruntime.preload_dlls() -except:pass - #traceback.print_exc() +except: + pass + # traceback.print_exc() warnings.filterwarnings("ignore") model_version = "1.1" diff --git a/requirements.txt b/requirements.txt index 90e4957d..4ad45b2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ pytorch-lightning>=2.4 gradio<5 ffmpeg-python onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64" -onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64" +onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "amd64" tqdm funasr==1.0.27 cn2an diff --git a/tools/subfix.py b/tools/subfix.py new file mode 100644 index 00000000..fb4cfdd6 --- /dev/null +++ b/tools/subfix.py @@ -0,0 +1,544 @@ +import datetime +import os +import threading +import traceback +from dataclasses import dataclass +from functools import partial +from typing import List + +import click +import gradio as gr +import librosa +import numpy as np +import soundfile +from gradio.components.audio import WaveformOptions + +from tools.i18n.i18n import I18nAuto + +PARTIAL_EXIT = partial(os._exit, 0) + +LANGUAGE_MAP: dict = { + "ZH": "ZH", + "zh": "ZH", + "JP": "JA", + "jp": "JA", + "JA": "JA", + "ja": "JA", + "EN": "EN", + "en": "EN", + "KO": "KO", + "ko": "KO", + "yue": "YUE", + "YUE": "YUE", +} + +LOCK = threading.Lock() + +IS_CLI = True + + +@dataclass +class SubfixErr: + error: Exception + tracebacks: str + + +class Subfix: + batch_size: int = 2 + cur_idx: int = 0 + list_path: str + textboxes: List[gr.Textbox] = [] + audios: List[gr.Audio] = [] + languages: List[gr.Dropdown] = [] + selections: List[gr.Checkbox] = [] + transcriptions_list: List[List[str]] = [] + + merge_audio_button: gr.Button + delete_audio_button: gr.Button + previous_index_button1: gr.Button + next_index_button1: gr.Button + previous_index_button2: gr.Button + next_index_button2: gr.Button + index_slider: gr.Slider + batch_size_slider: gr.Slider + close_button: gr.Button + + def __init__(self, i18n: I18nAuto): + self.i18n = i18n + with gr.Row(equal_height=True): + with gr.Column(scale=2, min_width=160): + self.index_slider = gr.Slider(minimum=0, maximum=1, step=1, label=i18n("音频索引")) + with gr.Column(scale=1, min_width=160): + self.previous_index_button1 = gr.Button(value=i18n("上一页"), elem_id="btn_previous") + with gr.Column(scale=1, min_width=160): + self.next_index_button1 = gr.Button(value=i18n("下一页"), elem_id="btn_next") + with gr.Row(equal_height=True): + with gr.Column(scale=2, min_width=160): + self.batch_size_slider = gr.Slider( + minimum=4, maximum=20, step=2, value=self.batch_size, label=i18n("每页音频条数") + ) + with gr.Column(scale=1, min_width=160): + self.merge_audio_button = gr.Button(value=i18n("合并选中音频")) + with gr.Column(scale=1, min_width=160): + self.delete_audio_button = gr.Button(value=i18n("删除选中音频")) + gr.render( + inputs=[self.index_slider, self.batch_size_slider], + triggers=[self.batch_size_slider.change], + )(self._render_text_area) + + @property + def max_index(self): + return len(self.transcriptions_list) + + def load_list(self, list_path: str): + with open(list_path, mode="r", encoding="utf-8") as f: + list_data = f.readlines() + for idx, transcriptions in enumerate(list_data): + data = transcriptions.split("|") + if len(data) != 4: + print(f"Error Line {idx + 1}: {'|'.join(data)}") + continue + audio_name, audio_folder, text_language, text = data + self.transcriptions_list.append( + [ + audio_name, + audio_folder, + LANGUAGE_MAP.get(text_language.upper(), text_language.upper()), + text.strip("\n").strip(), + ] + ) + self.list_path = list_path + + def save_list(self): + data = [] + for transcriptions in self.transcriptions_list: + data.append("|".join(transcriptions)) + try: + with open(self.list_path, mode="w", encoding="utf-8") as f: + f.write("\n".join(data)) + except Exception as e: + return SubfixErr(e, traceback.format_exc()) + + def change_index(self, index: int): + audios = [] + texts = [] + languages = [] + checkboxs = [] + with LOCK: + for i in range(index, index + self.batch_size): + if i <= self.max_index - 1: + audios.append(gr.Audio(value=self.transcriptions_list[i][0])) + texts.append(gr.Textbox(value=self.transcriptions_list[i][3], label=self.i18n("Text") + f" {i}")) + languages.append(gr.Dropdown(value=self.transcriptions_list[i][2])) + else: + audios.append(gr.Audio(value=None, interactive=False)) + texts.append(gr.Textbox(value=None, label=self.i18n("Text") + f" {i}", interactive=False)) + languages.append(gr.Dropdown(value=None, interactive=False)) + checkboxs = [gr.Checkbox(False) for i in range(self.batch_size)] + self.cur_idx = index + return *audios, *texts, *languages, *checkboxs + + def next_page(self, index: int): + batch_size = self.batch_size + max_index = self.max_index - batch_size + if max_index <= 0: + max_index = 1 + index = min(index + batch_size, max_index - 1) + return gr.Slider(value=index), *self.change_index(index) + + def previous_page(self, index: int): + batch_size = self.batch_size + index = max(index - batch_size, 0) + return gr.Slider(value=index), *self.change_index(index) + + def delete_audio(self, index, *selected): + delete_index = [i + index for i, _ in enumerate(selected) if _] + delete_index = [i for i in delete_index if i < self.max_index - 1] + for idx in delete_index[::-1]: + self.transcriptions_list.pop(idx) + self.save_list() + return gr.Slider(value=index, maximum=self.max_index), *self.change_index(index) + + def submit(self, *input): + with LOCK: + index = self.cur_idx + batch_size = self.batch_size + texts = input[: len(input) // 2] + languages = input[len(input) // 2 :] + if texts is None or languages is None: + raise ValueError() + for idx in range(index, min(index + batch_size, self.max_index - 1)): + self.transcriptions_list[idx][3] = texts[idx - index].strip().strip("\n") + self.transcriptions_list[idx][2] = languages[idx - index] + result = self.save_list() + if isinstance(result, SubfixErr): + gr.Warning(str(result.error)) + print(result.tracebacks) + + def merge_audio(self, index, *selected): + batch_size = self.batch_size + merge_index = [i + index for i, _ in enumerate(selected) if _] + merge_index = [i for i in merge_index if i < self.max_index - 1] + if len(merge_index) < 2: + return *(gr.skip() for _ in range(batch_size * 3 + 1)), *(gr.Checkbox(False) for _ in range(batch_size)) + else: + merge_texts = [] + merge_audios = [] + first_itm_index = merge_index[0] + first_itm_path = f"{os.path.splitext(self.transcriptions_list[first_itm_index][0])[0]}_{str(datetime.datetime.now().strftime(r'%Y%m%d_%H%M%S'))}.wav" + final_audio_list = [] + for idx in merge_index: + merge_texts.append(self.transcriptions_list[idx][3]) + merge_audios.append(self.transcriptions_list[idx][0]) + for idx in merge_index[:0:-1]: + self.transcriptions_list.pop(idx) + for audio_path in merge_audios: + final_audio_list.append(librosa.load(audio_path, sr=32000, mono=True)[0]) + final_audio_list.append(np.zeros(int(32000 * 0.3))) + final_audio_list.pop() + final_audio = np.concatenate(final_audio_list) + soundfile.write(first_itm_path, final_audio, 32000) + self.transcriptions_list[first_itm_index][0] = first_itm_path + self.transcriptions_list[first_itm_index][3] = ",".join(merge_texts) + return gr.Slider(maximum=self.max_index), *self.change_index(index) + + def _render_text_area(self, index, batch_size): + i18n = self.i18n + self.textboxes = [] + self.audios = [] + self.languages = [] + self.selections = [] + self.batch_size = batch_size + for i in range(index, index + batch_size): + with gr.Row(equal_height=True): + if i <= self.max_index - 1: + with gr.Column(scale=2, min_width=160): + textbox_tmp = gr.Textbox( + value=self.transcriptions_list[i][3], + label=i18n("Text") + f" {i}", + lines=2, + max_lines=3, + interactive=True, + ) + with gr.Column(scale=1, min_width=160): + audio_tmp = gr.Audio( + value=self.transcriptions_list[i][0], + show_label=False, + show_download_button=False, + editable=False, + waveform_options={"show_recording_waveform": False, "show_controls": False}, + ) + with gr.Column(scale=1, min_width=160): + with gr.Group(): + with gr.Row(): + language_tmp = gr.Dropdown( + choices=["ZH", "EN", "JA", "KO", "YUE"], + value=self.transcriptions_list[i][2], + allow_custom_value=True, + label=i18n("文本语言"), + interactive=True, + ) + with gr.Row(): + selection_tmp = gr.Checkbox( + label=i18n("选择音频"), + ) + else: + with gr.Column(scale=2, min_width=160): + textbox_tmp = gr.Textbox( + label=i18n("Text") + f" {i}", + lines=2, + max_lines=3, + elem_id="subfix_textbox", + interactive=False, + ) + with gr.Column(scale=1, min_width=160): + audio_tmp = gr.Audio( + streaming=True, + show_label=False, + show_download_button=False, + interactive=False, + waveform_options=WaveformOptions(show_recording_waveform=False, show_controls=False), + ) + with gr.Column(scale=1, min_width=160): + with gr.Group(): + with gr.Row(): + language_tmp = gr.Dropdown( + choices=["ZH", "EN", "JA", "KO", "YUE"], + value=None, + allow_custom_value=True, + label=i18n("文本语言"), + interactive=False, + ) + with gr.Row(): + selection_tmp = gr.Checkbox( + label=i18n("选择音频"), + interactive=False, + ) + + self.textboxes.append(textbox_tmp) + self.audios.append(audio_tmp) + self.languages.append(language_tmp) + self.selections.append(selection_tmp) + with gr.Row(equal_height=True): + with gr.Column(scale=2, min_width=160): + self.close_button = gr.Button(value=i18n("关闭打标WebUI"), variant="stop") + with gr.Column(scale=1, min_width=160): + self.previous_index_button2 = gr.Button(value=i18n("上一页")) + with gr.Column(scale=1, min_width=160): + self.next_index_button2 = gr.Button(value=i18n("下一页")) + + # Event Trigger Binding + + self.index_slider.release( # Change Index Button + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + ).success( + fn=self.change_index, + inputs=[ + self.index_slider, + ], + outputs=[ + *self.audios, + *self.textboxes, + *self.languages, + *self.selections, + ], + max_batch_size=1, + trigger_mode="once", + ) + + self.next_index_button1.click( # Next Page Button on the Top + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + ).success( + fn=self.next_page, + inputs=[ + self.index_slider, + ], + outputs=[ + self.index_slider, + *self.audios, + *self.textboxes, + *self.languages, + *self.selections, + ], + scroll_to_output=True, + trigger_mode="once", + ) + + self.next_index_button2.click( # Next Page Button on the Bottom, Binding to Next Page Button on the Top + lambda: None, + [], + [], + js=""" + () => { + document.getElementById("btn_next").click(); + }""", + trigger_mode="once", + ) + + self.previous_index_button1.click( # Previous Page Button on the Top + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + ).success( + fn=self.previous_page, + inputs=[ + self.index_slider, + ], + outputs=[ + self.index_slider, + *self.audios, + *self.textboxes, + *self.languages, + *self.selections, + ], + scroll_to_output=True, + trigger_mode="once", + ) + + self.previous_index_button2.click( # Previous Page Button on the Bottom, Binding to Previous Page Button on the Top + lambda: None, + [], + [], + js=""" + () => { + document.getElementById("btn_previous").click(); + }""", + trigger_mode="once", + ) + + self.delete_audio_button.click( # Delete the Audio in the Transcription File + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + ).success( + fn=self.delete_audio, + inputs=[ + self.index_slider, + *self.selections, + ], + outputs=[ + self.index_slider, + *self.audios, + *self.textboxes, + *self.languages, + *self.selections, + ], + scroll_to_output=True, + ).success( + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + show_progress="hidden", + ) + + self.merge_audio_button.click( # Delete the Audio in the Transcription File + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + ).success( + fn=self.merge_audio, + inputs=[ + self.index_slider, + *self.selections, + ], + outputs=[ + self.index_slider, + *self.audios, + *self.textboxes, + *self.languages, + *self.selections, + ], + scroll_to_output=True, + ).success( + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + show_progress="hidden", + ) + if not IS_CLI: + self.close_button.click( # Close the Subfix Tab, Binding to Close Button on Audio Processing Tab + fn=lambda: None, + inputs=[], + outputs=[], + js=""" + () => { + document.getElementById("btn_close").click(); + }""", + trigger_mode="once", + ) + else: + self.close_button.click( # Close the Subfix Tab, Binding to Close Button on Audio Processing Tab + fn=self.submit, + inputs=[ + *self.textboxes, + *self.languages, + ], + outputs=[], + trigger_mode="once", + ).then( + fn=PARTIAL_EXIT, + inputs=[], + outputs=[], + ) + + def render(self, list_path: str, batch_size: int = 10): + self.batch_size = batch_size + self.transcriptions_list = [] + self.load_list(list_path=list_path) + + +@click.command(name="subfix") +@click.argument( + "list-path", + metavar="", + type=click.Path(exists=True, dir_okay=False, readable=True, writable=True), + required=True, +) +@click.option( + "--i18n-lang", + type=str, + default="Auto", + help="Languages for internationalisation", + show_default=True, +) +@click.option( + "--port", + type=int, + default="9871", + show_default=True, +) +@click.option( + "--share", + type=bool, + default=False, + show_default=True, +) +def main(list_path: str = "", i18n_lang="Auto", port=9871, share=False): + """Web-Based audio subtitle editing and multilingual annotation Tool + + Accept a transcription list path to launch a Gradio WebUI for text editing + """ + + with gr.Blocks(analytics_enabled=False) as app: + subfix = Subfix(I18nAuto(i18n_lang)) + subfix.render(list_path=list_path) + if subfix.max_index > 0: + timer = gr.Timer(0.1) + + timer.tick( + fn=lambda: ( + gr.Slider(value=0, maximum=subfix.max_index), + gr.Slider(value=10), + gr.Timer(active=False), + ), + inputs=[], + outputs=[ + subfix.index_slider, + subfix.batch_size_slider, + timer, + ], + ) + else: + timer = gr.Timer(2) + + timer.tick( + fn=lambda x: (_ for _ in ()).throw(gr.Error("Invalid List")) if x is None else None, + inputs=[], + outputs=[], + ) + app.queue().launch( + server_name="0.0.0.0", + inbrowser=True, + share=share, + server_port=port, + quiet=False, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py deleted file mode 100644 index 3f2fd03e..00000000 --- a/tools/subfix_webui.py +++ /dev/null @@ -1,422 +0,0 @@ -import sys -from tools.i18n.i18n import I18nAuto, scan_language_list -language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" -i18n = I18nAuto(language=language) -import argparse -import copy -import json -import os -import uuid - -try: - import gradio.analytics as analytics - - analytics.version_check = lambda: None -except: - ... - -import gradio as gr -import librosa -import numpy as np -import soundfile - -g_json_key_text = "" -g_json_key_path = "" -g_load_file = "" -g_load_format = "" - -g_max_json_index = 0 -g_index = 0 -g_batch = 10 -g_text_list = [] -g_audio_list = [] -g_checkbox_list = [] -g_data_json = [] - - -def reload_data(index, batch): - global g_index - g_index = index - global g_batch - g_batch = batch - datas = g_data_json[index : index + batch] - output = [] - for d in datas: - output.append({g_json_key_text: d[g_json_key_text], g_json_key_path: d[g_json_key_path]}) - return output - - -def b_change_index(index, batch): - global g_index, g_batch - g_index, g_batch = index, batch - datas = reload_data(index, batch) - output = [] - for i, _ in enumerate(datas): - output.append( - # gr.Textbox( - # label=f"Text {i+index}", - # value=_[g_json_key_text]#text - # ) - {"__type__": "update", "label": f"Text {i + index}", "value": _[g_json_key_text]} - ) - for _ in range(g_batch - len(datas)): - output.append( - # gr.Textbox( - # label=f"Text", - # value="" - # ) - {"__type__": "update", "label": "Text", "value": ""} - ) - for _ in datas: - output.append(_[g_json_key_path]) - for _ in range(g_batch - len(datas)): - output.append(None) - for _ in range(g_batch): - output.append(False) - return output - - -def b_next_index(index, batch): - b_save_file() - if (index + batch) <= g_max_json_index: - return index + batch, *b_change_index(index + batch, batch) - else: - return index, *b_change_index(index, batch) - - -def b_previous_index(index, batch): - b_save_file() - if (index - batch) >= 0: - return index - batch, *b_change_index(index - batch, batch) - else: - return 0, *b_change_index(0, batch) - - -def b_submit_change(*text_list): - global g_data_json - change = False - for i, new_text in enumerate(text_list): - if g_index + i <= g_max_json_index: - new_text = new_text.strip() + " " - if g_data_json[g_index + i][g_json_key_text] != new_text: - g_data_json[g_index + i][g_json_key_text] = new_text - change = True - if change: - b_save_file() - return g_index, *b_change_index(g_index, g_batch) - - -def b_delete_audio(*checkbox_list): - global g_data_json, g_index, g_max_json_index - b_save_file() - change = False - for i, checkbox in reversed(list(enumerate(checkbox_list))): - if g_index + i < len(g_data_json): - if checkbox == True: - g_data_json.pop(g_index + i) - change = True - - g_max_json_index = len(g_data_json) - 1 - if g_index > g_max_json_index: - g_index = g_max_json_index - g_index = g_index if g_index >= 0 else 0 - if change: - b_save_file() - # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) - return { - "value": g_index, - "__type__": "update", - "maximum": (g_max_json_index if g_max_json_index >= 0 else 0), - }, *b_change_index(g_index, g_batch) - - -def b_invert_selection(*checkbox_list): - new_list = [not item if item is True else True for item in checkbox_list] - return new_list - - -def get_next_path(filename): - base_dir = os.path.dirname(filename) - base_name = os.path.splitext(os.path.basename(filename))[0] - for i in range(100): - new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") - if not os.path.exists(new_path): - return new_path - return os.path.join(base_dir, f"{str(uuid.uuid4())}.wav") - - -def b_audio_split(audio_breakpoint, *checkbox_list): - global g_data_json, g_max_json_index - checked_index = [] - for i, checkbox in enumerate(checkbox_list): - if checkbox == True and g_index + i < len(g_data_json): - checked_index.append(g_index + i) - if len(checked_index) == 1: - index = checked_index[0] - audio_json = copy.deepcopy(g_data_json[index]) - path = audio_json[g_json_key_path] - data, sample_rate = librosa.load(path, sr=None, mono=True) - audio_maxframe = len(data) - break_frame = int(audio_breakpoint * sample_rate) - - if break_frame >= 1 and break_frame < audio_maxframe: - audio_first = data[0:break_frame] - audio_second = data[break_frame:] - nextpath = get_next_path(path) - soundfile.write(nextpath, audio_second, sample_rate) - soundfile.write(path, audio_first, sample_rate) - g_data_json.insert(index + 1, audio_json) - g_data_json[index + 1][g_json_key_path] = nextpath - b_save_file() - - g_max_json_index = len(g_data_json) - 1 - # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) - - -def b_merge_audio(interval_r, *checkbox_list): - global g_data_json, g_max_json_index - b_save_file() - checked_index = [] - audios_path = [] - audios_text = [] - for i, checkbox in enumerate(checkbox_list): - if checkbox == True and g_index + i < len(g_data_json): - checked_index.append(g_index + i) - - if len(checked_index) > 1: - for i in checked_index: - audios_path.append(g_data_json[i][g_json_key_path]) - audios_text.append(g_data_json[i][g_json_key_text]) - for i in reversed(checked_index[1:]): - g_data_json.pop(i) - - base_index = checked_index[0] - base_path = audios_path[0] - g_data_json[base_index][g_json_key_text] = "".join(audios_text) - - audio_list = [] - l_sample_rate = None - for i, path in enumerate(audios_path): - data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) - l_sample_rate = sample_rate - if i > 0: - silence = np.zeros(int(l_sample_rate * interval_r)) - audio_list.append(silence) - - audio_list.append(data) - - audio_concat = np.concatenate(audio_list) - - soundfile.write(base_path, audio_concat, l_sample_rate) - - b_save_file() - - g_max_json_index = len(g_data_json) - 1 - - # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) - - -def b_save_json(): - with open(g_load_file, "w", encoding="utf-8") as file: - for data in g_data_json: - file.write(f"{json.dumps(data, ensure_ascii=False)}\n") - - -def b_save_list(): - with open(g_load_file, "w", encoding="utf-8") as file: - for data in g_data_json: - wav_path = data["wav_path"] - speaker_name = data["speaker_name"] - language = data["language"] - text = data["text"] - file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip() + "\n") - - -def b_load_json(): - global g_data_json, g_max_json_index - with open(g_load_file, "r", encoding="utf-8") as file: - g_data_json = file.readlines() - g_data_json = [json.loads(line) for line in g_data_json] - g_max_json_index = len(g_data_json) - 1 - - -def b_load_list(): - global g_data_json, g_max_json_index - with open(g_load_file, "r", encoding="utf-8") as source: - data_list = source.readlines() - for _ in data_list: - data = _.split("|") - if len(data) == 4: - wav_path, speaker_name, language, text = data - g_data_json.append( - {"wav_path": wav_path, "speaker_name": speaker_name, "language": language, "text": text.strip()} - ) - else: - print("error line:", data) - g_max_json_index = len(g_data_json) - 1 - - -def b_save_file(): - if g_load_format == "json": - b_save_json() - elif g_load_format == "list": - b_save_list() - - -def b_load_file(): - if g_load_format == "json": - b_load_json() - elif g_load_format == "list": - b_load_list() - - -def set_global(load_json, load_list, json_key_text, json_key_path, batch): - global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch - - g_batch = int(batch) - - if load_json != "None": - g_load_format = "json" - g_load_file = load_json - elif load_list != "None": - g_load_format = "list" - g_load_file = load_list - else: - g_load_format = "list" - g_load_file = "demo.list" - - g_json_key_text = json_key_text - g_json_key_path = json_key_path - - b_load_file() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process some integers.") - parser.add_argument("--load_json", default="None", help="source file, like demo.json") - parser.add_argument("--is_share", default="False", help="whether webui is_share=True") - parser.add_argument("--load_list", default="None", help="source file, like demo.list") - parser.add_argument("--webui_port_subfix", default=9871, help="source file, like demo.list") - parser.add_argument("--json_key_text", default="text", help="the text key name in json, Default: text") - parser.add_argument("--json_key_path", default="wav_path", help="the path key name in json, Default: wav_path") - parser.add_argument("--g_batch", default=10, help="max number g_batch wav to display, Default: 10") - - args = parser.parse_args() - - set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) - - with gr.Blocks(analytics_enabled=False) as demo: - gr.Markdown( - value=i18n("Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)") - ) - with gr.Row(): - btn_change_index = gr.Button("Change Index") - btn_submit_change = gr.Button("Submit Text") - btn_merge_audio = gr.Button("Merge Audio") - btn_delete_audio = gr.Button("Delete Audio") - btn_previous_index = gr.Button("Previous Index") - btn_next_index = gr.Button("Next Index") - - with gr.Row(): - index_slider = gr.Slider(minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3) - splitpoint_slider = gr.Slider( - minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 - ) - btn_audio_split = gr.Button("Split Audio", scale=1) - btn_save_json = gr.Button("Save File", visible=True, scale=1) - btn_invert_selection = gr.Button("Invert Selection", scale=1) - - with gr.Row(): - with gr.Column(): - for _ in range(0, g_batch): - with gr.Row(): - text = gr.Textbox(label="Text", visible=True, scale=5) - audio_output = gr.Audio(label="Output Audio", visible=True, scale=5) - audio_check = gr.Checkbox(label="Yes", show_label=True, info="Choose Audio", scale=1) - g_text_list.append(text) - g_audio_list.append(audio_output) - g_checkbox_list.append(audio_check) - - with gr.Row(): - batchsize_slider = gr.Slider( - minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False - ) - interval_slider = gr.Slider(minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3) - btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1) - btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1) - - btn_change_index.click( - b_change_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_submit_change.click( - b_submit_change, - inputs=[ - *g_text_list, - ], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_previous_index.click( - b_previous_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_next_index.click( - b_next_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_delete_audio.click( - b_delete_audio, - inputs=[*g_checkbox_list], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_merge_audio.click( - b_merge_audio, - inputs=[interval_slider, *g_checkbox_list], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_audio_split.click( - b_audio_split, - inputs=[splitpoint_slider, *g_checkbox_list], - outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], - ) - - btn_invert_selection.click(b_invert_selection, inputs=[*g_checkbox_list], outputs=[*g_checkbox_list]) - - btn_save_json.click(b_save_file) - - demo.load( - b_change_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], - ) - - demo.launch( - server_name="0.0.0.0", - inbrowser=True, - # quiet=True, - share=eval(args.is_share), - server_port=int(args.webui_port_subfix), - ) diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index f5f8d3f6..0112a1aa 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,23 +1,22 @@ import logging import os +import sys import traceback -import gradio as gr - -from tools.i18n.i18n import I18nAuto -from tools.my_utils import clean_path - -i18n = I18nAuto() - -logger = logging.getLogger(__name__) -import sys - import ffmpeg +import gradio as gr import torch from bsroformer import Roformer_Loader from mdxnet import MDXNetDereverb from vr import AudioPre, AudioPreDeEcho +from tools.i18n.i18n import I18nAuto +from tools.my_utils import clean_path, load_cudnn + +i18n = I18nAuto() + +logger = logging.getLogger(__name__) + weight_uvr5_root = "tools/uvr5/uvr5_weights" uvr5_names = [] for name in os.listdir(weight_uvr5_root): @@ -44,6 +43,7 @@ def html_center(text, label="p"): def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] + load_cudnn() try: inp_root = clean_path(inp_root) save_root_vocal = clean_path(save_root_vocal) diff --git a/webui.py b/webui.py index 0e34987a..dd9a8eb8 100644 --- a/webui.py +++ b/webui.py @@ -58,6 +58,7 @@ for site_packages_root in site_packages_roots: traceback.print_exc() import shutil import subprocess +from multiprocessing import cpu_count from subprocess import Popen from tools.assets import css, js, top_html @@ -86,14 +87,9 @@ from config import ( from tools import my_utils from tools.my_utils import check_details, check_for_existance -# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu -try: - import gradio.analytics as analytics - - analytics.version_check = lambda: None -except: - ... -import gradio as gr +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" +os.environ["language"] = language +i18n = I18nAuto(language=language) n_cpu = cpu_count() @@ -276,12 +272,7 @@ def change_label(path_list): if p_label is None: check_for_existance([path_list]) path_list = my_utils.clean_path(path_list) - cmd = '"%s" -s tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( - python_exec, - path_list, - webui_port_subfix, - is_share, - ) + cmd = f'"{python_exec}" -s tools/subfix.py --i18n-lang {language} --port {webui_port_subfix} --share {is_share} "{path_list}"' yield ( process_info(process_name_subfix, "opened"), {"__type__": "update", "visible": False}, From 89438d6001e2919c0515af6e4b25ee46a598deb7 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Tue, 27 May 2025 00:23:44 +0800 Subject: [PATCH 2/8] Add Docker Build pwsh in windows --- docker_build.ps1 | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ docker_build.sh | 2 +- install.sh | 5 ++-- tools/subfix.py | 39 +++++++++++++------------- 4 files changed, 95 insertions(+), 24 deletions(-) create mode 100644 docker_build.ps1 diff --git a/docker_build.ps1 b/docker_build.ps1 new file mode 100644 index 00000000..c0508fa6 --- /dev/null +++ b/docker_build.ps1 @@ -0,0 +1,73 @@ +$ErrorActionPreference = "Stop" + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Definition +Set-Location $ScriptDir + +if (-not (Get-Command "docker" -ErrorAction SilentlyContinue)) { + Write-Host "Docker Not Found" + exit 1 +} + +$Lite = $false +$CudaVersion = "12.6" + +function Write-Help { + Write-Host @" +Usage: powershell -File docker_build.ps1 [OPTIONS] + +Options: + --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED) + --lite Build a Lite Image + -h, --help Show this help message and exit + +Examples: + powershell -File docker_build.ps1 --cuda 12.6 --lite +"@ +} + +if ($args.Count -eq 0) { + Write-Help + exit 0 +} + +for ($i = 0; $i -lt $args.Count; $i++) { + switch ($args[$i]) { + '--cuda' { + $i++ + $val = $args[$i] + if ($val -ne "12.6" -and $val -ne "12.8") { + Write-Host "Error: Invalid CUDA_VERSION: $val" + Write-Host "Choose From: [12.6, 12.8]" + exit 1 + } + $CudaVersion = $val + } + '--lite' { + $Lite = $true + } + '-h' { Write-Help; exit 0 } + '--help' { Write-Help; exit 0 } + default { + Write-Host "Unknown Argument: $($args[$i])" + Write-Host "Use -h or --help to see available options." + exit 1 + } + } +} + +$arch = (Get-CimInstance Win32_Processor).Architecture +$TargetPlatform = if ($arch -eq 9) { "linux/amd64" } else { "linux/arm64" } + +if ($Lite) { + $TorchBase = "lite" +} else { + $TorchBase = "full" +} + +docker build ` + --build-arg CUDA_VERSION=$CudaVersion ` + --build-arg LITE=$Lite ` + --build-arg TARGETPLATFORM=$TargetPlatform ` + --build-arg TORCH_BASE=$TorchBase ` + -t "$env:USERNAME/gpt-sovits:local" ` + . \ No newline at end of file diff --git a/docker_build.sh b/docker_build.sh index 354599d2..b6a803fb 100644 --- a/docker_build.sh +++ b/docker_build.sh @@ -25,7 +25,7 @@ print_help() { echo " -h, --help Show this help message and exit" echo "" echo "Examples:" - echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper" + echo " bash docker_build.sh --cuda 12.6" } # Show help if no arguments provided diff --git a/install.sh b/install.sh index eba18683..187d07e5 100644 --- a/install.sh +++ b/install.sh @@ -34,8 +34,8 @@ print_help() { echo " -h, --help Show this help message and exit" echo "" echo "Examples:" - echo " bash install.sh --source HF --download-uvr5" - echo " bash install.sh --source ModelScope" + echo " bash install.sh --device CU128 --source HF --download-uvr5" + echo " bash install.sh --device MPS --source ModelScope" } # Show help if no arguments provided @@ -149,7 +149,6 @@ else echo "Installing,Please Wait..." fi done - conda install -c conda-forge -q -y fi echo "Installing ffmpeg and cmake..." diff --git a/tools/subfix.py b/tools/subfix.py index fb4cfdd6..6fcf4ddf 100644 --- a/tools/subfix.py +++ b/tools/subfix.py @@ -88,7 +88,7 @@ class Subfix: @property def max_index(self): - return len(self.transcriptions_list) + return len(self.transcriptions_list) - 1 def load_list(self, list_path: str): with open(list_path, mode="r", encoding="utf-8") as f: @@ -126,7 +126,7 @@ class Subfix: checkboxs = [] with LOCK: for i in range(index, index + self.batch_size): - if i <= self.max_index - 1: + if i <= self.max_index: audios.append(gr.Audio(value=self.transcriptions_list[i][0])) texts.append(gr.Textbox(value=self.transcriptions_list[i][3], label=self.i18n("Text") + f" {i}")) languages.append(gr.Dropdown(value=self.transcriptions_list[i][2])) @@ -140,10 +140,8 @@ class Subfix: def next_page(self, index: int): batch_size = self.batch_size - max_index = self.max_index - batch_size - if max_index <= 0: - max_index = 1 - index = min(index + batch_size, max_index - 1) + max_index = max(self.max_index - batch_size + 1, 0) + index = min(index + batch_size, max_index) return gr.Slider(value=index), *self.change_index(index) def previous_page(self, index: int): @@ -153,7 +151,7 @@ class Subfix: def delete_audio(self, index, *selected): delete_index = [i + index for i, _ in enumerate(selected) if _] - delete_index = [i for i in delete_index if i < self.max_index - 1] + delete_index = [i for i in delete_index if i < self.max_index] for idx in delete_index[::-1]: self.transcriptions_list.pop(idx) self.save_list() @@ -167,7 +165,8 @@ class Subfix: languages = input[len(input) // 2 :] if texts is None or languages is None: raise ValueError() - for idx in range(index, min(index + batch_size, self.max_index - 1)): + print(index, min(index + batch_size, self.max_index)) + for idx in range(index, min(index + batch_size, self.max_index + 1)): self.transcriptions_list[idx][3] = texts[idx - index].strip().strip("\n") self.transcriptions_list[idx][2] = languages[idx - index] result = self.save_list() @@ -178,7 +177,7 @@ class Subfix: def merge_audio(self, index, *selected): batch_size = self.batch_size merge_index = [i + index for i, _ in enumerate(selected) if _] - merge_index = [i for i in merge_index if i < self.max_index - 1] + merge_index = [i for i in merge_index if i < self.max_index] if len(merge_index) < 2: return *(gr.skip() for _ in range(batch_size * 3 + 1)), *(gr.Checkbox(False) for _ in range(batch_size)) else: @@ -211,7 +210,7 @@ class Subfix: self.batch_size = batch_size for i in range(index, index + batch_size): with gr.Row(equal_height=True): - if i <= self.max_index - 1: + if i <= self.max_index: with gr.Column(scale=2, min_width=160): textbox_tmp = gr.Textbox( value=self.transcriptions_list[i][3], @@ -281,7 +280,7 @@ class Subfix: self.selections.append(selection_tmp) with gr.Row(equal_height=True): with gr.Column(scale=2, min_width=160): - self.close_button = gr.Button(value=i18n("关闭打标WebUI"), variant="stop") + self.close_button = gr.Button(value=i18n("保存并关闭打标WebUI"), variant="stop") with gr.Column(scale=1, min_width=160): self.previous_index_button2 = gr.Button(value=i18n("上一页")) with gr.Column(scale=1, min_width=160): @@ -507,12 +506,12 @@ def main(list_path: str = "", i18n_lang="Auto", port=9871, share=False): with gr.Blocks(analytics_enabled=False) as app: subfix = Subfix(I18nAuto(i18n_lang)) subfix.render(list_path=list_path) - if subfix.max_index > 0: + if subfix.max_index >= 0: timer = gr.Timer(0.1) timer.tick( fn=lambda: ( - gr.Slider(value=0, maximum=subfix.max_index), + gr.Slider(value=0, maximum=subfix.max_index, step=1), gr.Slider(value=10), gr.Timer(active=False), ), @@ -531,13 +530,13 @@ def main(list_path: str = "", i18n_lang="Auto", port=9871, share=False): inputs=[], outputs=[], ) - app.queue().launch( - server_name="0.0.0.0", - inbrowser=True, - share=share, - server_port=port, - quiet=False, - ) + app.queue().launch( + server_name="0.0.0.0", + inbrowser=True, + share=share, + server_port=port, + quiet=False, + ) if __name__ == "__main__": From 6b63929809ab6015c6f630479e43df40f8aca7cf Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 1 Jun 2025 18:12:23 +0800 Subject: [PATCH 3/8] Disable Gradio API --- GPT_SoVITS/inference_webui.py | 1 + GPT_SoVITS/inference_webui_fast.py | 1 + tools/subfix.py | 1 + tools/uvr5/webui.py | 1 + webui.py | 1 + 5 files changed, 5 insertions(+) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 5c7d0103..6b9b35f7 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1337,5 +1337,6 @@ if __name__ == "__main__": inbrowser=True, share=is_share, server_port=infer_ttswebui, + show_api=False, # quiet=True, ) diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 470b7bbd..608297d0 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -505,5 +505,6 @@ if __name__ == "__main__": inbrowser=True, share=is_share, server_port=infer_ttswebui, + show_api=False, # quiet=True, ) diff --git a/tools/subfix.py b/tools/subfix.py index 6fcf4ddf..6ef91423 100644 --- a/tools/subfix.py +++ b/tools/subfix.py @@ -536,6 +536,7 @@ def main(list_path: str = "", i18n_lang="Auto", port=9871, share=False): share=share, server_port=port, quiet=False, + show_api=False, ) diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index 0112a1aa..2ca642aa 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -220,5 +220,6 @@ app.queue().launch( # concurrency_count=511, max_size=1022 inbrowser=True, share=is_share, server_port=webui_port_uvr5, + show_api=False, # quiet=True, ) diff --git a/webui.py b/webui.py index dd9a8eb8..08a7ef45 100644 --- a/webui.py +++ b/webui.py @@ -1972,5 +1972,6 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css inbrowser=True, share=is_share, server_port=webui_port_main, + show_api=False, # quiet=True, ) From cbbc2f09136e762c59ca3d447da624b7316794ec Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 1 Jun 2025 18:48:06 +0800 Subject: [PATCH 4/8] Accelerate GitHub Actions --- .github/build_windows_packages.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/build_windows_packages.ps1 b/.github/build_windows_packages.ps1 index 2e4acb2a..75918d35 100644 --- a/.github/build_windows_packages.ps1 +++ b/.github/build_windows_packages.ps1 @@ -162,7 +162,7 @@ Copy-Item -Path $curr -Destination $pkgName -Recurse $7zPath = "$pkgName.7z" $start = Get-Date Write-Host "Compress Starting at $start" -& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1 +& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -mmt=on -bsp1 $end = Get-Date Write-Host "Elapsed time: $($end - $start)" Get-ChildItem . From 473514c8812deef4e1337ca1114b4943fa88c86e Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 1 Jun 2025 22:55:28 +0800 Subject: [PATCH 5/8] Update README Update README Remove Outdated Content in README --- README.md | 57 +++++++++++++++++++------------------------ docs/cn/README.md | 54 +++++++++++++++++++++-------------------- docs/ja/README.md | 61 ++++++++++++++++++++++------------------------- docs/ko/README.md | 52 +++++++++++++++++++++------------------- docs/tr/README.md | 59 ++++++++++++++++++++++----------------------- 5 files changed, 137 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index d03514f0..5d902c23 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# +

GPT-SoVITS-WebUI

@@ -7,12 +9,21 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

RVC-Boss%2FGPT-SoVITS | Trendshift - +[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md) + +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + +[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) **English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md) @@ -20,7 +31,7 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

--- -## Features: +## Features 1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion. @@ -34,13 +45,13 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

Unseen speakers few-shot fine-tuning demo: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + -**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + ## Installation -For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online. +For users in China, you can use [AutoDL Cloud Docker](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to experience the full functionality online. ### Tested Environments @@ -171,7 +182,7 @@ docker exec -it ``` -if you want to switch to V1,then - -```bash -python webui.py v1 -``` - -Or maunally switch version in WebUI - ### Finetune #### Path Auto-filling is now supported @@ -253,7 +251,7 @@ Or maunally switch version in WebUI #### Integrated Package Users -Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` +Double-click `go-webui.bat` or use `go-webui.ps1` , then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` #### Others @@ -373,11 +371,6 @@ Use the command line to open the WebUI for UVR5 python tools/uvr5/webui.py "" ``` - - This is how the audio segmentation of the dataset is done using the command line ```bash @@ -453,5 +446,5 @@ Thankful to @Naozumi520 for providing the Cantonese training set and for the gui ## Thanks to all contributors for their efforts - + Contributors diff --git a/docs/cn/README.md b/docs/cn/README.md index e674d9cf..57d441d5 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -1,18 +1,29 @@ +# +

GPT-SoVITS-WebUI

-强大的少样本语音转换与语音合成Web用户界面.

+强大的少样本语音转换与语音合成Web用户界面.

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) RVC-Boss%2FGPT-SoVITS | Trendshift - +[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/更新日志-查看更新-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md) + +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + +[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) @@ -36,7 +47,7 @@ -**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + ## 安装 @@ -171,7 +182,7 @@ docker exec -it ``` -若想使用 V1,则 - -```bash -python webui.py v1 -``` - -或者在 webUI 内动态切换 - ### 微调 #### 现已支持自动填充路径 @@ -243,13 +245,13 @@ python webui.py v1 3. 进行降噪(可选) 4. 进行 ASR 5. 校对标注 -6. 前往下一个窗口,点击训练 +6. 前往下一个窗口, 点击训练 ### 打开推理 WebUI #### 整合包用户 -双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI +双击 `go-webui.bat` 或者使用 `go-webui.ps1` , 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI #### 其他 @@ -287,7 +289,7 @@ python webui.py 3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下 - 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) + 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型, 解压并重命名为`G2PWModel`, 将其放到`GPT_SoVITS/text`目录下) ## V3 更新说明 @@ -449,5 +451,5 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## 感谢所有贡献者的努力 - + Contributors diff --git a/docs/ja/README.md b/docs/ja/README.md index d4d3081e..efb9012f 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -1,16 +1,27 @@ +# +

GPT-SoVITS-WebUI

-パワフルなFew-Shot音声変換・音声合成 WebUI.

+パワフルなFew-Shot音声変換・音声合成 WebUI.

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) -
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ja/Changelog_JA.md) + +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + +[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) @@ -18,7 +29,7 @@ --- -## 機能: +## 機能 1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます. @@ -32,9 +43,9 @@ 声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + -**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + ## インストール @@ -165,7 +176,7 @@ docker exec -it ``` -V1 に切り替えたい場合は - -```bash -python webui.py v1 <言語(オプション)> -``` - -または WebUI で手動でバージョンを切り替えてください. - ### 微調整 #### パス自動補完のサポート @@ -239,7 +241,7 @@ python webui.py v1 <言語(オプション)> #### 統合パッケージ利用者 -`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. +`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. #### その他 @@ -359,11 +361,6 @@ V1/V2/V3/V4 環境から V2Pro への移行方法: python tools/uvr5/webui.py "" ``` - - コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです. ```bash @@ -384,7 +381,7 @@ python tools/asr/funasr_asr.py -i -o ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング) -(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります) +(進行状況バーは表示されません. GPU のパフォーマンスにより時間遅延が発生する可能性があります) ```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p @@ -439,5 +436,5 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## すべてのコントリビューターに感謝します - + Contributors diff --git a/docs/ko/README.md b/docs/ko/README.md index c4c7c794..91057fd3 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -1,3 +1,5 @@ +# +

GPT-SoVITS-WebUI

@@ -5,12 +7,21 @@ [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) -
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ko/Changelog_KO.md) + +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + +[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md) @@ -18,7 +29,7 @@ --- -## 기능: +## 기능 1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다. @@ -32,9 +43,9 @@ 보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + -**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + ## 설치 @@ -165,7 +176,7 @@ docker exec -it ``` -V1으로 전환하려면, - -```bash -python webui.py v1 <언어(옵션)> -``` - -또는 WebUI에서 수동으로 버전을 전환하십시오. - ### 미세 조정 #### 경로 자동 채우기가 지원됩니다 @@ -239,7 +241,7 @@ python webui.py v1 <언어(옵션)> #### 통합 패키지 사용자 -`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. +`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. #### 기타 @@ -277,13 +279,13 @@ V1 환경에서 V2를 사용하려면: 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오. - 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) + 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다) ## V3 릴리스 노트 새로운 기능: -1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.) +1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다) 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다. @@ -437,8 +439,8 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p @Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다. -## 모든 기여자들에게 감사드립니다 ;) +## 모든 기여자들에게 감사드립니다 - + Contributors diff --git a/docs/tr/README.md b/docs/tr/README.md index cc32691f..a8576c17 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -1,3 +1,5 @@ +# +

GPT-SoVITS-WebUI

@@ -7,12 +9,21 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz RVC-Boss%2FGPT-SoVITS | Trendshift - +[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/tr/Changelog_TR.md) + +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + +[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe** @@ -20,7 +31,7 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz --- -## Özellikler: +## Özellikler 1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin. @@ -34,9 +45,9 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + -**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** + ## Kurulum @@ -167,7 +178,7 @@ docker exec -it ``` -V1'e geçmek istiyorsanız, - -```bash -python webui.py v1 -``` - -veya WebUI'de manuel olarak sürüm değiştirin. - ### İnce Ayar #### Yol Otomatik Doldurma artık destekleniyor @@ -243,7 +245,7 @@ veya WebUI'de manuel olarak sürüm değiştirin. #### Entegre Paket Kullanıcıları -`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. +`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. #### Diğerleri @@ -281,11 +283,11 @@ V1 ortamından V2'yi kullanmak için: 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin. - Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) + Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin) ## V3 Sürüm Notları -Yeni Özellikler: +### Yeni Özellikler 1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir). @@ -293,7 +295,7 @@ Yeni Özellikler: [daha fazla detay]() -V2 ortamında V3 kullanımı: +### v2 ortamında v3 kullanımı 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin. @@ -323,7 +325,7 @@ V1/V2/V3 ortamından V4'e geçiş: Yeni Özellikler: 1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**. - [Daha fazla bilgi](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)) + [Daha fazla bilgi]() 2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir. @@ -363,11 +365,6 @@ UVR5 için Web Arayüzünü açmak için komut satırını kullanın python tools/uvr5/webui.py "" ``` - - Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır ```bash @@ -443,5 +440,5 @@ python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l ## Tüm katkıda bulunanlara çabaları için teşekkürler - + Contributors From 4f16ea188f3d3395367195692285cb3bad221172 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Wed, 4 Jun 2025 18:27:56 +0800 Subject: [PATCH 6/8] Update ChangeLogs --- docs/cn/Changelog_CN.md | 10 ++- docs/en/Changelog_EN.md | 9 +- docs/ja/Changelog_JA.md | 179 ++++++++++++++++++++-------------------- docs/ko/Changelog_KO.md | 9 +- docs/tr/Changelog_TR.md | 9 +- 5 files changed, 109 insertions(+), 107 deletions(-) diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 6c6dcdbe..b3d620e9 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -157,7 +157,7 @@ - 类型: 修复 - 提交: StaryLan - 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) - - 内容: 修复 UVR5 `inf everywhere` 报错的问题 (`is_half` 传参未转换布尔类型导致恒定半精度推理, 16系显卡会 `inf`). + - 内容: 修复 UVR5 `inf everywhere` 报错的问题 (`is_half` 传参未转换布尔类型导致恒定半精度推理, 16 系显卡会 `inf`). - 类型: 修复 - 提交: RVC-Boss - 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2) @@ -230,6 +230,7 @@ ## 202403 - 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) + - 内容: Faster Whisper 在没有 CUDA 可用时自动使用 CPU 推理. - 类型: 优化 - 提交: ShiroDoMain @@ -347,7 +348,7 @@ - 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340) - 内容: 支持合成语速调节, 支持冻结随机性只调节语速, 并将其更新到`api.py` 上. - 类型: 新功能 - - 提交: RVC-Boss, 红血球AE3803 + - 提交: RVC-Boss, 红血球 AE3803 - 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) - 内容: 增加 BS-Roformer 人声伴奏分离模型支持. - 类型: 新功能 @@ -409,7 +410,7 @@ - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) - 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.** - - 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))) + - 类型: 新功能 (特性参阅 [Wiki]()) - 提交: RVC-Boss - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) - 内容: 更新项目多语言文档. @@ -453,7 +454,7 @@ - 类型: 文档 - 提交: StaryLan - 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122) - - 内容: 对于模型无法判断的CJK短字符采用规则判断. + - 内容: 对于模型无法判断的 CJK 短字符采用规则判断. - 类型: 修复 - 提交: KamioRinn - 关联: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116) @@ -475,6 +476,7 @@ - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239). - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174). - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241) + - 内容: **为 SoVITS v3 适配并行推理**. - 类型: 新功能 - 提交: ChasonJiang diff --git a/docs/en/Changelog_EN.md b/docs/en/Changelog_EN.md index bde48f2b..d5dddfc7 100644 --- a/docs/en/Changelog_EN.md +++ b/docs/en/Changelog_EN.md @@ -285,7 +285,7 @@ - 2024.06.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) - Content: Fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation. - **Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.** + **Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.** - Type: Fix - Contributor: RVC-Boss - 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) @@ -347,7 +347,7 @@ - 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340) - Content: Support adjusting speech speed during synthesis, including an option to freeze randomness and only control speed. This feature has been updated to `api.py`. - Type: Feature - - Contributor: RVC-Boss, 红血球AE3803 + - Contributor: RVC-Boss, 红血球 AE3803 - 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) - Content: Added support for the BS-RoFormer vocal accompaniment separation model. - Type: Feature @@ -409,7 +409,7 @@ - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) - Content: **Added GPT-SoVITS V3 model, which requires 14GB VRAM for fine-tuning.** - - Type: Feature (Refer to [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))) + - Type: Feature (Refer to [Wiki]()) - Contributor: RVC-Boss - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) - Content: Updated multilingual project documentation. @@ -478,9 +478,6 @@ - Content: **Enabled parallel inference for SoVITS v3.** - Type: Feature - Contributor: ChasonJiang - -- Fixed other minor bugs. - - Integrated package fixes for ONNX runtime GPU inference support: - Type: Fix - Details: diff --git a/docs/ja/Changelog_JA.md b/docs/ja/Changelog_JA.md index 229d836a..e6c9b87c 100644 --- a/docs/ja/Changelog_JA.md +++ b/docs/ja/Changelog_JA.md @@ -3,11 +3,11 @@ ## 202401 - 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) - - 内容: WebUIに英語システム翻訳サポートを追加。 + - 内容: WebUI に英語システム翻訳サポートを追加。 - タイプ: ドキュメンテーション - 貢献者: D3lik - 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e) - - 内容: SoVITSトレーニングのZeroDivisionError修正を試みた。 + - 内容: SoVITS トレーニングの ZeroDivisionError 修正を試みた。 - タイプ: 修正 - 貢献者: RVC-Boss, Tybost - 関連: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) @@ -16,11 +16,11 @@ - タイプ: 最適化 - 貢献者: RVC-Boss - 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef) - - 内容: `cmd-asr.py`がデフォルトディレクトリにFunASRモデルが含まれているか確認し、ない場合はModelScopeからダウンロードするようになった。 + - 内容: `cmd-asr.py`がデフォルトディレクトリに FunASR モデルが含まれているか確認し、ない場合は ModelScope からダウンロードするようになった。 - タイプ: 機能 - 貢献者: RVC-Boss - 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d) - - 内容: `Config.py`に`is_share`パラメータを追加、`True`に設定するとWebUIを公開ネットワークにマッピング可能。 + - 内容: `Config.py`に`is_share`パラメータを追加、`True`に設定すると WebUI を公開ネットワークにマッピング可能。 - タイプ: 機能 - 貢献者: RVC-Boss - 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f) @@ -33,11 +33,11 @@ - 貢献者: RVC-Boss - 2024.01.22 英語と日本語トレーニングのネイティブサポートをテスト(日本語トレーニングはルートディレクトリに非英語特殊文字がないことが必要)。 - 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124) - - 内容: 音声パスチェックを改善。不正な入力パスから読み取ろうとすると、ffmpegエラーではなくパスが存在しないと報告するようになった。 + - 内容: 音声パスチェックを改善。不正な入力パスから読み取ろうとすると、ffmpeg エラーではなくパスが存在しないと報告するようになった。 - タイプ: 最適化 - 貢献者: xmimu - 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1) - - 内容: Hubert抽出がNaNエラーを引き起こし、SoVITS/GPTトレーニングでZeroDivisionErrorが発生する問題を解決。 + - 内容: Hubert 抽出が NaN エラーを引き起こし、SoVITS/GPT トレーニングで ZeroDivisionError が発生する問題を解決。 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50) @@ -49,19 +49,19 @@ - タイプ: 最適化 - 貢献者: RVC-Boss - 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f) - - 内容: 推論WebUIでクイックモデル切り替えをサポート追加。 + - 内容: 推論 WebUI でクイックモデル切り替えをサポート追加。 - タイプ: 機能 - 貢献者: RVC-Boss - 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b) - - 内容: 推論WebUIの冗長なログを削除。 + - 内容: 推論 WebUI の冗長なログを削除。 - タイプ: 最適化 - 貢献者: RVC-Boss - 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200) - - 内容: Macでのトレーニングと推論をサポート。 + - 内容: Mac でのトレーニングと推論をサポート。 - タイプ: 機能 - 貢献者: Lion-Wu - 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b) - - 内容: UVR5の読み取り時にディレクトリが自動的に飛び出す問題を修正。 + - 内容: UVR5 の読み取り時にディレクトリが自動的に飛び出す問題を修正。 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204) @@ -77,11 +77,11 @@ - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b) - - 内容: 半精度をサポートしないGPU向けに自動的に単精度を強制。CPU推論時も単精度を強制。 + - 内容: 半精度をサポートしない GPU 向けに自動的に単精度を強制。CPU 推論時も単精度を強制。 - タイプ: 最適化 - 貢献者: RVC-Boss - 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238) - - 内容: Dockerfile内のモデルダウンロードプロセスを完了。 + - 内容: Dockerfile 内のモデルダウンロードプロセスを完了。 - タイプ: 修正 - 貢献者: breakstring - 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257) @@ -89,7 +89,7 @@ - タイプ: 修正 - 貢献者: duliangang - 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683) - - 内容: GPTトレーニングがチェックポイントを保存しない問題を修正。 + - 内容: GPT トレーニングがチェックポイントを保存しない問題を修正。 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8) @@ -101,15 +101,15 @@ - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0) - - 内容: 16シリーズのような半精度トレーニングに問題があるGPU向けに、トレーニング設定を単精度に変更。 + - 内容: 16 シリーズのような半精度トレーニングに問題がある GPU 向けに、トレーニング設定を単精度に変更。 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec) - - 内容: 利用可能なColabバージョンをテストして更新。 + - 内容: 利用可能な Colab バージョンをテストして更新。 - タイプ: 機能 - 貢献者: RVC-Boss - 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135) - - 内容: FunASRをバージョン1.0に更新し、インターフェース不一致によるエラーを修正。 + - 内容: FunASR をバージョン 1.0 に更新し、インターフェース不一致によるエラーを修正。 - タイプ: 修正 - 貢献者: LauraGPT - 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4) @@ -128,11 +128,11 @@ ## 202402 - 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) - - 内容: ASRパス末尾のスラッシュによるファイル名保存エラーの修正 + - 内容: ASR パス末尾のスラッシュによるファイル名保存エラーの修正 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7) - - 内容: UVR5のフォーマット読み取りエラーによる音声分離失敗の修正 + - 内容: UVR5 のフォーマット読み取りエラーによる音声分離失敗の修正 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7) @@ -140,7 +140,7 @@ - タイプ: 機能改善 - 貢献者: RVC-Boss - 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) - - 内容: PaddleSpeechのテキスト正規化を導入(例: xx.xx%表記、「元/吨」を「元每吨」と正確に読む、アンダースコア問題の解消) + - 内容: PaddleSpeech のテキスト正規化を導入(例: xx.xx%表記、「元/吨」を「元每吨」と正確に読む、アンダースコア問題の解消) - タイプ: 機能改善 - 貢献者: KamioRinn - 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395) @@ -153,36 +153,36 @@ - 貢献者: RVC-Boss - 関連: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) - 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) - - 内容: UVR5の高バージョンLibrosaへの対応 + - 内容: UVR5 の高バージョン Librosa への対応 - タイプ: 修正 - 貢献者: StaryLan - 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) - - 内容: UVR5の「inf everywhere」エラーの修正(ブール型変換不足による半精度推論問題、16シリーズGPUで発生) + - 内容: UVR5 の「inf everywhere」エラーの修正(ブール型変換不足による半精度推論問題、16 シリーズ GPU で発生) - タイプ: 修正 - 貢献者: RVC-Boss - 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2) - - 内容: Gradio依存関係の修正 + - 内容: Gradio 依存関係の修正 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400) - - 内容: Faster Whisperの統合による日本語・英語音声認識機能の追加 + - 内容: Faster Whisper の統合による日本語・英語音声認識機能の追加 - タイプ: 新機能 - 貢献者: Shadow - 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)~[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10) - - 内容: 3連ルートディレクトリ空欄時の自動.listファイルパス読み込み機能 + - 内容: 3 連ルートディレクトリ空欄時の自動.list ファイルパス読み込み機能 - タイプ: 機能改善 - 貢献者: RVC-Boss - 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) - - 内容: GPTトレーニングのフリーズ問題(Windows10 1909)と繁体字システム言語時のエラー修正 + - 内容: GPT トレーニングのフリーズ問題(Windows10 1909)と繁体字システム言語時のエラー修正 - タイプ: 修正 - 貢献者: RVC-Boss - 関連: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) - 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) - - 内容: DPO損失実験的トレーニングオプションの追加(ネガティブサンプル構築によるGPTの繰り返し・文字抜け問題改善)、推論インターフェースの複数パラメータ公開 + - 内容: DPO 損失実験的トレーニングオプションの追加(ネガティブサンプル構築による GPT の繰り返し・文字抜け問題改善)、推論インターフェースの複数パラメータ公開 - タイプ: 新機能 - 貢献者: liufenghua - 2024.02.12 [Commit#2fa74ecb](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#d82f6bbb](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28) - - 内容: 音声認識ロジックの最適化、Faster Whisperのミラーサイトダウンロード対応(HuggingFace接続問題回避) + - 内容: 音声認識ロジックの最適化、Faster Whisper のミラーサイトダウンロード対応(HuggingFace 接続問題回避) - タイプ: 機能改善 - 貢献者: RVC-Boss - 2024.02.15 [Commit#dd2c4d6d](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8) @@ -190,7 +190,7 @@ - タイプ: 修正 - 貢献者: RVC-Boss - 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)~[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b) - - 内容: DPOトレーニングを必須からオプションに変更(選択時は自動的にバッチサイズ半減)、推論インターフェースの新パラメータ未伝達問題の修正 + - 内容: DPO トレーニングを必須からオプションに変更(選択時は自動的にバッチサイズ半減)、推論インターフェースの新パラメータ未伝達問題の修正 - タイプ: 機能改善 - 貢献者: RVC-Boss - 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5) @@ -207,38 +207,38 @@ - タイプ: 機能改善 - 貢献者: KamioRinn, v3cun - 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511) - - 内容: Colabの公開URL未開始問題の修正 + - 内容: Colab の公開 URL 未開始問題の修正 - タイプ: 修正 - 貢献者: ChanningWang2018, RVC-Boss - 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557) - - 内容: MacOS推論デバイスをMPSからCPUに変更(CPU推論の方が高速) + - 内容: MacOS 推論デバイスを MPS から CPU に変更(CPU 推論の方が高速) - タイプ: 機能改善 - 貢献者: XXXXRT666 - 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#5a171773](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35) - - 内容: データ前処理に音声ノイズ除去オプション追加(16Kサンプルレートにダウンサンプリング、高ノイズ時以外は非推奨) + - 内容: データ前処理に音声ノイズ除去オプション追加(16K サンプルレートにダウンサンプリング、高ノイズ時以外は非推奨) - タイプ: 新機能 - 貢献者: RVC-Boss - 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) - - 内容: is_half判定の修正によるMacOSの正常なCPU推論対応 + - 内容: is_half 判定の修正による MacOS の正常な CPU 推論対応 - タイプ: 修正 - 貢献者: XXXXRT666 - 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) - - 内容: UVR5 MDXNetパラメータ順序エラーによる出力フォルダ逆転問題の修正 + - 内容: UVR5 MDXNet パラメータ順序エラーによる出力フォルダ逆転問題の修正 - タイプ: 修正 - 貢献者: Yuze Wang ## 202403 - 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) - - 内容: CUDAが利用できない場合、Faster Whisperの自動CPU推論を有効化 + - 内容: CUDA が利用できない場合、Faster Whisper の自動 CPU 推論を有効化 - タイプ: 機能改善 - 貢献者: ShiroDoMain - 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193) - - 内容: Faster Whisper非中国語ASR使用時、中国語FunASRモデルの事前ダウンロードが不要に + - 内容: Faster Whisper 非中国語 ASR 使用時、中国語 FunASR モデルの事前ダウンロードが不要に - タイプ: 機能改善 - 貢献者: RVC-Boss - 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) - - 内容: 推論速度を50%向上(RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39環境で検証) + - 内容: 推論速度を 50%向上(RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 環境で検証) - タイプ: 機能改善 - 貢献者: GoHomeToMacDonal - 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721) @@ -246,7 +246,7 @@ - タイプ: 新機能 - 貢献者: ChasonJiang - 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761) - - 内容: CPUトレーニングをサポート(macOSでCPUを使用したトレーニングが可能に) + - 内容: CPU トレーニングをサポート(macOS で CPU を使用したトレーニングが可能に) - タイプ: 新機能 - 貢献者: Lion-Wu - 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821) @@ -254,21 +254,21 @@ - タイプ: 機能改善 - 貢献者: KamioRinn - 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894) - - 内容: APIフォーマットの改善 + - 内容: API フォーマットの改善 - タイプ: 機能改善 - 貢献者: KamioRinn ## 202404 - 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) - - 内容: UVR5 WebUIにおけるFFmpegコマンド文字列フォーマットの修正 + - 内容: UVR5 WebUI における FFmpeg コマンド文字列フォーマットの修正 - タイプ: 修正 - 貢献者: StaryLan ## 202405 - 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) - - 内容: SoVITSトレーニング時のVQ凍結漏れ問題を修正(品質劣化の原因となる) + - 内容: SoVITS トレーニング時の VQ 凍結漏れ問題を修正(品質劣化の原因となる) - タイプ: 修正 - 貢献者: hcwu1993 - 関連: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747) @@ -277,23 +277,23 @@ - タイプ: 機能改善 - 貢献者: StaryLan - 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132) - - 内容: Hubert抽出におけるバグ修正 + - 内容: Hubert 抽出におけるバグ修正 - タイプ: 修正 - 貢献者: XXXXRT666 ## 202406 - 2024.06.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) - - 内容: WebUIのGPTファインチューニング時に中国語入力テキストのBERT特徴量を読み取れない問題を修正(推論時との不一致や品質劣化の原因となる) - **注意: 既に大量データでファインチューニング済みの場合は、品質向上のためモデルの再チューニングを推奨** + - 内容: WebUI の GPT ファインチューニング時に中国語入力テキストの BERT 特徴量を読み取れない問題を修正(推論時との不一致や品質劣化の原因となる) + **注意: 既に大量データでファインチューニング済みの場合は、品質向上のためモデルの再チューニングを推奨** - タイプ: 修正 - 貢献者: RVC-Boss - 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) - - 内容: `s2_train.py`におけるSoVITSトレーニングの進捗バー処理を修正 + - 内容: `s2_train.py`における SoVITS トレーニングの進捗バー処理を修正 - タイプ: 修正 - 貢献者: pengzhendong - 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) - - 内容: UVR5 MDXNetがFFmpegを呼び出す際の文字列フォーマットを修正(スペースを含むパスに対応) + - 内容: UVR5 MDXNet が FFmpeg を呼び出す際の文字列フォーマットを修正(スペースを含むパスに対応) - タイプ: 修正 - 貢献者: RVC-Boss - 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) @@ -302,15 +302,15 @@ - 貢献者: XXXXRT666 - 関連: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165) - 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) - - 内容: CPU推論におけるデフォルトバッチサイズの小数点問題を修正 + - 内容: CPU 推論におけるデフォルトバッチサイズの小数点問題を修正 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) - - 内容: ノイズ除去やASRで例外が発生した場合に保留中の全オーディオファイル処理が終了してしまう問題を修正 + - 内容: ノイズ除去や ASR で例外が発生した場合に保留中の全オーディオファイル処理が終了してしまう問題を修正 - タイプ: 修正 - 貢献者: XXXXRT666 - 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) - - 内容: マルチGPUトレーニング時のマルチプロセス保存ロジックを修正 + - 内容: マルチ GPU トレーニング時のマルチプロセス保存ロジックを修正 - タイプ: 修正 - 貢献者: RVC-Boss - 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) @@ -326,14 +326,14 @@ - タイプ: 修正 - 貢献者: aoguai - 2024.07.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5) - - 内容: 高速化推論コードが検証済みでmainブランチにマージされ、ベースと同等の推論効果を保証。テキスト未参照モードでも高速推論をサポート + - 内容: 高速化推論コードが検証済みで main ブランチにマージされ、ベースと同等の推論効果を保証。テキスト未参照モードでも高速推論をサポート - タイプ: 最適化 - 貢献者: RVC-Boss, GoHomeToMacDonal - 関連: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) - 今後も`fast_inference`ブランチでの変更整合性を継続検証 - 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) - - 内容: i18nスキャンのリファクタリングと多言語設定ファイルの更新 + - 内容: i18n スキャンのリファクタリングと多言語設定ファイルの更新 - タイプ: ドキュメンテーション - 貢献者: StaryLan - 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) @@ -341,15 +341,15 @@ - タイプ: 修正 - 貢献者: XXXXRT666 - 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) - - 内容: GPTトレーニング時、カスタム`bucket_sampler`使用時のステップ数不一致を修正 + - 内容: GPT トレーニング時、カスタム`bucket_sampler`使用時のステップ数不一致を修正 - タイプ: 修正 - 貢献者: huangxu1991 - 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340) - 内容: 合成時の話速調整をサポート(ランダム性を固定して速度のみ制御するオプション含む)。`api.py`に更新済み - タイプ: 新機能 - - 貢献者: RVC-Boss, 红血球AE3803 + - 貢献者: RVC-Boss, 红血球 AE3803 - 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) - - 内容: BS-RoFormerボーカル・伴奏分離モデルのサポートを追加。 + - 内容: BS-RoFormer ボーカル・伴奏分離モデルのサポートを追加。 - タイプ: 新機能 - 貢献者: KamioRinn - 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) @@ -360,56 +360,56 @@ ## 202408 (V2 バージョン) - 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) - - 内容: WebUIでファイル処理時にパスを自動入力するように変更。 + - 内容: WebUI でファイル処理時にパスを自動入力するように変更。 - タイプ: 雑務 - 貢献者: XXXXRT666 - 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) - - 内容: BS-RoformerのFP16推論サポートを有効化。 + - 内容: BS-Roformer の FP16 推論サポートを有効化。 - タイプ: パフォーマンス最適化 - 貢献者: RVC-Boss - 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) - - 内容: GPU認識ロジックを最適化、ユーザーが入力した任意のGPUインデックスを処理するユーザーフレンドリーなロジックを追加。 + - 内容: GPU 認識ロジックを最適化、ユーザーが入力した任意の GPU インデックスを処理するユーザーフレンドリーなロジックを追加。 - タイプ: 雑務 - 貢献者: RVC-Boss - 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758) - - 内容: **GPT-SoVITS V2モデルを追加。** + - 内容: **GPT-SoVITS V2 モデルを追加。** - タイプ: 新機能 - 貢献者: RVC-Boss - 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) - - 内容: FunASRを使用して広東語ASRをサポート。 + - 内容: FunASR を使用して広東語 ASR をサポート。 - タイプ: 新機能 - 貢献者: RVC-Boss - 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388) - - 内容: UIとタイミングロジックを最適化。 + - 内容: UI とタイミングロジックを最適化。 - タイプ: 雑務 - 貢献者: XXXXRT666 - 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) - - 内容: 多音字処理ロジックを最適化(V2のみ)。 + - 内容: 多音字処理ロジックを最適化(V2 のみ)。 - タイプ: 修正、新機能 - 貢献者: KamioRinn、RVC-Boss - 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) - - 内容: 参照音声が1つしかアップロードできないバグを修正。欠損ファイルがある場合に警告ポップアップを表示するデータセット検証を追加。 + - 内容: 参照音声が 1 つしかアップロードできないバグを修正。欠損ファイルがある場合に警告ポップアップを表示するデータセット検証を追加。 - タイプ: 修正、雑務 - 貢献者: XXXXRT666 - 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) - - 内容: 上流のLangSegmentライブラリがSSMLタグを使用した数字、電話番号、日付、時刻の最適化をサポート。 + - 内容: 上流の LangSegment ライブラリが SSML タグを使用した数字、電話番号、日付、時刻の最適化をサポート。 - タイプ: 新機能 - 貢献者: juntaosun - 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) - - 内容: APIを修正・最適化。 + - 内容: API を修正・最適化。 - タイプ: 修正 - 貢献者: KamioRinn - 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) - 内容: `fast_inference`ブランチをメインブランチにマージ。 - タイプ: リファクタリング - 貢献者: ChasonJiang -- 2024.08.21 **GPT-SoVITS V2バージョンを正式リリース。** +- 2024.08.21 **GPT-SoVITS V2 バージョンを正式リリース。** ## 202502 (V3 バージョン) - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) - - 内容: **GPT-SoVITS V3モデルを追加。ファインチューニングには14GBのVRAMが必要。** - - タイプ: 新機能([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))参照) + - 内容: **GPT-SoVITS V3 モデルを追加。ファインチューニングには 14GB の VRAM が必要。** + - タイプ: 新機能([Wiki]()参照) - 貢献者: RVC-Boss - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) - 内容: 多言語プロジェクトドキュメントを更新。 @@ -424,7 +424,7 @@ - タイプ: パフォーマンス最適化 - 貢献者: wzy3650 - 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) - - 内容: ファインチューニング用に勾配チェックポイントサポートを追加。12GB VRAMが必要。 + - 内容: ファインチューニング用に勾配チェックポイントサポートを追加。12GB VRAM が必要。 - タイプ: 新機能 - 貢献者: Kakaru Hayate - 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) @@ -432,19 +432,19 @@ - タイプ: 新機能 - 貢献者: KamioRinn - 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e) - - 内容: **GPT-SoVITS V3モデルがLoRAトレーニングをサポート。ファインチューニングに8GB GPUメモリが必要。** + - 内容: **GPT-SoVITS V3 モデルが LoRA トレーニングをサポート。ファインチューニングに 8GB GPU メモリが必要。** - タイプ: 新機能 - 貢献者: RVC-Boss - 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) - - 内容: ボーカルと楽器分離のためのMel Band Roformerモデルサポートを追加。 + - 内容: ボーカルと楽器分離のための Mel Band Roformer モデルサポートを追加。 - タイプ: 新機能 - 貢献者: Sucial - 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) - - 内容: 中国語パス下でのMeCabエラーを修正(日本語/韓国語または多言語テキスト分割用)。 + - 内容: 中国語パス下での MeCab エラーを修正(日本語/韓国語または多言語テキスト分割用)。 - タイプ: 修正 - 貢献者: KamioRinn - 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc) - - 内容: **24kHzから48kHzへのオーディオ超解像モデルを追加**。V3モデルで24Kオーディオを生成する際の「こもった」オーディオ問題を緩和。 + - 内容: **24kHz から 48kHz へのオーディオ超解像モデルを追加**。V3 モデルで 24K オーディオを生成する際の「こもった」オーディオ問題を緩和。 - タイプ: 新機能 - 貢献者: RVC-Boss - 関連: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117) @@ -453,7 +453,7 @@ - タイプ: ドキュメント - 貢献者: StaryLan - 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122) - - 内容: モデルが識別できない短いCJK文字に対してルールベースの検出を適用。 + - 内容: モデルが識別できない短い CJK 文字に対してルールベースの検出を適用。 - タイプ: 修正 - 貢献者: KamioRinn - 関連: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116) @@ -461,7 +461,7 @@ - 内容: 合成速度を制御するための発話速度パラメータを追加。 - タイプ: 修正 - 貢献者: RVC-Boss -- 2025.02.28 **GPT-SoVITS V3を正式リリース**。 +- 2025.02.28 **GPT-SoVITS V3 を正式リリース**。 ## 202503 @@ -475,30 +475,31 @@ - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239). - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174). - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241) - - 内容: **SoVITS v3の並列推論を有効化。** + + - 内容: **SoVITS v3 の並列推論を有効化。** - タイプ: 新機能 - 貢献者: ChasonJiang - その他の軽微なバグを修正。 -- ONNXランタイムGPU推論サポートのための統合パッケージ修正: +- ONNX ランタイム GPU 推論サポートのための統合パッケージ修正: - タイプ: 修正 - 詳細: - - G2PW内のONNXモデルをCPUからGPU推論に切り替え、CPUボトルネックを大幅に削減; - - foxjoy dereverberationモデルがGPU推論をサポート。 + - G2PW 内の ONNX モデルを CPU から GPU 推論に切り替え、CPU ボトルネックを大幅に削減; + - foxjoy dereverberation モデルが GPU 推論をサポート。 ## 202504 (V4 バージョン) - 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) - - 内容: SoVITS v3並列推論のロックを解除。非同期モデル読み込みロジックを修正。 + - 内容: SoVITS v3 並列推論のロックを解除。非同期モデル読み込みロジックを修正。 - タイプ: 修正 - 貢献者: RVC-Boss - 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255) - - 内容: Ruffを使用したコードフォーマット。G2PWリンクを更新。 + - 内容: Ruff を使用したコードフォーマット。G2PW リンクを更新。 - タイプ: スタイル - 貢献者: XXXXRT666 - 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290) - - 内容: ドキュメントを整理。Python 3.11サポートを追加。インストーラーを更新。 + - 内容: ドキュメントを整理。Python 3.11 サポートを追加。インストーラーを更新。 - タイプ: 雑務 - 貢献者: XXXXRT666 - 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300) @@ -506,11 +507,11 @@ - タイプ: 雑務 - 貢献者: XXXXRT666 - 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf) - - 内容: **GPT-SoVITS V4モデルを追加。** + - 内容: **GPT-SoVITS V4 モデルを追加。** - タイプ: 新機能 - 貢献者: RVC-Boss - 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307) - - 内容: V4の並列推論を有効化。 + - 内容: V4 の並列推論を有効化。 - タイプ: 新機能 - 貢献者: RVC-Boss、ChasonJiang - 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309) @@ -518,24 +519,24 @@ - タイプ: 修正 - 貢献者: RVC-Boss、ChasonJiang - 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310) - - 内容: NumpyとNumbaのバージョン不一致問題を修正。librosaバージョンを更新。 + - 内容: Numpy と Numba のバージョン不一致問題を修正。librosa バージョンを更新。 - タイプ: 修正 - 貢献者: RVC-Boss、XXXXRT666 - 関連: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308) -- **2024.04.22 GPT-SoVITS V4を正式リリース**。 +- **2024.04.22 GPT-SoVITS V4 を正式リリース**。 - 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311) - - 内容: Gradioパラメータを更新。 + - 内容: Gradio パラメータを更新。 - タイプ: 雑務 - 貢献者: XXXXRT666 - 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322) - - 内容: Colab/Kaggleノートブックスクリプトを改善。 + - 内容: Colab/Kaggle ノートブックスクリプトを改善。 - タイプ: 雑務 - 貢献者: XXXXRT666 ## 202505 - 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) - - 内容: DockerとWindows自動ビルドスクリプトを改善。pre-commitフォーマットを追加。 + - 内容: Docker と Windows 自動ビルドスクリプトを改善。pre-commit フォーマットを追加。 - タイプ: 雑務 - 貢献者: XXXXRT666 - 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408) @@ -544,7 +545,7 @@ - 貢献者: KamioRinn - 関連: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404) - 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377) - - 内容: キャッシュ戦略を実装し、SoVITS V3/V4推論速度を10%向上。 + - 内容: キャッシュ戦略を実装し、SoVITS V3/V4 推論速度を 10%向上。 - タイプ: パフォーマンス最適化 - 貢献者: Kakaru Hayate - 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba) @@ -552,7 +553,7 @@ - タイプ: 修正 - 貢献者: RVC-Boss - 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc) - - 内容: UVR5およびONNX dereverberationモデルのエラーを修正。FFmpegが元のパスにスペースを含むMP3/M4Aファイルをエンコードする場合の問題を解決。 + - 内容: UVR5 および ONNX dereverberation モデルのエラーを修正。FFmpeg が元のパスにスペースを含む MP3/M4A ファイルをエンコードする場合の問題を解決。 - タイプ: 修正 - 貢献者: RVC-Boss diff --git a/docs/ko/Changelog_KO.md b/docs/ko/Changelog_KO.md index f22a830d..b9b91209 100644 --- a/docs/ko/Changelog_KO.md +++ b/docs/ko/Changelog_KO.md @@ -242,7 +242,7 @@ - 유형: 최적화 - 기여자: GoHomeToMacDonal - 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721) - - 내용: 빠른 추론 브랜치 'fast_inference_' 추가 + - 내용: 빠른 추론 브랜치 'fast*inference*' 추가 - 유형: 기능 - 기여자: ChasonJiang - 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761) @@ -285,7 +285,7 @@ - 2024.06.06 [Commit#99f09c8b](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) - 내용: WebUI의 GPT 미세조정 시 중국어 입력 텍스트의 BERT 특징을 읽지 않아 추론과 불일치 및 품질 저하가 발생하는 문제 수정 - **주의: 이전에 대량의 데이터로 미세조정을 한 경우 품질 향상을 위해 모델 재조정 권장** + **주의: 이전에 대량의 데이터로 미세조정을 한 경우 품질 향상을 위해 모델 재조정 권장** - 유형: 수정 - 기여자: RVC-Boss - 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) @@ -347,7 +347,7 @@ - 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340) - 내용: 합성 중 음성 속도 조절 기능 추가(무작위성 고정 및 속도만 제어 옵션 포함). 이 기능은 `api.py`에 업데이트됨 - 유형: 기능 - - 기여자: RVC-Boss, 红血球AE3803 + - 기여자: RVC-Boss, 红血球 AE3803 - 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) - 내용: BS-RoFormer 보컬 분리 모델 지원 추가 - 유형: 기능 @@ -409,7 +409,7 @@ - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) - 내용: **GPT-SoVITS V3 모델 추가, 파인튜닝 시 14GB VRAM 필요.** - - 유형: 신규 기능 ([위키 참조](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))) + - 유형: 신규 기능 ([위키 참조]()) - 기여자: RVC-Boss - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) - 내용: 다국어 프로젝트 문서 업데이트. @@ -475,6 +475,7 @@ - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239). - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174). - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241) + - 내용: **SoVITS v3 병렬 추론 지원 활성화.** - 유형: 신규 기능 - 기여자: ChasonJiang diff --git a/docs/tr/Changelog_TR.md b/docs/tr/Changelog_TR.md index 5612aeed..e477afd9 100644 --- a/docs/tr/Changelog_TR.md +++ b/docs/tr/Changelog_TR.md @@ -244,7 +244,7 @@ - Tür: Optimizasyon - Katkıda Bulunan: GoHomeToMacDonal - 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721) - - İçerik: Hızlı çıkarım dalı 'fast_inference_' eklendi + - İçerik: Hızlı çıkarım dalı 'fast*inference*' eklendi - Tür: Özellik - Katkıda Bulunan: ChasonJiang - 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761) @@ -287,7 +287,7 @@ - 2024.06.06 [Commit#99f09c8b](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) - İçerik: WebUI'da GPT ince ayarında Çince metinlerin BERT özelliklerinin okunmaması nedeniyle çıkarım tutarsızlığı ve kalite düşüşü sorunu düzeltildi - **Uyarı: Daha önce büyük miktarda veriyle ince ayar yaptıysanız, kaliteyi artırmak için modeli yeniden ayarlamanız önerilir** + **Uyarı: Daha önce büyük miktarda veriyle ince ayar yaptıysanız, kaliteyi artırmak için modeli yeniden ayarlamanız önerilir** - Tür: Düzeltme - Katkıda Bulunan: RVC-Boss - 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) @@ -347,7 +347,7 @@ - 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340) - İçerik: Sentez sırasında konuşma hızı ayarlama özelliği eklendi (rastgeleliği sabitleme ve sadece hızı kontrol etme seçeneği dahil). Bu özellik `api.py` dosyasına eklendi - Tür: Özellik - - Katkıda Bulunan: RVC-Boss, 红血球AE3803 + - Katkıda Bulunan: RVC-Boss, 红血球 AE3803 - 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) - İçerik: BS-RoFormer vokal eşlik ayırma modeli desteği eklendi. - Tür: Yeni Özellik @@ -409,7 +409,7 @@ - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) - İçerik: **İnce ayar için 14GB VRAM gerektiren GPT-SoVITS V3 modeli eklendi.** - - Tür: Yeni Özellik ([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) referans) + - Tür: Yeni Özellik ([Wiki]() referans) - Katkıda Bulunan: RVC-Boss - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) - İçerik: Çok dilli proje dokümantasyonu güncellendi. @@ -475,6 +475,7 @@ - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239). - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174). - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241) + - İçerik: **SoVITS v3 için paralel çıkarım etkinleştirildi.** - Tür: Yeni Özellik - Katkıda Bulunan: ChasonJiang From a8f366ac148a58893b3ba076b16cfd54dfc5a1be Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Thu, 5 Jun 2025 19:58:35 +0800 Subject: [PATCH 7/8] Update Badges --- README.md | 14 +++++--------- docs/cn/README.md | 16 ++++++---------- docs/ja/README.md | 12 ++++-------- docs/ko/README.md | 12 ++++-------- docs/tr/README.md | 12 ++++-------- webui.py | 2 +- 6 files changed, 24 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 5d902c23..7c845800 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,14 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md) -[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) -[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) - -[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) -[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) -[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) - **English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
@@ -331,7 +327,7 @@ Use v4 from v1/v2/v3 environment: New Features: 1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed. - [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)) + [more details]() 2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set. diff --git a/docs/cn/README.md b/docs/cn/README.md index 57d441d5..21ee3567 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -9,22 +9,18 @@ RVC-Boss%2FGPT-SoVITS | Trendshift -[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) -[![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) +[![Train In Colab](https://img.shields.io/badge/Colab-训练-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) +[![Infer In Colab](https://img.shields.io/badge/Colab-推理-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) +[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) +[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) + [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) [![Change Log](https://img.shields.io/badge/更新日志-查看更新-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md) -[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) -[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) -[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) - -[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) -[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) -[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) - [**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
diff --git a/docs/ja/README.md b/docs/ja/README.md index efb9012f..242222f4 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -11,17 +11,13 @@ [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) -[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) -[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ja/Changelog_JA.md) - [![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) -[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) -[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) -[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) diff --git a/docs/ko/README.md b/docs/ko/README.md index 91057fd3..af7d79a4 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -11,17 +11,13 @@ [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) -[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) -[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ko/Changelog_KO.md) - [![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) -[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) -[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) -[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md) diff --git a/docs/tr/README.md b/docs/tr/README.md index a8576c17..7d6fc93f 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -13,17 +13,13 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb) [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2) -[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) -[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) -[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/tr/Changelog_TR.md) - [![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) -[![GitHub Stars](https://img.shields.io/github/stars/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/stargazers) +[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) -[![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg?style=for-the-badge&logo=python)](https://www.python.org/downloads/) -[![PyTorch](https://img.shields.io/badge/PyTorch-2.5.1+-ee4c2c?style=for-the-badge&logo=pytorch)](https://pytorch.org/) -[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) +[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) +[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/) +[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe** diff --git a/webui.py b/webui.py index 08a7ef45..b8b3bb22 100644 --- a/webui.py +++ b/webui.py @@ -12,6 +12,7 @@ import platform import shutil import signal +import gradio as gr import psutil import torch import yaml @@ -67,7 +68,6 @@ from tools.i18n.i18n import I18nAuto, scan_language_list language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" os.environ["language"] = language i18n = I18nAuto(language=language) -from multiprocessing import cpu_count from config import ( GPU_INDEX, From 1e59f757a29e7e9eff73fb65b5740af098fe064e Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:18:57 +0800 Subject: [PATCH 8/8] Format --- GPT_SoVITS/AR/models/t2s_lightning_module.py | 3 +- GPT_SoVITS/TTS_infer_pack/TTS.py | 74 ++++--- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 4 +- GPT_SoVITS/eres2net/ERes2Net.py | 162 +++++++------- GPT_SoVITS/eres2net/ERes2NetV2.py | 208 ++++++++---------- GPT_SoVITS/eres2net/ERes2Net_huge.py | 163 +++++++------- GPT_SoVITS/eres2net/fusion.py | 4 +- GPT_SoVITS/eres2net/kaldi.py | 49 ++++- GPT_SoVITS/eres2net/pooling_layers.py | 27 +-- GPT_SoVITS/export_torch_script_v3v4.py | 24 +- GPT_SoVITS/f5_tts/model/backbones/dit.py | 8 +- GPT_SoVITS/module/data_utils.py | 41 +++- GPT_SoVITS/module/models.py | 71 ++++-- GPT_SoVITS/module/modules.py | 7 +- GPT_SoVITS/prepare_datasets/2-get-sv.py | 50 +++-- GPT_SoVITS/process_ckpt.py | 16 +- GPT_SoVITS/s2_train.py | 93 ++++++-- GPT_SoVITS/sv.py | 30 ++- .../text/LangSegmenter/langsegmenter.py | 107 ++++----- GPT_SoVITS/text/g2pw/onnx_api.py | 1 - GPT_SoVITS/text/tone_sandhi.py | 6 +- api.py | 84 ++++--- config.py | 14 +- tools/my_utils.py | 2 +- tools/uvr5/mdxnet.py | 4 +- tools/uvr5/vr.py | 8 +- webui.py | 4 +- 27 files changed, 719 insertions(+), 545 deletions(-) diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 97f3a084..fd357b94 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -28,7 +28,8 @@ class Text2SemanticLightningModule(LightningModule): self.load_state_dict( torch.load( pretrained_s1, - map_location="cpu", weights_only=False, + map_location="cpu", + weights_only=False, )["weight"], ) ) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index be936005..795b55dd 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -32,19 +32,21 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer from tools.audio_sr import AP_BWE from tools.i18n.i18n import I18nAuto, scan_language_list -from tools.my_utils import load_audio from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor from sv import SV -resample_transform_dict={} -def resample(audio_tensor, sr0,sr1,device): + +resample_transform_dict = {} + + +def resample(audio_tensor, sr0, sr1, device): global resample_transform_dict - key="%s-%s-%s"%(sr0,sr1,str(device)) + key = "%s-%s-%s" % (sr0, sr1, str(device)) if key not in resample_transform_dict: - resample_transform_dict[key] = torchaudio.transforms.Resample( - sr0, sr1 - ).to(device) + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) return resample_transform_dict[key](audio_tensor) + + language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) @@ -111,6 +113,7 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int): return processed_audio + class DictToAttrRecursive(dict): def __init__(self, input_dict): super().__init__(input_dict) @@ -479,7 +482,7 @@ class TTS: def init_vits_weights(self, weights_path: str): self.configs.vits_weights_path = weights_path version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path) - if "Pro"in model_version: + if "Pro" in model_version: self.init_sv_model() path_sovits = self.configs.default_configs[model_version]["vits_weights_path"] @@ -498,9 +501,9 @@ class TTS: else: hps["model"]["version"] = "v2" version = hps["model"]["version"] - v3v4set={"v3", "v4"} + v3v4set = {"v3", "v4"} if model_version not in v3v4set: - if "Pro"not in model_version: + if "Pro" not in model_version: model_version = version else: hps["model"]["version"] = model_version @@ -542,7 +545,7 @@ class TTS: if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"): del vits_model.enc_q - self.is_v2pro=model_version in {"v2Pro","v2ProPlus"} + self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"} if if_lora_v3 == False: print( @@ -632,7 +635,9 @@ class TTS: ) self.vocoder.remove_weight_norm() state_dict_g = torch.load( - "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), + map_location="cpu", + weights_only=False, ) print("loading vocoder", self.vocoder.load_state_dict(state_dict_g)) @@ -752,11 +757,13 @@ class TTS: if raw_sr != self.configs.sampling_rate: audio = raw_audio.to(self.configs.device) - if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0) + if audio.shape[0] == 2: + audio = audio.mean(0).unsqueeze(0) audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device) else: audio = raw_audio.to(self.configs.device) - if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0) + if audio.shape[0] == 2: + audio = audio.mean(0).unsqueeze(0) maxx = audio.abs().max() if maxx > 1: @@ -775,8 +782,9 @@ class TTS: audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device) if self.configs.is_half: audio = audio.half() - else:audio=None - return spec,audio + else: + audio = None + return spec, audio def _set_prompt_semantic(self, ref_wav_path: str): zero_wav = np.zeros( @@ -1073,7 +1081,10 @@ class TTS: ###### setting reference audio and prompt text preprocessing ######## t0 = time.perf_counter() - if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"] or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)): + if (ref_audio_path is not None) and ( + ref_audio_path != self.prompt_cache["ref_audio_path"] + or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None) + ): if not os.path.exists(ref_audio_path): raise ValueError(f"{ref_audio_path} not exists") self.set_ref_audio(ref_audio_path) @@ -1212,9 +1223,10 @@ class TTS: t_34 += t4 - t3 refer_audio_spec = [] - if self.is_v2pro:sv_emb=[] - for spec,audio_tensor in self.prompt_cache["refer_spec"]: - spec=spec.to(dtype=self.precision, device=self.configs.device) + if self.is_v2pro: + sv_emb = [] + for spec, audio_tensor in self.prompt_cache["refer_spec"]: + spec = spec.to(dtype=self.precision, device=self.configs.device) refer_audio_spec.append(spec) if self.is_v2pro: sv_emb.append(self.sv_model.compute_embedding3(audio_tensor)) @@ -1249,10 +1261,14 @@ class TTS: torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) ) _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) - if self.is_v2pro!=True: - _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :] + if self.is_v2pro != True: + _batch_audio_fragment = self.vits_model.decode( + all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] else: - _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :] + _batch_audio_fragment = self.vits_model.decode( + all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb + ).detach()[0, 0, :] audio_frag_end_idx.insert(0, 0) batch_audio_fragment = [ _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]] @@ -1266,9 +1282,13 @@ class TTS: pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) ) # .unsqueeze(0)#mq要多unsqueeze一次 if self.is_v2pro != True: - audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :] + audio_fragment = self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] else: - audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :] + audio_fragment = self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb + ).detach()[0, 0, :] batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分 else: if parallel_infer: @@ -1410,7 +1430,7 @@ class TTS: raw_entry = self.prompt_cache["refer_spec"][0] if isinstance(raw_entry, tuple): raw_entry = raw_entry[0] - refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device) + refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device) fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] @@ -1480,7 +1500,7 @@ class TTS: raw_entry = self.prompt_cache["refer_spec"][0] if isinstance(raw_entry, tuple): raw_entry = raw_entry[0] - refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device) + refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device) fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index f03183a1..9a478d43 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -160,7 +160,9 @@ class TextPreprocessor: else: for tmp in LangSegmenter.getTexts(text): if langlist: - if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"): + if (tmp["lang"] == "en" and langlist[-1] == "en") or ( + tmp["lang"] != "en" and langlist[-1] != "en" + ): textlist[-1] += tmp["text"] continue if tmp["lang"] == "en": diff --git a/GPT_SoVITS/eres2net/ERes2Net.py b/GPT_SoVITS/eres2net/ERes2Net.py index f728742d..1618c813 100644 --- a/GPT_SoVITS/eres2net/ERes2Net.py +++ b/GPT_SoVITS/eres2net/ERes2Net.py @@ -1,13 +1,12 @@ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. - ERes2Net incorporates both local and global feature fusion techniques to improve the performance. - The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. - The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. """ - +Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. +ERes2Net incorporates both local and global feature fusion techniques to improve the performance. +The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. +The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. +""" import torch import math @@ -16,15 +15,14 @@ import torch.nn.functional as F import pooling_layers as pooling_layers from fusion import AFF -class ReLU(nn.Hardtanh): +class ReLU(nn.Hardtanh): def __init__(self, inplace=False): super(ReLU, self).__init__(0, 20, inplace) def __repr__(self): - inplace_str = 'inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + inplace_str + ')' + inplace_str = "inplace" if self.inplace else "" + return self.__class__.__name__ + " (" + inplace_str + ")" class BasicBlockERes2Net(nn.Module): @@ -32,28 +30,28 @@ class BasicBlockERes2Net(nn.Module): def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): super(BasicBlockERes2Net, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale - convs=[] - bns=[] + convs = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList(bns) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, - stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -64,18 +62,18 @@ class BasicBlockERes2Net(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: - sp = spx[i] - else: - sp = sp + spx[i] - sp = self.convs[i](sp) - sp = self.relu(self.bns[i](sp)) - if i==0: - out = sp - else: - out = torch.cat((out,sp),1) + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -86,22 +84,23 @@ class BasicBlockERes2Net(nn.Module): return out + class BasicBlockERes2Net_diff_AFF(nn.Module): expansion = 2 def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): super(BasicBlockERes2Net_diff_AFF, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale - convs=[] - fuse_models=[] - bns=[] + convs = [] + fuse_models = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) for j in range(self.nums - 1): fuse_models.append(AFF(channels=width)) @@ -109,15 +108,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): self.bns = nn.ModuleList(bns) self.fuse_models = nn.ModuleList(fuse_models) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, - stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -128,19 +127,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: + if i == 0: sp = spx[i] else: - sp = self.fuse_models[i-1](sp, spx[i]) - + sp = self.fuse_models[i - 1](sp, spx[i]) + sp = self.convs[i](sp) sp = self.relu(self.bns[i](sp)) - if i==0: + if i == 0: out = sp else: - out = torch.cat((out,sp),1) + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -151,16 +150,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): return out + class ERes2Net(nn.Module): - def __init__(self, - block=BasicBlockERes2Net, - block_fuse=BasicBlockERes2Net_diff_AFF, - num_blocks=[3, 4, 6, 3], - m_channels=32, - feat_dim=80, - embedding_size=192, - pooling_func='TSTP', - two_emb_layer=False): + def __init__( + self, + block=BasicBlockERes2Net, + block_fuse=BasicBlockERes2Net_diff_AFF, + num_blocks=[3, 4, 6, 3], + m_channels=32, + feat_dim=80, + embedding_size=192, + pooling_func="TSTP", + two_emb_layer=False, + ): super(ERes2Net, self).__init__() self.in_planes = m_channels self.feat_dim = feat_dim @@ -176,20 +178,24 @@ class ERes2Net(nn.Module): self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2) # Downsampling module for each layer - self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False) - self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False) - self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False) + self.layer1_downsample = nn.Conv2d( + m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False + ) + self.layer2_downsample = nn.Conv2d( + m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False + ) + self.layer3_downsample = nn.Conv2d( + m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False + ) # Bottom-up fusion module self.fuse_mode12 = AFF(channels=m_channels * 4) self.fuse_mode123 = AFF(channels=m_channels * 8) self.fuse_mode1234 = AFF(channels=m_channels * 16) - self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 - self.pool = getattr(pooling_layers, pooling_func)( - in_dim=self.stats_dim * block.expansion) - self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, - embedding_size) + self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2 + self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion) + self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size) if self.two_emb_layer: self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) self.seg_2 = nn.Linear(embedding_size, embedding_size) @@ -212,7 +218,7 @@ class ERes2Net(nn.Module): out1 = self.layer1(out) out2 = self.layer2(out1) out1_downsample = self.layer1_downsample(out1) - fuse_out12 = self.fuse_mode12(out2, out1_downsample) + fuse_out12 = self.fuse_mode12(out2, out1_downsample) out3 = self.layer3(out2) fuse_out12_downsample = self.layer2_downsample(fuse_out12) fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) @@ -243,18 +249,16 @@ class ERes2Net(nn.Module): fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) out4 = self.layer4(out3) fuse_out123_downsample = self.layer3_downsample(fuse_out123) - fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1) + fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1) return fuse_out1234 -if __name__ == '__main__': - +if __name__ == "__main__": x = torch.zeros(10, 300, 80) - model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP') + model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func="TSTP") model.eval() out = model(x) - print(out.shape) # torch.Size([10, 192]) + print(out.shape) # torch.Size([10, 192]) num_params = sum(param.numel() for param in model.parameters()) - print("{} M".format(num_params / 1e6)) # 6.61M - + print("{} M".format(num_params / 1e6)) # 6.61M diff --git a/GPT_SoVITS/eres2net/ERes2NetV2.py b/GPT_SoVITS/eres2net/ERes2NetV2.py index fdfd6db4..2e152a41 100644 --- a/GPT_SoVITS/eres2net/ERes2NetV2.py +++ b/GPT_SoVITS/eres2net/ERes2NetV2.py @@ -1,14 +1,12 @@ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension - within each stage. However, this modification also increases the number of model parameters and computational complexity. - To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing - both the model parameters and its computational cost. """ - - +To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension +within each stage. However, this modification also increases the number of model parameters and computational complexity. +To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing +both the model parameters and its computational cost. +""" import torch import math @@ -17,47 +15,42 @@ import torch.nn.functional as F import pooling_layers as pooling_layers from fusion import AFF -class ReLU(nn.Hardtanh): +class ReLU(nn.Hardtanh): def __init__(self, inplace=False): super(ReLU, self).__init__(0, 20, inplace) def __repr__(self): - inplace_str = 'inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + inplace_str + ')' + inplace_str = "inplace" if self.inplace else "" + return self.__class__.__name__ + " (" + inplace_str + ")" class BasicBlockERes2NetV2(nn.Module): - def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2): super(BasicBlockERes2NetV2, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale self.expansion = expansion - convs=[] - bns=[] + convs = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList(bns) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, - self.expansion * planes, - kernel_size=1, - stride=stride, - bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -68,18 +61,18 @@ class BasicBlockERes2NetV2(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: - sp = spx[i] - else: - sp = sp + spx[i] - sp = self.convs[i](sp) - sp = self.relu(self.bns[i](sp)) - if i==0: - out = sp - else: - out = torch.cat((out,sp),1) + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -90,22 +83,22 @@ class BasicBlockERes2NetV2(nn.Module): return out -class BasicBlockERes2NetV2AFF(nn.Module): +class BasicBlockERes2NetV2AFF(nn.Module): def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2): super(BasicBlockERes2NetV2AFF, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale self.expansion = expansion - convs=[] - fuse_models=[] - bns=[] + convs = [] + fuse_models = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) for j in range(self.nums - 1): fuse_models.append(AFF(channels=width, r=4)) @@ -113,18 +106,15 @@ class BasicBlockERes2NetV2AFF(nn.Module): self.bns = nn.ModuleList(bns) self.fuse_models = nn.ModuleList(fuse_models) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, - self.expansion * planes, - kernel_size=1, - stride=stride, - bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -135,19 +125,19 @@ class BasicBlockERes2NetV2AFF(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: + if i == 0: sp = spx[i] else: - sp = self.fuse_models[i-1](sp, spx[i]) - + sp = self.fuse_models[i - 1](sp, spx[i]) + sp = self.convs[i](sp) sp = self.relu(self.bns[i](sp)) - if i==0: + if i == 0: out = sp else: - out = torch.cat((out,sp),1) + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -158,19 +148,22 @@ class BasicBlockERes2NetV2AFF(nn.Module): return out + class ERes2NetV2(nn.Module): - def __init__(self, - block=BasicBlockERes2NetV2, - block_fuse=BasicBlockERes2NetV2AFF, - num_blocks=[3, 4, 6, 3], - m_channels=64, - feat_dim=80, - embedding_size=192, - baseWidth=26, - scale=2, - expansion=2, - pooling_func='TSTP', - two_emb_layer=False): + def __init__( + self, + block=BasicBlockERes2NetV2, + block_fuse=BasicBlockERes2NetV2AFF, + num_blocks=[3, 4, 6, 3], + m_channels=64, + feat_dim=80, + embedding_size=192, + baseWidth=26, + scale=2, + expansion=2, + pooling_func="TSTP", + two_emb_layer=False, + ): super(ERes2NetV2, self).__init__() self.in_planes = m_channels self.feat_dim = feat_dim @@ -181,42 +174,29 @@ class ERes2NetV2(nn.Module): self.scale = scale self.expansion = expansion - self.conv1 = nn.Conv2d(1, - m_channels, - kernel_size=3, - stride=1, - padding=1, - bias=False) + self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(m_channels) - self.layer1 = self._make_layer(block, - m_channels, - num_blocks[0], - stride=1) - self.layer2 = self._make_layer(block, - m_channels * 2, - num_blocks[1], - stride=2) - self.layer3 = self._make_layer(block_fuse, - m_channels * 4, - num_blocks[2], - stride=2) - self.layer4 = self._make_layer(block_fuse, - m_channels * 8, - num_blocks[3], - stride=2) + self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2) # Downsampling module - self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \ - padding=1, stride=2, bias=False) + self.layer3_ds = nn.Conv2d( + m_channels * 4 * self.expansion, + m_channels * 8 * self.expansion, + kernel_size=3, + padding=1, + stride=2, + bias=False, + ) # Bottom-up fusion module self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4) - self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 - self.pool = getattr(pooling_layers, pooling_func)( - in_dim=self.stats_dim * self.expansion) - self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, - embedding_size) + self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2 + self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion) + self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size) if self.two_emb_layer: self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) self.seg_2 = nn.Linear(embedding_size, embedding_size) @@ -228,7 +208,11 @@ class ERes2NetV2(nn.Module): strides = [stride] + [1] * (num_blocks - 1) layers = [] for stride in strides: - layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion)) + layers.append( + block( + self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion + ) + ) self.in_planes = planes * self.expansion return nn.Sequential(*layers) @@ -264,7 +248,7 @@ class ERes2NetV2(nn.Module): out3_ds = self.layer3_ds(out3) fuse_out34 = self.fuse34(out4, out3_ds) # print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72]) - return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1) + return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1) # stats = self.pool(fuse_out34) # # embed_a = self.seg_1(stats) @@ -276,17 +260,13 @@ class ERes2NetV2(nn.Module): # else: # return embed_a -if __name__ == '__main__': +if __name__ == "__main__": x = torch.randn(1, 300, 80) model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2) model.eval() y = model(x) print(y.size()) - macs, num_params = profile(model, inputs=(x, )) - print("Params: {} M".format(num_params / 1e6)) # 17.86 M - print("MACs: {} G".format(macs / 1e9)) # 12.69 G - - - - + macs, num_params = profile(model, inputs=(x,)) + print("Params: {} M".format(num_params / 1e6)) # 17.86 M + print("MACs: {} G".format(macs / 1e9)) # 12.69 G diff --git a/GPT_SoVITS/eres2net/ERes2Net_huge.py b/GPT_SoVITS/eres2net/ERes2Net_huge.py index 0ea82d39..0f04236b 100644 --- a/GPT_SoVITS/eres2net/ERes2Net_huge.py +++ b/GPT_SoVITS/eres2net/ERes2Net_huge.py @@ -1,14 +1,13 @@ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. - ERes2Net incorporates both local and global feature fusion techniques to improve the performance. - The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. - The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. - ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better - recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance. +"""Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. +ERes2Net incorporates both local and global feature fusion techniques to improve the performance. +The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. +The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. +ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better +recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance. """ -import pdb import torch import math @@ -17,15 +16,14 @@ import torch.nn.functional as F import pooling_layers as pooling_layers from fusion import AFF -class ReLU(nn.Hardtanh): +class ReLU(nn.Hardtanh): def __init__(self, inplace=False): super(ReLU, self).__init__(0, 20, inplace) def __repr__(self): - inplace_str = 'inplace' if self.inplace else '' - return self.__class__.__name__ + ' (' \ - + inplace_str + ')' + inplace_str = "inplace" if self.inplace else "" + return self.__class__.__name__ + " (" + inplace_str + ")" class BasicBlockERes2Net(nn.Module): @@ -33,27 +31,28 @@ class BasicBlockERes2Net(nn.Module): def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3): super(BasicBlockERes2Net, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale - convs=[] - bns=[] + convs = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList(bns) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -64,18 +63,18 @@ class BasicBlockERes2Net(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: - sp = spx[i] - else: - sp = sp + spx[i] - sp = self.convs[i](sp) - sp = self.relu(self.bns[i](sp)) - if i==0: - out = sp - else: - out = torch.cat((out,sp),1) + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -86,22 +85,23 @@ class BasicBlockERes2Net(nn.Module): return out + class BasicBlockERes2Net_diff_AFF(nn.Module): expansion = 4 def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3): super(BasicBlockERes2Net_diff_AFF, self).__init__() - width = int(math.floor(planes*(baseWidth/64.0))) - self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False) - self.bn1 = nn.BatchNorm2d(width*scale) + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) self.nums = scale - convs=[] - fuse_models=[] - bns=[] + convs = [] + fuse_models = [] + bns = [] for i in range(self.nums): - convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) - bns.append(nn.BatchNorm2d(width)) + convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) for j in range(self.nums - 1): fuse_models.append(AFF(channels=width)) @@ -109,14 +109,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): self.bns = nn.ModuleList(bns) self.fuse_models = nn.ModuleList(fuse_models) self.relu = ReLU(inplace=True) - - self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes*self.expansion) + + self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes)) + nn.BatchNorm2d(self.expansion * planes), + ) self.stride = stride self.width = width self.scale = scale @@ -127,20 +128,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - spx = torch.split(out,self.width,1) + spx = torch.split(out, self.width, 1) for i in range(self.nums): - if i==0: + if i == 0: sp = spx[i] else: - sp = self.fuse_models[i-1](sp, spx[i]) - + sp = self.fuse_models[i - 1](sp, spx[i]) + sp = self.convs[i](sp) sp = self.relu(self.bns[i](sp)) - if i==0: + if i == 0: out = sp else: - out = torch.cat((out,sp),1) - + out = torch.cat((out, sp), 1) out = self.conv3(out) out = self.bn3(out) @@ -151,16 +151,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module): return out + class ERes2Net(nn.Module): - def __init__(self, - block=BasicBlockERes2Net, - block_fuse=BasicBlockERes2Net_diff_AFF, - num_blocks=[3, 4, 6, 3], - m_channels=64, - feat_dim=80, - embedding_size=192, - pooling_func='TSTP', - two_emb_layer=False): + def __init__( + self, + block=BasicBlockERes2Net, + block_fuse=BasicBlockERes2Net_diff_AFF, + num_blocks=[3, 4, 6, 3], + m_channels=64, + feat_dim=80, + embedding_size=192, + pooling_func="TSTP", + two_emb_layer=False, + ): super(ERes2Net, self).__init__() self.in_planes = m_channels self.feat_dim = feat_dim @@ -176,17 +179,22 @@ class ERes2Net(nn.Module): self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2) self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2) - self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False) - self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False) - self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False) + self.layer1_downsample = nn.Conv2d( + m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False + ) + self.layer2_downsample = nn.Conv2d( + m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False + ) + self.layer3_downsample = nn.Conv2d( + m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False + ) self.fuse_mode12 = AFF(channels=m_channels * 8) self.fuse_mode123 = AFF(channels=m_channels * 16) self.fuse_mode1234 = AFF(channels=m_channels * 32) - self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 - self.pool = getattr(pooling_layers, pooling_func)( - in_dim=self.stats_dim * block.expansion) + self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2 + self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion) self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size) if self.two_emb_layer: self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) @@ -229,7 +237,7 @@ class ERes2Net(nn.Module): else: return embed_a - def forward2(self, x,if_mean): + def forward2(self, x, if_mean): x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) x = x.unsqueeze_(1) @@ -243,14 +251,13 @@ class ERes2Net(nn.Module): fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) out4 = self.layer4(out3) fuse_out123_downsample = self.layer3_downsample(fuse_out123) - fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2)#bs,20480,T - if(if_mean==False): - mean=fuse_out1234[0].transpose(1,0)#(T,20480),bs=T + fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2) # bs,20480,T + if if_mean == False: + mean = fuse_out1234[0].transpose(1, 0) # (T,20480),bs=T else: - mean = fuse_out1234.mean(2)#bs,20480 - mean_std=torch.cat([mean,torch.zeros_like(mean)],1) - return self.seg_1(mean_std)#(T,192) - + mean = fuse_out1234.mean(2) # bs,20480 + mean_std = torch.cat([mean, torch.zeros_like(mean)], 1) + return self.seg_1(mean_std) # (T,192) # stats = self.pool(fuse_out1234) # if self.two_emb_layer: @@ -275,12 +282,8 @@ class ERes2Net(nn.Module): fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) out4 = self.layer4(out3) fuse_out123_downsample = self.layer3_downsample(fuse_out123) - fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1) + fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1) return fuse_out1234 # print(fuse_out1234.shape) # print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape) # pdb.set_trace() - - - - diff --git a/GPT_SoVITS/eres2net/fusion.py b/GPT_SoVITS/eres2net/fusion.py index 2aff7a72..d156a55c 100644 --- a/GPT_SoVITS/eres2net/fusion.py +++ b/GPT_SoVITS/eres2net/fusion.py @@ -6,7 +6,6 @@ import torch.nn as nn class AFF(nn.Module): - def __init__(self, channels=64, r=4): super(AFF, self).__init__() inter_channels = int(channels // r) @@ -23,7 +22,6 @@ class AFF(nn.Module): xa = torch.cat((x, ds_y), dim=1) x_att = self.local_att(xa) x_att = 1.0 + torch.tanh(x_att) - xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att) + xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att) return xo - diff --git a/GPT_SoVITS/eres2net/kaldi.py b/GPT_SoVITS/eres2net/kaldi.py index 58664c0c..a80e5e6b 100644 --- a/GPT_SoVITS/eres2net/kaldi.py +++ b/GPT_SoVITS/eres2net/kaldi.py @@ -144,7 +144,7 @@ def _get_waveform_and_window_properties( ) assert 0 < window_shift, "`window_shift` must be greater than 0" assert padded_window_size % 2 == 0, ( - "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`" + "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`" ) assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]" assert sample_frequency > 0, "`sample_frequency` must be greater than zero" @@ -441,7 +441,9 @@ def get_mel_banks( high_freq: float, vtln_low: float, vtln_high: float, - vtln_warp_factor: float,device=None,dtype=None + vtln_warp_factor: float, + device=None, + dtype=None, ) -> Tuple[Tensor, Tensor]: """ Returns: @@ -457,9 +459,9 @@ def get_mel_banks( if high_freq <= 0.0: high_freq += nyquist - assert ( - (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq) - ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist) + assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), ( + "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist) + ) # fft-bin width [think of it as Nyquist-freq / half-window-length] fft_bin_width = sample_freq / window_length_padded @@ -475,7 +477,7 @@ def get_mel_banks( assert vtln_warp_factor == 1.0 or ( (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high) - ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format( + ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format( vtln_low, vtln_high, low_freq, high_freq ) @@ -508,9 +510,12 @@ def get_mel_banks( bins[up_idx] = up_slope[up_idx] bins[down_idx] = down_slope[down_idx] - return bins.to(device=device,dtype=dtype)#, center_freqs + return bins.to(device=device, dtype=dtype) # , center_freqs + + +cache = {} + -cache={} def fbank( waveform: Tensor, blackman_coeff: float = 0.42, @@ -620,14 +625,34 @@ def fbank( # size (num_mel_bins, padded_window_size // 2) # print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp) - cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype) + cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % ( + num_mel_bins, + padded_window_size, + sample_frequency, + low_freq, + high_freq, + vtln_low, + vtln_high, + vtln_warp, + device, + dtype, + ) if cache_key not in cache: mel_energies = get_mel_banks( - num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype + num_mel_bins, + padded_window_size, + sample_frequency, + low_freq, + high_freq, + vtln_low, + vtln_high, + vtln_warp, + device, + dtype, ) - cache[cache_key]=mel_energies + cache[cache_key] = mel_energies else: - mel_energies=cache[cache_key] + mel_energies = cache[cache_key] # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1) mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0) diff --git a/GPT_SoVITS/eres2net/pooling_layers.py b/GPT_SoVITS/eres2net/pooling_layers.py index 1fa0e7d6..c3e0eab6 100644 --- a/GPT_SoVITS/eres2net/pooling_layers.py +++ b/GPT_SoVITS/eres2net/pooling_layers.py @@ -1,7 +1,7 @@ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker.""" +"""This implementation is adapted from https://github.com/wenet-e2e/wespeaker.""" import torch import torch.nn as nn @@ -11,6 +11,7 @@ class TAP(nn.Module): """ Temporal average pooling, only first-order mean is considered """ + def __init__(self, **kwargs): super(TAP, self).__init__() @@ -25,6 +26,7 @@ class TSDP(nn.Module): """ Temporal standard deviation pooling, only second-order std is considered """ + def __init__(self, **kwargs): super(TSDP, self).__init__() @@ -41,6 +43,7 @@ class TSTP(nn.Module): x-vector Comment: simple concatenation can not make full use of both statistics """ + def __init__(self, **kwargs): super(TSTP, self).__init__() @@ -56,9 +59,10 @@ class TSTP(nn.Module): class ASTP(nn.Module): - """ Attentive statistics pooling: Channel- and context-dependent - statistics pooling, first used in ECAPA_TDNN. + """Attentive statistics pooling: Channel- and context-dependent + statistics pooling, first used in ECAPA_TDNN. """ + def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False): super(ASTP, self).__init__() self.global_context_att = global_context_att @@ -66,15 +70,10 @@ class ASTP(nn.Module): # Use Conv1d with stride == 1 rather than Linear, then we don't # need to transpose inputs. if global_context_att: - self.linear1 = nn.Conv1d( - in_dim * 3, bottleneck_dim, - kernel_size=1) # equals W and b in the paper + self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1) # equals W and b in the paper else: - self.linear1 = nn.Conv1d( - in_dim, bottleneck_dim, - kernel_size=1) # equals W and b in the paper - self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, - kernel_size=1) # equals V and k in the paper + self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper + self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper def forward(self, x): """ @@ -88,15 +87,13 @@ class ASTP(nn.Module): if self.global_context_att: context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x) - context_std = torch.sqrt( - torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x) + context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x) x_in = torch.cat((x, context_mean, context_std), dim=1) else: x_in = x # DON'T use ReLU here! ReLU may be hard to converge. - alpha = torch.tanh( - self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in)) + alpha = torch.tanh(self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in)) alpha = torch.softmax(self.linear2(alpha), dim=2) mean = torch.sum(alpha * x, dim=2) var = torch.sum(alpha * (x**2), dim=2) - mean**2 diff --git a/GPT_SoVITS/export_torch_script_v3v4.py b/GPT_SoVITS/export_torch_script_v3v4.py index 55d27282..89cb4b03 100644 --- a/GPT_SoVITS/export_torch_script_v3v4.py +++ b/GPT_SoVITS/export_torch_script_v3v4.py @@ -402,7 +402,7 @@ class GPTSoVITSV3(torch.nn.Module): chunk_len = 934 - fea_ref.shape[2] wav_gen_list = [] idx = 0 - fea_todo = fea_todo[:,:,:-5] + fea_todo = fea_todo[:, :, :-5] wav_gen_length = fea_todo.shape[2] * 256 while 1: # current_time = datetime.now() @@ -434,7 +434,8 @@ class GPTSoVITSV3(torch.nn.Module): wav_gen = torch.cat(wav_gen_list, 2) return wav_gen[0][0][:wav_gen_length] - + + class GPTSoVITSV4(torch.nn.Module): def __init__(self, gpt_sovits_half, cfm, hifigan): super().__init__() @@ -461,7 +462,7 @@ class GPTSoVITSV4(torch.nn.Module): chunk_len = 1000 - fea_ref.shape[2] wav_gen_list = [] idx = 0 - fea_todo = fea_todo[:,:,:-10] + fea_todo = fea_todo[:, :, :-10] wav_gen_length = fea_todo.shape[2] * 480 while 1: # current_time = datetime.now() @@ -577,6 +578,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new v3v4set = {"v3", "v4"} + def get_sovits_weights(sovits_path): path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" is_exist_s2gv3 = os.path.exists(path_sovits_v3) @@ -699,14 +701,13 @@ def export_cfm( return export_cfm -def export_1(ref_wav_path,ref_wav_text,version="v3"): +def export_1(ref_wav_path, ref_wav_text, version="v3"): if version == "v3": sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") init_bigvgan() else: sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth") init_hifigan() - dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") raw_t2s = get_raw_t2s_model(dict_s1).to(device) @@ -751,9 +752,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"): # phones1, bert1, norm_text1 = get_phones_and_bert( # "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" # ) - phones1, bert1, norm_text1 = get_phones_and_bert( - ref_wav_text, "auto", "v3" - ) + phones1, bert1, norm_text1 = get_phones_and_bert(ref_wav_text, "auto", "v3") phones2, bert2, norm_text2 = get_phones_and_bert( "这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", "auto", @@ -914,7 +913,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"): hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,)) hifigan_model_.save("onnx/ad/hifigan_model.pt") wav_gen = hifigan_model(cmf_res) - + print("wav_gen:", wav_gen.shape, wav_gen.dtype) audio = wav_gen[0][0].cpu().detach().numpy() @@ -1201,7 +1200,6 @@ def export_2(version="v3"): gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4 sr = 24000 if version == "v3" else 48000 - time.sleep(5) # print("thread:", torch.get_num_threads()) # print("thread:", torch.get_num_interop_threads()) @@ -1212,14 +1210,14 @@ def export_2(version="v3"): "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", gpt_sovits_v3v4, "out.wav", - sr + sr, ) test_export( "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", gpt_sovits_v3v4, "out2.wav", - sr + sr, ) # test_export( @@ -1251,6 +1249,6 @@ def test_export_gpt_sovits_v3(): with torch.no_grad(): - export_1("onnx/ad/ref.wav","你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。","v4") + export_1("onnx/ad/ref.wav", "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "v4") # export_2("v4") # test_export_gpt_sovits_v3() diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py index f64a3c39..4aa3b9ac 100644 --- a/GPT_SoVITS/f5_tts/model/backbones/dit.py +++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py @@ -143,9 +143,9 @@ class DiT(nn.Module): drop_audio_cond=False, # cfg for cond audio drop_text=False, # cfg for text # mask: bool["b n"] | None = None, # noqa: F722 - infer=False, # bool - text_cache=None, # torch tensor as text_embed - dt_cache=None, # torch tensor as dt + infer=False, # bool + text_cache=None, # torch tensor as text_embed + dt_cache=None, # torch tensor as dt ): x = x0.transpose(2, 1) cond = cond0.transpose(2, 1) @@ -191,4 +191,4 @@ class DiT(nn.Module): if infer: return output, text_embed, dt else: - return output \ No newline at end of file + return output diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 81829683..46eff5fb 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -21,7 +21,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): 3) computes spectrograms from audio files. """ - def __init__(self, hparams, version=None,val=False): + def __init__(self, hparams, version=None, val=False): exp_dir = hparams.exp_dir self.path2 = "%s/2-name2text.txt" % exp_dir self.path4 = "%s/4-cnhubert" % exp_dir @@ -29,7 +29,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): assert os.path.exists(self.path2) assert os.path.exists(self.path4) assert os.path.exists(self.path5) - self.is_v2Pro=version in {"v2Pro","v2ProPlus"} + self.is_v2Pro = version in {"v2Pro", "v2ProPlus"} if self.is_v2Pro: self.path7 = "%s/7-sv_cn" % exp_dir assert os.path.exists(self.path7) @@ -118,7 +118,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False if self.is_v2Pro: - sv_emb=torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu") + sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu") except: traceback.print_exc() spec = torch.zeros(1025, 100) @@ -126,10 +126,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): ssl = torch.zeros(1, 768, 100) text = text[-1:] if self.is_v2Pro: - sv_emb=torch.zeros(1,20480) + sv_emb = torch.zeros(1, 20480) print("load audio or ssl error!!!!!!", audiopath) if self.is_v2Pro: - return (ssl, spec, wav, text,sv_emb) + return (ssl, spec, wav, text, sv_emb) else: return (ssl, spec, wav, text) @@ -192,9 +192,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): class TextAudioSpeakerCollate: """Zero-pads model inputs and targets""" - def __init__(self, return_ids=False,version=None): + def __init__(self, return_ids=False, version=None): self.return_ids = return_ids - self.is_v2Pro=version in {"v2Pro","v2ProPlus"} + self.is_v2Pro = version in {"v2Pro", "v2ProPlus"} def __call__(self, batch): """Collate's training batch from normalized text, audio and speaker identities @@ -228,7 +228,7 @@ class TextAudioSpeakerCollate: text_padded.zero_() if self.is_v2Pro: - sv_embs=torch.FloatTensor(len(batch),20480) + sv_embs = torch.FloatTensor(len(batch), 20480) for i in range(len(ids_sorted_decreasing)): row = batch[ids_sorted_decreasing[i]] @@ -250,11 +250,30 @@ class TextAudioSpeakerCollate: text_lengths[i] = text.size(0) if self.is_v2Pro: - sv_embs[i]=row[4] + sv_embs[i] = row[4] if self.is_v2Pro: - return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths,sv_embs + return ( + ssl_padded, + ssl_lengths, + spec_padded, + spec_lengths, + wav_padded, + wav_lengths, + text_padded, + text_lengths, + sv_embs, + ) else: - return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths + return ( + ssl_padded, + ssl_lengths, + spec_padded, + spec_lengths, + wav_padded, + wav_lengths, + text_padded, + text_lengths, + ) class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 4fbec59f..1c8e662f 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -586,12 +586,17 @@ class DiscriminatorS(torch.nn.Module): return x, fmap -v2pro_set={"v2Pro","v2ProPlus"} + +v2pro_set = {"v2Pro", "v2ProPlus"} + + class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False,version=None): + def __init__(self, use_spectral_norm=False, version=None): super(MultiPeriodDiscriminator, self).__init__() - if version in v2pro_set:periods = [2, 3, 5, 7, 11,17,23] - else:periods = [2, 3, 5, 7, 11] + if version in v2pro_set: + periods = [2, 3, 5, 7, 11, 17, 23] + else: + periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] @@ -787,6 +792,7 @@ class CodePredictor(nn.Module): return pred_codes.transpose(0, 1) + class SynthesizerTrn(nn.Module): """ Synthesizer for Training @@ -886,13 +892,13 @@ class SynthesizerTrn(nn.Module): self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) self.freeze_quantizer = freeze_quantizer - self.is_v2pro=self.version in v2pro_set + self.is_v2pro = self.version in v2pro_set if self.is_v2pro: self.sv_emb = nn.Linear(20480, gin_channels) self.ge_to512 = nn.Linear(gin_channels, 512) self.prelu = nn.PReLU(num_parameters=gin_channels) - def forward(self, ssl, y, y_lengths, text, text_lengths,sv_emb=None): + def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) if self.version == "v1": ge = self.ref_enc(y * y_mask, y_mask) @@ -952,7 +958,7 @@ class SynthesizerTrn(nn.Module): return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def decode(self, codes, text, refer,noise_scale=0.5, speed=1, sv_emb=None): + def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None): def get_ge(refer, sv_emb): ge = None if refer is not None: @@ -970,8 +976,8 @@ class SynthesizerTrn(nn.Module): if type(refer) == list: ges = [] - for idx,_refer in enumerate(refer): - ge = get_ge(_refer, sv_emb[idx]if self.is_v2pro else None) + for idx, _refer in enumerate(refer): + ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None) ges.append(ge) ge = torch.stack(ges, 0).mean(0) else: @@ -983,7 +989,14 @@ class SynthesizerTrn(nn.Module): quantized = self.quantizer.decode(codes) if self.semantic_frame_rate == "25hz": quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, self.ge_to512(ge.transpose(2,1)).transpose(2,1)if self.is_v2pro else ge, speed) + x, m_p, logs_p, y_mask = self.enc_p( + quantized, + y_lengths, + text, + text_lengths, + self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge, + speed, + ) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -996,6 +1009,7 @@ class SynthesizerTrn(nn.Module): quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) return codes.transpose(0, 1) + class CFM(torch.nn.Module): def __init__(self, in_channels, dit): super().__init__() @@ -1029,7 +1043,18 @@ class CFM(torch.nn.Module): t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t # v_pred = model(x, t_tensor, d_tensor, **extra_args) v_pred, text_emb, dt = self.estimator( - x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False, infer=True, text_cache=text_cache, dt_cache=dt_cache + x, + prompt_x, + x_lens, + t_tensor, + d_tensor, + mu, + use_grad_ckpt=False, + drop_audio_cond=False, + drop_text=False, + infer=True, + text_cache=text_cache, + dt_cache=dt_cache, ) v_pred = v_pred.transpose(2, 1) if self.use_conditioner_cache: @@ -1037,18 +1062,18 @@ class CFM(torch.nn.Module): dt_cache = dt if inference_cfg_rate > 1e-5: neg, text_cfg_emb, _ = self.estimator( - x, - prompt_x, - x_lens, - t_tensor, - d_tensor, - mu, - use_grad_ckpt=False, - drop_audio_cond=True, - drop_text=True, - infer=True, - text_cache=text_cfg_cache, - dt_cache=dt_cache + x, + prompt_x, + x_lens, + t_tensor, + d_tensor, + mu, + use_grad_ckpt=False, + drop_audio_cond=True, + drop_text=True, + infer=True, + text_cache=text_cfg_cache, + dt_cache=dt_cache, ) neg = neg.transpose(2, 1) if self.use_conditioner_cache: diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index 9a94898f..6fa84a43 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -1,5 +1,4 @@ import math -import pdb import numpy as np import torch @@ -720,10 +719,10 @@ class MelStyleEncoder(nn.Module): else: len_ = (~mask).sum(dim=1).unsqueeze(1) x = x.masked_fill(mask.unsqueeze(-1), 0) - dtype=x.dtype + dtype = x.dtype x = x.float() - x=torch.div(x,len_.unsqueeze(1)) - out=x.sum(dim=1).to(dtype) + x = torch.div(x, len_.unsqueeze(1)) + out = x.sum(dim=1).to(dtype) return out def forward(self, x, mask=None): diff --git a/GPT_SoVITS/prepare_datasets/2-get-sv.py b/GPT_SoVITS/prepare_datasets/2-get-sv.py index 8980833f..80b0ad69 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-sv.py +++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py @@ -10,7 +10,6 @@ i_part = os.environ.get("i_part") all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] -from feature_extractor import cnhubert opt_dir = os.environ.get("opt_dir") sv_path = os.environ.get("sv_path") @@ -19,19 +18,18 @@ import torch is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() import traceback -import numpy as np -from scipy.io import wavfile import torchaudio now_dir = os.getcwd() sys.path.append(now_dir) sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net") -from tools.my_utils import load_audio, clean_path +from tools.my_utils import clean_path from time import time as ttime import shutil from ERes2NetV2 import ERes2NetV2 import kaldi as Kaldi + def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path dir = os.path.dirname(path) name = os.path.basename(path) @@ -56,37 +54,45 @@ if torch.cuda.is_available(): else: device = "cpu" + class SV: - def __init__(self,device,is_half): - pretrained_state = torch.load(sv_path, map_location='cpu') - embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4) + def __init__(self, device, is_half): + pretrained_state = torch.load(sv_path, map_location="cpu") + embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4) embedding_model.load_state_dict(pretrained_state) embedding_model.eval() - self.embedding_model=embedding_model - self.res=torchaudio.transforms.Resample(32000, 16000).to(device) + self.embedding_model = embedding_model + self.res = torchaudio.transforms.Resample(32000, 16000).to(device) if is_half == False: - self.embedding_model=self.embedding_model.to(device) + self.embedding_model = self.embedding_model.to(device) else: - self.embedding_model=self.embedding_model.half().to(device) - self.is_half=is_half + self.embedding_model = self.embedding_model.half().to(device) + self.is_half = is_half - def compute_embedding3(self,wav):#(1,x)#-1~1 + def compute_embedding3(self, wav): # (1,x)#-1~1 with torch.no_grad(): - wav=self.res(wav) - if self.is_half==True:wav=wav.half() - feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]) + wav = self.res(wav) + if self.is_half == True: + wav = wav.half() + feat = torch.stack( + [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav] + ) sv_emb = self.embedding_model.forward3(feat) return sv_emb -sv=SV(device,is_half) + +sv = SV(device, is_half) + + def name2go(wav_name, wav_path): sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name) - if os.path.exists(sv_cn_path):return - wav_path="%s/%s" % (wav32dir, wav_name) - wav32k,sr0 = torchaudio.load(wav_path) - assert sr0==32000 + if os.path.exists(sv_cn_path): + return + wav_path = "%s/%s" % (wav32dir, wav_name) + wav32k, sr0 = torchaudio.load(wav_path) + assert sr0 == 32000 wav32k = wav32k.to(device) - emb=sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480]) + emb = sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480]) my_save(emb, sv_cn_path) diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index ca30359c..20db9b19 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -17,15 +17,16 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path shutil.move(tmp_path, "%s/%s" % (dir, name)) - from io import BytesIO -model_version2byte={ - "v3":b"03", - "v4":b"04", - "v2Pro":b"05", - "v2ProPlus":b"06", +model_version2byte = { + "v3": b"03", + "v4": b"04", + "v2Pro": b"05", + "v2ProPlus": b"06", } + + def my_save2(fea, path, model_version): bio = BytesIO() torch.save(fea, bio) @@ -50,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None): if lora_rank: opt["lora_rank"] = lora_rank my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version) - elif (model_version!=None and "Pro"in model_version): + elif model_version != None and "Pro" in model_version: my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version) else: my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) @@ -58,6 +59,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None): except: return traceback.format_exc() + """ 00:v1 01:v2 diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index 0a046049..4b9f6488 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -36,7 +36,7 @@ from module.models import ( MultiPeriodDiscriminator, SynthesizerTrn, ) -from process_ckpt import savee,my_save2 +from process_ckpt import savee torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = False @@ -87,11 +87,30 @@ def run(rank, n_gpus, hps): if torch.cuda.is_available(): torch.cuda.set_device(rank) - train_dataset = TextAudioSpeakerLoader(hps.data,version=hps.model.version) + train_dataset = TextAudioSpeakerLoader(hps.data, version=hps.model.version) train_sampler = DistributedBucketSampler( train_dataset, hps.train.batch_size, - [32,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,], + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1100, + 1200, + 1300, + 1400, + 1500, + 1600, + 1700, + 1800, + 1900, + ], num_replicas=n_gpus, rank=rank, shuffle=True, @@ -130,9 +149,9 @@ def run(rank, n_gpus, hps): ) net_d = ( - MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).cuda(rank) + MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).cuda(rank) if torch.cuda.is_available() - else MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).to(device) + else MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device) ) for name, param in net_g.named_parameters(): if not param.requires_grad: @@ -235,7 +254,7 @@ def run(rank, n_gpus, hps): print( "loaded pretrained %s" % hps.train.pretrained_s2D, net_d.module.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],strict=False + torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], strict=False ) if torch.cuda.is_available() else net_d.load_state_dict( @@ -310,17 +329,44 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade net_g.train() net_d.train() for batch_idx, data in enumerate(tqdm(train_loader)): - if hps.model.version in {"v2Pro","v2ProPlus"}: - ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths,sv_emb=data + if hps.model.version in {"v2Pro", "v2ProPlus"}: + ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths, sv_emb = data else: - ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths=data + ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data if torch.cuda.is_available(): - spec, spec_lengths = (spec.cuda(rank,non_blocking=True,),spec_lengths.cuda(rank,non_blocking=True,),) - y, y_lengths = (y.cuda(rank,non_blocking=True,),y_lengths.cuda(rank,non_blocking=True,),) + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), + ) + y, y_lengths = ( + y.cuda( + rank, + non_blocking=True, + ), + y_lengths.cuda( + rank, + non_blocking=True, + ), + ) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = (text.cuda(rank,non_blocking=True,),text_lengths.cuda(rank,non_blocking=True,),) + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), + ) if hps.model.version in {"v2Pro", "v2ProPlus"}: sv_emb = sv_emb.cuda(rank, non_blocking=True) else: @@ -334,9 +380,19 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade sv_emb = sv_emb.to(device) with autocast(enabled=hps.train.fp16_run): if hps.model.version in {"v2Pro", "v2ProPlus"}: - (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl) = net_g(ssl, spec, spec_lengths, text, text_lengths,sv_emb) + (y_hat, kl_ssl, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl) = net_g( + ssl, spec, spec_lengths, text, text_lengths, sv_emb + ) else: - (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl,) = net_g(ssl, spec, spec_lengths, text, text_lengths) + ( + y_hat, + kl_ssl, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + stats_ssl, + ) = net_g(ssl, spec, spec_lengths, text, text_lengths) mel = spec_to_mel_torch( spec, @@ -508,7 +564,14 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade % ( hps.name, epoch, - savee(ckpt,hps.name + "_e%s_s%s" % (epoch, global_step),epoch,global_step,hps,model_version=None if hps.model.version not in {"v2Pro","v2ProPlus"}else hps.model.version), + savee( + ckpt, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + global_step, + hps, + model_version=None if hps.model.version not in {"v2Pro", "v2ProPlus"} else hps.model.version, + ), ) ) diff --git a/GPT_SoVITS/sv.py b/GPT_SoVITS/sv.py index fb5806ff..22e70369 100644 --- a/GPT_SoVITS/sv.py +++ b/GPT_SoVITS/sv.py @@ -1,24 +1,32 @@ -import sys,os,torch +import sys +import os +import torch + sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net") sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt" from ERes2NetV2 import ERes2NetV2 import kaldi as Kaldi + + class SV: - def __init__(self,device,is_half): - pretrained_state = torch.load(sv_path, map_location='cpu', weights_only=False) - embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4) + def __init__(self, device, is_half): + pretrained_state = torch.load(sv_path, map_location="cpu", weights_only=False) + embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4) embedding_model.load_state_dict(pretrained_state) embedding_model.eval() - self.embedding_model=embedding_model + self.embedding_model = embedding_model if is_half == False: - self.embedding_model=self.embedding_model.to(device) + self.embedding_model = self.embedding_model.to(device) else: - self.embedding_model=self.embedding_model.half().to(device) - self.is_half=is_half + self.embedding_model = self.embedding_model.half().to(device) + self.is_half = is_half - def compute_embedding3(self,wav): + def compute_embedding3(self, wav): with torch.no_grad(): - if self.is_half==True:wav=wav.half() - feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]) + if self.is_half == True: + wav = wav.half() + feat = torch.stack( + [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav] + ) sv_emb = self.embedding_model.forward3(feat) return sv_emb diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py index 88b93794..0187ea69 100644 --- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py +++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py @@ -3,38 +3,44 @@ import re # jieba静音 import jieba + jieba.setLogLevel(logging.CRITICAL) # 更改fast_langdetect大模型位置 from pathlib import Path import fast_langdetect -fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect")) + +fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector( + fast_langdetect.infer.LangDetectConfig( + cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect" + ) +) from split_lang import LangSplitter def full_en(text): - pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$' + pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$" return bool(re.match(pattern, text)) def full_cjk(text): # 来自wiki cjk_ranges = [ - (0x4E00, 0x9FFF), # CJK Unified Ideographs - (0x3400, 0x4DB5), # CJK Extension A - (0x20000, 0x2A6DD), # CJK Extension B - (0x2A700, 0x2B73F), # CJK Extension C - (0x2B740, 0x2B81F), # CJK Extension D - (0x2B820, 0x2CEAF), # CJK Extension E - (0x2CEB0, 0x2EBEF), # CJK Extension F - (0x30000, 0x3134A), # CJK Extension G - (0x31350, 0x323AF), # CJK Extension H - (0x2EBF0, 0x2EE5D), # CJK Extension H + (0x4E00, 0x9FFF), # CJK Unified Ideographs + (0x3400, 0x4DB5), # CJK Extension A + (0x20000, 0x2A6DD), # CJK Extension B + (0x2A700, 0x2B73F), # CJK Extension C + (0x2B740, 0x2B81F), # CJK Extension D + (0x2B820, 0x2CEAF), # CJK Extension E + (0x2CEB0, 0x2EBEF), # CJK Extension F + (0x30000, 0x3134A), # CJK Extension G + (0x31350, 0x323AF), # CJK Extension H + (0x2EBF0, 0x2EE5D), # CJK Extension H ] - pattern = r'[0-9、-〜。!?.!?… /]+$' + pattern = r"[0-9、-〜。!?.!?… /]+$" cjk_text = "" for char in text: @@ -45,7 +51,7 @@ def full_cjk(text): return cjk_text -def split_jako(tag_lang,item): +def split_jako(tag_lang, item): if tag_lang == "ja": pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)" else: @@ -53,41 +59,40 @@ def split_jako(tag_lang,item): lang_list: list[dict] = [] tag = 0 - for match in re.finditer(pattern, item['text']): + for match in re.finditer(pattern, item["text"]): if match.start() > tag: - lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]}) + lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]}) tag = match.end() - lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]}) + lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]}) - if tag < len(item['text']): - lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]}) + if tag < len(item["text"]): + lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]}) return lang_list def merge_lang(lang_list, item): - if lang_list and item['lang'] == lang_list[-1]['lang']: - lang_list[-1]['text'] += item['text'] + if lang_list and item["lang"] == lang_list[-1]["lang"]: + lang_list[-1]["text"] += item["text"] else: lang_list.append(item) return lang_list -class LangSegmenter(): +class LangSegmenter: # 默认过滤器, 基于gsv目前四种语言 DEFAULT_LANG_MAP = { "zh": "zh", "yue": "zh", # 粤语 "wuu": "zh", # 吴语 "zh-cn": "zh", - "zh-tw": "x", # 繁体设置为x + "zh-tw": "x", # 繁体设置为x "ko": "ko", "ja": "ja", "en": "en", } - def getTexts(text): lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP) substr = lang_splitter.split_by_lang(text=text) @@ -95,18 +100,18 @@ class LangSegmenter(): lang_list: list[dict] = [] for _, item in enumerate(substr): - dict_item = {'lang':item.lang,'text':item.text} + dict_item = {"lang": item.lang, "text": item.text} # 处理短英文被识别为其他语言的问题 - if full_en(dict_item['text']): - dict_item['lang'] = 'en' - lang_list = merge_lang(lang_list,dict_item) + if full_en(dict_item["text"]): + dict_item["lang"] = "en" + lang_list = merge_lang(lang_list, dict_item) continue # 处理非日语夹日文的问题(不包含CJK) ja_list: list[dict] = [] - if dict_item['lang'] != 'ja': - ja_list = split_jako('ja',dict_item) + if dict_item["lang"] != "ja": + ja_list = split_jako("ja", dict_item) if not ja_list: ja_list.append(dict_item) @@ -115,8 +120,8 @@ class LangSegmenter(): ko_list: list[dict] = [] temp_list: list[dict] = [] for _, ko_item in enumerate(ja_list): - if ko_item["lang"] != 'ko': - ko_list = split_jako('ko',ko_item) + if ko_item["lang"] != "ko": + ko_list = split_jako("ko", ko_item) if ko_list: temp_list.extend(ko_list) @@ -126,50 +131,50 @@ class LangSegmenter(): # 未存在非日韩文夹日韩文 if len(temp_list) == 1: # 未知语言检查是否为CJK - if dict_item['lang'] == 'x': - cjk_text = full_cjk(dict_item['text']) + if dict_item["lang"] == "x": + cjk_text = full_cjk(dict_item["text"]) if cjk_text: - dict_item = {'lang':'zh','text':cjk_text} - lang_list = merge_lang(lang_list,dict_item) + dict_item = {"lang": "zh", "text": cjk_text} + lang_list = merge_lang(lang_list, dict_item) else: - lang_list = merge_lang(lang_list,dict_item) + lang_list = merge_lang(lang_list, dict_item) continue else: - lang_list = merge_lang(lang_list,dict_item) + lang_list = merge_lang(lang_list, dict_item) continue # 存在非日韩文夹日韩文 for _, temp_item in enumerate(temp_list): # 未知语言检查是否为CJK - if temp_item['lang'] == 'x': - cjk_text = full_cjk(dict_item['text']) + if temp_item["lang"] == "x": + cjk_text = full_cjk(dict_item["text"]) if cjk_text: - dict_item = {'lang':'zh','text':cjk_text} - lang_list = merge_lang(lang_list,dict_item) + dict_item = {"lang": "zh", "text": cjk_text} + lang_list = merge_lang(lang_list, dict_item) else: - lang_list = merge_lang(lang_list,dict_item) + lang_list = merge_lang(lang_list, dict_item) else: - lang_list = merge_lang(lang_list,temp_item) + lang_list = merge_lang(lang_list, temp_item) temp_list = lang_list lang_list = [] for _, temp_item in enumerate(temp_list): - if temp_item['lang'] == 'x': + if temp_item["lang"] == "x": if lang_list: - temp_item['lang'] = lang_list[-1]['lang'] + temp_item["lang"] = lang_list[-1]["lang"] elif len(temp_list) > 1: - temp_item['lang'] = temp_list[1]['lang'] + temp_item["lang"] = temp_list[1]["lang"] else: - temp_item['lang'] = 'zh' + temp_item["lang"] = "zh" - lang_list = merge_lang(lang_list,temp_item) + lang_list = merge_lang(lang_list, temp_item) return lang_list - + if __name__ == "__main__": text = "MyGO?,你也喜欢まいご吗?" print(LangSegmenter.getTexts(text)) text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。" - print(LangSegmenter.getTexts(text)) \ No newline at end of file + print(LangSegmenter.getTexts(text)) diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index 9d153745..52eed443 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -3,7 +3,6 @@ import json import os -import traceback import warnings import zipfile from typing import Any, Dict, List, Tuple diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index e9a279c6..4ed73781 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -655,11 +655,7 @@ class ToneSandhi: while i < len(seg): word, pos = seg[i] merged = False - if ( - i - 1 >= 0 - and word == "一" - and i + 1 < len(seg) - ): + if i - 1 >= 0 and word == "一" and i + 1 < len(seg): last = new_seg[-1] if new_seg else seg[i - 1] if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v": combined = last[0] + "一" + seg[i + 1][0] diff --git a/api.py b/api.py index b7e94e77..dc2e8826 100644 --- a/api.py +++ b/api.py @@ -199,6 +199,8 @@ def is_full(*items): # 任意一项为空返回False bigvgan_model = hifigan_model = sv_cn_model = None + + def clean_hifigan_model(): global hifigan_model if hifigan_model: @@ -208,6 +210,8 @@ def clean_hifigan_model(): torch.cuda.empty_cache() except: pass + + def clean_bigvgan_model(): global bigvgan_model if bigvgan_model: @@ -217,6 +221,8 @@ def clean_bigvgan_model(): torch.cuda.empty_cache() except: pass + + def clean_sv_cn_model(): global sv_cn_model if sv_cn_model: @@ -229,7 +235,7 @@ def clean_sv_cn_model(): def init_bigvgan(): - global bigvgan_model, hifigan_model,sv_cn_model + global bigvgan_model, hifigan_model, sv_cn_model from BigVGAN import bigvgan bigvgan_model = bigvgan.BigVGAN.from_pretrained( @@ -247,7 +253,7 @@ def init_bigvgan(): def init_hifigan(): - global hifigan_model, bigvgan_model,sv_cn_model + global hifigan_model, bigvgan_model, sv_cn_model hifigan_model = Generator( initial_channel=100, resblock="1", @@ -262,7 +268,9 @@ def init_hifigan(): hifigan_model.eval() hifigan_model.remove_weight_norm() state_dict_g = torch.load( - "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), + map_location="cpu", + weights_only=False, ) print("loading vocoder", hifigan_model.load_state_dict(state_dict_g)) if is_half == True: @@ -272,19 +280,21 @@ def init_hifigan(): from sv import SV + + def init_sv_cn(): global hifigan_model, bigvgan_model, sv_cn_model sv_cn_model = SV(device, is_half) -resample_transform_dict={} -def resample(audio_tensor, sr0,sr1,device): +resample_transform_dict = {} + + +def resample(audio_tensor, sr0, sr1, device): global resample_transform_dict - key="%s-%s-%s"%(sr0,sr1,str(device)) + key = "%s-%s-%s" % (sr0, sr1, str(device)) if key not in resample_transform_dict: - resample_transform_dict[key] = torchaudio.transforms.Resample( - sr0, sr1 - ).to(device) + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) return resample_transform_dict[key](audio_tensor) @@ -370,6 +380,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new def get_sovits_weights(sovits_path): from config import pretrained_sovits_name + path_sovits_v3 = pretrained_sovits_name["v3"] path_sovits_v4 = pretrained_sovits_name["v4"] is_exist_s2gv3 = os.path.exists(path_sovits_v3) @@ -628,15 +639,17 @@ class DictToAttrRecursive(dict): def get_spepc(hps, filename, dtype, device, is_v2pro=False): - sr1=int(hps.data.sampling_rate) - audio, sr0=torchaudio.load(filename) - if sr0!=sr1: - audio=audio.to(device) - if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0) - audio=resample(audio,sr0,sr1,device) + sr1 = int(hps.data.sampling_rate) + audio, sr0 = torchaudio.load(filename) + if sr0 != sr1: + audio = audio.to(device) + if audio.shape[0] == 2: + audio = audio.mean(0).unsqueeze(0) + audio = resample(audio, sr0, sr1, device) else: - audio=audio.to(device) - if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0) + audio = audio.to(device) + if audio.shape[0] == 2: + audio = audio.mean(0).unsqueeze(0) maxx = audio.abs().max() if maxx > 1: @@ -649,10 +662,10 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False): hps.data.win_length, center=False, ) - spec=spec.to(dtype) - if is_v2pro==True: - audio=resample(audio,sr1,16000,device).to(dtype) - return spec,audio + spec = spec.to(dtype) + if is_v2pro == True: + audio = resample(audio, sr1, 16000, device).to(dtype) + return spec, audio def pack_audio(audio_bytes, data, rate): @@ -872,29 +885,29 @@ def get_tts_wav( prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) - is_v2pro = version in {"v2Pro","v2ProPlus"} + is_v2pro = version in {"v2Pro", "v2ProPlus"} if version not in {"v3", "v4"}: refers = [] if is_v2pro: - sv_emb= [] + sv_emb = [] if sv_cn_model == None: init_sv_cn() if inp_refs: for path in inp_refs: - try:#####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer - refer,audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) + try: #####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer + refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) refers.append(refer) if is_v2pro: sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor)) except Exception as e: logger.error(e) if len(refers) == 0: - refers,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) - refers=[refers] + refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) + refers = [refers] if is_v2pro: - sv_emb=[sv_cn_model.compute_embedding3(audio_tensor)] + sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)] else: - refer,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device) + refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device) t1 = ttime() # os.environ['version'] = version @@ -937,14 +950,22 @@ def get_tts_wav( if version not in {"v3", "v4"}: if is_v2pro: audio = ( - vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb) + vq_model.decode( + pred_semantic, + torch.LongTensor(phones2).to(device).unsqueeze(0), + refers, + speed=speed, + sv_emb=sv_emb, + ) .detach() .cpu() .numpy()[0, 0] ) else: audio = ( - vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed) + vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed + ) .detach() .cpu() .numpy()[0, 0] @@ -1108,7 +1129,6 @@ def handle( if not default_refer.is_ready(): return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) - if cut_punc == None: text = cut_text(text, default_cut_punc) else: diff --git a/config.py b/config.py index 8f4be146..fdc11c0a 100644 --- a/config.py +++ b/config.py @@ -144,7 +144,8 @@ webui_port_subfix = 9871 api_port = 9880 -#Thanks to the contribution of @Karasukaigan and @XXXXRT666 + +# Thanks to the contribution of @Karasukaigan and @XXXXRT666 def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]: cpu = torch.device("cpu") cuda = torch.device(f"cuda:{idx}") @@ -157,10 +158,13 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo mem_gb = mem_bytes / (1024**3) + 0.4 major, minor = capability sm_version = major + minor / 10.0 - is_16_series = bool(re.search(r"16\d{2}", name))and sm_version == 7.5 - if mem_gb < 4 or sm_version < 5.3:return cpu, torch.float32, 0.0, 0.0 - if sm_version == 6.1 or is_16_series==True:return cuda, torch.float32, sm_version, mem_gb - if sm_version > 6.1:return cuda, torch.float16, sm_version, mem_gb + is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5 + if mem_gb < 4 or sm_version < 5.3: + return cpu, torch.float32, 0.0, 0.0 + if sm_version == 6.1 or is_16_series == True: + return cuda, torch.float32, sm_version, mem_gb + if sm_version > 6.1: + return cuda, torch.float16, sm_version, mem_gb return cpu, torch.float32, 0.0, 0.0 diff --git a/tools/my_utils.py b/tools/my_utils.py index d26a372e..04f1a98a 100644 --- a/tools/my_utils.py +++ b/tools/my_utils.py @@ -109,7 +109,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False): if os.path.exists(wav_path): ... else: - gr.Warning(wav_path+i18n("路径错误")) + gr.Warning(wav_path + i18n("路径错误")) return if is_train: path_list.append(os.path.join(path_list[0], "2-name2text.txt")) diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py index 6548ce24..98c75c1f 100644 --- a/tools/uvr5/mdxnet.py +++ b/tools/uvr5/mdxnet.py @@ -190,14 +190,14 @@ class Predictor: opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): - os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_vocal, opt_path_vocal)) + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_vocal, opt_path_vocal)) if os.path.exists(opt_path_vocal): try: os.remove(path_vocal) except: pass if os.path.exists(path_other): - os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_other, opt_path_other)) + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_other, opt_path_other)) if os.path.exists(opt_path_other): try: os.remove(path_other) diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py index 8f24ca6a..45429cca 100644 --- a/tools/uvr5/vr.py +++ b/tools/uvr5/vr.py @@ -140,7 +140,7 @@ class AudioPre: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path) + cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path) print(cmd) os.system(cmd) if os.path.exists(opt_format_path): @@ -177,7 +177,7 @@ class AudioPre: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path) + cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path) print(cmd) os.system(cmd) if os.path.exists(opt_format_path): @@ -307,7 +307,7 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path) + cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path) print(cmd) os.system(cmd) if os.path.exists(opt_format_path): @@ -340,7 +340,7 @@ class AudioPreDeEcho: ) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format - cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path) + cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path) print(cmd) os.system(cmd) if os.path.exists(opt_format_path): diff --git a/webui.py b/webui.py index b8b3bb22..a171739b 100644 --- a/webui.py +++ b/webui.py @@ -498,7 +498,7 @@ def open1Ba( ): global p_train_SoVITS if p_train_SoVITS == None: - exp_name=exp_name.rstrip(" ") + exp_name = exp_name.rstrip(" ") config_file = ( "GPT_SoVITS/configs/s2.json" if version not in {"v2Pro", "v2ProPlus"} @@ -595,7 +595,7 @@ def open1Bb( ): global p_train_GPT if p_train_GPT == None: - exp_name=exp_name.rstrip(" ") + exp_name = exp_name.rstrip(" ") with open( "GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml" ) as f: