diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 40a1b2e..1b7ad11 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1,38 +1,40 @@ -from copy import deepcopy +import gc import math import os -import sys -import gc import random -import traceback +import sys import time +import traceback +from copy import deepcopy + import torchaudio from tqdm import tqdm now_dir = os.getcwd() sys.path.append(now_dir) -import ffmpeg import os from typing import List, Tuple, Union + +import ffmpeg +import librosa import numpy as np import torch import torch.nn.functional as F import yaml -from transformers import AutoModelForMaskedLM, AutoTokenizer -from tools.audio_sr import AP_BWE from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from BigVGAN.bigvgan import BigVGAN from feature_extractor.cnhubert import CNHubert +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch from module.models import SynthesizerTrn, SynthesizerTrnV3 from peft import LoraConfig, get_peft_model -import librosa +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new +from transformers import AutoModelForMaskedLM, AutoTokenizer + +from tools.audio_sr import AP_BWE from tools.i18n.i18n import I18nAuto, scan_language_list from tools.my_utils import load_audio -from module.mel_processing import spectrogram_torch from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor -from BigVGAN.bigvgan import BigVGAN -from module.mel_processing import mel_spectrogram_torch -from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language @@ -461,8 +463,6 @@ class TTS: n_speakers=self.configs.n_speakers, **kwargs, ) - if hasattr(vits_model, "enc_q"): - del vits_model.enc_q self.configs.is_v3_synthesizer = False else: vits_model = SynthesizerTrnV3( @@ -473,6 +473,8 @@ class TTS: ) self.configs.is_v3_synthesizer = True self.init_bigvgan() + if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"): + del vits_model.enc_q if if_lora_v3 == False: print( diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index c112f1a..3f9750a 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -9,9 +9,10 @@ import logging import traceback -import torchaudio import warnings +import torchaudio + logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) @@ -22,10 +23,11 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR) warnings.simplefilter(action="ignore", category=FutureWarning) +import json import os import re import sys -import json + import torch from text.LangSegmenter import LangSegmenter @@ -91,16 +93,17 @@ is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() # is_half=False punctuation = set(["!", "?", "…", ",", ".", "-", " "]) import gradio as gr -from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np import librosa +import numpy as np from feature_extractor import cnhubert +from transformers import AutoModelForMaskedLM, AutoTokenizer cnhubert.cnhubert_base_path = cnhubert_base_path -from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3 import random +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3 + def set_seed(seed): if seed == -1: @@ -115,12 +118,14 @@ def set_seed(seed): # set_seed(42) +from time import time as ttime + from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from peft import LoraConfig, get_peft_model from text import cleaned_text_to_sequence from text.cleaner import clean_text -from time import time as ttime + from tools.i18n.i18n import I18nAuto, scan_language_list -from peft import LoraConfig, get_peft_model language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language @@ -265,10 +270,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) prompt_language_update, text_update, text_language_update, - {"__type__": "update", "visible": visible_sample_steps}, + {"__type__": "update", "visible": visible_sample_steps, "value": 32}, {"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, ) dict_s2 = load_sovits_new(sovits_path) @@ -329,6 +335,19 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) # torch.save(vq_model.state_dict(),"merge_win.pth") vq_model.eval() + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "visible": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) with open("./weight.json") as f: data = f.read() data = json.loads(data) @@ -530,7 +549,7 @@ def get_phones_and_bert(text, language, version, final=False): return phones, bert.to(dtype), norm_text -from module.mel_processing import spectrogram_torch, mel_spectrogram_torch +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch spec_min = -12 spec_max = 2 @@ -1020,7 +1039,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。") + i18n("v3暂不支持该模式,使用了会报错。"), value=False, - interactive=True, + interactive=True if model_version != "v3" else False, show_label=True, scale=1, ) @@ -1137,7 +1156,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: # phoneme=gr.Textbox(label=i18n("音素框"), value="") # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") with gr.Row(): - inference_button = gr.Button(i18n("合成语音"), variant="primary", size="lg", scale=25) + inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) output = gr.Audio(label=i18n("输出的语音"), scale=14) inference_button.click( @@ -1176,6 +1195,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inp_refs, ref_text_free, if_sr_Checkbox, + inference_button, ], ) GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 360cb9c..934cbe2 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -7,11 +7,11 @@ 全部按日文识别 """ -import random -import os -import re -import logging import json +import logging +import os +import random +import re import sys now_dir = os.getcwd() @@ -47,11 +47,13 @@ gpt_path = os.environ.get("gpt_path", None) sovits_path = os.environ.get("sovits_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None) bert_path = os.environ.get("bert_path", None) -version = os.environ.get("version", "v2") +version = model_version = os.environ.get("version", "v2") import gradio as gr -from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR +from inference_webui import DictToAttrRecursive from TTS_infer_pack.text_segmentation_method import get_method +from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config + from tools.i18n.i18n import I18nAuto, scan_language_list language = os.environ.get("language", "Auto") @@ -254,21 +256,18 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) -from process_ckpt import get_sovits_version_from_path_fast +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): - global version, dict_language + global version, model_version, dict_language, if_lora_v3 version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) - - if if_lora_v3 and not os.path.exists(path_sovits_v3): - info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + # print(sovits_path,version, model_version, if_lora_v3) + if if_lora_v3 == True and is_exist_s2gv3 == False: # + info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") gr.Warning(info) raise FileExistsError(info) - - tts_pipeline.init_vits_weights(sovits_path) - - dict_language = dict_language_v1 if tts_pipeline.configs.version == "v1" else dict_language_v2 + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): prompt_text_update, prompt_language_update = ( @@ -289,6 +288,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) else: visible_sample_steps = False visible_inp_refs = True + # prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free, yield ( {"__type__": "update", "choices": list(dict_language.keys())}, {"__type__": "update", "choices": list(dict_language.keys())}, @@ -296,12 +296,25 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) prompt_language_update, text_update, text_language_update, - {"__type__": "update", "visible": visible_sample_steps}, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, {"__type__": "update", "visible": visible_inp_refs}, - {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, - {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, ) + tts_pipeline.init_vits_weights(sovits_path) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) with open("./weight.json") as f: data = f.read() data = json.loads(data) @@ -341,7 +354,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath") - inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"), file_count="multiple") + inp_refs = gr.File( + label=i18n("辅参考音频(可选多个,或不选)"), + file_count="multiple", + visible=True if model_version != "v3" else False, + ) prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) with gr.Row(): prompt_language = gr.Dropdown( @@ -351,7 +368,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: ref_text_free = gr.Checkbox( label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, - interactive=True, + interactive=True if model_version != "v3" else False, show_label=True, ) gr.Markdown( @@ -465,8 +482,19 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: SoVITS_dropdown.change( change_sovits_weights, [SoVITS_dropdown, prompt_language, text_language], - [prompt_language, text_language, prompt_text, prompt_language, text, text_language], - ) + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + inference_button, + ], + ) # GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) with gr.Group(): diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index 0195511..c175ec6 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -63,7 +63,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"): extract_dir = os.path.join(parent_directory, "G2PWModel_1.1") extract_dir_new = os.path.join(parent_directory, "G2PWModel") print("Downloading g2pw model...") - modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" + modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" with requests.get(modelscope_url, stream=True) as r: r.raise_for_status() with open(zip_dir, "wb") as f: diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 4666f6e..58a67d3 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT 修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122 修复v3sovits未传参以支持调节语速 + +### 202503 + +修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8 + +修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +修复其他若干bug + +重点更新: + +1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +2-整合包修复onnxruntime GPU推理的支持,影响:(1)g2pw有个onnx模型原先是CPU推理现在用GPU,显著降低推理的CPU瓶颈 (2)foxjoy去混响模型现在可使用GPU推理 diff --git a/webui.py b/webui.py index 6ce107e..bdc9441 100644 --- a/webui.py +++ b/webui.py @@ -435,9 +435,9 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) else: cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) - #####v3暂不支持加速推理 - if version == "v3": - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) + # #####v3暂不支持加速推理 + # if version=="v3": + # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) if p_tts_inference is None: os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) @@ -1312,9 +1312,9 @@ def switch_version(version_): "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False, }, - {"__type__": "update", "interactive": False if version == "v3" else True, "value": False}, + {"__type__": "update", "interactive": True, "value": False}, {"__type__": "update", "visible": True if version == "v3" else False}, - ) + ) # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer if os.path.exists("GPT_SoVITS/text/G2PWModel"):