Merge remote-tracking branch 'upstream/main' into Ruff-Format

This commit is contained in:
XXXXRT666 2025-04-01 11:19:53 +01:00
commit 27ee75e47b
6 changed files with 117 additions and 53 deletions

View File

@ -1,38 +1,40 @@
from copy import deepcopy import gc
import math import math
import os import os
import sys
import gc
import random import random
import traceback import sys
import time import time
import traceback
from copy import deepcopy
import torchaudio import torchaudio
from tqdm import tqdm from tqdm import tqdm
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
import ffmpeg
import os import os
from typing import List, Tuple, Union from typing import List, Tuple, Union
import ffmpeg
import librosa
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import yaml import yaml
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from BigVGAN.bigvgan import BigVGAN
from feature_extractor.cnhubert import CNHubert from feature_extractor.cnhubert import CNHubert
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
from module.models import SynthesizerTrn, SynthesizerTrnV3 from module.models import SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model from peft import LoraConfig, get_peft_model
import librosa from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio from tools.my_utils import load_audio
from module.mel_processing import spectrogram_torch
from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from BigVGAN.bigvgan import BigVGAN
from module.mel_processing import mel_spectrogram_torch
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -461,8 +463,6 @@ class TTS:
n_speakers=self.configs.n_speakers, n_speakers=self.configs.n_speakers,
**kwargs, **kwargs,
) )
if hasattr(vits_model, "enc_q"):
del vits_model.enc_q
self.configs.is_v3_synthesizer = False self.configs.is_v3_synthesizer = False
else: else:
vits_model = SynthesizerTrnV3( vits_model = SynthesizerTrnV3(
@ -473,6 +473,8 @@ class TTS:
) )
self.configs.is_v3_synthesizer = True self.configs.is_v3_synthesizer = True
self.init_bigvgan() self.init_bigvgan()
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q
if if_lora_v3 == False: if if_lora_v3 == False:
print( print(

View File

@ -9,9 +9,10 @@
import logging import logging
import traceback import traceback
import torchaudio
import warnings import warnings
import torchaudio
logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -22,10 +23,11 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
logging.getLogger("multipart.multipart").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
warnings.simplefilter(action="ignore", category=FutureWarning) warnings.simplefilter(action="ignore", category=FutureWarning)
import json
import os import os
import re import re
import sys import sys
import json
import torch import torch
from text.LangSegmenter import LangSegmenter from text.LangSegmenter import LangSegmenter
@ -91,16 +93,17 @@ is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
# is_half=False # is_half=False
punctuation = set(["!", "?", "", ",", ".", "-", " "]) punctuation = set(["!", "?", "", ",", ".", "-", " "])
import gradio as gr import gradio as gr
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
import librosa import librosa
import numpy as np
from feature_extractor import cnhubert from feature_extractor import cnhubert
from transformers import AutoModelForMaskedLM, AutoTokenizer
cnhubert.cnhubert_base_path = cnhubert_base_path cnhubert.cnhubert_base_path = cnhubert_base_path
from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3
import random import random
from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3
def set_seed(seed): def set_seed(seed):
if seed == -1: if seed == -1:
@ -115,12 +118,14 @@ def set_seed(seed):
# set_seed(42) # set_seed(42)
from time import time as ttime
from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from peft import LoraConfig, get_peft_model
from text import cleaned_text_to_sequence from text import cleaned_text_to_sequence
from text.cleaner import clean_text from text.cleaner import clean_text
from time import time as ttime
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
from peft import LoraConfig, get_peft_model
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -265,10 +270,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update, prompt_language_update,
text_update, text_update,
text_language_update, text_language_update,
{"__type__": "update", "visible": visible_sample_steps}, {"__type__": "update", "visible": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "visible": True if model_version == "v3" else False},
{"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
) )
dict_s2 = load_sovits_new(sovits_path) dict_s2 = load_sovits_new(sovits_path)
@ -329,6 +335,19 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
# torch.save(vq_model.state_dict(),"merge_win.pth") # torch.save(vq_model.state_dict(),"merge_win.pth")
vq_model.eval() vq_model.eval()
yield (
{"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())},
prompt_text_update,
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "visible": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False},
{"__type__": "update", "value": i18n("合成语音"), "interactive": True},
)
with open("./weight.json") as f: with open("./weight.json") as f:
data = f.read() data = f.read()
data = json.loads(data) data = json.loads(data)
@ -530,7 +549,7 @@ def get_phones_and_bert(text, language, version, final=False):
return phones, bert.to(dtype), norm_text return phones, bert.to(dtype), norm_text
from module.mel_processing import spectrogram_torch, mel_spectrogram_torch from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
spec_min = -12 spec_min = -12
spec_max = 2 spec_max = 2
@ -1020,7 +1039,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。") label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")
+ i18n("v3暂不支持该模式使用了会报错。"), + i18n("v3暂不支持该模式使用了会报错。"),
value=False, value=False,
interactive=True, interactive=True if model_version != "v3" else False,
show_label=True, show_label=True,
scale=1, scale=1,
) )
@ -1137,7 +1156,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
# phoneme=gr.Textbox(label=i18n("音素框"), value="") # phoneme=gr.Textbox(label=i18n("音素框"), value="")
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
with gr.Row(): with gr.Row():
inference_button = gr.Button(i18n("合成语音"), variant="primary", size="lg", scale=25) inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
output = gr.Audio(label=i18n("输出的语音"), scale=14) output = gr.Audio(label=i18n("输出的语音"), scale=14)
inference_button.click( inference_button.click(
@ -1176,6 +1195,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
inp_refs, inp_refs,
ref_text_free, ref_text_free,
if_sr_Checkbox, if_sr_Checkbox,
inference_button,
], ],
) )
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])

View File

@ -7,11 +7,11 @@
全部按日文识别 全部按日文识别
""" """
import random
import os
import re
import logging
import json import json
import logging
import os
import random
import re
import sys import sys
now_dir = os.getcwd() now_dir = os.getcwd()
@ -47,11 +47,13 @@ gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None) sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
bert_path = os.environ.get("bert_path", None) bert_path = os.environ.get("bert_path", None)
version = os.environ.get("version", "v2") version = model_version = os.environ.get("version", "v2")
import gradio as gr import gradio as gr
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR from inference_webui import DictToAttrRecursive
from TTS_infer_pack.text_segmentation_method import get_method from TTS_infer_pack.text_segmentation_method import get_method
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
@ -254,21 +256,18 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
global version, dict_language global version, model_version, dict_language, if_lora_v3
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
# print(sovits_path,version, model_version, if_lora_v3)
if if_lora_v3 and not os.path.exists(path_sovits_v3): if if_lora_v3 == True and is_exist_s2gv3 == False: #
info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info) gr.Warning(info)
raise FileExistsError(info) raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
tts_pipeline.init_vits_weights(sovits_path)
dict_language = dict_language_v1 if tts_pipeline.configs.version == "v1" else dict_language_v2
if prompt_language is not None and text_language is not None: if prompt_language is not None and text_language is not None:
if prompt_language in list(dict_language.keys()): if prompt_language in list(dict_language.keys()):
prompt_text_update, prompt_language_update = ( prompt_text_update, prompt_language_update = (
@ -289,6 +288,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
else: else:
visible_sample_steps = False visible_sample_steps = False
visible_inp_refs = True visible_inp_refs = True
# prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
yield ( yield (
{"__type__": "update", "choices": list(dict_language.keys())}, {"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())}, {"__type__": "update", "choices": list(dict_language.keys())},
@ -296,12 +296,25 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update, prompt_language_update,
text_update, text_update,
text_language_update, text_language_update,
{"__type__": "update", "visible": visible_sample_steps}, {"__type__": "update", "interactive": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, {"__type__": "update", "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
) )
tts_pipeline.init_vits_weights(sovits_path)
yield (
{"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())},
prompt_text_update,
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "interactive": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "interactive": True if model_version != "v3" else False},
{"__type__": "update", "value": i18n("合成语音"), "interactive": True},
)
with open("./weight.json") as f: with open("./weight.json") as f:
data = f.read() data = f.read()
data = json.loads(data) data = json.loads(data)
@ -341,7 +354,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(value=i18n("*请上传并填写参考信息")) gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row(): with gr.Row():
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath") inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath")
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"), file_count="multiple") inp_refs = gr.File(
label=i18n("辅参考音频(可选多个,或不选)"),
file_count="multiple",
visible=True if model_version != "v3" else False,
)
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
with gr.Row(): with gr.Row():
prompt_language = gr.Dropdown( prompt_language = gr.Dropdown(
@ -351,7 +368,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
ref_text_free = gr.Checkbox( ref_text_free = gr.Checkbox(
label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"),
value=False, value=False,
interactive=True, interactive=True if model_version != "v3" else False,
show_label=True, show_label=True,
) )
gr.Markdown( gr.Markdown(
@ -465,8 +482,19 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
SoVITS_dropdown.change( SoVITS_dropdown.change(
change_sovits_weights, change_sovits_weights,
[SoVITS_dropdown, prompt_language, text_language], [SoVITS_dropdown, prompt_language, text_language],
[prompt_language, text_language, prompt_text, prompt_language, text, text_language], [
) prompt_language,
text_language,
prompt_text,
prompt_language,
text,
text_language,
sample_steps,
inp_refs,
ref_text_free,
inference_button,
],
) #
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
with gr.Group(): with gr.Group():

View File

@ -63,7 +63,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1") extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory, "G2PWModel") extract_dir_new = os.path.join(parent_directory, "G2PWModel")
print("Downloading g2pw model...") print("Downloading g2pw model...")
modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r: with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status() r.raise_for_status()
with open(zip_dir, "wb") as f: with open(zip_dir, "wb") as f:

View File

@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122 修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
修复v3sovits未传参以支持调节语速 修复v3sovits未传参以支持调节语速
### 202503
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
修复其他若干bug
重点更新:
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
2-整合包修复onnxruntime GPU推理的支持影响1g2pw有个onnx模型原先是CPU推理现在用GPU显著降低推理的CPU瓶颈 2foxjoy去混响模型现在可使用GPU推理

View File

@ -435,9 +435,9 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language)
else: else:
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
#####v3暂不支持加速推理 # #####v3暂不支持加速推理
if version == "v3": # if version=="v3":
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if p_tts_inference is None: if p_tts_inference is None:
os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path)
os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path)
@ -1312,9 +1312,9 @@ def switch_version(version_):
"value": False if not if_force_ckpt else True, "value": False if not if_force_ckpt else True,
"interactive": True if not if_force_ckpt else False, "interactive": True if not if_force_ckpt else False,
}, },
{"__type__": "update", "interactive": False if version == "v3" else True, "value": False}, {"__type__": "update", "interactive": True, "value": False},
{"__type__": "update", "visible": True if version == "v3" else False}, {"__type__": "update", "visible": True if version == "v3" else False},
) ) # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
if os.path.exists("GPT_SoVITS/text/G2PWModel"): if os.path.exists("GPT_SoVITS/text/G2PWModel"):