Merge remote-tracking branch 'upstream/main' into Ruff-Format

This commit is contained in:
XXXXRT666 2025-04-01 11:19:53 +01:00
commit 27ee75e47b
6 changed files with 117 additions and 53 deletions

View File

@ -1,38 +1,40 @@
from copy import deepcopy
import gc
import math
import os
import sys
import gc
import random
import traceback
import sys
import time
import traceback
from copy import deepcopy
import torchaudio
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import ffmpeg
import os
from typing import List, Tuple, Union
import ffmpeg
import librosa
import numpy as np
import torch
import torch.nn.functional as F
import yaml
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from BigVGAN.bigvgan import BigVGAN
from feature_extractor.cnhubert import CNHubert
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
from module.models import SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model
import librosa
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio
from module.mel_processing import spectrogram_torch
from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from BigVGAN.bigvgan import BigVGAN
from module.mel_processing import mel_spectrogram_torch
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -461,8 +463,6 @@ class TTS:
n_speakers=self.configs.n_speakers,
**kwargs,
)
if hasattr(vits_model, "enc_q"):
del vits_model.enc_q
self.configs.is_v3_synthesizer = False
else:
vits_model = SynthesizerTrnV3(
@ -473,6 +473,8 @@ class TTS:
)
self.configs.is_v3_synthesizer = True
self.init_bigvgan()
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q
if if_lora_v3 == False:
print(

View File

@ -9,9 +9,10 @@
import logging
import traceback
import torchaudio
import warnings
import torchaudio
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -22,10 +23,11 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
warnings.simplefilter(action="ignore", category=FutureWarning)
import json
import os
import re
import sys
import json
import torch
from text.LangSegmenter import LangSegmenter
@ -91,16 +93,17 @@ is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
# is_half=False
punctuation = set(["!", "?", "", ",", ".", "-", " "])
import gradio as gr
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
import librosa
import numpy as np
from feature_extractor import cnhubert
from transformers import AutoModelForMaskedLM, AutoTokenizer
cnhubert.cnhubert_base_path = cnhubert_base_path
from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3
import random
from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3
def set_seed(seed):
if seed == -1:
@ -115,12 +118,14 @@ def set_seed(seed):
# set_seed(42)
from time import time as ttime
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from peft import LoraConfig, get_peft_model
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
from time import time as ttime
from tools.i18n.i18n import I18nAuto, scan_language_list
from peft import LoraConfig, get_peft_model
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -265,10 +270,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "visible": visible_sample_steps},
{"__type__": "update", "visible": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False},
{"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
)
dict_s2 = load_sovits_new(sovits_path)
@ -329,6 +335,19 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
# torch.save(vq_model.state_dict(),"merge_win.pth")
vq_model.eval()
yield (
{"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())},
prompt_text_update,
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "visible": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False},
{"__type__": "update", "value": i18n("合成语音"), "interactive": True},
)
with open("./weight.json") as f:
data = f.read()
data = json.loads(data)
@ -530,7 +549,7 @@ def get_phones_and_bert(text, language, version, final=False):
return phones, bert.to(dtype), norm_text
from module.mel_processing import spectrogram_torch, mel_spectrogram_torch
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
spec_min = -12
spec_max = 2
@ -1020,7 +1039,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")
+ i18n("v3暂不支持该模式使用了会报错。"),
value=False,
interactive=True,
interactive=True if model_version != "v3" else False,
show_label=True,
scale=1,
)
@ -1137,7 +1156,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
with gr.Row():
inference_button = gr.Button(i18n("合成语音"), variant="primary", size="lg", scale=25)
inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
output = gr.Audio(label=i18n("输出的语音"), scale=14)
inference_button.click(
@ -1176,6 +1195,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
inp_refs,
ref_text_free,
if_sr_Checkbox,
inference_button,
],
)
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])

View File

@ -7,11 +7,11 @@
全部按日文识别
"""
import random
import os
import re
import logging
import json
import logging
import os
import random
import re
import sys
now_dir = os.getcwd()
@ -47,11 +47,13 @@ gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
bert_path = os.environ.get("bert_path", None)
version = os.environ.get("version", "v2")
version = model_version = os.environ.get("version", "v2")
import gradio as gr
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
from inference_webui import DictToAttrRecursive
from TTS_infer_pack.text_segmentation_method import get_method
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
from tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto")
@ -254,21 +256,18 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
global version, dict_language
global version, model_version, dict_language, if_lora_v3
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
if if_lora_v3 and not os.path.exists(path_sovits_v3):
info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
# print(sovits_path,version, model_version, if_lora_v3)
if if_lora_v3 == True and is_exist_s2gv3 == False: #
info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info)
raise FileExistsError(info)
tts_pipeline.init_vits_weights(sovits_path)
dict_language = dict_language_v1 if tts_pipeline.configs.version == "v1" else dict_language_v2
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
if prompt_language is not None and text_language is not None:
if prompt_language in list(dict_language.keys()):
prompt_text_update, prompt_language_update = (
@ -289,6 +288,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
else:
visible_sample_steps = False
visible_inp_refs = True
# prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
yield (
{"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())},
@ -296,12 +296,25 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "visible": visible_sample_steps},
{"__type__": "update", "interactive": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False},
{"__type__": "update", "visible": True if model_version == "v3" else False},
{"__type__": "update", "interactive": True if model_version != "v3" else False},
{"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
)
tts_pipeline.init_vits_weights(sovits_path)
yield (
{"__type__": "update", "choices": list(dict_language.keys())},
{"__type__": "update", "choices": list(dict_language.keys())},
prompt_text_update,
prompt_language_update,
text_update,
text_language_update,
{"__type__": "update", "interactive": visible_sample_steps, "value": 32},
{"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "interactive": True if model_version != "v3" else False},
{"__type__": "update", "value": i18n("合成语音"), "interactive": True},
)
with open("./weight.json") as f:
data = f.read()
data = json.loads(data)
@ -341,7 +354,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row():
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath")
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"), file_count="multiple")
inp_refs = gr.File(
label=i18n("辅参考音频(可选多个,或不选)"),
file_count="multiple",
visible=True if model_version != "v3" else False,
)
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
with gr.Row():
prompt_language = gr.Dropdown(
@ -351,7 +368,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
ref_text_free = gr.Checkbox(
label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"),
value=False,
interactive=True,
interactive=True if model_version != "v3" else False,
show_label=True,
)
gr.Markdown(
@ -465,8 +482,19 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
SoVITS_dropdown.change(
change_sovits_weights,
[SoVITS_dropdown, prompt_language, text_language],
[prompt_language, text_language, prompt_text, prompt_language, text, text_language],
)
[
prompt_language,
text_language,
prompt_text,
prompt_language,
text,
text_language,
sample_steps,
inp_refs,
ref_text_free,
inference_button,
],
) #
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
with gr.Group():

View File

@ -63,7 +63,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
print("Downloading g2pw model...")
modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status()
with open(zip_dir, "wb") as f:

View File

@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
修复v3sovits未传参以支持调节语速
### 202503
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
修复其他若干bug
重点更新:
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
2-整合包修复onnxruntime GPU推理的支持影响1g2pw有个onnx模型原先是CPU推理现在用GPU显著降低推理的CPU瓶颈 2foxjoy去混响模型现在可使用GPU推理

View File

@ -435,9 +435,9 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language)
else:
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
#####v3暂不支持加速推理
if version == "v3":
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
# #####v3暂不支持加速推理
# if version=="v3":
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if p_tts_inference is None:
os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path)
os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path)
@ -1312,9 +1312,9 @@ def switch_version(version_):
"value": False if not if_force_ckpt else True,
"interactive": True if not if_force_ckpt else False,
},
{"__type__": "update", "interactive": False if version == "v3" else True, "value": False},
{"__type__": "update", "interactive": True, "value": False},
{"__type__": "update", "visible": True if version == "v3" else False},
)
) # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
if os.path.exists("GPT_SoVITS/text/G2PWModel"):