From 7394dc7b0c9e5012b614f8d7b48404a1d6c5ad38 Mon Sep 17 00:00:00 2001 From: ChasonJiang <46401978+ChasonJiang@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:34:51 +0800 Subject: [PATCH 01/14] =?UTF-8?q?=E4=B8=BAapi=5Fv2=E5=92=8Cinference=5Fweb?= =?UTF-8?q?ui=5Ffast=E9=80=82=E9=85=8DV3=E7=89=88=E6=9C=AC=20(#2188)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modified: GPT_SoVITS/TTS_infer_pack/TTS.py modified: GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py modified: GPT_SoVITS/inference_webui_fast.py * 适配V3版本 * api_v2.py和inference_webui_fast.py的v3适配 * 修改了个远古bug,增加了更友好的提示信息 * 优化webui * 修改为正确的path * 修复v3 lora模型的载入问题 * 修复读取tts_infer.yaml文件时遇到的编码不匹配的问题 --- .gitignore | 4 +- .../alias_free_activation/torch/act.py | 2 +- .../alias_free_activation/torch/resample.py | 4 +- GPT_SoVITS/BigVGAN/bigvgan.py | 14 +- GPT_SoVITS/BigVGAN/meldataset.py | 2 +- GPT_SoVITS/BigVGAN/utils0.py | 2 +- GPT_SoVITS/TTS_infer_pack/TTS.py | 447 ++++++++++++++---- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 7 +- GPT_SoVITS/configs/tts_infer.yaml | 12 +- GPT_SoVITS/inference_webui_fast.py | 109 +++-- api_v2.py | 24 +- tools/audio_sr.py | 5 + 12 files changed, 486 insertions(+), 146 deletions(-) diff --git a/.gitignore b/.gitignore index b7fec30..e5cedbf 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,6 @@ SoVITS_weights_v3 TEMP weight.json ffmpeg* -ffprobe* \ No newline at end of file +ffprobe* +tools/AP_BWE_main/24kto48k/* +!tools/AP_BWE_main/24kto48k/readme.txt \ No newline at end of file diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py index cc6e9f8..a6693aa 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py @@ -2,7 +2,7 @@ # LICENSE is in incl_licenses directory. import torch.nn as nn -from alias_free_activation.torch.resample import UpSample1d, DownSample1d +from .resample import UpSample1d, DownSample1d class Activation1d(nn.Module): diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py index f321150..a35380f 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py @@ -3,8 +3,8 @@ import torch.nn as nn from torch.nn import functional as F -from alias_free_activation.torch.filter import LowPassFilter1d -from alias_free_activation.torch.filter import kaiser_sinc_filter1d +from .filter import LowPassFilter1d +from .filter import kaiser_sinc_filter1d class UpSample1d(nn.Module): diff --git a/GPT_SoVITS/BigVGAN/bigvgan.py b/GPT_SoVITS/BigVGAN/bigvgan.py index 214672e..6c4a223 100644 --- a/GPT_SoVITS/BigVGAN/bigvgan.py +++ b/GPT_SoVITS/BigVGAN/bigvgan.py @@ -14,10 +14,10 @@ import torch.nn as nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn.utils import weight_norm, remove_weight_norm -import activations -from utils0 import init_weights, get_padding -from alias_free_activation.torch.act import Activation1d as TorchActivation1d -from env import AttrDict +from . import activations +from .utils0 import init_weights, get_padding +from .alias_free_activation.torch.act import Activation1d as TorchActivation1d +from .env import AttrDict from huggingface_hub import PyTorchModelHubMixin, hf_hub_download @@ -93,7 +93,7 @@ class AMPBlock1(torch.nn.Module): # Select which Activation1d, lazy-load cuda version to ensure backward compatibility if self.h.get("use_cuda_kernel", False): - from alias_free_activation.cuda.activation1d import ( + from .alias_free_activation.cuda.activation1d import ( Activation1d as CudaActivation1d, ) @@ -193,7 +193,7 @@ class AMPBlock2(torch.nn.Module): # Select which Activation1d, lazy-load cuda version to ensure backward compatibility if self.h.get("use_cuda_kernel", False): - from alias_free_activation.cuda.activation1d import ( + from .alias_free_activation.cuda.activation1d import ( Activation1d as CudaActivation1d, ) @@ -271,7 +271,7 @@ class BigVGAN( # Select which Activation1d, lazy-load cuda version to ensure backward compatibility if self.h.get("use_cuda_kernel", False): - from alias_free_activation.cuda.activation1d import ( + from .alias_free_activation.cuda.activation1d import ( Activation1d as CudaActivation1d, ) diff --git a/GPT_SoVITS/BigVGAN/meldataset.py b/GPT_SoVITS/BigVGAN/meldataset.py index bfbd4b6..a5859b9 100644 --- a/GPT_SoVITS/BigVGAN/meldataset.py +++ b/GPT_SoVITS/BigVGAN/meldataset.py @@ -15,7 +15,7 @@ from librosa.filters import mel as librosa_mel_fn import pathlib from tqdm import tqdm from typing import List, Tuple, Optional -from env import AttrDict +from .env import AttrDict MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases) diff --git a/GPT_SoVITS/BigVGAN/utils0.py b/GPT_SoVITS/BigVGAN/utils0.py index 888ea89..da98a24 100644 --- a/GPT_SoVITS/BigVGAN/utils0.py +++ b/GPT_SoVITS/BigVGAN/utils0.py @@ -9,7 +9,7 @@ from torch.nn.utils import weight_norm matplotlib.use("Agg") import matplotlib.pylab as plt -from meldataset import MAX_WAV_VALUE +from .meldataset import MAX_WAV_VALUE from scipy.io.wavfile import write diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 012cbf8..52402e9 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -4,6 +4,7 @@ import os, sys, gc import random import traceback +import torchaudio from tqdm import tqdm now_dir = os.getcwd() sys.path.append(now_dir) @@ -15,10 +16,11 @@ import torch import torch.nn.functional as F import yaml from transformers import AutoModelForMaskedLM, AutoTokenizer - +from tools.audio_sr import AP_BWE from AR.models.t2s_lightning_module import Text2SemanticLightningModule from feature_extractor.cnhubert import CNHubert -from module.models import SynthesizerTrn +from module.models import SynthesizerTrn, SynthesizerTrnV3 +from peft import LoraConfig, get_peft_model import librosa from time import time as ttime from tools.i18n.i18n import I18nAuto, scan_language_list @@ -26,10 +28,98 @@ from tools.my_utils import load_audio from module.mel_processing import spectrogram_torch from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor +from BigVGAN.bigvgan import BigVGAN +from module.mel_processing import spectrogram_torch,mel_spectrogram_torch +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) + + +spec_min = -12 +spec_max = 2 +def norm_spec(x): + return (x - spec_min) / (spec_max - spec_min) * 2 - 1 +def denorm_spec(x): + return (x + 1) / 2 * (spec_max - spec_min) + spec_min +mel_fn=lambda x: mel_spectrogram_torch(x, **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False +}) + + +def speed_change(input_audio:np.ndarray, speed:float, sr:int): + # 将 NumPy 数组转换为原始 PCM 流 + raw_audio = input_audio.astype(np.int16).tobytes() + + # 设置 ffmpeg 输入流 + input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1) + + # 变速处理 + output_stream = input_stream.filter('atempo', speed) + + # 输出流到管道 + out, _ = ( + output_stream.output('pipe:', format='s16le', acodec='pcm_s16le') + .run(input=raw_audio, capture_stdout=True, capture_stderr=True) + ) + + # 将管道输出解码为 NumPy 数组 + processed_audio = np.frombuffer(out, np.int16) + + return processed_audio + + + +resample_transform_dict={} +def resample(audio_tensor, sr0, device): + global resample_transform_dict + if sr0 not in resample_transform_dict: + resample_transform_dict[sr0] = torchaudio.transforms.Resample( + sr0, 24000 + ).to(device) + return resample_transform_dict[sr0](audio_tensor) + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +class NO_PROMPT_ERROR(Exception): + pass + + # configs/tts_infer.yaml """ custom: @@ -56,11 +146,19 @@ default_v2: t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 +default_v3: + bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + device: cpu + is_half: false + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt + vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth + version: v3 """ def set_seed(seed:int): seed = int(seed) - seed = seed if seed != -1 else random.randrange(1 << 32) + seed = seed if seed != -1 else random.randint(0, 2**32 - 1) print(f"Set seed to {seed}") os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) @@ -82,7 +180,7 @@ def set_seed(seed:int): class TTS_Config: default_configs={ - "default":{ + "v1":{ "device": "cpu", "is_half": False, "version": "v1", @@ -91,7 +189,7 @@ class TTS_Config: "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", }, - "default_v2":{ + "v2":{ "device": "cpu", "is_half": False, "version": "v2", @@ -100,6 +198,15 @@ class TTS_Config: "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", }, + "v3":{ + "device": "cpu", + "is_half": False, + "version": "v3", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, } configs:dict = None v1_languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] @@ -136,12 +243,9 @@ class TTS_Config: assert isinstance(configs, dict) version = configs.get("version", "v2").lower() - assert version in ["v1", "v2"] - self.default_configs["default"] = configs.get("default", self.default_configs["default"]) - self.default_configs["default_v2"] = configs.get("default_v2", self.default_configs["default_v2"]) - - default_config_key = "default"if version=="v1" else "default_v2" - self.configs:dict = configs.get("custom", deepcopy(self.default_configs[default_config_key])) + assert version in ["v1", "v2", "v3"] + self.default_configs[version] = configs.get(version, self.default_configs[version]) + self.configs:dict = configs.get("custom", deepcopy(self.default_configs[version])) self.device = self.configs.get("device", torch.device("cpu")) @@ -159,20 +263,22 @@ class TTS_Config: self.vits_weights_path = self.configs.get("vits_weights_path", None) self.bert_base_path = self.configs.get("bert_base_path", None) self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None) - self.languages = self.v2_languages if self.version=="v2" else self.v1_languages + self.languages = self.v1_languages if self.version=="v1" else self.v2_languages + + self.is_v3_synthesizer:bool = False if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)): - self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path'] + self.t2s_weights_path = self.default_configs[version]['t2s_weights_path'] print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}") if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)): - self.vits_weights_path = self.default_configs[default_config_key]['vits_weights_path'] + self.vits_weights_path = self.default_configs[version]['vits_weights_path'] print(f"fall back to default vits_weights_path: {self.vits_weights_path}") if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)): - self.bert_base_path = self.default_configs[default_config_key]['bert_base_path'] + self.bert_base_path = self.default_configs[version]['bert_base_path'] print(f"fall back to default bert_base_path: {self.bert_base_path}") if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): - self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path'] + self.cnhuhbert_base_path = self.default_configs[version]['cnhuhbert_base_path'] print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") self.update_configs() @@ -195,7 +301,7 @@ class TTS_Config: else: print(i18n("路径不存在,使用默认配置")) self.save_configs(configs_path) - with open(configs_path, 'r') as f: + with open(configs_path, 'r', encoding='utf-8') as f: configs = yaml.load(f, Loader=yaml.FullLoader) return configs @@ -224,7 +330,7 @@ class TTS_Config: def update_version(self, version:str)->None: self.version = version - self.languages = self.v2_languages if self.version=="v2" else self.v1_languages + self.languages = self.v1_languages if self.version=="v1" else self.v2_languages def __str__(self): self.configs = self.update_configs() @@ -252,10 +358,13 @@ class TTS: self.configs:TTS_Config = TTS_Config(configs) self.t2s_model:Text2SemanticLightningModule = None - self.vits_model:SynthesizerTrn = None + self.vits_model:Union[SynthesizerTrn, SynthesizerTrnV3] = None self.bert_tokenizer:AutoTokenizer = None self.bert_model:AutoModelForMaskedLM = None self.cnhuhbert_model:CNHubert = None + self.bigvgan_model:BigVGAN = None + self.sr_model:AP_BWE = None + self.sr_model_not_exist:bool = False self._init_models() @@ -310,38 +419,83 @@ class TTS: self.bert_model = self.bert_model.half() def init_vits_weights(self, weights_path: str): - print(f"Loading VITS weights from {weights_path}") + self.configs.vits_weights_path = weights_path - dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) - hps = dict_s2["config"] - if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: - self.configs.update_version("v1") - else: - self.configs.update_version("v2") - self.configs.save_configs() + version, model_version, if_lora_v3=get_sovits_version_from_path_fast(weights_path) + path_sovits_v3=self.configs.default_configs["v3"]["vits_weights_path"] + + if if_lora_v3==True and os.path.exists(path_sovits_v3)==False: + info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + raise FileExistsError(info) + + # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) + dict_s2 = load_sovits_new(weights_path) + hps = dict_s2["config"] + + hps["model"]["semantic_frame_rate"] = "25hz" + if 'enc_p.text_embedding.weight'not in dict_s2['weight']: + hps["model"]["version"] = "v2"#v3model,v2sybomls + elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + hps["model"]["version"] = "v1" + else: + hps["model"]["version"] = "v2" + # version = hps["model"]["version"] - hps["model"]["version"] = self.configs.version self.configs.filter_length = hps["data"]["filter_length"] self.configs.segment_size = hps["train"]["segment_size"] self.configs.sampling_rate = hps["data"]["sampling_rate"] self.configs.hop_length = hps["data"]["hop_length"] self.configs.win_length = hps["data"]["win_length"] self.configs.n_speakers = hps["data"]["n_speakers"] - self.configs.semantic_frame_rate = "25hz" + self.configs.semantic_frame_rate = hps["model"]["semantic_frame_rate"] kwargs = hps["model"] - vits_model = SynthesizerTrn( - self.configs.filter_length // 2 + 1, - self.configs.segment_size // self.configs.hop_length, - n_speakers=self.configs.n_speakers, - **kwargs - ) + # print(f"self.configs.sampling_rate:{self.configs.sampling_rate}") + + self.configs.update_version(model_version) + + # print(f"model_version:{model_version}") + # print(f'hps["model"]["version"]:{hps["model"]["version"]}') + if model_version!="v3": + vits_model = SynthesizerTrn( + self.configs.filter_length // 2 + 1, + self.configs.segment_size // self.configs.hop_length, + n_speakers=self.configs.n_speakers, + **kwargs + ) + if hasattr(vits_model, "enc_q"): + del vits_model.enc_q + self.configs.is_v3_synthesizer = False + else: + vits_model = SynthesizerTrnV3( + self.configs.filter_length // 2 + 1, + self.configs.segment_size // self.configs.hop_length, + n_speakers=self.configs.n_speakers, + **kwargs + ) + self.configs.is_v3_synthesizer = True + self.init_bigvgan() + + + if if_lora_v3==False: + print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") + else: + print(f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}") + lora_rank=dict_s2["lora_rank"] + lora_config = LoraConfig( + target_modules=["to_k", "to_q", "to_v", "to_out.0"], + r=lora_rank, + lora_alpha=lora_rank, + init_lora_weights=True, + ) + vits_model.cfm = get_peft_model(vits_model.cfm, lora_config) + print(f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") + + vits_model.cfm = vits_model.cfm.merge_and_unload() - if hasattr(vits_model, "enc_q"): - del vits_model.enc_q vits_model = vits_model.to(self.configs.device) vits_model = vits_model.eval() - vits_model.load_state_dict(dict_s2["weight"], strict=False) + self.vits_model = vits_model if self.configs.is_half and str(self.configs.device)!="cpu": self.vits_model = self.vits_model.half() @@ -363,6 +517,30 @@ class TTS: if self.configs.is_half and str(self.configs.device)!="cpu": self.t2s_model = self.t2s_model.half() + + def init_bigvgan(self): + if self.bigvgan_model is not None: + return + self.bigvgan_model = BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + self.bigvgan_model.remove_weight_norm() + self.bigvgan_model = self.bigvgan_model.eval() + if self.configs.is_half == True: + self.bigvgan_model = self.bigvgan_model.half().to(self.configs.device) + else: + self.bigvgan_model = self.bigvgan_model.to(self.configs.device) + + def init_sr_model(self): + if self.sr_model is not None: + return + try: + self.sr_model:AP_BWE=AP_BWE(self.configs.device,DictToAttrRecursive) + self.sr_model_not_exist = False + except FileNotFoundError: + print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) + self.sr_model_not_exist = True + + def enable_half_precision(self, enable: bool = True, save: bool = True): ''' To enable half precision for the TTS model. @@ -387,6 +565,8 @@ class TTS: self.bert_model =self.bert_model.half() if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.half() + if self.bigvgan_model is not None: + self.bigvgan_model = self.bigvgan_model.half() else: if self.t2s_model is not None: self.t2s_model = self.t2s_model.float() @@ -396,6 +576,8 @@ class TTS: self.bert_model = self.bert_model.float() if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.float() + if self.bigvgan_model is not None: + self.bigvgan_model = self.bigvgan_model.float() def set_device(self, device: torch.device, save: bool = True): ''' @@ -414,6 +596,11 @@ class TTS: self.bert_model = self.bert_model.to(device) if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.to(device) + if self.bigvgan_model is not None: + self.bigvgan_model = self.bigvgan_model.to(device) + if self.sr_model is not None: + self.sr_model = self.sr_model.to(device) + def set_ref_audio(self, ref_audio_path:str): ''' @@ -437,6 +624,11 @@ class TTS: self.prompt_cache["refer_spec"][0] = spec def _get_ref_spec(self, ref_audio_path): + raw_audio, raw_sr = torchaudio.load(ref_audio_path) + raw_audio=raw_audio.to(self.configs.device).float() + self.prompt_cache["raw_audio"] = raw_audio + self.prompt_cache["raw_sr"] = raw_sr + audio = load_audio(ref_audio_path, int(self.configs.sampling_rate)) audio = torch.FloatTensor(audio) maxx=audio.abs().max() @@ -625,11 +817,11 @@ class TTS: Recovery the order of the audio according to the batch_index_list. Args: - data (List[list(np.ndarray)]): the out of order audio . + data (List[list(torch.Tensor)]): the out of order audio . batch_index_list (List[list[int]]): the batch index list. Returns: - list (List[np.ndarray]): the data in the original order. + list (List[torch.Tensor]): the data in the original order. ''' length = len(sum(batch_index_list, [])) _data = [None]*length @@ -671,6 +863,8 @@ class TTS: "seed": -1, # int. random seed for reproducibility. "parallel_infer": True, # bool. whether to use parallel inference. "repetition_penalty": 1.35 # float. repetition penalty for T2S model. + "sample_steps": 32, # int. number of sampling steps for VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. } returns: Tuple[int, np.ndarray]: sampling rate and audio data. @@ -698,6 +892,8 @@ class TTS: actual_seed = set_seed(seed) parallel_infer = inputs.get("parallel_infer", True) repetition_penalty = inputs.get("repetition_penalty", 1.35) + sample_steps = inputs.get("sample_steps", 32) + super_sampling = inputs.get("super_sampling", False) if parallel_infer: print(i18n("并行推理模式已开启")) @@ -732,6 +928,9 @@ class TTS: if not no_prompt_text: assert prompt_lang in self.configs.languages + if no_prompt_text and self.configs.is_v3_synthesizer: + raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3") + if ref_audio_path in [None, ""] and \ ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])): raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") @@ -761,13 +960,13 @@ class TTS: if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_lang != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) if self.prompt_cache["prompt_text"] != prompt_text: - self.prompt_cache["prompt_text"] = prompt_text - self.prompt_cache["prompt_lang"] = prompt_lang phones, bert_features, norm_text = \ self.text_preprocessor.segment_and_extract_feature_for_text( prompt_text, prompt_lang, self.configs.version) + self.prompt_cache["prompt_text"] = prompt_text + self.prompt_cache["prompt_lang"] = prompt_lang self.prompt_cache["phones"] = phones self.prompt_cache["bert_features"] = bert_features self.prompt_cache["norm_text"] = norm_text @@ -781,8 +980,7 @@ class TTS: if not return_fragment: data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) if len(data) == 0: - yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), - dtype=np.int16) + yield 16000, np.zeros(int(16000), dtype=np.int16) return batch_index_list:list = None @@ -836,6 +1034,7 @@ class TTS: t_34 = 0.0 t_45 = 0.0 audio = [] + output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000 for item in data: t3 = ttime() if return_fragment: @@ -858,7 +1057,7 @@ class TTS: else: prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) - + print(f"############ {i18n('预测语义Token')} ############") pred_semantic_list, idx_list = self.t2s_model.model.infer_panel( all_phoneme_ids, all_phoneme_lens, @@ -892,70 +1091,80 @@ class TTS: # batch_audio_fragment = (self.vits_model.batched_decode( # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec # )) - - if speed_factor == 1.0: - # ## vits并行推理 method 2 - pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] - upsample_rate = math.prod(self.vits_model.upsample_rates) - audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))] - audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))] - all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) - _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) - _batch_audio_fragment = (self.vits_model.decode( - all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) - audio_frag_end_idx.insert(0, 0) - batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))] + print(f"############ {i18n('合成音频')} ############") + if not self.configs.is_v3_synthesizer: + if speed_factor == 1.0: + # ## vits并行推理 method 2 + pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] + upsample_rate = math.prod(self.vits_model.upsample_rates) + audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))] + audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))] + all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) + _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) + _batch_audio_fragment = (self.vits_model.decode( + all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :]) + audio_frag_end_idx.insert(0, 0) + batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))] + else: + # ## vits串行推理 + for i, idx in enumerate(tqdm(idx_list)): + phones = batch_phones[i].unsqueeze(0).to(self.configs.device) + _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 + audio_fragment =(self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :]) + batch_audio_fragment.append( + audio_fragment + ) ###试试重建不带上prompt部分 else: - # ## vits串行推理 - for i, idx in enumerate(idx_list): + for i, idx in enumerate(tqdm(idx_list)): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 - audio_fragment =(self.vits_model.decode( - _pred_semantic, phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) + audio_fragment = self.v3_synthesis( + _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps + ) batch_audio_fragment.append( audio_fragment - ) ###试试重建不带上prompt部分 + ) t5 = ttime() t_45 += t5 - t4 if return_fragment: print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4)) yield self.audio_postprocess([batch_audio_fragment], - self.configs.sampling_rate, + output_sr, None, speed_factor, False, - fragment_interval + fragment_interval, + super_sampling if self.configs.is_v3_synthesizer else False ) else: audio.append(batch_audio_fragment) if self.stop_flag: - yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), - dtype=np.int16) + yield 16000, np.zeros(int(16000), dtype=np.int16) return if not return_fragment: print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45)) if len(audio) == 0: - yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), - dtype=np.int16) + yield 16000, np.zeros(int(16000), dtype=np.int16) return yield self.audio_postprocess(audio, - self.configs.sampling_rate, + output_sr, batch_index_list, speed_factor, split_bucket, - fragment_interval + fragment_interval, + super_sampling if self.configs.is_v3_synthesizer else False ) except Exception as e: traceback.print_exc() # 必须返回一个空音频, 否则会导致显存不释放。 - yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate), - dtype=np.int16) + yield 16000, np.zeros(int(16000), dtype=np.int16) # 重置模型, 否则会导致显存释放不完全。 del self.t2s_model del self.vits_model @@ -983,7 +1192,8 @@ class TTS: batch_index_list:list=None, speed_factor:float=1.0, split_bucket:bool=True, - fragment_interval:float=0.3 + fragment_interval:float=0.3, + super_sampling:bool=False, )->Tuple[int, np.ndarray]: zero_wav = torch.zeros( int(self.configs.sampling_rate * fragment_interval), @@ -996,7 +1206,7 @@ class TTS: max_audio=torch.abs(audio_fragment).max()#简单防止16bit爆音 if max_audio>1: audio_fragment/=max_audio audio_fragment:torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) - audio[i][j] = audio_fragment.cpu().numpy() + audio[i][j] = audio_fragment if split_bucket: @@ -1005,8 +1215,21 @@ class TTS: # audio = [item for batch in audio for item in batch] audio = sum(audio, []) + audio = torch.cat(audio, dim=0) + + if super_sampling: + print(f"############ {i18n('音频超采样')} ############") + t1 = ttime() + self.init_sr_model() + if not self.sr_model_not_exist: + audio,sr=self.sr_model(audio.unsqueeze(0),sr) + max_audio=np.abs(audio).max() + if max_audio > 1: audio /= max_audio + t2 = ttime() + print(f"超采样用时:{t2-t1:.3f}s") + else: + audio = audio.cpu().numpy() - audio = np.concatenate(audio, 0) audio = (audio * 32768).astype(np.int16) # try: @@ -1018,25 +1241,59 @@ class TTS: return sr, audio + def v3_synthesis(self, + semantic_tokens:torch.Tensor, + phones:torch.Tensor, + speed:float=1.0, + sample_steps:int=32 + ): + + prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) + prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) + refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device) + fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) + ref_audio:torch.Tensor = self.prompt_cache["raw_audio"] + ref_sr = self.prompt_cache["raw_sr"] + ref_audio=ref_audio.to(self.configs.device).float() + if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.mean(0).unsqueeze(0) + if ref_sr!=24000: + ref_audio=resample(ref_audio, ref_sr, self.configs.device) + # print("ref_audio",ref_audio.abs().mean()) + mel2 = mel_fn(ref_audio) + mel2 = norm_spec(mel2) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + mel2 = mel2[:, :, :T_min] + fea_ref = fea_ref[:, :, :T_min] + if (T_min > 468): + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + chunk_len = 934 - T_min -def speed_change(input_audio:np.ndarray, speed:float, sr:int): - # 将 NumPy 数组转换为原始 PCM 流 - raw_audio = input_audio.astype(np.int16).tobytes() + mel2=mel2.to(self.precision) + fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed) - # 设置 ffmpeg 输入流 - input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1) + cfm_resss = [] + idx = 0 + while (1): + fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] + if (fea_todo_chunk.shape[-1] == 0): break + idx += chunk_len + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - # 变速处理 - output_stream = input_stream.filter('atempo', speed) + cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) + cfm_res = cfm_res[:, :, mel2.shape[2]:] + mel2 = cfm_res[:, :, -T_min:] - # 输出流到管道 - out, _ = ( - output_stream.output('pipe:', format='s16le', acodec='pcm_s16le') - .run(input=raw_audio, capture_stdout=True, capture_stderr=True) - ) - - # 将管道输出解码为 NumPy 数组 - processed_audio = np.frombuffer(out, np.int16) - - return processed_audio + fea_ref = fea_todo_chunk[:, :, -T_min:] + cfm_resss.append(cfm_res) + cmf_res = torch.cat(cfm_resss, 2) + cmf_res = denorm_spec(cmf_res) + + with torch.inference_mode(): + wav_gen = self.bigvgan_model(cmf_res) + audio=wav_gen[0][0]#.cpu().detach().numpy() + + return audio diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 9def3da..653656a 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -118,11 +118,11 @@ class TextPreprocessor: def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: - language = language.replace("all_","") + # language = language.replace("all_","") formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") - if language == "zh": + if language == "all_zh": if re.search(r'[A-Za-z]', formattext): formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) @@ -130,7 +130,7 @@ class TextPreprocessor: else: phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) bert = self.get_bert_feature(norm_text, word2ph).to(self.device) - elif language == "yue" and re.search(r'[A-Za-z]', formattext): + elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) return self.get_phones_and_bert(formattext,"yue",version) @@ -199,6 +199,7 @@ class TextPreprocessor: return phone_level_feature.T def clean_text_inf(self, text:str, language:str, version:str="v2"): + language = language.replace("all_","") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 66f1193..344aae4 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -6,7 +6,7 @@ custom: t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt version: v2 vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth -default: +v1: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu @@ -14,7 +14,7 @@ default: t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt version: v1 vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth -default_v2: +v2: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu @@ -22,3 +22,11 @@ default_v2: t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt version: v2 vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth +v3: + bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + device: cpu + is_half: false + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt + version: v3 + vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 5a6910d..9017aa4 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -7,7 +7,7 @@ 全部按日文识别 ''' import random -import os, re, logging +import os, re, logging, json import sys now_dir = os.getcwd() sys.path.append(now_dir) @@ -44,7 +44,7 @@ bert_path = os.environ.get("bert_path", None) version=os.environ.get("version","v2") import gradio as gr -from TTS_infer_pack.TTS import TTS, TTS_Config +from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR from TTS_infer_pack.text_segmentation_method import get_method from tools.i18n.i18n import I18nAuto, scan_language_list @@ -62,6 +62,9 @@ if torch.cuda.is_available(): else: device = "cpu" +# is_half = False +# device = "cpu" + dict_language_v1 = { i18n("中文"): "all_zh",#全部按中文识别 i18n("英文"): "en",#全部按英文识别#######不变 @@ -123,11 +126,11 @@ def inference(text, text_lang, speed_factor, ref_text_free, split_bucket,fragment_interval, seed, keep_random, parallel_infer, - repetition_penalty + repetition_penalty, sample_steps, super_sampling, ): seed = -1 if keep_random else seed - actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32) + actual_seed = seed if seed not in [-1, "", None] else random.randint(0, 2**32 - 1) inputs={ "text": text, "text_lang": dict_language[text_lang], @@ -147,9 +150,14 @@ def inference(text, text_lang, "seed":actual_seed, "parallel_infer": parallel_infer, "repetition_penalty": repetition_penalty, + "sample_steps": int(sample_steps), + "super_sampling": super_sampling, } - for item in tts_pipeline.run(inputs): - yield item, actual_seed + try: + for item in tts_pipeline.run(inputs): + yield item, actual_seed + except NO_PROMPT_ERROR: + gr.Warning(i18n('V3不支持无参考文本模式,请填写参考文本!')) def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 @@ -163,19 +171,38 @@ def change_choices(): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} +path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" +pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3] +pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"] _ =[[],[]] -for i in range(2): - if os.path.exists(pretrained_gpt_name[i]): - _[0].append(pretrained_gpt_name[i]) - if os.path.exists(pretrained_sovits_name[i]): - _[-1].append(pretrained_sovits_name[i]) +for i in range(3): + if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) pretrained_gpt_name,pretrained_sovits_name = _ -SoVITS_weight_root=["SoVITS_weights_v2","SoVITS_weights"] -GPT_weight_root=["GPT_weights_v2","GPT_weights"] + +if os.path.exists(f"./weight.json"): + pass +else: + with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file) + +with open(f"./weight.json", 'r', encoding="utf-8") as file: + weight_data = file.read() + weight_data=json.loads(weight_data) + gpt_path = os.environ.get( + "gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name)) + sovits_path = os.environ.get( + "sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name)) + if isinstance(gpt_path,list): + gpt_path = gpt_path[0] + if isinstance(sovits_path,list): + sovits_path = sovits_path[0] + + + +SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] +GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] for path in SoVITS_weight_root+GPT_weight_root: os.makedirs(path,exist_ok=True) @@ -194,10 +221,18 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) - +from process_ckpt import get_sovits_version_from_path_fast def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): - tts_pipeline.init_vits_weights(sovits_path) global version, dict_language + version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) + + if if_lora_v3 and not os.path.exists(path_sovits_v3): + info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + gr.Warning(info) + raise FileExistsError(info) + + tts_pipeline.init_vits_weights(sovits_path) + dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): @@ -210,9 +245,19 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): else: text_update = {'__type__':'update', 'value':''} text_language_update = {'__type__':'update', 'value':i18n("中文")} - return {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update - + if model_version=="v3": + visible_sample_steps=True + visible_inp_refs=False + else: + visible_sample_steps=False + visible_inp_refs=True + yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False} + with open("./weight.json")as f: + data=f.read() + data=json.loads(data) + data["SoVITS"][version]=sovits_path + with open("./weight.json","w")as f:f.write(json.dumps(data)) with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( @@ -257,13 +302,19 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): with gr.Column(): - batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True) - fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True) - speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="speed_factor",value=1.0,interactive=True) - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True) + with gr.Row(): + batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True) + sample_steps = gr.Radio(label=i18n("采样步数(仅对V3生效)"),value=32,choices=[4,8,16,32],visible=True) + with gr.Row(): + fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True) + speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="语速",value=1.0,interactive=True) + with gr.Row(): + top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) + top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + with gr.Row(): + temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) + repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True) + with gr.Column(): with gr.Row(): how_to_cut = gr.Dropdown( @@ -272,10 +323,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: value=i18n("凑四句一切"), interactive=True, scale=1 ) + super_sampling = gr.Checkbox(label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True) + + with gr.Row(): parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True) split_bucket = gr.Checkbox(label=i18n("数据分桶(并行推理时会降低一点计算量)"), value=True, interactive=True, show_label=True) with gr.Row(): + seed = gr.Number(label=i18n("随机种子"),value=-1) keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True) @@ -295,7 +350,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: speed_factor, ref_text_free, split_bucket,fragment_interval, seed, keep_random, parallel_infer, - repetition_penalty + repetition_penalty, sample_steps, super_sampling, ], [output, seed], ) diff --git a/api_v2.py b/api_v2.py index 92a18f3..3a8566a 100644 --- a/api_v2.py +++ b/api_v2.py @@ -39,6 +39,8 @@ POST: "seed": -1, # int. random seed for reproducibility. "parallel_infer": True, # bool. whether to use parallel inference. "repetition_penalty": 1.35 # float. repetition penalty for T2S model. + "sample_steps": 32, # int. number of sampling steps for VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. } ``` @@ -164,6 +166,8 @@ class TTS_Request(BaseModel): streaming_mode:bool = False parallel_infer:bool = True repetition_penalty:float = 1.35 + sample_steps:int = 32 + super_sampling:bool = False ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int): @@ -294,7 +298,9 @@ async def tts_handle(req:dict): "media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac". "streaming_mode": False, # bool. whether to return a streaming response. "parallel_infer": True, # bool.(optional) whether to use parallel inference. - "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. + "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. + "sample_steps": 32, # int. number of sampling steps for VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. } returns: StreamingResponse: audio stream response. @@ -316,10 +322,12 @@ async def tts_handle(req:dict): if streaming_mode: def streaming_generator(tts_generator:Generator, media_type:str): - if media_type == "wav": - yield wave_header_chunk() - media_type = "raw" + if_frist_chunk = True for sr, chunk in tts_generator: + if if_frist_chunk and media_type == "wav": + yield wave_header_chunk(sample_rate=sr) + media_type = "raw" + if_frist_chunk = False yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue() # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}") @@ -365,7 +373,9 @@ async def tts_get_endpoint( media_type:str = "wav", streaming_mode:bool = False, parallel_infer:bool = True, - repetition_penalty:float = 1.35 + repetition_penalty:float = 1.35, + sample_steps:int =32, + super_sampling:bool = False ): req = { "text": text, @@ -387,7 +397,9 @@ async def tts_get_endpoint( "media_type":media_type, "streaming_mode":streaming_mode, "parallel_infer":parallel_infer, - "repetition_penalty":float(repetition_penalty) + "repetition_penalty":float(repetition_penalty), + "sample_steps":int(sample_steps), + "super_sampling":super_sampling } return await tts_handle(req) diff --git a/tools/audio_sr.py b/tools/audio_sr.py index d51f055..009ad26 100644 --- a/tools/audio_sr.py +++ b/tools/audio_sr.py @@ -39,6 +39,11 @@ class AP_BWE(): self.model=model self.h=h + def to(self, *arg, **kwargs): + self.model.to(*arg, **kwargs) + self.device = self.model.conv_pre_mag.weight.device + return self + def __call__(self, audio,orig_sampling_rate): with torch.no_grad(): # audio, orig_sampling_rate = torchaudio.load(inp_path) From 265586990cb0c23faa17640bf095bbf9bbc6136b Mon Sep 17 00:00:00 2001 From: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:35:52 +0800 Subject: [PATCH 02/14] =?UTF-8?q?=E6=9B=B4=E6=96=B0G2PWModel=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E9=93=BE=E6=8E=A5=20(#2219)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update G2PWModel download url * update G2PWModel download url --- GPT_SoVITS/text/g2pw/onnx_api.py | 2 +- README.md | 4 ++-- docs/cn/README.md | 4 ++-- docs/ja/README.md | 4 ++-- docs/ko/README.md | 4 ++-- docs/tr/README.md | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index 32fc2c0..dcb4604 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'): extract_dir = os.path.join(parent_directory,"G2PWModel_1.1") extract_dir_new = os.path.join(parent_directory,"G2PWModel") print("Downloading g2pw model...") - modelscope_url = "https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" + modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" with requests.get(modelscope_url, stream=True) as r: r.raise_for_status() with open(zip_dir, 'wb') as f: diff --git a/README.md b/README.md index adc1344..31567a6 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`. -2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only) +2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only) 3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. @@ -253,7 +253,7 @@ Use v2 from v1 environment: 3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. - Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. + Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. ## V3 Release Notes diff --git a/docs/cn/README.md b/docs/cn/README.md index 6196099..3594e43 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -147,7 +147,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。 -2. 从 [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文TTS) +2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文TTS) 3. 对于 UVR5(人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。 @@ -255,7 +255,7 @@ python webui.py 3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下 - 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) + 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) ## V3更新说明 diff --git a/docs/ja/README.md b/docs/ja/README.md index 8c815e8..ee7d819 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -138,7 +138,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。 -2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語TTSのみ) +2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語TTSのみ) 3. UVR5(ボーカル/伴奏(BGM等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。 @@ -242,7 +242,7 @@ V1環境からV2を使用するには: 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置 - 中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) + 中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) ## V3 リリースノート diff --git a/docs/ko/README.md b/docs/ko/README.md index 271ea57..5bf51ec 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -143,7 +143,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요. -2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용) +2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용) 3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요. @@ -248,7 +248,7 @@ V1 환경에서 V2를 사용하려면: 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오. - 중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) + 중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) ## V3 릴리스 노트 diff --git a/docs/tr/README.md b/docs/tr/README.md index 8b45663..d83bbab 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -138,7 +138,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin. -2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için) +2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için) 3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin. @@ -245,7 +245,7 @@ V1 ortamından V2'yi kullanmak için: 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin. - Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) + Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) ## V3 Sürüm Notları From 6c1c1bb72a1aa318e3b7368426d1469b148def87 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:45:06 +0800 Subject: [PATCH 03/14] Update README.md huggingface url Update README.md huggingface url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 31567a6..04bfff2 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ _Note: numba==0.56.4 requires py<3.11_ ### Windows -If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI. +If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI. **Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** From d7c24e9ac9785fda038f41dd3bf29c93e4a73c3b Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:46:04 +0800 Subject: [PATCH 04/14] Update README.md --- docs/cn/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cn/README.md b/docs/cn/README.md index 3594e43..7698c76 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -53,7 +53,7 @@ _注: numba==0.56.4 需要 python<3.11_ ### Windows -如果你是 Windows 用户(已在 win>=10 上测试),可以下载[下载整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。 +如果你是 Windows 用户(已在 win>=10 上测试),可以下载[下载整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。 **中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO)。** From 86e6dea6944f444a15f116373e55922bac9952a1 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:46:14 +0800 Subject: [PATCH 05/14] Update README.md --- docs/ja/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ja/README.md b/docs/ja/README.md index ee7d819..cafb07b 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -49,7 +49,7 @@ _注記: numba==0.56.4 は py<3.11 が必要です_ ### Windows -Windows ユーザー:(Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。 +Windows ユーザー:(Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。 ### Linux From 4635cb4293ee4f1721b4a5f636a27df8f05b1b85 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:46:21 +0800 Subject: [PATCH 06/14] Update README.md --- docs/ko/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ko/README.md b/docs/ko/README.md index 5bf51ec..92f0e75 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -49,7 +49,7 @@ _참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._ ### Windows -Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다. +Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다. ### Linux From b88bd391fc650a5066ffc2eccf89f057f76cf76d Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:46:29 +0800 Subject: [PATCH 07/14] Update README.md --- docs/tr/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tr/README.md b/docs/tr/README.md index d83bbab..6a13b2a 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -51,7 +51,7 @@ _Not: numba==0.56.4, py<3.11 gerektirir_ ### Windows -Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın. +Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın. ### Linux From f1332ff53a39ff792199d7a04c78d72f03129efe Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:49:48 +0800 Subject: [PATCH 08/14] Update README.md --- docs/cn/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cn/README.md b/docs/cn/README.md index 7698c76..62f8401 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -53,7 +53,7 @@ _注: numba==0.56.4 需要 python<3.11_ ### Windows -如果你是 Windows 用户(已在 win>=10 上测试),可以下载[下载整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。 +如果你是 Windows 用户(已在 win>=10 上测试),可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。 **中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO)。** From b0e465eb721f5c007324096690e8b5d87de4a86a Mon Sep 17 00:00:00 2001 From: zzz <458761603@qq.com> Date: Wed, 26 Mar 2025 14:50:55 +0800 Subject: [PATCH 09/14] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=BC?= =?UTF-8?q?=E5=87=BA=20v3=20=E7=9A=84=20script=20(#2208)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: 添加导出 v3 的 script * Fix: 由于 export_torch_script_v3 的改动,v2 现在需要传入 top_k --- GPT_SoVITS/export_torch_script.py | 31 +- GPT_SoVITS/export_torch_script_v3.py | 1045 ++++++++++++++++++++++ GPT_SoVITS/f5_tts/model/backbones/dit.py | 2 +- GPT_SoVITS/module/models_onnx.py | 206 ++++- 4 files changed, 1275 insertions(+), 9 deletions(-) create mode 100644 GPT_SoVITS/export_torch_script_v3.py diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index f7bef13..3f2c296 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -427,7 +427,7 @@ class T2SModel(nn.Module): self.top_k = int(raw_t2s.config["inference"]["top_k"]) self.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) - def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor): + def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor,top_k:LongTensor): bert = torch.cat([ref_bert.T, text_bert.T], 1) all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) bert = bert.unsqueeze(0) @@ -472,12 +472,13 @@ class T2SModel(nn.Module): .to(device=x.device, dtype=torch.bool) idx = 0 + top_k = int(top_k) xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) logits = self.ar_predict_layer(xy_dec[:, -1]) logits = logits[:, :-1] - samples = sample(logits, y, top_k=self.top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) y_emb = self.ar_audio_embedding(y[:, -1:]) xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) @@ -493,7 +494,7 @@ class T2SModel(nn.Module): if(idx<11):###至少预测出10个token不然不给停止(0.4s) logits = logits[:, :-1] - samples = sample(logits, y, top_k=self.top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) @@ -653,6 +654,8 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be torch._dynamo.mark_dynamic(ref_bert, 0) torch._dynamo.mark_dynamic(text_bert, 0) + top_k = torch.LongTensor([5]).to(device) + with torch.no_grad(): gpt_sovits_export = torch.jit.trace( gpt_sovits, @@ -662,7 +665,8 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be ref_seq, text_seq, ref_bert, - text_bert)) + text_bert, + top_k)) gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt") gpt_sovits_export.save(gpt_sovits_path) @@ -684,15 +688,26 @@ class GPT_SoVITS(nn.Module): self.t2s = t2s self.vits = vits - def forward(self, ssl_content:torch.Tensor, ref_audio_sr:torch.Tensor, ref_seq:Tensor, text_seq:Tensor, ref_bert:Tensor, text_bert:Tensor, speed=1.0): + def forward( + self, + ssl_content: torch.Tensor, + ref_audio_sr: torch.Tensor, + ref_seq: Tensor, + text_seq: Tensor, + ref_bert: Tensor, + text_bert: Tensor, + top_k: LongTensor, + speed=1.0, + ): codes = self.vits.vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompts = prompt_semantic.unsqueeze(0) - pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert) + pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert, top_k) audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed) return audio + def test(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") @@ -784,8 +799,10 @@ def test(): print('text_bert:',text_bert.shape) text_bert=text_bert.to('cuda') + top_k = torch.LongTensor([5]).to('cuda') + with torch.no_grad(): - audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert) + audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert, top_k) print('start write wav') soundfile.write("out.wav", audio.detach().cpu().numpy(), 32000) diff --git a/GPT_SoVITS/export_torch_script_v3.py b/GPT_SoVITS/export_torch_script_v3.py new file mode 100644 index 0000000..8b73d30 --- /dev/null +++ b/GPT_SoVITS/export_torch_script_v3.py @@ -0,0 +1,1045 @@ +import os +from export_torch_script import ( + T2SModel, + get_raw_t2s_model, + resamplex, + spectrogram_torch, +) +from f5_tts.model.backbones.dit import DiT +from feature_extractor import cnhubert +from inference_webui import get_phones_and_bert +import librosa +from module import commons +from module.mel_processing import mel_spectrogram_torch, spectral_normalize_torch +from module.models_onnx import CFM, SynthesizerTrnV3 +import numpy as np +import torch._dynamo.config +import torchaudio +import logging, uvicorn +import torch +import soundfile +from librosa.filters import mel as librosa_mel_fn + + +from inference_webui import get_spepc, norm_spec, resample, ssl_model + +logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG) +logger = logging.getLogger("uvicorn") + +is_half = True +device = "cuda" if torch.cuda.is_available() else "cpu" +now_dir = os.getcwd() + + +class MelSpectrgram(torch.nn.Module): + + def __init__( + self, + dtype, + device, + n_fft, + num_mels, + sampling_rate, + hop_size, + win_size, + fmin, + fmax, + center=False, + ): + super().__init__() + self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype) + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device) + self.n_fft:int = n_fft + self.hop_size:int = hop_size + self.win_size:int = win_size + self.center:bool = center + + def forward(self, y): + y = torch.nn.functional.pad( + y.unsqueeze(1), + ( + int((self.n_fft - self.hop_size) / 2), + int((self.n_fft - self.hop_size) / 2), + ), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + self.n_fft, + hop_length=self.hop_size, + win_length=self.win_size, + window=self.hann_window, + center=self.center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-9) + spec = torch.matmul(self.mel_basis, spec) + # spec = spectral_normalize_torch(spec) + spec = torch.log(torch.clamp(spec, min=1e-5)) + return spec + + +class ExportDitBlocks(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + self.transformer_blocks = dit.transformer_blocks + self.norm_out = dit.norm_out + self.proj_out = dit.proj_out + self.depth = dit.depth + + def forward(self, x, t, mask, rope): + for block in self.transformer_blocks: + x = block(x, t, mask=mask, rope=(rope, 1.0)) + x = self.norm_out(x, t) + output = self.proj_out(x) + return output + + +class ExportDitEmbed(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + self.time_embed = dit.time_embed + self.d_embed = dit.d_embed + self.text_embed = dit.text_embed + self.input_embed = dit.input_embed + self.rotary_embed = dit.rotary_embed + self.rotary_embed.inv_freq.to(device) + + def forward( + self, + x0: torch.Tensor, # nosied input audio # noqa: F722 + cond0: torch.Tensor, # masked cond audio # noqa: F722 + x_lens: torch.Tensor, + time: torch.Tensor, # time step # noqa: F821 F722 + dt_base_bootstrap: torch.Tensor, + text0: torch.Tensor, # noqa: F722#####condition feature + ): + x = x0.transpose(2, 1) + cond = cond0.transpose(2, 1) + text = text0.transpose(2, 1) + mask = commons.sequence_mask(x_lens, max_length=x.size(1)).to(x.device) + + t = self.time_embed(time) + self.d_embed(dt_base_bootstrap) + text_embed = self.text_embed(text, x.shape[1]) + rope_t = torch.arange(x.shape[1], device=device) + rope, _ = self.rotary_embed(rope_t) + x = self.input_embed(x, cond, text_embed) + return x, t, mask, rope + + +class ExportDiT(torch.nn.Module): + def __init__(self, dit: DiT): + super().__init__() + if dit != None: + self.embed = ExportDitEmbed(dit) + self.blocks = ExportDitBlocks(dit) + else: + self.embed = None + self.blocks = None + + def forward( # x, prompt_x, x_lens, t, style,cond + self, # d is channel,n is T + x0: torch.Tensor, # nosied input audio # noqa: F722 + cond0: torch.Tensor, # masked cond audio # noqa: F722 + x_lens: torch.Tensor, + time: torch.Tensor, # time step # noqa: F821 F722 + dt_base_bootstrap: torch.Tensor, + text0: torch.Tensor, # noqa: F722#####condition feature + ): + x, t, mask, rope = self.embed(x0, cond0, x_lens, time, dt_base_bootstrap, text0) + output = self.blocks(x, t, mask, rope) + return output + + +class ExportCFM(torch.nn.Module): + def __init__(self, cfm: CFM): + super().__init__() + self.cfm = cfm + + def forward( + self, + fea_ref: torch.Tensor, + fea_todo_chunk: torch.Tensor, + mel2: torch.Tensor, + sample_steps: torch.LongTensor, + ): + T_min = fea_ref.size(2) + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + cfm_res = self.cfm( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] + mel2 = cfm_res[:, :, -T_min:] + fea_ref = fea_todo_chunk[:, :, -T_min:] + return cfm_res, fea_ref, mel2 + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + +spec_min = -12 +spec_max = 2 + +@torch.jit.script +def norm_spec(x): + spec_min = -12 + spec_max = 2 + return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + +def denorm_spec(x): + spec_min = -12 + spec_max = 2 + return (x + 1) / 2 * (spec_max - spec_min) + spec_min + + +class ExportGPTSovitsHalf(torch.nn.Module): + + def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3): + super().__init__() + self.hps = hps + self.t2s_m = t2s_m + self.vq_model = vq_model + self.mel2 = MelSpectrgram( + dtype=torch.float32, + device=device, + n_fft=1024, + num_mels=100, + sampling_rate=24000, + hop_size=256, + win_size=1024, + fmin=0, + fmax=None, + center=False, + ) + # self.dtype = dtype + self.filter_length:int = hps.data.filter_length + self.sampling_rate:int = hps.data.sampling_rate + self.hop_length:int = hps.data.hop_length + self.win_length:int = hps.data.win_length + + def forward( + self, + ssl_content, + ref_audio_32k:torch.FloatTensor, + phoneme_ids0, + phoneme_ids1, + bert1, + bert2, + top_k, + ): + refer = spectrogram_torch( + ref_audio_32k, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ).to(ssl_content.dtype) + + + codes = self.vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0) + # print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + pred_semantic = self.t2s_m( + prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) + # print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + + ge = self.vq_model.create_ge(refer) + # print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + prompt_ = prompt.unsqueeze(0) + fea_ref = self.vq_model(prompt_, phoneme_ids0, ge) + # print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + # print(prompt_.shape, phoneme_ids0.shape, ge.shape) + # print(fea_ref.shape) + + ref_24k = resamplex(ref_audio_32k, 32000, 24000) + mel2 = norm_spec(self.mel2(ref_24k)).to(ssl_content.dtype) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + mel2 = mel2[:, :, :T_min] + fea_ref = fea_ref[:, :, :T_min] + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + + fea_todo = self.vq_model(pred_semantic, phoneme_ids1, ge) + # print('fea_todo',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + # print(pred_semantic.shape, phoneme_ids1.shape, ge.shape) + # print(fea_todo.shape) + + return fea_ref, fea_todo, mel2 + +class GPTSoVITSV3(torch.nn.Module): + def __init__(self, gpt_sovits_half, cfm, bigvgan): + super().__init__() + self.gpt_sovits_half = gpt_sovits_half + self.cfm = cfm + self.bigvgan = bigvgan + + def forward( + self, + ssl_content, + ref_audio_32k:torch.FloatTensor, + phoneme_ids0:torch.LongTensor, + phoneme_ids1:torch.LongTensor, + bert1, + bert2, + top_k: torch.LongTensor, + sample_steps: torch.LongTensor, + ): + # current_time = datetime.now() + # print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_ref, fea_todo, mel2 = self.gpt_sovits_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + chunk_len = 934 - fea_ref.shape[2] + wav_gen_list = [] + idx = 0 + wav_gen_length = fea_todo.shape[2] * 256 + while 1: + # current_time = datetime.now() + # print("idx:",idx,current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + # 因为导出的模型在不同shape时会重新编译还是怎么的,会卡顿10s这样, + # 所以在这里补0让他shape维持不变 + # 但是这样会导致生成的音频长度不对,所以在最后截取一下。 + # 经过 bigvgan 之后音频长度就是 fea_todo.shape[2] * 256 + complete_len = chunk_len - fea_todo_chunk.shape[-1] + if complete_len != 0: + fea_todo_chunk = torch.cat([fea_todo_chunk, torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype)], 2) + + cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) + idx += chunk_len + + cfm_res = denorm_spec(cfm_res) + bigvgan_res = self.bigvgan(cfm_res) + wav_gen_list.append(bigvgan_res) + + wav_gen = torch.cat(wav_gen_list, 2) + return wav_gen[0][0][:wav_gen_length] + +def init_bigvgan(): + global bigvgan_model + from BigVGAN import bigvgan + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" + % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + bigvgan_model.remove_weight_norm() + bigvgan_model = bigvgan_model.eval() + if is_half == True: + bigvgan_model = bigvgan_model.half().to(device) + else: + bigvgan_model = bigvgan_model.to(device) + + +class Sovits: + def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps): + self.vq_model = vq_model + self.hps = hps + cfm.estimator = ExportDiT(cfm.estimator) + self.cfm = cfm + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def get_sovits_weights(sovits_path): + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + is_exist_s2gv3 = os.path.exists(path_sovits_v3) + + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + if if_lora_v3 == True and is_exist_s2gv3 == False: + logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + + dict_s2 = load_sovits_new(sovits_path) + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: + hps.model.version = "v1" + else: + hps.model.version = "v2" + + if model_version == "v3": + hps.model.version = "v3" + + logger.info(f"hps: {hps}") + + vq_model = SynthesizerTrnV3( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + # init_bigvgan() + model_version = hps.model.version + logger.info(f"模型版本: {model_version}") + + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.load_state_dict(dict_s2["weight"], strict=False) + vq_model.eval() + + cfm = vq_model.cfm + del vq_model.cfm + + sovits = Sovits(vq_model, cfm, hps) + return sovits + + +logger.info(f"torch version {torch.__version__}") +# ssl_model = cnhubert.get_model() +# if is_half: +# ssl_model = ssl_model.half().to(device) +# else: +# ssl_model = ssl_model.to(device) + + +def export_cfm( + e_cfm: ExportCFM, + mu: torch.Tensor, + x_lens: torch.LongTensor, + prompt: torch.Tensor, + n_timesteps: torch.IntTensor, + temperature=1.0, +): + cfm = e_cfm.cfm + + B, T = mu.size(0), mu.size(1) + x = ( + torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) + * temperature + ) + print("x:", x.shape, x.dtype) + prompt_len = prompt.size(-1) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) + prompt_x[..., :prompt_len] = prompt[..., :prompt_len] + x[..., :prompt_len] = 0.0 + mu = mu.transpose(2, 1) + + ntimestep = int(n_timesteps) + + t = torch.tensor(0.0, dtype=x.dtype, device=x.device) + d = torch.tensor(1.0 / ntimestep, dtype=x.dtype, device=x.device) + + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d + + print( + "cfm input shapes:", + x.shape, + prompt_x.shape, + x_lens.shape, + t_tensor.shape, + d_tensor.shape, + mu.shape, + ) + + print("cfm input dtypes:", x.dtype, prompt_x.dtype, x_lens.dtype, t_tensor.dtype, d_tensor.dtype, mu.dtype) + + estimator: ExportDiT = torch.jit.trace( + cfm.estimator, + optimize=True, + example_inputs=(x, prompt_x, x_lens, t_tensor, d_tensor, mu), + ) + estimator.save("onnx/ad/estimator.pt") + # torch.onnx.export( + # cfm.estimator, + # (x, prompt_x, x_lens, t_tensor, d_tensor, mu), + # "onnx/ad/dit.onnx", + # input_names=["x", "prompt_x", "x_lens", "t", "d", "mu"], + # output_names=["output"], + # dynamic_axes={ + # "x": [2], + # "prompt_x": [2], + # "mu": [2], + # }, + # ) + print("save estimator ok") + cfm.estimator = estimator + export_cfm = torch.jit.script(e_cfm) + export_cfm.save("onnx/ad/cfm.pt") + # sovits.cfm = cfm + # cfm.save("onnx/ad/cfm.pt") + return export_cfm + + +def export(): + sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") + + init_bigvgan() + + dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") + raw_t2s = get_raw_t2s_model(dict_s1).to(device) + print("#### get_raw_t2s_model ####") + print(raw_t2s.config) + + if is_half: + raw_t2s = raw_t2s.half().to(device) + + t2s_m = T2SModel(raw_t2s) + t2s_m.eval() + script_t2s = torch.jit.script(t2s_m).to(device) + + hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = 32 + dtype = torch.float16 if is_half == True else torch.float32 + refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() + codes = sovits.vq_model.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + prompt = prompt_semantic.unsqueeze(0).to(device) + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + "这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", + "auto", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + # codes = sovits.vq_model.extract_latent(ssl_content) + # prompt_semantic = codes[0, 0] + # prompts = prompt_semantic.unsqueeze(0) + + top_k = torch.LongTensor([15]).to(device) + print("topk", top_k) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + print( + prompt.dtype, + phoneme_ids0.dtype, + phoneme_ids1.dtype, + bert1.dtype, + bert2.dtype, + top_k.dtype, + ) + print( + prompt.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + pred_semantic = t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + + ge = sovits.vq_model.create_ge(refer) + prompt_ = prompt.unsqueeze(0) + + torch._dynamo.mark_dynamic(prompt_, 2) + torch._dynamo.mark_dynamic(phoneme_ids0, 1) + + fea_ref = sovits.vq_model(prompt_, phoneme_ids0, ge) + + inputs = { + "forward": (prompt_, phoneme_ids0, ge), + "extract_latent": ssl_content, + "create_ge": refer, + } + + + trace_vq_model = torch.jit.trace_module( + sovits.vq_model, inputs, optimize=True + ) + trace_vq_model.save("onnx/ad/vq_model.pt") + + print(fea_ref.shape, fea_ref.dtype, ge.shape) + print(prompt_.shape, phoneme_ids0.shape, ge.shape) + + # vq_model = torch.jit.trace( + # sovits.vq_model, + # optimize=True, + # # strict=False, + # example_inputs=(prompt_, phoneme_ids0, ge), + # ) + # vq_model = sovits.vq_model + vq_model = trace_vq_model + + gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model) + torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt") + + ref_audio, sr = torchaudio.load(ref_wav_path) + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: + ref_audio = ref_audio.mean(0).unsqueeze(0) + if sr != 24000: + ref_audio = resample(ref_audio, sr) + # mel2 = mel_fn(ref_audio) + mel2 = norm_spec(mel_fn(ref_audio)) + T_min = min(mel2.shape[2], fea_ref.shape[2]) + fea_ref = fea_ref[:, :, :T_min] + print("fea_ref:", fea_ref.shape, T_min) + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + chunk_len = 934 - T_min + mel2 = mel2.to(dtype) + + # fea_todo, ge = sovits.vq_model(pred_semantic,y_lengths, phoneme_ids1, ge) + fea_todo = vq_model(pred_semantic, phoneme_ids1, ge) + + cfm_resss = [] + idx = 0 + sample_steps = torch.LongTensor([sample_steps]).to(device) + export_cfm_ = ExportCFM(sovits.cfm) + while 1: + print("idx:", idx) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + print( + "export_cfm:", + fea_ref.shape, + fea_todo_chunk.shape, + mel2.shape, + sample_steps.shape, + ) + if idx == 0: + fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + export_cfm_ = export_cfm( + export_cfm_, + fea, + torch.LongTensor([fea.size(1)]).to(fea.device), + mel2, + sample_steps, + ) + # torch.onnx.export( + # export_cfm_, + # ( + # fea_ref, + # fea_todo_chunk, + # mel2, + # sample_steps, + # ), + # "onnx/ad/cfm.onnx", + # input_names=["fea_ref", "fea_todo_chunk", "mel2", "sample_steps"], + # output_names=["cfm_res", "fea_ref_", "mel2_"], + # dynamic_axes={ + # "fea_ref": [2], + # "fea_todo_chunk": [2], + # "mel2": [2], + # }, + # ) + + idx += chunk_len + + cfm_res, fea_ref, mel2 = export_cfm_( + fea_ref, fea_todo_chunk, mel2, sample_steps + ) + cfm_resss.append(cfm_res) + continue + + cmf_res = torch.cat(cfm_resss, 2) + cmf_res = denorm_spec(cmf_res).to(device) + print("cmf_res:", cmf_res.shape, cmf_res.dtype) + with torch.inference_mode(): + cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype) + torch._dynamo.mark_dynamic(cmf_res_rand, 2) + bigvgan_model_ = torch.jit.trace( + bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,) + ) + bigvgan_model_.save("onnx/ad/bigvgan_model.pt") + wav_gen = bigvgan_model(cmf_res) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + audio = wav_gen[0][0].cpu().detach().numpy() + + sr = 24000 + soundfile.write("out.export.wav", (audio * 32768).astype(np.int16), sr) + + +from datetime import datetime + + +def test_export( + todo_text, + gpt_sovits_v3_half, + cfm, + bigvgan, + output, +): + + # hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = 8 + + dtype = torch.float16 if is_half == True else torch.float32 + + zero_wav = np.zeros( + int(16000 * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() + + ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + todo_text, + "zh", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + top_k = torch.LongTensor([15]).to(device) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start inference %s", current_time) + print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) + fea_ref, fea_todo, mel2 = gpt_sovits_v3_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + chunk_len = 934 - fea_ref.shape[2] + print(fea_ref.shape, fea_todo.shape, mel2.shape) + + cfm_resss = [] + sample_steps = torch.LongTensor([sample_steps]) + idx = 0 + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start cfm %s", current_time) + wav_gen_length = fea_todo.shape[2] * 256 + + while 1: + + current_time = datetime.now() + print("idx:", idx, current_time.strftime("%Y-%m-%d %H:%M:%S")) + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break + + complete_len = chunk_len - fea_todo_chunk.shape[-1] + if complete_len != 0: + fea_todo_chunk = torch.cat([fea_todo_chunk, torch.zeros(1, 512, complete_len).to(device).to(dtype)], 2) + + cfm_res, fea_ref, mel2 = cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) + # if complete_len > 0 : + # cfm_res = cfm_res[:, :, :-complete_len] + # fea_ref = fea_ref[:, :, :-complete_len] + # mel2 = mel2[:, :, :-complete_len] + + idx += chunk_len + + current_time = datetime.now() + print("cfm end", current_time.strftime("%Y-%m-%d %H:%M:%S")) + cfm_res = denorm_spec(cfm_res).to(device) + bigvgan_res = bigvgan(cfm_res) + cfm_resss.append(bigvgan_res) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start bigvgan %s", current_time) + wav_gen = torch.cat(cfm_resss, 2) + # cmf_res = denorm_spec(cmf_res) + # cmf_res = cmf_res.to(device) + # print("cmf_res:", cmf_res.shape) + + # cmf_res = torch.cat([cmf_res,torch.zeros([1,100,2000-cmf_res.size(2)],device=device,dtype=cmf_res.dtype)], 2) + + # wav_gen = bigvgan(cmf_res) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + wav_gen = wav_gen[:, :, :wav_gen_length] + + audio = wav_gen[0][0].cpu().detach().numpy() + logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + sr = 24000 + soundfile.write(output, (audio * 32768).astype(np.int16), sr) + + +def test_export1( + todo_text, + gpt_sovits_v3, + output, +): + + # hps = sovits.hps + ref_wav_path = "onnx/ad/ref.wav" + speed = 1.0 + sample_steps = torch.LongTensor([16]) + + dtype = torch.float16 if is_half == True else torch.float32 + + zero_wav = np.zeros( + int(24000 * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() + print("ssl_content:", ssl_content.shape, ssl_content.dtype) + + ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() + + phones1, bert1, norm_text1 = get_phones_and_bert( + "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" + ) + phones2, bert2, norm_text2 = get_phones_and_bert( + todo_text, + "zh", + "v3", + ) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert1 = bert1.T.to(device) + bert2 = bert2.T.to(device) + top_k = torch.LongTensor([15]).to(device) + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info("start inference %s", current_time) + print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) + wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps) + print("wav_gen:", wav_gen.shape, wav_gen.dtype) + + wav_gen = torch.cat([wav_gen,zero_wav_torch],0) + + audio = wav_gen.cpu().detach().numpy() + logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + sr = 24000 + soundfile.write(output, (audio * 32768).astype(np.int16), sr) + + +import time + + +def test_(): + + sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") + + # cfm = ExportCFM(sovits.cfm) + # cfm.cfm.estimator = dit + sovits.cfm = None + + cfm = torch.jit.load("onnx/ad/cfm.pt", map_location=device) + # cfm = torch.jit.optimize_for_inference(cfm) + cfm = cfm.half().to(device) + + cfm.eval() + + logger.info(f"cfm ok") + + dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") + # v2 的 gpt 也可以用 + # dict_s1 = torch.load("GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt") + raw_t2s = get_raw_t2s_model(dict_s1).to(device) + print("#### get_raw_t2s_model ####") + print(raw_t2s.config) + if is_half: + raw_t2s = raw_t2s.half().to(device) + t2s_m = T2SModel(raw_t2s).half().to(device) + t2s_m.eval() + t2s_m = torch.jit.script(t2s_m) + t2s_m.eval() + # t2s_m.top_k = 15 + logger.info(f"t2s_m ok") + + + vq_model: torch.jit.ScriptModule = torch.jit.load( + "onnx/ad/vq_model.pt", map_location=device + ) + # vq_model = torch.jit.optimize_for_inference(vq_model) + # vq_model = vq_model.half().to(device) + vq_model.eval() + # vq_model = sovits.vq_model + logger.info(f"vq_model ok") + + # gpt_sovits_v3_half = torch.jit.load("onnx/ad/gpt_sovits_v3_half.pt") + # gpt_sovits_v3_half = torch.jit.optimize_for_inference(gpt_sovits_v3_half) + # gpt_sovits_v3_half = gpt_sovits_v3_half.half() + # gpt_sovits_v3_half = gpt_sovits_v3_half.cuda() + # gpt_sovits_v3_half.eval() + gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model) + logger.info(f"gpt_sovits_v3_half ok") + + # init_bigvgan() + # global bigvgan_model + bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt") + # bigvgan_model = torch.jit.optimize_for_inference(bigvgan_model) + bigvgan_model = bigvgan_model.half() + bigvgan_model = bigvgan_model.cuda() + bigvgan_model.eval() + + logger.info(f"bigvgan ok") + + gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model) + gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3) + gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt") + gpt_sovits_v3 = gpt_sovits_v3.half().to(device) + gpt_sovits_v3.eval() + print("save gpt_sovits_v3 ok") + + time.sleep(5) + # print("thread:", torch.get_num_threads()) + # print("thread:", torch.get_num_interop_threads()) + # torch.set_num_interop_threads(1) + # torch.set_num_threads(1) + + test_export1( + "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", + gpt_sovits_v3, + "out.wav", + ) + + test_export1( + "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", + gpt_sovits_v3, + "out2.wav", + ) + + # test_export( + # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP. 哈哈哈...", + # gpt_sovits_v3_half, + # cfm, + # bigvgan_model, + # "out2.wav", + # ) + +def test_export_gpt_sovits_v3(): + gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt",map_location=device) + # test_export1( + # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", + # gpt_sovits_v3, + # "out3.wav", + # ) + # test_export1( + # "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", + # gpt_sovits_v3, + # "out4.wav", + # ) + test_export1( + "风萧萧兮易水寒,壮士一去兮不复还.", + gpt_sovits_v3, + "out5.wav", + ) + + +with torch.no_grad(): + # export() + test_() + # test_export_gpt_sovits_v3() diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py index 8546fc3..ac32fa5 100644 --- a/GPT_SoVITS/f5_tts/model/backbones/dit.py +++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py @@ -138,7 +138,7 @@ class DiT(nn.Module): time: float["b"] | float[""], # time step # noqa: F821 F722 dt_base_bootstrap, text0, # : int["b nt"] # noqa: F722#####condition feature - use_grad_ckpt, # bool + use_grad_ckpt=False, # bool ###no-use drop_audio_cond=False, # cfg for cond audio drop_text=False, # cfg for text diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py index abe2a3c..1c24056 100644 --- a/GPT_SoVITS/module/models_onnx.py +++ b/GPT_SoVITS/module/models_onnx.py @@ -9,6 +9,8 @@ from module import commons from module import modules from module import attentions_onnx as attentions +from f5_tts.model import DiT + from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding @@ -342,6 +344,37 @@ class PosteriorEncoder(nn.Module): return z, m, logs, x_mask +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + + def forward(self, x, x_lengths, g=None): + if(g!=None): + g = g.detach() + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + return stats, x_mask + class WNEncoder(nn.Module): def __init__( self, @@ -916,4 +949,175 @@ class SynthesizerTrn(nn.Module): def extract_latent(self, x): ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0, 1) \ No newline at end of file + return codes.transpose(0, 1) + +class CFM(torch.nn.Module): + def __init__( + self, + in_channels,dit + ): + super().__init__() + # self.sigma_min = 1e-6 + + self.estimator = dit + + self.in_channels = in_channels + + # self.criterion = torch.nn.MSELoss() + + def forward(self, mu:torch.Tensor, x_lens:torch.LongTensor, prompt:torch.Tensor, n_timesteps:torch.LongTensor, temperature:float=1.0): + """Forward diffusion""" + B, T = mu.size(0), mu.size(1) + x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) + + ntimesteps = int(n_timesteps) + + prompt_len = prompt.size(-1) + prompt_x = torch.zeros_like(x,dtype=mu.dtype) + prompt_x[..., :prompt_len] = prompt[..., :prompt_len] + x[..., :prompt_len] = 0.0 + mu=mu.transpose(2,1) + t = torch.tensor(0.0,dtype=x.dtype,device=x.device) + d = torch.tensor(1.0/ntimesteps,dtype=x.dtype,device=x.device) + d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + + for j in range(ntimesteps): + t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t + # d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + # v_pred = model(x, t_tensor, d_tensor, **extra_args) + v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu).transpose(2, 1) + # if inference_cfg_rate>1e-5: + # neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1) + # v_pred=v_pred+(v_pred-neg)*inference_cfg_rate + x = x + d * v_pred + t = t + d + x[:, :, :prompt_len] = 0.0 + return x + + +def set_no_grad(net_g): + for name, param in net_g.named_parameters(): + param.requires_grad=False + +@torch.jit.script_if_tracing +def compile_codes_length(codes): + y_lengths1 = torch.LongTensor([codes.size(2)]).to(codes.device) + return y_lengths1 * 2.5 * 1.5 + +@torch.jit.script_if_tracing +def compile_ref_length(refer): + refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) + return refer_lengths + +class SynthesizerTrnV3(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + version="v3", + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.version = version + + self.model_dim=512 + self.use_sdp = use_sdp + self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback + # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, + # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) + # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, + # gin_channels=gin_channels) + # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + + ssl_dim = 768 + assert semantic_frame_rate in ['25hz', "50hz"] + self.semantic_frame_rate = semantic_frame_rate + if semantic_frame_rate == '25hz': + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) + else: + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) + + self.quantizer = ResidualVectorQuantizer( + dimension=ssl_dim, + n_q=1, + bins=1024 + ) + freeze_quantizer + inter_channels2=512 + self.bridge=nn.Sequential( + nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), + nn.LeakyReLU() + ) + self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) + self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) + self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim + if freeze_quantizer==True: + set_no_grad(self.ssl_proj) + set_no_grad(self.quantizer) + set_no_grad(self.enc_p) + + def create_ge(self, refer): + refer_lengths = compile_ref_length(refer) + refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) + ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) + return ge + + def forward(self, codes, text,ge,speed=1): + + y_lengths1=compile_codes_length(codes) + + quantized = self.quantizer.decode(codes) + if self.semantic_frame_rate == '25hz': + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge,speed) + fea=self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + ####more wn paramter to learn mel + fea, y_mask_ = self.wns1(fea, y_lengths1, ge) + return fea + + def extract_latent(self, x): + ssl = self.ssl_proj(x) + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) + return codes.transpose(0,1) \ No newline at end of file From fef65d40feee6fa5c72d2b6626ed3fbc75ab9192 Mon Sep 17 00:00:00 2001 From: lishq Date: Wed, 26 Mar 2025 15:03:36 +0800 Subject: [PATCH 10/14] fix: prevent concurrent access to BERT model with thread lock (#2165) Added thread lock to protect get_phones_and_bert method from potential race conditions during concurrent access. This addresses issue #1844 where multiple threads accessing the BERT model simultaneously could cause data inconsistency or crashes. Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> --- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 127 +++++++++--------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 653656a..0ebe553 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -1,5 +1,6 @@ import os, sys +import threading from tqdm import tqdm now_dir = os.getcwd() @@ -54,6 +55,7 @@ class TextPreprocessor: self.bert_model = bert_model self.tokenizer = tokenizer self.device = device + self.bert_lock = threading.RLock() def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]: print(f'############ {i18n("切分文本")} ############') @@ -117,70 +119,71 @@ class TextPreprocessor: return self.get_phones_and_bert(text, language, version) def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): - if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: - # language = language.replace("all_","") - formattext = text - while " " in formattext: - formattext = formattext.replace(" ", " ") - if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"zh",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = self.get_bert_feature(norm_text, word2ph).to(self.device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"yue",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = torch.zeros( - (1024, len(phones)), - dtype=torch.float32, - ).to(self.device) - elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] - if language == "auto": - for tmp in LangSegmenter.getTexts(text): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "auto_yue": - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "zh": - tmp["lang"] = "yue" - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - else: - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "en": - langlist.append(tmp["lang"]) - else: - # 因无法区别中日韩文汉字,以用户输入为准 - langlist.append(language) - textlist.append(tmp["text"]) - # print(textlist) - # print(langlist) - phones_list = [] - bert_list = [] - norm_text_list = [] - for i in range(len(textlist)): - lang = langlist[i] - phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) - bert = self.get_bert_inf(phones, word2ph, norm_text, lang) - phones_list.append(phones) - norm_text_list.append(norm_text) - bert_list.append(bert) - bert = torch.cat(bert_list, dim=1) - phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + with self.bert_lock: + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + # language = language.replace("all_","") + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r'[A-Za-z]', formattext): + formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext,"zh",version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = self.get_bert_feature(norm_text, word2ph).to(self.device) + elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): + formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext,"yue",version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float32, + ).to(self.device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist=[] + langlist=[] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + # print(textlist) + # print(langlist) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) + bert = self.get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = ''.join(norm_text_list) - if not final and len(phones) < 6: - return self.get_phones_and_bert("." + text,language,version,final=True) + if not final and len(phones) < 6: + return self.get_phones_and_bert("." + text,language,version,final=True) - return phones, bert, norm_text + return phones, bert, norm_text def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor: From 13573a1b06515973de5de7435d5cc3881bc575d4 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:22:01 +0800 Subject: [PATCH 11/14] fix torch.load --- GPT_SoVITS/prepare_datasets/3-get-semantic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index dbffa0e..b213a8a 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -81,7 +81,7 @@ if os.path.exists(semantic_path) == False: # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) print( vq_model.load_state_dict( - torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False + torch.load(pretrained_s2G, map_location="cpu", weights_only=False)["weight"], strict=False ) ) From c0ce55a132472cac536e9e30f0bb1fa07bad8521 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:32:43 +0800 Subject: [PATCH 12/14] Update my_utils.py --- tools/my_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/my_utils.py b/tools/my_utils.py index cdb95e0..3369248 100644 --- a/tools/my_utils.py +++ b/tools/my_utils.py @@ -32,7 +32,7 @@ def clean_path(path_str:str): if path_str.endswith(('\\','/')): return clean_path(path_str[0:-1]) path_str = path_str.replace('/', os.sep).replace('\\', os.sep) - return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") + return path_str.strip(" \'\n\"\u202a")#path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): From b65ea9181e7b6bf219be23dc3f624f67bf799c2f Mon Sep 17 00:00:00 2001 From: C3EZ <96614352+luckykevvv@users.noreply.github.com> Date: Wed, 26 Mar 2025 19:04:13 +1100 Subject: [PATCH 13/14] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=AF=B9amd=E6=98=BE?= =?UTF-8?q?=E5=8D=A1=E7=9A=84=E6=94=AF=E6=8C=81=20(#2076)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added the instruction for AMD GPU in English * Added the instruction for AMD GPU in Chinese * Update install.sh, now it will check wether user are using cuda or rocm * 恢复原来的readme,已经更新了install.sh * 恢复中文readme * 将n卡的判断条件由nvcc改成nvidia-smi --- install.sh | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index 63be260..d4d7349 100644 --- a/install.sh +++ b/install.sh @@ -2,8 +2,13 @@ # 安装构建工具 # Install build tools +echo "Installing GCC..." conda install -c conda-forge gcc=14 + +echo "Installing G++..." conda install -c conda-forge gxx + +echo "Installing ffmpeg and cmake..." conda install ffmpeg cmake # 设置编译环境 @@ -12,10 +17,60 @@ export CMAKE_MAKE_PROGRAM="$CONDA_PREFIX/bin/cmake" export CC="$CONDA_PREFIX/bin/gcc" export CXX="$CONDA_PREFIX/bin/g++" -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia +echo "Checking for CUDA installation..." +if command -v nvidia-smi &> /dev/null; then + USE_CUDA=true + echo "CUDA found." +else + echo "CUDA not found." + USE_CUDA=false +fi + + +if [ "$USE_CUDA" = false ]; then + echo "Checking for ROCm installation..." + if [ -d "/opt/rocm" ]; then + USE_ROCM=true + echo "ROCm found." + if grep -qi "microsoft" /proc/version; then + echo "You are running WSL." + IS_WSL=true + else + echo "You are NOT running WSL." + IS_WSL=false + fi + else + echo "ROCm not found." + USE_ROCM=false + fi +fi + +if [ "$USE_CUDA" = true ]; then + echo "Installing PyTorch with CUDA support..." + conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia +elif [ "$USE_ROCM" = true ] ; then + echo "Installing PyTorch with ROCm support..." + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 +else + echo "Installing PyTorch for CPU..." + conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch +fi + + +echo "Installing Python dependencies from requirements.txt..." # 刷新环境 # Refresh environment hash -r +pip install -r requirements.txt + +if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then + echo "Update to WSL compatible runtime lib..." + location=`pip show torch | grep Location | awk -F ": " '{print $2}'` + cd ${location}/torch/lib/ + rm libhsa-runtime64.so* + cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so +fi + +echo "Installation completed successfully!" -pip install -r requirements.txt \ No newline at end of file From ee4a466f79b4f643251aa2f873f541f85df11d91 Mon Sep 17 00:00:00 2001 From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:39:19 +0800 Subject: [PATCH 14/14] Update patched_mha_with_cache.py --- .../AR/modules/patched_mha_with_cache.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py index 7be241d..cab6afe 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py @@ -12,33 +12,33 @@ import torch def multi_head_attention_forward_patched( - query: Tensor, - key: Tensor, - value: Tensor, - embed_dim_to_check: int, - num_heads: int, - in_proj_weight: Optional[Tensor], - in_proj_bias: Optional[Tensor], - bias_k: Optional[Tensor], - bias_v: Optional[Tensor], - add_zero_attn: bool, + query, + key, + value, + embed_dim_to_check, + num_heads, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + add_zero_attn, dropout_p: float, - out_proj_weight: Tensor, - out_proj_bias: Optional[Tensor], - training: bool = True, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - use_separate_proj_weight: bool = False, - q_proj_weight: Optional[Tensor] = None, - k_proj_weight: Optional[Tensor] = None, - v_proj_weight: Optional[Tensor] = None, - static_k: Optional[Tensor] = None, - static_v: Optional[Tensor] = None, - average_attn_weights: bool = True, - is_causal: bool = False, + out_proj_weight, + out_proj_bias, + training = True, + key_padding_mask = None, + need_weights = True, + attn_mask = None, + use_separate_proj_weight = False, + q_proj_weight = None, + k_proj_weight = None, + v_proj_weight = None, + static_k = None, + static_v = None, + average_attn_weights = True, + is_causal = False, cache=None, -) -> Tuple[Tensor, Optional[Tensor]]: +): r""" Args: query, key, value: map a query and a set of key-value pairs to an output.