From 0ba0ebcf6534d12b0c63b981249d62657cc1eb7c Mon Sep 17 00:00:00 2001 From: ChasonJiang <1440499136@qq.com> Date: Mon, 21 Apr 2025 22:10:39 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=82=E9=85=8Dv4=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/TTS_infer_pack/TTS.py | 23 ++++++++++++----------- GPT_SoVITS/configs/tts_infer.yaml | 6 +++--- GPT_SoVITS/inference_webui_fast.py | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 19b01116..13cc513c 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -106,11 +106,12 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int): resample_transform_dict = {} -def resample(audio_tensor, sr0, device): +def resample(audio_tensor, sr0, sr1, device): global resample_transform_dict - if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) - return resample_transform_dict[sr0](audio_tensor) + key="%s-%s"%(sr0,sr1) + if key not in resample_transform_dict: + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) + return resample_transform_dict[key](audio_tensor) class DictToAttrRecursive(dict): @@ -1372,9 +1373,10 @@ class TTS: if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - tgt_sr = self.vocoder_configs["sr"] + # tgt_sr = self.vocoder_configs["sr"] + tgt_sr = 24000 if self.configs.version == "v3" else 32000 if ref_sr != tgt_sr: - ref_audio = resample(ref_audio, ref_sr, self.configs.device) + ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device) mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) @@ -1437,12 +1439,11 @@ class TTS: ref_audio = ref_audio.to(self.configs.device).float() if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if ref_sr != 24000: - ref_audio = resample(ref_audio, ref_sr, self.configs.device) - - tgt_sr = self.vocoder_configs["sr"] + + # tgt_sr = self.vocoder_configs["sr"] + tgt_sr = 24000 if self.configs.version == "v3" else 32000 if ref_sr != tgt_sr: - ref_audio = resample(ref_audio, ref_sr, self.configs.device) + ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device) mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 20c41a20..85e8f9d7 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -3,9 +3,9 @@ custom: cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cuda is_half: true - t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt - version: v2 - vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt + version: v3 + vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth v1: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 99cd44d0..547b1e04 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -397,7 +397,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True ) sample_steps = gr.Radio( - label=i18n("采样步数(仅对V3生效)"), value=32, choices=[4, 8, 16, 32], visible=True + label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32], visible=True ) with gr.Row(): fragment_interval = gr.Slider(