适配v4版本

This commit is contained in:
ChasonJiang 2025-04-21 22:10:39 +08:00
parent 30fdb60295
commit 0ba0ebcf65
3 changed files with 16 additions and 15 deletions

View File

@ -106,11 +106,12 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
resample_transform_dict = {} resample_transform_dict = {}
def resample(audio_tensor, sr0, device): def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict global resample_transform_dict
if sr0 not in resample_transform_dict: key="%s-%s"%(sr0,sr1)
resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) if key not in resample_transform_dict:
return resample_transform_dict[sr0](audio_tensor) resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
class DictToAttrRecursive(dict): class DictToAttrRecursive(dict):
@ -1372,9 +1373,10 @@ class TTS:
if ref_audio.shape[0] == 2: if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
tgt_sr = self.vocoder_configs["sr"] # tgt_sr = self.vocoder_configs["sr"]
tgt_sr = 24000 if self.configs.version == "v3" else 32000
if ref_sr != tgt_sr: if ref_sr != tgt_sr:
ref_audio = resample(ref_audio, ref_sr, self.configs.device) ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
mel2 = norm_spec(mel2) mel2 = norm_spec(mel2)
@ -1437,12 +1439,11 @@ class TTS:
ref_audio = ref_audio.to(self.configs.device).float() ref_audio = ref_audio.to(self.configs.device).float()
if ref_audio.shape[0] == 2: if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
if ref_sr != 24000:
ref_audio = resample(ref_audio, ref_sr, self.configs.device) # tgt_sr = self.vocoder_configs["sr"]
tgt_sr = 24000 if self.configs.version == "v3" else 32000
tgt_sr = self.vocoder_configs["sr"]
if ref_sr != tgt_sr: if ref_sr != tgt_sr:
ref_audio = resample(ref_audio, ref_sr, self.configs.device) ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
mel2 = norm_spec(mel2) mel2 = norm_spec(mel2)

View File

@ -3,9 +3,9 @@ custom:
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
device: cuda device: cuda
is_half: true is_half: true
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
version: v2 version: v3
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
v1: v1:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base

View File

@ -397,7 +397,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True
) )
sample_steps = gr.Radio( sample_steps = gr.Radio(
label=i18n("采样步数(仅对V3生效)"), value=32, choices=[4, 8, 16, 32], visible=True label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32], visible=True
) )
with gr.Row(): with gr.Row():
fragment_interval = gr.Slider( fragment_interval = gr.Slider(