mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-05 22:20:01 +08:00
适配v4版本
This commit is contained in:
parent
30fdb60295
commit
0ba0ebcf65
@ -106,11 +106,12 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
|||||||
resample_transform_dict = {}
|
resample_transform_dict = {}
|
||||||
|
|
||||||
|
|
||||||
def resample(audio_tensor, sr0, device):
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
if sr0 not in resample_transform_dict:
|
key="%s-%s"%(sr0,sr1)
|
||||||
resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
|
if key not in resample_transform_dict:
|
||||||
return resample_transform_dict[sr0](audio_tensor)
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
|
return resample_transform_dict[key](audio_tensor)
|
||||||
|
|
||||||
|
|
||||||
class DictToAttrRecursive(dict):
|
class DictToAttrRecursive(dict):
|
||||||
@ -1372,9 +1373,10 @@ class TTS:
|
|||||||
if ref_audio.shape[0] == 2:
|
if ref_audio.shape[0] == 2:
|
||||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||||
|
|
||||||
tgt_sr = self.vocoder_configs["sr"]
|
# tgt_sr = self.vocoder_configs["sr"]
|
||||||
|
tgt_sr = 24000 if self.configs.version == "v3" else 32000
|
||||||
if ref_sr != tgt_sr:
|
if ref_sr != tgt_sr:
|
||||||
ref_audio = resample(ref_audio, ref_sr, self.configs.device)
|
ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
|
||||||
|
|
||||||
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
|
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
|
||||||
mel2 = norm_spec(mel2)
|
mel2 = norm_spec(mel2)
|
||||||
@ -1437,12 +1439,11 @@ class TTS:
|
|||||||
ref_audio = ref_audio.to(self.configs.device).float()
|
ref_audio = ref_audio.to(self.configs.device).float()
|
||||||
if ref_audio.shape[0] == 2:
|
if ref_audio.shape[0] == 2:
|
||||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||||
if ref_sr != 24000:
|
|
||||||
ref_audio = resample(ref_audio, ref_sr, self.configs.device)
|
# tgt_sr = self.vocoder_configs["sr"]
|
||||||
|
tgt_sr = 24000 if self.configs.version == "v3" else 32000
|
||||||
tgt_sr = self.vocoder_configs["sr"]
|
|
||||||
if ref_sr != tgt_sr:
|
if ref_sr != tgt_sr:
|
||||||
ref_audio = resample(ref_audio, ref_sr, self.configs.device)
|
ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device)
|
||||||
|
|
||||||
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
|
mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio)
|
||||||
mel2 = norm_spec(mel2)
|
mel2 = norm_spec(mel2)
|
||||||
|
@ -3,9 +3,9 @@ custom:
|
|||||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||||
device: cuda
|
device: cuda
|
||||||
is_half: true
|
is_half: true
|
||||||
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
|
||||||
version: v2
|
version: v3
|
||||||
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
|
||||||
v1:
|
v1:
|
||||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||||
|
@ -397,7 +397,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
|||||||
minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True
|
minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True
|
||||||
)
|
)
|
||||||
sample_steps = gr.Radio(
|
sample_steps = gr.Radio(
|
||||||
label=i18n("采样步数(仅对V3生效)"), value=32, choices=[4, 8, 16, 32], visible=True
|
label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32], visible=True
|
||||||
)
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
fragment_interval = gr.Slider(
|
fragment_interval = gr.Slider(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user