mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-09-29 00:30:15 +08:00
verified audio, text, synthesizer all working, todo:dig into t2s for error
This commit is contained in:
parent
e8fdf472c0
commit
63cbb6efa7
@ -264,17 +264,9 @@ class GptSoVits(nn.Module):
|
|||||||
class AudioPreprocess(nn.Module):
|
class AudioPreprocess(nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = HubertConfig.from_pretrained(cnhubert_base_path)
|
|
||||||
self.config._attn_implementation = "eager" # Use standard attention
|
|
||||||
self.config.apply_spec_augment = False # Disable masking for inference
|
|
||||||
self.config.layerdrop = 0.0 # Disable layer dropout
|
|
||||||
|
|
||||||
# Load the model
|
# Load the model
|
||||||
self.model = HubertModel.from_pretrained(
|
self.model = HubertModel.from_pretrained(cnhubert_base_path, local_files_only=True)
|
||||||
cnhubert_base_path,
|
|
||||||
config=self.config,
|
|
||||||
local_files_only=True
|
|
||||||
)
|
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
|
|
||||||
self.sv_model = SV("cpu", False)
|
self.sv_model = SV("cpu", False)
|
||||||
@ -292,7 +284,7 @@ class AudioPreprocess(nn.Module):
|
|||||||
|
|
||||||
sv_emb = self.sv_model.compute_embedding3_onnx(ref_audio_16k)
|
sv_emb = self.sv_model.compute_embedding3_onnx(ref_audio_16k)
|
||||||
|
|
||||||
zero_tensor = torch.zeros((1, 4800), dtype=torch.float32)
|
zero_tensor = torch.zeros((1, 9600), dtype=torch.float32)
|
||||||
ref_audio_16k = ref_audio_16k.unsqueeze(0)
|
ref_audio_16k = ref_audio_16k.unsqueeze(0)
|
||||||
# concate zero_tensor with waveform
|
# concate zero_tensor with waveform
|
||||||
ref_audio_16k = torch.cat([ref_audio_16k, zero_tensor], dim=1)
|
ref_audio_16k = torch.cat([ref_audio_16k, zero_tensor], dim=1)
|
||||||
@ -452,11 +444,12 @@ if __name__ == "__main__":
|
|||||||
# version = "v1"
|
# version = "v1"
|
||||||
# export(vits_path, gpt_path, exp_path, version)
|
# export(vits_path, gpt_path, exp_path, version)
|
||||||
|
|
||||||
# gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
|
gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
|
||||||
# vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
|
vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
|
||||||
# exp_path = "v2_export"
|
exp_path = "v2_export"
|
||||||
# version = "v2"
|
version = "v2"
|
||||||
# export(vits_path, gpt_path, exp_path, version)
|
export(vits_path, gpt_path, exp_path, version)
|
||||||
|
combineInitStepAndStageStep('onnx/v2_export/v2_export_t2s_init_step.onnx', 'onnx/v2_export/v2_export_t2s_sdec.onnx', 'onnx/v2_export/v2_export_t2s_combined.onnx')
|
||||||
|
|
||||||
# gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
|
# gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
|
||||||
# vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth"
|
# vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth"
|
||||||
@ -464,11 +457,11 @@ if __name__ == "__main__":
|
|||||||
# version = "v2Pro"
|
# version = "v2Pro"
|
||||||
# export(vits_path, gpt_path, exp_path, version)
|
# export(vits_path, gpt_path, exp_path, version)
|
||||||
|
|
||||||
gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
|
# gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
|
||||||
vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth"
|
# vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth"
|
||||||
exp_path = "v2proplus_export"
|
# exp_path = "v2proplus_export"
|
||||||
version = "v2ProPlus"
|
# version = "v2ProPlus"
|
||||||
export(vits_path, gpt_path, exp_path, version)
|
# export(vits_path, gpt_path, exp_path, version)
|
||||||
combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx')
|
# combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx')
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import torch
|
|||||||
from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx
|
from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx
|
||||||
|
|
||||||
|
|
||||||
MODEL_PATH = "onnx/v2proplus_export/v2proplus"
|
MODEL_PATH = "onnx/v2_export/v2"
|
||||||
|
|
||||||
def audio_postprocess(
|
def audio_postprocess(
|
||||||
audios,
|
audios,
|
||||||
@ -63,7 +63,7 @@ def preprocess_text(text:str):
|
|||||||
|
|
||||||
# input_phones_saved = np.load("playground/ref/input_phones.npy")
|
# input_phones_saved = np.load("playground/ref/input_phones.npy")
|
||||||
# input_bert_saved = np.load("playground/ref/input_bert.npy").T.astype(np.float32)
|
# input_bert_saved = np.load("playground/ref/input_bert.npy").T.astype(np.float32)
|
||||||
[input_phones, input_bert] = preprocess_text("地上的人儿吵吵闹闹在地上追")
|
[input_phones, input_bert] = preprocess_text("天上的风筝在天上飞,地上的人儿在地上追。")
|
||||||
|
|
||||||
|
|
||||||
# ref_phones = np.load("playground/ref/ref_phones.npy")
|
# ref_phones = np.load("playground/ref/ref_phones.npy")
|
||||||
@ -73,8 +73,12 @@ def preprocess_text(text:str):
|
|||||||
|
|
||||||
[audio_prompt_hubert, spectrum, sv_emb] = audio_preprocess("playground/ref/audio.wav")
|
[audio_prompt_hubert, spectrum, sv_emb] = audio_preprocess("playground/ref/audio.wav")
|
||||||
|
|
||||||
|
np.save("playground/ref/audio_prompt_hubert.npy", audio_prompt_hubert.astype(np.float16))
|
||||||
|
|
||||||
|
# audio_prompt_hubert_saved = np.load("playground/ref/audio_prompt_hubert.npy").astype(np.float32)
|
||||||
|
|
||||||
t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx")
|
t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx")
|
||||||
|
# t2s_init_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_init_step.onnx")
|
||||||
|
|
||||||
[y, k, v, y_emb, x_example, fake_logits, fake_samples] = t2s_combined.run(None, {
|
[y, k, v, y_emb, x_example, fake_logits, fake_samples] = t2s_combined.run(None, {
|
||||||
"if_init_step": np.array(True, dtype=bool),
|
"if_init_step": np.array(True, dtype=bool),
|
||||||
@ -90,7 +94,7 @@ t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx")
|
|||||||
"ix_example":np.empty((1, 0), dtype=np.float32)
|
"ix_example":np.empty((1, 0), dtype=np.float32)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# t2s_stage_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_sdec.onnx")
|
||||||
|
|
||||||
for idx in tqdm(range(1, 1500)):
|
for idx in tqdm(range(1, 1500)):
|
||||||
# [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
|
# [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
|
||||||
@ -113,13 +117,6 @@ y[0, -1] = 0
|
|||||||
|
|
||||||
|
|
||||||
pred_semantic = np.expand_dims(y[:, -idx:], axis=0)
|
pred_semantic = np.expand_dims(y[:, -idx:], axis=0)
|
||||||
# Read and resample reference audio
|
|
||||||
waveform, sample_rate = torchaudio.load("playground/ref/audio.wav")
|
|
||||||
if sample_rate != 32000:
|
|
||||||
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=32000)
|
|
||||||
waveform = resampler(waveform)
|
|
||||||
print(f"Waveform shape: {waveform.shape}")
|
|
||||||
ref_audio = waveform.numpy().astype(np.float32)
|
|
||||||
|
|
||||||
vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
|
vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
|
||||||
|
|
||||||
@ -127,7 +124,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
|
|||||||
"input_text_phones": input_phones,
|
"input_text_phones": input_phones,
|
||||||
"pred_semantic": pred_semantic,
|
"pred_semantic": pred_semantic,
|
||||||
"spectrum": spectrum.astype(np.float32),
|
"spectrum": spectrum.astype(np.float32),
|
||||||
"sv_emb": sv_emb.astype(np.float32)
|
# "sv_emb": sv_emb.astype(np.float32)
|
||||||
})
|
})
|
||||||
|
|
||||||
audio_postprocess([audio])
|
audio_postprocess([audio])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user