From 63cbb6efa761a633d971c2af66a9e4bf60da3f31 Mon Sep 17 00:00:00 2001 From: zpeng11 Date: Fri, 22 Aug 2025 01:36:58 -0400 Subject: [PATCH] verified audio, text, synthesizer all working, todo:dig into t2s for error --- GPT_SoVITS/onnx_export.py | 35 ++++++++++++++--------------------- playground/freerun.py | 19 ++++++++----------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index c0b535e2..87ea5e70 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -264,17 +264,9 @@ class GptSoVits(nn.Module): class AudioPreprocess(nn.Module): def __init__(self): super().__init__() - self.config = HubertConfig.from_pretrained(cnhubert_base_path) - self.config._attn_implementation = "eager" # Use standard attention - self.config.apply_spec_augment = False # Disable masking for inference - self.config.layerdrop = 0.0 # Disable layer dropout # Load the model - self.model = HubertModel.from_pretrained( - cnhubert_base_path, - config=self.config, - local_files_only=True - ) + self.model = HubertModel.from_pretrained(cnhubert_base_path, local_files_only=True) self.model.eval() self.sv_model = SV("cpu", False) @@ -292,7 +284,7 @@ class AudioPreprocess(nn.Module): sv_emb = self.sv_model.compute_embedding3_onnx(ref_audio_16k) - zero_tensor = torch.zeros((1, 4800), dtype=torch.float32) + zero_tensor = torch.zeros((1, 9600), dtype=torch.float32) ref_audio_16k = ref_audio_16k.unsqueeze(0) # concate zero_tensor with waveform ref_audio_16k = torch.cat([ref_audio_16k, zero_tensor], dim=1) @@ -452,11 +444,12 @@ if __name__ == "__main__": # version = "v1" # export(vits_path, gpt_path, exp_path, version) - # gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt" - # vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth" - # exp_path = "v2_export" - # version = "v2" - # export(vits_path, gpt_path, exp_path, version) + gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt" + vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth" + exp_path = "v2_export" + version = "v2" + export(vits_path, gpt_path, exp_path, version) + combineInitStepAndStageStep('onnx/v2_export/v2_export_t2s_init_step.onnx', 'onnx/v2_export/v2_export_t2s_sdec.onnx', 'onnx/v2_export/v2_export_t2s_combined.onnx') # gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt" # vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth" @@ -464,11 +457,11 @@ if __name__ == "__main__": # version = "v2Pro" # export(vits_path, gpt_path, exp_path, version) - gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt" - vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth" - exp_path = "v2proplus_export" - version = "v2ProPlus" - export(vits_path, gpt_path, exp_path, version) - combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx') + # gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt" + # vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth" + # exp_path = "v2proplus_export" + # version = "v2ProPlus" + # export(vits_path, gpt_path, exp_path, version) + # combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx') diff --git a/playground/freerun.py b/playground/freerun.py index ffd80028..f914e6dc 100644 --- a/playground/freerun.py +++ b/playground/freerun.py @@ -7,7 +7,7 @@ import torch from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx -MODEL_PATH = "onnx/v2proplus_export/v2proplus" +MODEL_PATH = "onnx/v2_export/v2" def audio_postprocess( audios, @@ -63,7 +63,7 @@ def preprocess_text(text:str): # input_phones_saved = np.load("playground/ref/input_phones.npy") # input_bert_saved = np.load("playground/ref/input_bert.npy").T.astype(np.float32) -[input_phones, input_bert] = preprocess_text("地上的人儿吵吵闹闹在地上追") +[input_phones, input_bert] = preprocess_text("天上的风筝在天上飞,地上的人儿在地上追。") # ref_phones = np.load("playground/ref/ref_phones.npy") @@ -73,8 +73,12 @@ def preprocess_text(text:str): [audio_prompt_hubert, spectrum, sv_emb] = audio_preprocess("playground/ref/audio.wav") +np.save("playground/ref/audio_prompt_hubert.npy", audio_prompt_hubert.astype(np.float16)) + +# audio_prompt_hubert_saved = np.load("playground/ref/audio_prompt_hubert.npy").astype(np.float32) t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx") +# t2s_init_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_init_step.onnx") [y, k, v, y_emb, x_example, fake_logits, fake_samples] = t2s_combined.run(None, { "if_init_step": np.array(True, dtype=bool), @@ -90,7 +94,7 @@ t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx") "ix_example":np.empty((1, 0), dtype=np.float32) }) - +# t2s_stage_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_sdec.onnx") for idx in tqdm(range(1, 1500)): # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] @@ -113,13 +117,6 @@ y[0, -1] = 0 pred_semantic = np.expand_dims(y[:, -idx:], axis=0) -# Read and resample reference audio -waveform, sample_rate = torchaudio.load("playground/ref/audio.wav") -if sample_rate != 32000: - resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=32000) - waveform = resampler(waveform) -print(f"Waveform shape: {waveform.shape}") -ref_audio = waveform.numpy().astype(np.float32) vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx") @@ -127,7 +124,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx") "input_text_phones": input_phones, "pred_semantic": pred_semantic, "spectrum": spectrum.astype(np.float32), - "sv_emb": sv_emb.astype(np.float32) + # "sv_emb": sv_emb.astype(np.float32) }) audio_postprocess([audio])