From 63cbb6efa761a633d971c2af66a9e4bf60da3f31 Mon Sep 17 00:00:00 2001
From: zpeng11 <a2843173630@gmail.com>
Date: Fri, 22 Aug 2025 01:36:58 -0400
Subject: [PATCH] verified audio, text, synthesizer all working, todo:dig into
 t2s for error

---
 GPT_SoVITS/onnx_export.py | 35 ++++++++++++++---------------------
 playground/freerun.py     | 19 ++++++++-----------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py
index c0b535e2..87ea5e70 100644
--- a/GPT_SoVITS/onnx_export.py
+++ b/GPT_SoVITS/onnx_export.py
@@ -264,17 +264,9 @@ class GptSoVits(nn.Module):
 class AudioPreprocess(nn.Module):
     def __init__(self):
         super().__init__()
-        self.config = HubertConfig.from_pretrained(cnhubert_base_path)
-        self.config._attn_implementation = "eager"  # Use standard attention
-        self.config.apply_spec_augment = False      # Disable masking for inference
-        self.config.layerdrop = 0.0                 # Disable layer dropout
         
         # Load the model
-        self.model = HubertModel.from_pretrained(
-            cnhubert_base_path,
-            config=self.config, 
-            local_files_only=True
-        )
+        self.model = HubertModel.from_pretrained(cnhubert_base_path, local_files_only=True)
         self.model.eval()
 
         self.sv_model = SV("cpu", False)
@@ -292,7 +284,7 @@ class AudioPreprocess(nn.Module):
 
         sv_emb = self.sv_model.compute_embedding3_onnx(ref_audio_16k)
 
-        zero_tensor = torch.zeros((1, 4800), dtype=torch.float32)
+        zero_tensor = torch.zeros((1, 9600), dtype=torch.float32)
         ref_audio_16k = ref_audio_16k.unsqueeze(0)
         # concate zero_tensor with waveform
         ref_audio_16k = torch.cat([ref_audio_16k, zero_tensor], dim=1)
@@ -452,11 +444,12 @@ if __name__ == "__main__":
     # version = "v1"
     # export(vits_path, gpt_path, exp_path, version)
 
-    # gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
-    # vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
-    # exp_path = "v2_export"
-    # version = "v2"
-    # export(vits_path, gpt_path, exp_path, version)
+    gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
+    vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
+    exp_path = "v2_export"
+    version = "v2"
+    export(vits_path, gpt_path, exp_path, version)
+    combineInitStepAndStageStep('onnx/v2_export/v2_export_t2s_init_step.onnx', 'onnx/v2_export/v2_export_t2s_sdec.onnx', 'onnx/v2_export/v2_export_t2s_combined.onnx')
 
     # gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
     # vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth"
@@ -464,11 +457,11 @@ if __name__ == "__main__":
     # version = "v2Pro"
     # export(vits_path, gpt_path, exp_path, version)
 
-    gpt_path = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
-    vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth"
-    exp_path = "v2proplus_export"
-    version = "v2ProPlus"
-    export(vits_path, gpt_path, exp_path, version)
-    combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx')
+    # gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
+    # vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth"
+    # exp_path = "v2proplus_export"
+    # version = "v2ProPlus"
+    # export(vits_path, gpt_path, exp_path, version)
+    # combineInitStepAndStageStep('onnx/v2proplus_export/v2proplus_export_t2s_init_step.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_sdec.onnx', 'onnx/v2proplus_export/v2proplus_export_t2s_combined.onnx')
 
 
diff --git a/playground/freerun.py b/playground/freerun.py
index ffd80028..f914e6dc 100644
--- a/playground/freerun.py
+++ b/playground/freerun.py
@@ -7,7 +7,7 @@ import torch
 from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx
 
 
-MODEL_PATH = "onnx/v2proplus_export/v2proplus"
+MODEL_PATH = "onnx/v2_export/v2"
 
 def audio_postprocess(
     audios,
@@ -63,7 +63,7 @@ def preprocess_text(text:str):
 
 # input_phones_saved = np.load("playground/ref/input_phones.npy")
 # input_bert_saved = np.load("playground/ref/input_bert.npy").T.astype(np.float32)
-[input_phones, input_bert] = preprocess_text("地上的人儿吵吵闹闹在地上追")
+[input_phones, input_bert] = preprocess_text("天上的风筝在天上飞，地上的人儿在地上追。")
 
 
 # ref_phones = np.load("playground/ref/ref_phones.npy")
@@ -73,8 +73,12 @@ def preprocess_text(text:str):
 
 [audio_prompt_hubert, spectrum, sv_emb] = audio_preprocess("playground/ref/audio.wav")
 
+np.save("playground/ref/audio_prompt_hubert.npy", audio_prompt_hubert.astype(np.float16))
+
+# audio_prompt_hubert_saved = np.load("playground/ref/audio_prompt_hubert.npy").astype(np.float32)
 
 t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx")
+# t2s_init_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_init_step.onnx")
 
 [y, k, v, y_emb, x_example, fake_logits, fake_samples] = t2s_combined.run(None, {
     "if_init_step": np.array(True, dtype=bool),
@@ -90,7 +94,7 @@ t2s_combined = ort.InferenceSession(MODEL_PATH+"_export_t2s_combined.onnx")
     "ix_example":np.empty((1, 0), dtype=np.float32)
 })
 
-
+# t2s_stage_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_sdec.onnx")
 
 for idx in tqdm(range(1, 1500)):
     # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
@@ -113,13 +117,6 @@ y[0, -1] = 0
 
 
 pred_semantic = np.expand_dims(y[:, -idx:], axis=0)
-# Read and resample reference audio
-waveform, sample_rate = torchaudio.load("playground/ref/audio.wav")
-if sample_rate != 32000:
-    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=32000)
-    waveform = resampler(waveform)
-print(f"Waveform shape: {waveform.shape}")
-ref_audio = waveform.numpy().astype(np.float32)
 
 vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
 
@@ -127,7 +124,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
     "input_text_phones": input_phones,
     "pred_semantic": pred_semantic,
     "spectrum": spectrum.astype(np.float32),
-    "sv_emb": sv_emb.astype(np.float32)
+    # "sv_emb": sv_emb.astype(np.float32)
 })
 
 audio_postprocess([audio])