update namings

2025-12-17 01:59:08 +08:00 · 2025-08-20 20:21:42 -04:00 · 2025-08-20 20:21:42 -04:00 · bb529e7e4a
commit bb529e7e4a
parent 4e0cc57052
2 changed files with 13 additions and 14 deletions
--- a/GPT_SoVITS/onnx_export.py
+++ b/GPT_SoVITS/onnx_export.py
@ -157,14 +157,14 @@ class T2SModel(nn.Module):
            self.init_step,
            (ref_seq, text_seq, ref_bert, text_bert, ssl_content),
            f"onnx/{project_name}/{project_name}_t2s_init_step.onnx",
-            input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"],
+            input_names=["ref_text_phones", "input_text_phones", "ref_text_bert", "input_text_bert", "hubert_ssl_content"],
            output_names=["y", "k", "v", "y_emb", "x_example"],
            dynamic_axes={
-                "ref_seq": {1: "ref_length"},
-                "text_seq": {1: "text_length"},
-                "ref_bert": {0: "ref_length"},
-                "text_bert": {0: "text_length"},
-                "ssl_content": {2: "ssl_length"},
+                "ref_text_phones": {1: "ref_length"},
+                "input_text_phones": {1: "text_length"},
+                "ref_text_bert": {0: "ref_length"},
+                "input_text_bert": {0: "text_length"},
+                "hubert_ssl_content": {2: "ssl_length"},
            },
            opset_version=16,
        )
@ -254,9 +254,8 @@ class GptSoVits(nn.Module):
            input_names=["text_seq", "pred_semantic", "spectrum", "sv_emb"],
            output_names=["audio"],
            dynamic_axes={
-                "text_seq": {1: "text_length"},
+                "input_text_phones": {1: "text_length"},
                "pred_semantic": {2: "pred_length"},
-                "ref_audio": {1: "audio_length"},
                "spectrum": {2: "spectrum_length"},
            },
            opset_version=17,
--- a/playground/freerun.py
+++ b/playground/freerun.py
@ -77,11 +77,11 @@ def preprocess_text(text:str):
 init_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_init_step.onnx")

 [y, k, v, y_emb, x_example] = init_step.run(None, {
-    "text_seq": input_phones,
-    "text_bert": input_bert,
-    "ref_seq": ref_phones,
-    "ref_bert": ref_bert,
-    "ssl_content": audio_prompt_hubert
+    "input_text_phones": input_phones,
+    "input_text_bert": input_bert,
+    "ref_text_phones": ref_phones,
+    "ref_text_bert": ref_bert,
+    "hubert_ssl_content": audio_prompt_hubert
 })

 # fsdec = ort.InferenceSession(MODEL_PATH+"_export_t2s_fsdec.onnx")
@ -120,7 +120,7 @@ ref_audio = waveform.numpy().astype(np.float32)
 vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")

 [audio] = vtis.run(None, {
-    "text_seq": input_phones,
+    "input_text_phones": input_phones,
    "pred_semantic": pred_semantic,
    "spectrum": spectrum.astype(np.float32),
    "sv_emb": sv_emb.astype(np.float32)