From bb529e7e4a1bebfb4d4cdf9b96cb6f5623ac5967 Mon Sep 17 00:00:00 2001 From: zpeng11 Date: Wed, 20 Aug 2025 20:21:42 -0400 Subject: [PATCH] update namings --- GPT_SoVITS/onnx_export.py | 15 +++++++-------- playground/freerun.py | 12 ++++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 3a940f65..f0e685e7 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -157,14 +157,14 @@ class T2SModel(nn.Module): self.init_step, (ref_seq, text_seq, ref_bert, text_bert, ssl_content), f"onnx/{project_name}/{project_name}_t2s_init_step.onnx", - input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], + input_names=["ref_text_phones", "input_text_phones", "ref_text_bert", "input_text_bert", "hubert_ssl_content"], output_names=["y", "k", "v", "y_emb", "x_example"], dynamic_axes={ - "ref_seq": {1: "ref_length"}, - "text_seq": {1: "text_length"}, - "ref_bert": {0: "ref_length"}, - "text_bert": {0: "text_length"}, - "ssl_content": {2: "ssl_length"}, + "ref_text_phones": {1: "ref_length"}, + "input_text_phones": {1: "text_length"}, + "ref_text_bert": {0: "ref_length"}, + "input_text_bert": {0: "text_length"}, + "hubert_ssl_content": {2: "ssl_length"}, }, opset_version=16, ) @@ -254,9 +254,8 @@ class GptSoVits(nn.Module): input_names=["text_seq", "pred_semantic", "spectrum", "sv_emb"], output_names=["audio"], dynamic_axes={ - "text_seq": {1: "text_length"}, + "input_text_phones": {1: "text_length"}, "pred_semantic": {2: "pred_length"}, - "ref_audio": {1: "audio_length"}, "spectrum": {2: "spectrum_length"}, }, opset_version=17, diff --git a/playground/freerun.py b/playground/freerun.py index f169fd99..25494ef3 100644 --- a/playground/freerun.py +++ b/playground/freerun.py @@ -77,11 +77,11 @@ def preprocess_text(text:str): init_step = ort.InferenceSession(MODEL_PATH+"_export_t2s_init_step.onnx") [y, k, v, y_emb, x_example] = init_step.run(None, { - "text_seq": input_phones, - "text_bert": input_bert, - "ref_seq": ref_phones, - "ref_bert": ref_bert, - "ssl_content": audio_prompt_hubert + "input_text_phones": input_phones, + "input_text_bert": input_bert, + "ref_text_phones": ref_phones, + "ref_text_bert": ref_bert, + "hubert_ssl_content": audio_prompt_hubert }) # fsdec = ort.InferenceSession(MODEL_PATH+"_export_t2s_fsdec.onnx") @@ -120,7 +120,7 @@ ref_audio = waveform.numpy().astype(np.float32) vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx") [audio] = vtis.run(None, { - "text_seq": input_phones, + "input_text_phones": input_phones, "pred_semantic": pred_semantic, "spectrum": spectrum.astype(np.float32), "sv_emb": sv_emb.astype(np.float32)