feat:v1v2 both works for export

2025-09-29 00:30:15 +08:00 · 2025-08-20 20:57:29 -04:00 · 2025-08-20 20:57:29 -04:00 · 403c5bf320
commit 403c5bf320
parent bc7fe01876
2 changed files with 9 additions and 9 deletions
--- a/GPT_SoVITS/onnx_export.py
+++ b/GPT_SoVITS/onnx_export.py
@ -206,7 +206,7 @@ class T2SModel(nn.Module):
 class VitsModel(nn.Module):
    def __init__(self, vits_path, version:str = 'v2'):
        super().__init__()
-        dict_s2 = torch.load(vits_path, map_location="cpu")
+        dict_s2 = torch.load(vits_path, map_location="cpu", weights_only=False)
        self.hps = dict_s2["config"]
        if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
            self.hps["model"]["version"] = "v1"
@ -400,11 +400,11 @@ if __name__ == "__main__":
    except:
        pass

-    # gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-    # vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
-    # exp_path = "v1_export"
-    # version = "v1"
-    # export(vits_path, gpt_path, exp_path, version)
+    gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+    vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
+    exp_path = "v1_export"
+    version = "v1"
+    export(vits_path, gpt_path, exp_path, version)

    gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
    vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
--- a/playground/freerun.py
+++ b/playground/freerun.py
@ -7,7 +7,7 @@ import torch
 from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx


-MODEL_PATH = "onnx/v2proplus_export/v2proplus"
+MODEL_PATH = "onnx/v1_export/v1"

 def audio_postprocess(
    audios,
@ -56,7 +56,7 @@ def audio_preprocess(audio_path):

 def preprocess_text(text:str):
    preprocessor = TextPreprocessorOnnx("playground/bert")
-    [phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v2')
+    [phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v1')
    phones = np.expand_dims(np.array(phones, dtype=np.int64), axis=0)
    return phones, bert_features.T.astype(np.float32)

@ -123,7 +123,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
    "input_text_phones": input_phones,
    "pred_semantic": pred_semantic,
    "spectrum": spectrum.astype(np.float32),
-    "sv_emb": sv_emb.astype(np.float32)
+    # "sv_emb": sv_emb.astype(np.float32)
 })

 audio_postprocess([audio])