diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 45d32a02..8718a815 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -206,7 +206,7 @@ class T2SModel(nn.Module): class VitsModel(nn.Module): def __init__(self, vits_path, version:str = 'v2'): super().__init__() - dict_s2 = torch.load(vits_path, map_location="cpu") + dict_s2 = torch.load(vits_path, map_location="cpu", weights_only=False) self.hps = dict_s2["config"] if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: self.hps["model"]["version"] = "v1" @@ -400,11 +400,11 @@ if __name__ == "__main__": except: pass - # gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" - # vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" - # exp_path = "v1_export" - # version = "v1" - # export(vits_path, gpt_path, exp_path, version) + gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" + vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" + exp_path = "v1_export" + version = "v1" + export(vits_path, gpt_path, exp_path, version) gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt" vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth" diff --git a/playground/freerun.py b/playground/freerun.py index 25494ef3..5733f253 100644 --- a/playground/freerun.py +++ b/playground/freerun.py @@ -7,7 +7,7 @@ import torch from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx -MODEL_PATH = "onnx/v2proplus_export/v2proplus" +MODEL_PATH = "onnx/v1_export/v1" def audio_postprocess( audios, @@ -56,7 +56,7 @@ def audio_preprocess(audio_path): def preprocess_text(text:str): preprocessor = TextPreprocessorOnnx("playground/bert") - [phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v2') + [phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v1') phones = np.expand_dims(np.array(phones, dtype=np.int64), axis=0) return phones, bert_features.T.astype(np.float32) @@ -123,7 +123,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx") "input_text_phones": input_phones, "pred_semantic": pred_semantic, "spectrum": spectrum.astype(np.float32), - "sv_emb": sv_emb.astype(np.float32) + # "sv_emb": sv_emb.astype(np.float32) }) audio_postprocess([audio])