feat:v1v2 both works for export

This commit is contained in:
zpeng11 2025-08-20 20:57:29 -04:00
parent bc7fe01876
commit 403c5bf320
2 changed files with 9 additions and 9 deletions

View File

@ -206,7 +206,7 @@ class T2SModel(nn.Module):
class VitsModel(nn.Module):
def __init__(self, vits_path, version:str = 'v2'):
super().__init__()
dict_s2 = torch.load(vits_path, map_location="cpu")
dict_s2 = torch.load(vits_path, map_location="cpu", weights_only=False)
self.hps = dict_s2["config"]
if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
self.hps["model"]["version"] = "v1"
@ -400,11 +400,11 @@ if __name__ == "__main__":
except:
pass
# gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
# vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
# exp_path = "v1_export"
# version = "v1"
# export(vits_path, gpt_path, exp_path, version)
gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
vits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
exp_path = "v1_export"
version = "v1"
export(vits_path, gpt_path, exp_path, version)
gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
vits_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"

View File

@ -7,7 +7,7 @@ import torch
from TTS_infer_pack.TextPreprocessor_onnx import TextPreprocessorOnnx
MODEL_PATH = "onnx/v2proplus_export/v2proplus"
MODEL_PATH = "onnx/v1_export/v1"
def audio_postprocess(
audios,
@ -56,7 +56,7 @@ def audio_preprocess(audio_path):
def preprocess_text(text:str):
preprocessor = TextPreprocessorOnnx("playground/bert")
[phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v2')
[phones, bert_features, norm_text] = preprocessor.segment_and_extract_feature_for_text(text, 'all_zh', 'v1')
phones = np.expand_dims(np.array(phones, dtype=np.int64), axis=0)
return phones, bert_features.T.astype(np.float32)
@ -123,7 +123,7 @@ vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
"input_text_phones": input_phones,
"pred_semantic": pred_semantic,
"spectrum": spectrum.astype(np.float32),
"sv_emb": sv_emb.astype(np.float32)
# "sv_emb": sv_emb.astype(np.float32)
})
audio_postprocess([audio])