mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-09-29 00:30:15 +08:00
feat: v2pro and v2proplus onnx export works, todo:solve data dependencies
This commit is contained in:
parent
610b36561a
commit
4e42a28f9c
1
.gitignore
vendored
1
.gitignore
vendored
@ -194,3 +194,4 @@ cython_debug/
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
onnx/
|
||||
*.onnx
|
@ -1,4 +1,5 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torchaudio
|
||||
from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule
|
||||
from feature_extractor import cnhubert
|
||||
@ -39,6 +40,27 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
return spec
|
||||
|
||||
def resample_audio(audio: torch.Tensor, orig_sr: int, target_sr: int) -> torch.Tensor:
|
||||
"""
|
||||
Resample audio from orig_sr to target_sr using linear interpolation.
|
||||
audio: (batch, channels, samples) or (channels, samples) or (samples,)
|
||||
"""
|
||||
if audio.dim() == 1:
|
||||
audio = audio.unsqueeze(0).unsqueeze(0)
|
||||
elif audio.dim() == 2:
|
||||
audio = audio.unsqueeze(0)
|
||||
# audio shape: (batch, channels, samples)
|
||||
batch, channels, samples = audio.shape
|
||||
new_samples = int(samples * target_sr / orig_sr)
|
||||
audio = audio.view(batch * channels, 1, samples)
|
||||
resampled = F.interpolate(audio, size=new_samples, mode='linear', align_corners=False)
|
||||
resampled = resampled.view(batch, channels, new_samples)
|
||||
if resampled.shape[0] == 1 and resampled.shape[1] == 1:
|
||||
resampled = resampled.squeeze(0).squeeze(0)
|
||||
elif resampled.shape[0] == 1:
|
||||
resampled = resampled.squeeze(0)
|
||||
return resampled
|
||||
|
||||
|
||||
class DictToAttrRecursive(dict):
|
||||
def __init__(self, input_dict):
|
||||
@ -225,7 +247,7 @@ class VitsModel(nn.Module):
|
||||
center=False,
|
||||
)
|
||||
if self.sv_model is not None:
|
||||
sv_emb=self.sv_model.compute_embedding3_onnx(ref_audio)
|
||||
sv_emb=self.sv_model.compute_embedding3_onnx(resample_audio(ref_audio, 32000, 16000))
|
||||
return self.vq_model(pred_semantic, text_seq, refer, sv_emb=sv_emb)[0, 0]
|
||||
return self.vq_model(pred_semantic, text_seq, refer)[0, 0]
|
||||
|
||||
@ -292,7 +314,7 @@ def export(vits_path, gpt_path, project_name, voice_model_version="v2"):
|
||||
"y",
|
||||
"e4",
|
||||
],
|
||||
version=voice_model_version,
|
||||
version='v2',
|
||||
)
|
||||
]
|
||||
)
|
||||
@ -325,7 +347,7 @@ def export(vits_path, gpt_path, project_name, voice_model_version="v2"):
|
||||
"y",
|
||||
"e4",
|
||||
],
|
||||
version=voice_model_version,
|
||||
version='v2',
|
||||
)
|
||||
]
|
||||
)
|
||||
@ -380,6 +402,11 @@ if __name__ == "__main__":
|
||||
# version = "v2"
|
||||
# export(vits_path, gpt_path, exp_path, version)
|
||||
|
||||
# gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
|
||||
# vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth"
|
||||
# exp_path = "v2pro_export"
|
||||
# version = "v2Pro"
|
||||
|
||||
gpt_path = "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"
|
||||
vits_path = "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth"
|
||||
exp_path = "v2proplus_export"
|
||||
|
BIN
playground/audio_prompt_hubert.npy
Normal file
BIN
playground/audio_prompt_hubert.npy
Normal file
Binary file not shown.
102
playground/freerun.py
Normal file
102
playground/freerun.py
Normal file
@ -0,0 +1,102 @@
|
||||
import onnxruntime as ort
|
||||
import numpy as np
|
||||
import onnx
|
||||
from tqdm import tqdm
|
||||
import torchaudio
|
||||
import torch
|
||||
|
||||
MODEL_PATH = "playground/v2proplus_export/v2proplus"
|
||||
|
||||
def audio_postprocess(
|
||||
audios,
|
||||
fragment_interval: float = 0.3,
|
||||
):
|
||||
zero_wav = np.zeros((int(32000 * fragment_interval),)).astype(np.float32)
|
||||
for i, audio in enumerate(audios):
|
||||
max_audio = np.abs(audio).max() # 简单防止16bit爆音
|
||||
if max_audio > 1:
|
||||
audio /= max_audio
|
||||
audio = np.concatenate([audio, zero_wav], axis=0)
|
||||
audios[i] = audio
|
||||
|
||||
audio = np.concatenate(audios, axis=0)
|
||||
|
||||
# audio = (audio * 32768).astype(np.int16)
|
||||
|
||||
audio_tensor = torch.from_numpy(audio).unsqueeze(0)
|
||||
|
||||
torchaudio.save('playground/output.wav', audio_tensor, 32000)
|
||||
|
||||
return audio
|
||||
|
||||
|
||||
input_phones = np.load("playground/input_phones.npy")
|
||||
input_bert = np.load("playground/input_bert.npy").T.astype(np.float32)
|
||||
ref_phones = np.load("playground/ref_phones.npy")
|
||||
ref_bert = np.load("playground/ref_bert.npy").T.astype(np.float32)
|
||||
audio_prompt_hubert = np.load("playground/audio_prompt_hubert.npy").astype(np.float32)
|
||||
|
||||
|
||||
encoder = ort.InferenceSession(MODEL_PATH+"_export_t2s_encoder.onnx")
|
||||
|
||||
outputs = encoder.run(None, {
|
||||
"text_seq": input_phones,
|
||||
"text_bert": input_bert,
|
||||
"ref_seq": ref_phones,
|
||||
"ref_bert": ref_bert,
|
||||
"ssl_content": audio_prompt_hubert
|
||||
})
|
||||
|
||||
print(outputs[0].shape, outputs[1].shape)
|
||||
|
||||
x = outputs[0]
|
||||
prompts = outputs[1]
|
||||
|
||||
fsdec = ort.InferenceSession(MODEL_PATH+"_export_t2s_fsdec.onnx")
|
||||
sdec = ort.InferenceSession(MODEL_PATH+"_export_t2s_sdec.onnx")
|
||||
|
||||
# for i in tqdm(range(10000)):
|
||||
[y, k, v, y_emb, x_example] = fsdec.run(None, {
|
||||
"x": x,
|
||||
"prompts": prompts
|
||||
})
|
||||
|
||||
early_stop_num = -1
|
||||
prefix_len = prompts.shape[1]
|
||||
|
||||
stop = False
|
||||
for idx in tqdm(range(1, 1500)):
|
||||
# [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
|
||||
[y, k, v, y_emb, logits, samples] = sdec.run(None, {
|
||||
"iy": y,
|
||||
"ik": k,
|
||||
"iv": v,
|
||||
"iy_emb": y_emb,
|
||||
"ix_example": x_example
|
||||
})
|
||||
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
||||
stop = True
|
||||
if np.argmax(logits, axis=-1)[0] == 1024 or samples[0, 0] == 1024:
|
||||
stop = True
|
||||
if stop:
|
||||
break
|
||||
y[0, -1] = 0
|
||||
|
||||
|
||||
pred_semantic = np.expand_dims(y[:, -idx:], axis=0)
|
||||
# Read and resample reference audio
|
||||
waveform, sample_rate = torchaudio.load("playground/ref/audio.wav")
|
||||
if sample_rate != 32000:
|
||||
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=32000)
|
||||
waveform = resampler(waveform)
|
||||
ref_audio = waveform.numpy().astype(np.float32)
|
||||
vtis = ort.InferenceSession(MODEL_PATH+"_export_vits.onnx")
|
||||
|
||||
[audio] = vtis.run(None, {
|
||||
"text_seq": input_phones,
|
||||
"pred_semantic": pred_semantic,
|
||||
"ref_audio": ref_audio
|
||||
})
|
||||
print(audio.shape, audio.dtype, audio.min(), audio.max())
|
||||
|
||||
audio_postprocess([audio])
|
BIN
playground/input_bert.npy
Normal file
BIN
playground/input_bert.npy
Normal file
Binary file not shown.
BIN
playground/input_phones.npy
Normal file
BIN
playground/input_phones.npy
Normal file
Binary file not shown.
BIN
playground/output.wav
Normal file
BIN
playground/output.wav
Normal file
Binary file not shown.
BIN
playground/ref/audio.wav
Normal file
BIN
playground/ref/audio.wav
Normal file
Binary file not shown.
BIN
playground/ref_bert.npy
Normal file
BIN
playground/ref_bert.npy
Normal file
Binary file not shown.
BIN
playground/ref_phones.npy
Normal file
BIN
playground/ref_phones.npy
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user