mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-07 23:48:48 +08:00
95 lines
2.4 KiB
Python
95 lines
2.4 KiB
Python
import numpy as np
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from torch import Tensor, nn
|
|
from torch.nn.utils.parametrizations import weight_norm
|
|
|
|
from ..hparams import HParams
|
|
from .lvcnet import LVCBlock
|
|
from .mrstft import MRSTFTLoss
|
|
|
|
|
|
class UnivNet(nn.Module):
|
|
@property
|
|
def d_noise(self):
|
|
return 128
|
|
|
|
@property
|
|
def strides(self):
|
|
return [7, 5, 4, 3]
|
|
|
|
@property
|
|
def dilations(self):
|
|
return [1, 3, 9, 27]
|
|
|
|
@property
|
|
def nc(self):
|
|
return self.hp.univnet_nc
|
|
|
|
@property
|
|
def scale_factor(self) -> int:
|
|
return self.hp.hop_size
|
|
|
|
def __init__(self, hp: HParams, d_input):
|
|
super().__init__()
|
|
self.d_input = d_input
|
|
|
|
self.hp = hp
|
|
|
|
self.blocks = nn.ModuleList(
|
|
[
|
|
LVCBlock(
|
|
self.nc,
|
|
d_input,
|
|
stride=stride,
|
|
dilations=self.dilations,
|
|
cond_hop_length=hop_length,
|
|
kpnet_conv_size=3,
|
|
)
|
|
for stride, hop_length in zip(self.strides, np.cumprod(self.strides))
|
|
]
|
|
)
|
|
|
|
self.conv_pre = weight_norm(nn.Conv1d(self.d_noise, self.nc, 7, padding=3, padding_mode="reflect"))
|
|
|
|
self.conv_post = nn.Sequential(
|
|
nn.LeakyReLU(0.2),
|
|
weight_norm(nn.Conv1d(self.nc, 1, 7, padding=3, padding_mode="reflect")),
|
|
nn.Tanh(),
|
|
)
|
|
|
|
self.mrstft = MRSTFTLoss(hp)
|
|
|
|
@property
|
|
def eps(self):
|
|
return 1e-5
|
|
|
|
def forward(self, x: Tensor, y: Tensor | None = None, npad=10):
|
|
"""
|
|
Args:
|
|
x: (b c t), acoustic features
|
|
y: (b t), waveform
|
|
Returns:
|
|
z: (b t), waveform
|
|
"""
|
|
assert x.ndim == 3, "x must be 3D tensor"
|
|
assert y is None or y.ndim == 2, "y must be 2D tensor"
|
|
assert x.shape[1] == self.d_input, f"x.shape[1] must be {self.d_input}, but got {x.shape}"
|
|
assert npad >= 0, "npad must be positive or zero"
|
|
|
|
x = F.pad(x, (0, npad), "constant", 0)
|
|
z = torch.randn(x.shape[0], self.d_noise, x.shape[2]).to(x)
|
|
z = self.conv_pre(z) # (b c t)
|
|
|
|
for block in self.blocks:
|
|
z = block(z, x) # (b c t)
|
|
|
|
z = self.conv_post(z) # (b 1 t)
|
|
z = z[..., : -self.scale_factor * npad]
|
|
z = z.squeeze(1) # (b t)
|
|
|
|
if y is not None:
|
|
self.losses = self.mrstft(z, y)
|
|
|
|
return z
|