GPT-SoVITS/GPT_SoVITS/resemble_enhance/denoiser/unet.py

import torch.nn.functional as F
from torch import nn


class PreactResBlock(nn.Sequential):
    def __init__(self, dim):
        super().__init__(
            nn.GroupNorm(dim // 16, dim),
            nn.GELU(),
            nn.Conv2d(dim, dim, 3, padding=1),
            nn.GroupNorm(dim // 16, dim),
            nn.GELU(),
            nn.Conv2d(dim, dim, 3, padding=1),
        )

    def forward(self, x):
        return x + super().forward(x)


class UNetBlock(nn.Module):
    def __init__(self, input_dim, output_dim=None, scale_factor=1.0):
        super().__init__()
        if output_dim is None:
            output_dim = input_dim
        self.pre_conv = nn.Conv2d(input_dim, output_dim, 3, padding=1)
        self.res_block1 = PreactResBlock(output_dim)
        self.res_block2 = PreactResBlock(output_dim)
        self.downsample = self.upsample = nn.Identity()
        if scale_factor > 1:
            self.upsample = nn.Upsample(scale_factor=scale_factor)
        elif scale_factor < 1:
            self.downsample = nn.Upsample(scale_factor=scale_factor)

    def forward(self, x, h=None):
        """
        Args:
            x: (b c h w), last output
            h: (b c h w), skip output
        Returns:
            o: (b c h w), output
            s: (b c h w), skip output
        """
        x = self.upsample(x)
        if h is not None:
            assert x.shape == h.shape, f"{x.shape} != {h.shape}"
            x = x + h
        x = self.pre_conv(x)
        x = self.res_block1(x)
        x = self.res_block2(x)
        return self.downsample(x), x


class UNet(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=16, num_blocks=4, num_middle_blocks=2):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.input_proj = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.encoder_blocks = nn.ModuleList(
            [
                UNetBlock(input_dim=hidden_dim * 2**i, output_dim=hidden_dim * 2 ** (i + 1), scale_factor=0.5)
                for i in range(num_blocks)
            ]
        )
        self.middle_blocks = nn.ModuleList(
            [UNetBlock(input_dim=hidden_dim * 2**num_blocks) for _ in range(num_middle_blocks)]
        )
        self.decoder_blocks = nn.ModuleList(
            [
                UNetBlock(input_dim=hidden_dim * 2 ** (i + 1), output_dim=hidden_dim * 2**i, scale_factor=2)
                for i in reversed(range(num_blocks))
            ]
        )
        self.head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.GELU(),
            nn.Conv2d(hidden_dim, output_dim, 1),
        )

    @property
    def scale_factor(self):
        return 2 ** len(self.encoder_blocks)

    def pad_to_fit(self, x):
        """
        Args:
            x: (b c h w), input
        Returns:
            x: (b c h' w'), padded input
        """
        hpad = (self.scale_factor - x.shape[2] % self.scale_factor) % self.scale_factor
        wpad = (self.scale_factor - x.shape[3] % self.scale_factor) % self.scale_factor
        return F.pad(x, (0, wpad, 0, hpad))

    def forward(self, x):
        """
        Args:
            x: (b c h w), input
        Returns:
            o: (b c h w), output
        """
        shape = x.shape

        x = self.pad_to_fit(x)
        x = self.input_proj(x)

        s_list = []
        for block in self.encoder_blocks:
            x, s = block(x)
            s_list.append(s)

        for block in self.middle_blocks:
            x, _ = block(x)

        for block, s in zip(self.decoder_blocks, reversed(s_list)):
            x, _ = block(x, s)

        x = self.head(x)
        x = x[..., : shape[2], : shape[3]]

        return x

    def test(self, shape=(3, 512, 256)):
        import ptflops

        macs, params = ptflops.get_model_complexity_info(
            self,
            shape,
            as_strings=True,
            print_per_layer_stat=True,
            verbose=True,
        )

        print(f"macs: {macs}")
        print(f"params: {params}")


def main():
    model = UNet(3, 3)
    model.test()


if __name__ == "__main__":
    main()