CogVideo/sat/sgm/modules/autoencoding/magvit2_pytorch.py

import copy
from pathlib import Path
from math import log2, ceil, sqrt
from functools import wraps, partial

import torch
import torch.nn.functional as F
from torch.cuda.amp import autocast
from torch import nn, einsum, Tensor
from torch.nn import Module, ModuleList
from torch.autograd import grad as torch_grad

import torchvision
from torchvision.models import VGG16_Weights

from collections import namedtuple

# from vector_quantize_pytorch import LFQ, FSQ
from .regularizers.finite_scalar_quantization import FSQ
from .regularizers.lookup_free_quantization import LFQ

from einops import rearrange, repeat, reduce, pack, unpack
from einops.layers.torch import Rearrange

from beartype import beartype
from beartype.typing import Union, Tuple, Optional, List

from magvit2_pytorch.attend import Attend
from magvit2_pytorch.version import __version__

from gateloop_transformer import SimpleGateLoopLayer

from taylor_series_linear_attention import TaylorSeriesLinearAttn

from kornia.filters import filter3d

import pickle

# helper


def exists(v):
    return v is not None


def default(v, d):
    return v if exists(v) else d


def safe_get_index(it, ind, default=None):
    if ind < len(it):
        return it[ind]
    return default


def pair(t):
    return t if isinstance(t, tuple) else (t, t)


def identity(t, *args, **kwargs):
    return t


def divisible_by(num, den):
    return (num % den) == 0


def pack_one(t, pattern):
    return pack([t], pattern)


def unpack_one(t, ps, pattern):
    return unpack(t, ps, pattern)[0]


def append_dims(t, ndims: int):
    return t.reshape(*t.shape, *((1,) * ndims))


def is_odd(n):
    return not divisible_by(n, 2)


def maybe_del_attr_(o, attr):
    if hasattr(o, attr):
        delattr(o, attr)


def cast_tuple(t, length=1):
    return t if isinstance(t, tuple) else ((t,) * length)


# tensor helpers


def l2norm(t):
    return F.normalize(t, dim=-1, p=2)


def pad_at_dim(t, pad, dim=-1, value=0.0):
    dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1)
    zeros = (0, 0) * dims_from_right
    return F.pad(t, (*zeros, *pad), value=value)


def pick_video_frame(video, frame_indices):
    batch, device = video.shape[0], video.device
    video = rearrange(video, "b c f ... -> b f c ...")
    batch_indices = torch.arange(batch, device=device)
    batch_indices = rearrange(batch_indices, "b -> b 1")
    images = video[batch_indices, frame_indices]
    images = rearrange(images, "b 1 c ... -> b c ...")
    return images


# gan related


def gradient_penalty(images, output):
    batch_size = images.shape[0]

    gradients = torch_grad(
        outputs=output,
        inputs=images,
        grad_outputs=torch.ones(output.size(), device=images.device),
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]

    gradients = rearrange(gradients, "b ... -> b (...)")
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()


def leaky_relu(p=0.1):
    return nn.LeakyReLU(p)


def hinge_discr_loss(fake, real):
    return (F.relu(1 + fake) + F.relu(1 - real)).mean()


def hinge_gen_loss(fake):
    return -fake.mean()


@autocast(enabled=False)
@beartype
def grad_layer_wrt_loss(loss: Tensor, layer: nn.Parameter):
    return torch_grad(
        outputs=loss, inputs=layer, grad_outputs=torch.ones_like(loss), retain_graph=True
    )[0].detach()


# helper decorators


def remove_vgg(fn):
    @wraps(fn)
    def inner(self, *args, **kwargs):
        has_vgg = hasattr(self, "vgg")
        if has_vgg:
            vgg = self.vgg
            delattr(self, "vgg")

        out = fn(self, *args, **kwargs)

        if has_vgg:
            self.vgg = vgg

        return out

    return inner


# helper classes


def Sequential(*modules):
    modules = [*filter(exists, modules)]

    if len(modules) == 0:
        return nn.Identity()

    return nn.Sequential(*modules)


class Residual(Module):
    @beartype
    def __init__(self, fn: Module):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x


# for a bunch of tensor operations to change tensor to (batch, time, feature dimension) and back


class ToTimeSequence(Module):
    @beartype
    def __init__(self, fn: Module):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        x = rearrange(x, "b c f ... -> b ... f c")
        x, ps = pack_one(x, "* n c")

        o = self.fn(x, **kwargs)

        o = unpack_one(o, ps, "* n c")
        return rearrange(o, "b ... f c -> b c f ...")


class SqueezeExcite(Module):
    # global context network - attention-esque squeeze-excite variant (https://arxiv.org/abs/2012.13375)

    def __init__(self, dim, *, dim_out=None, dim_hidden_min=16, init_bias=-10):
        super().__init__()
        dim_out = default(dim_out, dim)

        self.to_k = nn.Conv2d(dim, 1, 1)
        dim_hidden = max(dim_hidden_min, dim_out // 2)

        self.net = nn.Sequential(
            nn.Conv2d(dim, dim_hidden, 1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(dim_hidden, dim_out, 1),
            nn.Sigmoid(),
        )

        nn.init.zeros_(self.net[-2].weight)
        nn.init.constant_(self.net[-2].bias, init_bias)

    def forward(self, x):
        orig_input, batch = x, x.shape[0]
        is_video = x.ndim == 5

        if is_video:
            x = rearrange(x, "b c f h w -> (b f) c h w")

        context = self.to_k(x)

        context = rearrange(context, "b c h w -> b c (h w)").softmax(dim=-1)
        spatial_flattened_input = rearrange(x, "b c h w -> b c (h w)")

        out = einsum("b i n, b c n -> b c i", context, spatial_flattened_input)
        out = rearrange(out, "... -> ... 1")
        gates = self.net(out)

        if is_video:
            gates = rearrange(gates, "(b f) c h w -> b c f h w", b=batch)

        return gates * orig_input


# token shifting


class TokenShift(Module):
    @beartype
    def __init__(self, fn: Module):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        x, x_shift = x.chunk(2, dim=1)
        x_shift = pad_at_dim(x_shift, (1, -1), dim=2)  # shift time dimension
        x = torch.cat((x, x_shift), dim=1)
        return self.fn(x, **kwargs)


# rmsnorm


class RMSNorm(Module):
    def __init__(self, dim, channel_first=False, images=False, bias=False):
        super().__init__()
        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
        shape = (dim, *broadcastable_dims) if channel_first else (dim,)

        self.channel_first = channel_first
        self.scale = dim**0.5
        self.gamma = nn.Parameter(torch.ones(shape))
        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0

    def forward(self, x):
        return (
            F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma
            + self.bias
        )


class AdaptiveRMSNorm(Module):
    def __init__(self, dim, *, dim_cond, channel_first=False, images=False, bias=False):
        super().__init__()
        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
        shape = (dim, *broadcastable_dims) if channel_first else (dim,)

        self.dim_cond = dim_cond
        self.channel_first = channel_first
        self.scale = dim**0.5

        self.to_gamma = nn.Linear(dim_cond, dim)
        self.to_bias = nn.Linear(dim_cond, dim) if bias else None

        nn.init.zeros_(self.to_gamma.weight)
        nn.init.ones_(self.to_gamma.bias)

        if bias:
            nn.init.zeros_(self.to_bias.weight)
            nn.init.zeros_(self.to_bias.bias)

    @beartype
    def forward(self, x: Tensor, *, cond: Tensor):
        batch = x.shape[0]
        assert cond.shape == (batch, self.dim_cond)

        gamma = self.to_gamma(cond)

        bias = 0.0
        if exists(self.to_bias):
            bias = self.to_bias(cond)

        if self.channel_first:
            gamma = append_dims(gamma, x.ndim - 2)

            if exists(self.to_bias):
                bias = append_dims(bias, x.ndim - 2)

        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * gamma + bias


# attention


class Attention(Module):
    @beartype
    def __init__(
        self,
        *,
        dim,
        dim_cond: Optional[int] = None,
        causal=False,
        dim_head=32,
        heads=8,
        flash=False,
        dropout=0.0,
        num_memory_kv=4,
    ):
        super().__init__()
        dim_inner = dim_head * heads

        self.need_cond = exists(dim_cond)

        if self.need_cond:
            self.norm = AdaptiveRMSNorm(dim, dim_cond=dim_cond)
        else:
            self.norm = RMSNorm(dim)

        self.to_qkv = nn.Sequential(
            nn.Linear(dim, dim_inner * 3, bias=False),
            Rearrange("b n (qkv h d) -> qkv b h n d", qkv=3, h=heads),
        )

        assert num_memory_kv > 0
        self.mem_kv = nn.Parameter(torch.randn(2, heads, num_memory_kv, dim_head))

        self.attend = Attend(causal=causal, dropout=dropout, flash=flash)

        self.to_out = nn.Sequential(
            Rearrange("b h n d -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
        )

    @beartype
    def forward(self, x, mask: Optional[Tensor] = None, cond: Optional[Tensor] = None):
        maybe_cond_kwargs = dict(cond=cond) if self.need_cond else dict()

        x = self.norm(x, **maybe_cond_kwargs)

        q, k, v = self.to_qkv(x)

        mk, mv = map(lambda t: repeat(t, "h n d -> b h n d", b=q.shape[0]), self.mem_kv)
        k = torch.cat((mk, k), dim=-2)
        v = torch.cat((mv, v), dim=-2)

        out = self.attend(q, k, v, mask=mask)
        return self.to_out(out)


class LinearAttention(Module):
    """
    using the specific linear attention proposed in https://arxiv.org/abs/2106.09681
    """

    @beartype
    def __init__(self, *, dim, dim_cond: Optional[int] = None, dim_head=8, heads=8, dropout=0.0):
        super().__init__()
        dim_inner = dim_head * heads

        self.need_cond = exists(dim_cond)

        if self.need_cond:
            self.norm = AdaptiveRMSNorm(dim, dim_cond=dim_cond)
        else:
            self.norm = RMSNorm(dim)

        self.attn = TaylorSeriesLinearAttn(dim=dim, dim_head=dim_head, heads=heads)

    def forward(self, x, cond: Optional[Tensor] = None):
        maybe_cond_kwargs = dict(cond=cond) if self.need_cond else dict()

        x = self.norm(x, **maybe_cond_kwargs)

        return self.attn(x)


class LinearSpaceAttention(LinearAttention):
    def forward(self, x, *args, **kwargs):
        x = rearrange(x, "b c ... h w -> b ... h w c")
        x, batch_ps = pack_one(x, "* h w c")
        x, seq_ps = pack_one(x, "b * c")

        x = super().forward(x, *args, **kwargs)

        x = unpack_one(x, seq_ps, "b * c")
        x = unpack_one(x, batch_ps, "* h w c")
        return rearrange(x, "b ... h w c -> b c ... h w")


class SpaceAttention(Attention):
    def forward(self, x, *args, **kwargs):
        x = rearrange(x, "b c t h w -> b t h w c")
        x, batch_ps = pack_one(x, "* h w c")
        x, seq_ps = pack_one(x, "b * c")

        x = super().forward(x, *args, **kwargs)

        x = unpack_one(x, seq_ps, "b * c")
        x = unpack_one(x, batch_ps, "* h w c")
        return rearrange(x, "b t h w c -> b c t h w")


class TimeAttention(Attention):
    def forward(self, x, *args, **kwargs):
        x = rearrange(x, "b c t h w -> b h w t c")
        x, batch_ps = pack_one(x, "* t c")

        x = super().forward(x, *args, **kwargs)

        x = unpack_one(x, batch_ps, "* t c")
        return rearrange(x, "b h w t c -> b c t h w")


class GEGLU(Module):
    def forward(self, x):
        x, gate = x.chunk(2, dim=1)
        return F.gelu(gate) * x


class FeedForward(Module):
    @beartype
    def __init__(self, dim, *, dim_cond: Optional[int] = None, mult=4, images=False):
        super().__init__()
        conv_klass = nn.Conv2d if images else nn.Conv3d

        rmsnorm_klass = (
            RMSNorm if not exists(dim_cond) else partial(AdaptiveRMSNorm, dim_cond=dim_cond)
        )

        maybe_adaptive_norm_klass = partial(rmsnorm_klass, channel_first=True, images=images)

        dim_inner = int(dim * mult * 2 / 3)

        self.norm = maybe_adaptive_norm_klass(dim)

        self.net = Sequential(
            conv_klass(dim, dim_inner * 2, 1), GEGLU(), conv_klass(dim_inner, dim, 1)
        )

    @beartype
    def forward(self, x: Tensor, *, cond: Optional[Tensor] = None):
        maybe_cond_kwargs = dict(cond=cond) if exists(cond) else dict()

        x = self.norm(x, **maybe_cond_kwargs)
        return self.net(x)


# discriminator with anti-aliased downsampling (blurpool Zhang et al.)


class Blur(Module):
    def __init__(self):
        super().__init__()
        f = torch.Tensor([1, 2, 1])
        self.register_buffer("f", f)

    def forward(self, x, space_only=False, time_only=False):
        assert not (space_only and time_only)

        f = self.f

        if space_only:
            f = einsum("i, j -> i j", f, f)
            f = rearrange(f, "... -> 1 1 ...")
        elif time_only:
            f = rearrange(f, "f -> 1 f 1 1")
        else:
            f = einsum("i, j, k -> i j k", f, f, f)
            f = rearrange(f, "... -> 1 ...")

        is_images = x.ndim == 4

        if is_images:
            x = rearrange(x, "b c h w -> b c 1 h w")

        out = filter3d(x, f, normalized=True)

        if is_images:
            out = rearrange(out, "b c 1 h w -> b c h w")

        return out


class DiscriminatorBlock(Module):
    def __init__(self, input_channels, filters, downsample=True, antialiased_downsample=True):
        super().__init__()
        self.conv_res = nn.Conv2d(input_channels, filters, 1, stride=(2 if downsample else 1))

        self.net = nn.Sequential(
            nn.Conv2d(input_channels, filters, 3, padding=1),
            leaky_relu(),
            nn.Conv2d(filters, filters, 3, padding=1),
            leaky_relu(),
        )

        self.maybe_blur = Blur() if antialiased_downsample else None

        self.downsample = (
            nn.Sequential(
                Rearrange("b c (h p1) (w p2) -> b (c p1 p2) h w", p1=2, p2=2),
                nn.Conv2d(filters * 4, filters, 1),
            )
            if downsample
            else None
        )

    def forward(self, x):
        res = self.conv_res(x)

        x = self.net(x)

        if exists(self.downsample):
            if exists(self.maybe_blur):
                x = self.maybe_blur(x, space_only=True)

            x = self.downsample(x)

        x = (x + res) * (2**-0.5)
        return x


class Discriminator(Module):
    @beartype
    def __init__(
        self,
        *,
        dim,
        image_size,
        channels=3,
        max_dim=512,
        attn_heads=8,
        attn_dim_head=32,
        linear_attn_dim_head=8,
        linear_attn_heads=16,
        ff_mult=4,
        antialiased_downsample=False,
    ):
        super().__init__()
        image_size = pair(image_size)
        min_image_resolution = min(image_size)

        num_layers = int(log2(min_image_resolution) - 2)

        blocks = []

        layer_dims = [channels] + [(dim * 4) * (2**i) for i in range(num_layers + 1)]
        layer_dims = [min(layer_dim, max_dim) for layer_dim in layer_dims]
        layer_dims_in_out = tuple(zip(layer_dims[:-1], layer_dims[1:]))

        blocks = []
        attn_blocks = []

        image_resolution = min_image_resolution

        for ind, (in_chan, out_chan) in enumerate(layer_dims_in_out):
            num_layer = ind + 1
            is_not_last = ind != (len(layer_dims_in_out) - 1)

            block = DiscriminatorBlock(
                in_chan,
                out_chan,
                downsample=is_not_last,
                antialiased_downsample=antialiased_downsample,
            )

            attn_block = Sequential(
                Residual(
                    LinearSpaceAttention(
                        dim=out_chan, heads=linear_attn_heads, dim_head=linear_attn_dim_head
                    )
                ),
                Residual(FeedForward(dim=out_chan, mult=ff_mult, images=True)),
            )

            blocks.append(ModuleList([block, attn_block]))

            image_resolution //= 2

        self.blocks = ModuleList(blocks)

        dim_last = layer_dims[-1]

        downsample_factor = 2**num_layers
        last_fmap_size = tuple(map(lambda n: n // downsample_factor, image_size))

        latent_dim = last_fmap_size[0] * last_fmap_size[1] * dim_last

        self.to_logits = Sequential(
            nn.Conv2d(dim_last, dim_last, 3, padding=1),
            leaky_relu(),
            Rearrange("b ... -> b (...)"),
            nn.Linear(latent_dim, 1),
            Rearrange("b 1 -> b"),
        )

    def forward(self, x):
        for block, attn_block in self.blocks:
            x = block(x)
            x = attn_block(x)

        return self.to_logits(x)


# modulatable conv from Karras et al. Stylegan2
# for conditioning on latents


class Conv3DMod(Module):
    @beartype
    def __init__(
        self,
        dim,
        *,
        spatial_kernel,
        time_kernel,
        causal=True,
        dim_out=None,
        demod=True,
        eps=1e-8,
        pad_mode="zeros",
    ):
        super().__init__()
        dim_out = default(dim_out, dim)

        self.eps = eps

        assert is_odd(spatial_kernel) and is_odd(time_kernel)

        self.spatial_kernel = spatial_kernel
        self.time_kernel = time_kernel

        time_padding = (time_kernel - 1, 0) if causal else ((time_kernel // 2,) * 2)

        self.pad_mode = pad_mode
        self.padding = (*((spatial_kernel // 2,) * 4), *time_padding)
        self.weights = nn.Parameter(
            torch.randn((dim_out, dim, time_kernel, spatial_kernel, spatial_kernel))
        )

        self.demod = demod

        nn.init.kaiming_normal_(self.weights, a=0, mode="fan_in", nonlinearity="selu")

    @beartype
    def forward(self, fmap, cond: Tensor):
        """
        notation

        b - batch
        n - convs
        o - output
        i - input
        k - kernel
        """

        b = fmap.shape[0]

        # prepare weights for modulation

        weights = self.weights

        # do the modulation, demodulation, as done in stylegan2

        cond = rearrange(cond, "b i -> b 1 i 1 1 1")

        weights = weights * (cond + 1)

        if self.demod:
            inv_norm = (
                reduce(weights**2, "b o i k0 k1 k2 -> b o 1 1 1 1", "sum")
                .clamp(min=self.eps)
                .rsqrt()
            )
            weights = weights * inv_norm

        fmap = rearrange(fmap, "b c t h w -> 1 (b c) t h w")

        weights = rearrange(weights, "b o ... -> (b o) ...")

        fmap = F.pad(fmap, self.padding, mode=self.pad_mode)
        fmap = F.conv3d(fmap, weights, groups=b)

        return rearrange(fmap, "1 (b o) ... -> b o ...", b=b)


# strided conv downsamples


class SpatialDownsample2x(Module):
    def __init__(self, dim, dim_out=None, kernel_size=3, antialias=False):
        super().__init__()
        dim_out = default(dim_out, dim)
        self.maybe_blur = Blur() if antialias else identity
        self.conv = nn.Conv2d(dim, dim_out, kernel_size, stride=2, padding=kernel_size // 2)

    def forward(self, x):
        x = self.maybe_blur(x, space_only=True)

        x = rearrange(x, "b c t h w -> b t c h w")
        x, ps = pack_one(x, "* c h w")

        out = self.conv(x)

        out = unpack_one(out, ps, "* c h w")
        out = rearrange(out, "b t c h w -> b c t h w")
        return out


class TimeDownsample2x(Module):
    def __init__(self, dim, dim_out=None, kernel_size=3, antialias=False):
        super().__init__()
        dim_out = default(dim_out, dim)
        self.maybe_blur = Blur() if antialias else identity
        self.time_causal_padding = (kernel_size - 1, 0)
        self.conv = nn.Conv1d(dim, dim_out, kernel_size, stride=2)

    def forward(self, x):
        x = self.maybe_blur(x, time_only=True)

        x = rearrange(x, "b c t h w -> b h w c t")
        x, ps = pack_one(x, "* c t")

        x = F.pad(x, self.time_causal_padding)
        out = self.conv(x)

        out = unpack_one(out, ps, "* c t")
        out = rearrange(out, "b h w c t -> b c t h w")
        return out


# depth to space upsamples


class SpatialUpsample2x(Module):
    def __init__(self, dim, dim_out=None):
        super().__init__()
        dim_out = default(dim_out, dim)
        conv = nn.Conv2d(dim, dim_out * 4, 1)

        self.net = nn.Sequential(
            conv, nn.SiLU(), Rearrange("b (c p1 p2) h w -> b c (h p1) (w p2)", p1=2, p2=2)
        )

        self.init_conv_(conv)

    def init_conv_(self, conv):
        o, i, h, w = conv.weight.shape
        conv_weight = torch.empty(o // 4, i, h, w)
        nn.init.kaiming_uniform_(conv_weight)
        conv_weight = repeat(conv_weight, "o ... -> (o 4) ...")

        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)

    def forward(self, x):
        x = rearrange(x, "b c t h w -> b t c h w")
        x, ps = pack_one(x, "* c h w")

        out = self.net(x)

        out = unpack_one(out, ps, "* c h w")
        out = rearrange(out, "b t c h w -> b c t h w")
        return out


class TimeUpsample2x(Module):
    def __init__(self, dim, dim_out=None):
        super().__init__()
        dim_out = default(dim_out, dim)
        conv = nn.Conv1d(dim, dim_out * 2, 1)

        self.net = nn.Sequential(conv, nn.SiLU(), Rearrange("b (c p) t -> b c (t p)", p=2))

        self.init_conv_(conv)

    def init_conv_(self, conv):
        o, i, t = conv.weight.shape
        conv_weight = torch.empty(o // 2, i, t)
        nn.init.kaiming_uniform_(conv_weight)
        conv_weight = repeat(conv_weight, "o ... -> (o 2) ...")

        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)

    def forward(self, x):
        x = rearrange(x, "b c t h w -> b h w c t")
        x, ps = pack_one(x, "* c t")

        out = self.net(x)

        out = unpack_one(out, ps, "* c t")
        out = rearrange(out, "b h w c t -> b c t h w")
        return out


# autoencoder - only best variant here offered, with causal conv 3d


def SameConv2d(dim_in, dim_out, kernel_size):
    kernel_size = cast_tuple(kernel_size, 2)
    padding = [k // 2 for k in kernel_size]
    return nn.Conv2d(dim_in, dim_out, kernel_size=kernel_size, padding=padding)


class CausalConv3d(Module):
    @beartype
    def __init__(
        self,
        chan_in,
        chan_out,
        kernel_size: Union[int, Tuple[int, int, int]],
        pad_mode="constant",
        **kwargs,
    ):
        super().__init__()
        kernel_size = cast_tuple(kernel_size, 3)

        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size

        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)

        dilation = kwargs.pop("dilation", 1)
        stride = kwargs.pop("stride", 1)

        self.pad_mode = pad_mode
        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
        height_pad = height_kernel_size // 2
        width_pad = width_kernel_size // 2

        self.time_pad = time_pad
        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)

        stride = (stride, 1, 1)
        dilation = (dilation, 1, 1)
        self.conv = nn.Conv3d(
            chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs
        )

    def forward(self, x):
        pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant"

        x = F.pad(x, self.time_causal_padding, mode=pad_mode)
        return self.conv(x)


@beartype
def ResidualUnit(dim, kernel_size: Union[int, Tuple[int, int, int]], pad_mode: str = "constant"):
    net = Sequential(
        CausalConv3d(dim, dim, kernel_size, pad_mode=pad_mode),
        nn.ELU(),
        nn.Conv3d(dim, dim, 1),
        nn.ELU(),
        SqueezeExcite(dim),
    )

    return Residual(net)


@beartype
class ResidualUnitMod(Module):
    def __init__(
        self,
        dim,
        kernel_size: Union[int, Tuple[int, int, int]],
        *,
        dim_cond,
        pad_mode: str = "constant",
        demod=True,
    ):
        super().__init__()
        kernel_size = cast_tuple(kernel_size, 3)
        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
        assert height_kernel_size == width_kernel_size

        self.to_cond = nn.Linear(dim_cond, dim)

        self.conv = Conv3DMod(
            dim=dim,
            spatial_kernel=height_kernel_size,
            time_kernel=time_kernel_size,
            causal=True,
            demod=demod,
            pad_mode=pad_mode,
        )

        self.conv_out = nn.Conv3d(dim, dim, 1)

    @beartype
    def forward(
        self,
        x,
        cond: Tensor,
    ):
        res = x
        cond = self.to_cond(cond)

        x = self.conv(x, cond=cond)
        x = F.elu(x)
        x = self.conv_out(x)
        x = F.elu(x)
        return x + res


class CausalConvTranspose3d(Module):
    def __init__(
        self,
        chan_in,
        chan_out,
        kernel_size: Union[int, Tuple[int, int, int]],
        *,
        time_stride,
        **kwargs,
    ):
        super().__init__()
        kernel_size = cast_tuple(kernel_size, 3)

        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size

        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)

        self.upsample_factor = time_stride

        height_pad = height_kernel_size // 2
        width_pad = width_kernel_size // 2

        stride = (time_stride, 1, 1)
        padding = (0, height_pad, width_pad)

        self.conv = nn.ConvTranspose3d(
            chan_in, chan_out, kernel_size, stride, padding=padding, **kwargs
        )

    def forward(self, x):
        assert x.ndim == 5
        t = x.shape[2]

        out = self.conv(x)

        out = out[..., : (t * self.upsample_factor), :, :]
        return out


# video tokenizer class

LossBreakdown = namedtuple(
    "LossBreakdown",
    [
        "recon_loss",
        "lfq_aux_loss",
        "quantizer_loss_breakdown",
        "perceptual_loss",
        "adversarial_gen_loss",
        "adaptive_adversarial_weight",
        "multiscale_gen_losses",
        "multiscale_gen_adaptive_weights",
    ],
)

DiscrLossBreakdown = namedtuple(
    "DiscrLossBreakdown", ["discr_loss", "multiscale_discr_losses", "gradient_penalty"]
)


class VideoTokenizer(Module):
    @beartype
    def __init__(
        self,
        *,
        image_size,
        layers: Tuple[Union[str, Tuple[str, int]], ...] = ("residual", "residual", "residual"),
        residual_conv_kernel_size=3,
        num_codebooks=1,
        codebook_size: Optional[int] = None,
        channels=3,
        init_dim=64,
        max_dim=float("inf"),
        dim_cond=None,
        dim_cond_expansion_factor=4.0,
        input_conv_kernel_size: Tuple[int, int, int] = (7, 7, 7),
        output_conv_kernel_size: Tuple[int, int, int] = (3, 3, 3),
        pad_mode: str = "constant",
        lfq_entropy_loss_weight=0.1,
        lfq_commitment_loss_weight=1.0,
        lfq_diversity_gamma=2.5,
        quantizer_aux_loss_weight=1.0,
        lfq_activation=nn.Identity(),
        use_fsq=False,
        fsq_levels: Optional[List[int]] = None,
        attn_dim_head=32,
        attn_heads=8,
        attn_dropout=0.0,
        linear_attn_dim_head=8,
        linear_attn_heads=16,
        vgg: Optional[Module] = None,
        vgg_weights: VGG16_Weights = VGG16_Weights.DEFAULT,
        perceptual_loss_weight=1e-1,
        discr_kwargs: Optional[dict] = None,
        multiscale_discrs: Tuple[Module, ...] = tuple(),
        use_gan=True,
        adversarial_loss_weight=1.0,
        grad_penalty_loss_weight=10.0,
        multiscale_adversarial_loss_weight=1.0,
        flash_attn=True,
        separate_first_frame_encoding=False,
    ):
        super().__init__()

        # for autosaving the config

        _locals = locals()
        _locals.pop("self", None)
        _locals.pop("__class__", None)
        self._configs = pickle.dumps(_locals)

        # image size

        self.channels = channels
        self.image_size = image_size

        # initial encoder

        self.conv_in = CausalConv3d(channels, init_dim, input_conv_kernel_size, pad_mode=pad_mode)

        # whether to encode the first frame separately or not

        self.conv_in_first_frame = nn.Identity()
        self.conv_out_first_frame = nn.Identity()

        if separate_first_frame_encoding:
            self.conv_in_first_frame = SameConv2d(channels, init_dim, input_conv_kernel_size[-2:])
            self.conv_out_first_frame = SameConv2d(init_dim, channels, output_conv_kernel_size[-2:])

        self.separate_first_frame_encoding = separate_first_frame_encoding

        # encoder and decoder layers

        self.encoder_layers = ModuleList([])
        self.decoder_layers = ModuleList([])

        self.conv_out = CausalConv3d(init_dim, channels, output_conv_kernel_size, pad_mode=pad_mode)

        dim = init_dim
        dim_out = dim

        layer_fmap_size = image_size
        time_downsample_factor = 1
        has_cond_across_layers = []

        for layer_def in layers:
            layer_type, *layer_params = cast_tuple(layer_def)

            has_cond = False

            if layer_type == "residual":
                encoder_layer = ResidualUnit(dim, residual_conv_kernel_size)
                decoder_layer = ResidualUnit(dim, residual_conv_kernel_size)

            elif layer_type == "consecutive_residual":
                (num_consecutive,) = layer_params
                encoder_layer = Sequential(
                    *[ResidualUnit(dim, residual_conv_kernel_size) for _ in range(num_consecutive)]
                )
                decoder_layer = Sequential(
                    *[ResidualUnit(dim, residual_conv_kernel_size) for _ in range(num_consecutive)]
                )

            elif layer_type == "cond_residual":
                assert exists(
                    dim_cond
                ), "dim_cond must be passed into VideoTokenizer, if tokenizer is to be conditioned"

                has_cond = True

                encoder_layer = ResidualUnitMod(
                    dim,
                    residual_conv_kernel_size,
                    dim_cond=int(dim_cond * dim_cond_expansion_factor),
                )
                decoder_layer = ResidualUnitMod(
                    dim,
                    residual_conv_kernel_size,
                    dim_cond=int(dim_cond * dim_cond_expansion_factor),
                )
                dim_out = dim

            elif layer_type == "compress_space":
                dim_out = safe_get_index(layer_params, 0)
                dim_out = default(dim_out, dim * 2)
                dim_out = min(dim_out, max_dim)

                encoder_layer = SpatialDownsample2x(dim, dim_out)
                decoder_layer = SpatialUpsample2x(dim_out, dim)

                assert layer_fmap_size > 1
                layer_fmap_size //= 2

            elif layer_type == "compress_time":
                dim_out = safe_get_index(layer_params, 0)
                dim_out = default(dim_out, dim * 2)
                dim_out = min(dim_out, max_dim)

                encoder_layer = TimeDownsample2x(dim, dim_out)
                decoder_layer = TimeUpsample2x(dim_out, dim)

                time_downsample_factor *= 2

            elif layer_type == "attend_space":
                attn_kwargs = dict(
                    dim=dim,
                    dim_head=attn_dim_head,
                    heads=attn_heads,
                    dropout=attn_dropout,
                    flash=flash_attn,
                )

                encoder_layer = Sequential(
                    Residual(SpaceAttention(**attn_kwargs)), Residual(FeedForward(dim))
                )

                decoder_layer = Sequential(
                    Residual(SpaceAttention(**attn_kwargs)), Residual(FeedForward(dim))
                )

            elif layer_type == "linear_attend_space":
                linear_attn_kwargs = dict(
                    dim=dim, dim_head=linear_attn_dim_head, heads=linear_attn_heads
                )

                encoder_layer = Sequential(
                    Residual(LinearSpaceAttention(**linear_attn_kwargs)), Residual(FeedForward(dim))
                )

                decoder_layer = Sequential(
                    Residual(LinearSpaceAttention(**linear_attn_kwargs)), Residual(FeedForward(dim))
                )

            elif layer_type == "gateloop_time":
                gateloop_kwargs = dict(use_heinsen=False)

                encoder_layer = ToTimeSequence(Residual(SimpleGateLoopLayer(dim=dim)))
                decoder_layer = ToTimeSequence(Residual(SimpleGateLoopLayer(dim=dim)))

            elif layer_type == "attend_time":
                attn_kwargs = dict(
                    dim=dim,
                    dim_head=attn_dim_head,
                    heads=attn_heads,
                    dropout=attn_dropout,
                    causal=True,
                    flash=flash_attn,
                )

                encoder_layer = Sequential(
                    Residual(TokenShift(TimeAttention(**attn_kwargs))),
                    Residual(TokenShift(FeedForward(dim, dim_cond=dim_cond))),
                )

                decoder_layer = Sequential(
                    Residual(TokenShift(TimeAttention(**attn_kwargs))),
                    Residual(TokenShift(FeedForward(dim, dim_cond=dim_cond))),
                )

            elif layer_type == "cond_attend_space":
                has_cond = True

                attn_kwargs = dict(
                    dim=dim,
                    dim_cond=dim_cond,
                    dim_head=attn_dim_head,
                    heads=attn_heads,
                    dropout=attn_dropout,
                    flash=flash_attn,
                )

                encoder_layer = Sequential(
                    Residual(SpaceAttention(**attn_kwargs)), Residual(FeedForward(dim))
                )

                decoder_layer = Sequential(
                    Residual(SpaceAttention(**attn_kwargs)), Residual(FeedForward(dim))
                )

            elif layer_type == "cond_linear_attend_space":
                has_cond = True

                attn_kwargs = dict(
                    dim=dim,
                    dim_cond=dim_cond,
                    dim_head=attn_dim_head,
                    heads=attn_heads,
                    dropout=attn_dropout,
                    flash=flash_attn,
                )

                encoder_layer = Sequential(
                    Residual(LinearSpaceAttention(**attn_kwargs)),
                    Residual(FeedForward(dim, dim_cond=dim_cond)),
                )

                decoder_layer = Sequential(
                    Residual(LinearSpaceAttention(**attn_kwargs)),
                    Residual(FeedForward(dim, dim_cond=dim_cond)),
                )

            elif layer_type == "cond_attend_time":
                has_cond = True

                attn_kwargs = dict(
                    dim=dim,
                    dim_cond=dim_cond,
                    dim_head=attn_dim_head,
                    heads=attn_heads,
                    dropout=attn_dropout,
                    causal=True,
                    flash=flash_attn,
                )

                encoder_layer = Sequential(
                    Residual(TokenShift(TimeAttention(**attn_kwargs))),
                    Residual(TokenShift(FeedForward(dim, dim_cond=dim_cond))),
                )

                decoder_layer = Sequential(
                    Residual(TokenShift(TimeAttention(**attn_kwargs))),
                    Residual(TokenShift(FeedForward(dim, dim_cond=dim_cond))),
                )

            else:
                raise ValueError(f"unknown layer type {layer_type}")

            self.encoder_layers.append(encoder_layer)
            self.decoder_layers.insert(0, decoder_layer)

            dim = dim_out
            has_cond_across_layers.append(has_cond)

        # add a final norm just before quantization layer

        self.encoder_layers.append(
            Sequential(
                Rearrange("b c ... -> b ... c"),
                nn.LayerNorm(dim),
                Rearrange("b ... c -> b c ..."),
            )
        )

        self.time_downsample_factor = time_downsample_factor
        self.time_padding = time_downsample_factor - 1

        self.fmap_size = layer_fmap_size

        # use a MLP stem for conditioning, if needed

        self.has_cond_across_layers = has_cond_across_layers
        self.has_cond = any(has_cond_across_layers)

        self.encoder_cond_in = nn.Identity()
        self.decoder_cond_in = nn.Identity()

        if has_cond:
            self.dim_cond = dim_cond

            self.encoder_cond_in = Sequential(
                nn.Linear(dim_cond, int(dim_cond * dim_cond_expansion_factor)), nn.SiLU()
            )

            self.decoder_cond_in = Sequential(
                nn.Linear(dim_cond, int(dim_cond * dim_cond_expansion_factor)), nn.SiLU()
            )

        # quantizer related

        self.use_fsq = use_fsq

        if not use_fsq:
            assert exists(codebook_size) and not exists(
                fsq_levels
            ), "if use_fsq is set to False, `codebook_size` must be set (and not `fsq_levels`)"

            # lookup free quantizer(s) - multiple codebooks is possible
            # each codebook will get its own entropy regularization

            self.quantizers = LFQ(
                dim=dim,
                codebook_size=codebook_size,
                num_codebooks=num_codebooks,
                entropy_loss_weight=lfq_entropy_loss_weight,
                commitment_loss_weight=lfq_commitment_loss_weight,
                diversity_gamma=lfq_diversity_gamma,
            )

        else:
            assert (
                not exists(codebook_size) and exists(fsq_levels)
            ), "if use_fsq is set to True, `fsq_levels` must be set (and not `codebook_size`). the effective codebook size is the cumulative product of all the FSQ levels"

            self.quantizers = FSQ(fsq_levels, dim=dim, num_codebooks=num_codebooks)

        self.quantizer_aux_loss_weight = quantizer_aux_loss_weight

        # dummy loss

        self.register_buffer("zero", torch.tensor(0.0), persistent=False)

        # perceptual loss related

        use_vgg = channels in {1, 3, 4} and perceptual_loss_weight > 0.0

        self.vgg = None
        self.perceptual_loss_weight = perceptual_loss_weight

        if use_vgg:
            if not exists(vgg):
                vgg = torchvision.models.vgg16(weights=vgg_weights)

                vgg.classifier = Sequential(*vgg.classifier[:-2])

            self.vgg = vgg

        self.use_vgg = use_vgg

        # main flag for whether to use GAN at all

        self.use_gan = use_gan

        # discriminator

        discr_kwargs = default(
            discr_kwargs, dict(dim=dim, image_size=image_size, channels=channels, max_dim=512)
        )

        self.discr = Discriminator(**discr_kwargs)

        self.adversarial_loss_weight = adversarial_loss_weight
        self.grad_penalty_loss_weight = grad_penalty_loss_weight

        self.has_gan = use_gan and adversarial_loss_weight > 0.0

        # multi-scale discriminators

        self.has_multiscale_gan = use_gan and multiscale_adversarial_loss_weight > 0.0

        self.multiscale_discrs = ModuleList([*multiscale_discrs])

        self.multiscale_adversarial_loss_weight = multiscale_adversarial_loss_weight

        self.has_multiscale_discrs = (
            use_gan and multiscale_adversarial_loss_weight > 0.0 and len(multiscale_discrs) > 0
        )

    @property
    def device(self):
        return self.zero.device

    @classmethod
    def init_and_load_from(cls, path, strict=True):
        path = Path(path)
        assert path.exists()
        pkg = torch.load(str(path), map_location="cpu")

        assert "config" in pkg, "model configs were not found in this saved checkpoint"

        config = pickle.loads(pkg["config"])
        tokenizer = cls(**config)
        tokenizer.load(path, strict=strict)
        return tokenizer

    def parameters(self):
        return [
            *self.conv_in.parameters(),
            *self.conv_in_first_frame.parameters(),
            *self.conv_out_first_frame.parameters(),
            *self.conv_out.parameters(),
            *self.encoder_layers.parameters(),
            *self.decoder_layers.parameters(),
            *self.encoder_cond_in.parameters(),
            *self.decoder_cond_in.parameters(),
            *self.quantizers.parameters(),
        ]

    def discr_parameters(self):
        return self.discr.parameters()

    def copy_for_eval(self):
        device = self.device
        vae_copy = copy.deepcopy(self.cpu())

        maybe_del_attr_(vae_copy, "discr")
        maybe_del_attr_(vae_copy, "vgg")
        maybe_del_attr_(vae_copy, "multiscale_discrs")

        vae_copy.eval()
        return vae_copy.to(device)

    @remove_vgg
    def state_dict(self, *args, **kwargs):
        return super().state_dict(*args, **kwargs)

    @remove_vgg
    def load_state_dict(self, *args, **kwargs):
        return super().load_state_dict(*args, **kwargs)

    def save(self, path, overwrite=True):
        path = Path(path)
        assert overwrite or not path.exists(), f"{str(path)} already exists"

        pkg = dict(model_state_dict=self.state_dict(), version=__version__, config=self._configs)

        torch.save(pkg, str(path))

    def load(self, path, strict=True):
        path = Path(path)
        assert path.exists()

        pkg = torch.load(str(path))
        state_dict = pkg.get("model_state_dict")
        version = pkg.get("version")

        assert exists(state_dict)

        if exists(version):
            print(f"loading checkpointed tokenizer from version {version}")

        self.load_state_dict(state_dict, strict=strict)

    @beartype
    def encode(
        self,
        video: Tensor,
        quantize=False,
        cond: Optional[Tensor] = None,
        video_contains_first_frame=True,
    ):
        encode_first_frame_separately = (
            self.separate_first_frame_encoding and video_contains_first_frame
        )

        # whether to pad video or not

        if video_contains_first_frame:
            video_len = video.shape[2]

            video = pad_at_dim(video, (self.time_padding, 0), value=0.0, dim=2)
            video_packed_shape = [
                torch.Size([self.time_padding]),
                torch.Size([]),
                torch.Size([video_len - 1]),
            ]

        # conditioning, if needed

        assert (
            (not self.has_cond) or exists(cond)
        ), "`cond` must be passed into tokenizer forward method since conditionable layers were specified"

        if exists(cond):
            assert cond.shape == (video.shape[0], self.dim_cond)

            cond = self.encoder_cond_in(cond)
            cond_kwargs = dict(cond=cond)

        # initial conv
        # taking into account whether to encode first frame separately

        if encode_first_frame_separately:
            pad, first_frame, video = unpack(video, video_packed_shape, "b c * h w")
            first_frame = self.conv_in_first_frame(first_frame)

        video = self.conv_in(video)

        if encode_first_frame_separately:
            video, _ = pack([first_frame, video], "b c * h w")
            video = pad_at_dim(video, (self.time_padding, 0), dim=2)

        # encoder layers

        for fn, has_cond in zip(self.encoder_layers, self.has_cond_across_layers):
            layer_kwargs = dict()

            if has_cond:
                layer_kwargs = cond_kwargs

            video = fn(video, **layer_kwargs)

        maybe_quantize = identity if not quantize else self.quantizers

        return maybe_quantize(video)

    @beartype
    def decode_from_code_indices(
        self, codes: Tensor, cond: Optional[Tensor] = None, video_contains_first_frame=True
    ):
        assert codes.dtype in (torch.long, torch.int32)

        if codes.ndim == 2:
            video_code_len = codes.shape[-1]
            assert divisible_by(
                video_code_len, self.fmap_size**2
            ), f"flattened video ids must have a length ({video_code_len}) that is divisible by the fmap size ({self.fmap_size}) squared ({self.fmap_size ** 2})"

            codes = rearrange(codes, "b (f h w) -> b f h w", h=self.fmap_size, w=self.fmap_size)

        quantized = self.quantizers.indices_to_codes(codes)

        return self.decode(
            quantized, cond=cond, video_contains_first_frame=video_contains_first_frame
        )

    @beartype
    def decode(
        self, quantized: Tensor, cond: Optional[Tensor] = None, video_contains_first_frame=True
    ):
        decode_first_frame_separately = (
            self.separate_first_frame_encoding and video_contains_first_frame
        )

        batch = quantized.shape[0]

        # conditioning, if needed

        assert (
            (not self.has_cond) or exists(cond)
        ), "`cond` must be passed into tokenizer forward method since conditionable layers were specified"

        if exists(cond):
            assert cond.shape == (batch, self.dim_cond)

            cond = self.decoder_cond_in(cond)
            cond_kwargs = dict(cond=cond)

        # decoder layers

        x = quantized

        for fn, has_cond in zip(self.decoder_layers, reversed(self.has_cond_across_layers)):
            layer_kwargs = dict()

            if has_cond:
                layer_kwargs = cond_kwargs

            x = fn(x, **layer_kwargs)

        # to pixels

        if decode_first_frame_separately:
            left_pad, xff, x = (
                x[:, :, : self.time_padding],
                x[:, :, self.time_padding],
                x[:, :, (self.time_padding + 1) :],
            )

            out = self.conv_out(x)
            outff = self.conv_out_first_frame(xff)

            video, _ = pack([outff, out], "b c * h w")

        else:
            video = self.conv_out(x)

            # if video were padded, remove padding

            if video_contains_first_frame:
                video = video[:, :, self.time_padding :]

        return video

    @torch.no_grad()
    def tokenize(self, video):
        self.eval()
        return self.forward(video, return_codes=True)

    @beartype
    def forward(
        self,
        video_or_images: Tensor,
        cond: Optional[Tensor] = None,
        return_loss=False,
        return_codes=False,
        return_recon=False,
        return_discr_loss=False,
        return_recon_loss_only=False,
        apply_gradient_penalty=True,
        video_contains_first_frame=True,
        adversarial_loss_weight=None,
        multiscale_adversarial_loss_weight=None,
    ):
        adversarial_loss_weight = default(adversarial_loss_weight, self.adversarial_loss_weight)
        multiscale_adversarial_loss_weight = default(
            multiscale_adversarial_loss_weight, self.multiscale_adversarial_loss_weight
        )

        assert (return_loss + return_codes + return_discr_loss) <= 1
        assert video_or_images.ndim in {4, 5}

        assert video_or_images.shape[-2:] == (self.image_size, self.image_size)

        # accept images for image pretraining (curriculum learning from images to video)

        is_image = video_or_images.ndim == 4

        if is_image:
            video = rearrange(video_or_images, "b c ... -> b c 1 ...")
            video_contains_first_frame = True
        else:
            video = video_or_images

        batch, channels, frames = video.shape[:3]

        assert divisible_by(
            frames - int(video_contains_first_frame), self.time_downsample_factor
        ), f"number of frames {frames} minus the first frame ({frames - int(video_contains_first_frame)}) must be divisible by the total downsample factor across time {self.time_downsample_factor}"

        # encoder

        x = self.encode(video, cond=cond, video_contains_first_frame=video_contains_first_frame)

        # lookup free quantization

        if self.use_fsq:
            quantized, codes = self.quantizers(x)

            aux_losses = self.zero
            quantizer_loss_breakdown = None
        else:
            (quantized, codes, aux_losses), quantizer_loss_breakdown = self.quantizers(
                x, return_loss_breakdown=True
            )

        if return_codes and not return_recon:
            return codes

        # decoder

        recon_video = self.decode(
            quantized, cond=cond, video_contains_first_frame=video_contains_first_frame
        )

        if return_codes:
            return codes, recon_video

        # reconstruction loss

        if not (return_loss or return_discr_loss or return_recon_loss_only):
            return recon_video

        recon_loss = F.mse_loss(video, recon_video)

        # for validation, only return recon loss

        if return_recon_loss_only:
            return recon_loss, recon_video

        # gan discriminator loss

        if return_discr_loss:
            assert self.has_gan
            assert exists(self.discr)

            # pick a random frame for image discriminator

            frame_indices = torch.randn((batch, frames)).topk(1, dim=-1).indices

            real = pick_video_frame(video, frame_indices)

            if apply_gradient_penalty:
                real = real.requires_grad_()

            fake = pick_video_frame(recon_video, frame_indices)

            real_logits = self.discr(real)
            fake_logits = self.discr(fake.detach())

            discr_loss = hinge_discr_loss(fake_logits, real_logits)

            # multiscale discriminators

            multiscale_discr_losses = []

            if self.has_multiscale_discrs:
                for discr in self.multiscale_discrs:
                    multiscale_real_logits = discr(video)
                    multiscale_fake_logits = discr(recon_video.detach())

                    multiscale_discr_loss = hinge_discr_loss(
                        multiscale_fake_logits, multiscale_real_logits
                    )

                    multiscale_discr_losses.append(multiscale_discr_loss)
            else:
                multiscale_discr_losses.append(self.zero)

            # gradient penalty

            if apply_gradient_penalty:
                gradient_penalty_loss = gradient_penalty(real, real_logits)
            else:
                gradient_penalty_loss = self.zero

            # total loss

            total_loss = (
                discr_loss
                + gradient_penalty_loss * self.grad_penalty_loss_weight
                + sum(multiscale_discr_losses) * self.multiscale_adversarial_loss_weight
            )

            discr_loss_breakdown = DiscrLossBreakdown(
                discr_loss, multiscale_discr_losses, gradient_penalty_loss
            )

            return total_loss, discr_loss_breakdown

        # perceptual loss

        if self.use_vgg:
            frame_indices = torch.randn((batch, frames)).topk(1, dim=-1).indices

            input_vgg_input = pick_video_frame(video, frame_indices)
            recon_vgg_input = pick_video_frame(recon_video, frame_indices)

            if channels == 1:
                input_vgg_input = repeat(input_vgg_input, "b 1 h w -> b c h w", c=3)
                recon_vgg_input = repeat(recon_vgg_input, "b 1 h w -> b c h w", c=3)

            elif channels == 4:
                input_vgg_input = input_vgg_input[:, :3]
                recon_vgg_input = recon_vgg_input[:, :3]

            input_vgg_feats = self.vgg(input_vgg_input)
            recon_vgg_feats = self.vgg(recon_vgg_input)

            perceptual_loss = F.mse_loss(input_vgg_feats, recon_vgg_feats)
        else:
            perceptual_loss = self.zero

        # get gradient with respect to perceptual loss for last decoder layer
        # needed for adaptive weighting

        last_dec_layer = self.conv_out.conv.weight

        norm_grad_wrt_perceptual_loss = None

        if self.training and self.use_vgg and (self.has_gan or self.has_multiscale_discrs):
            norm_grad_wrt_perceptual_loss = grad_layer_wrt_loss(
                perceptual_loss, last_dec_layer
            ).norm(p=2)

        # per-frame image discriminator

        recon_video_frames = None

        if self.has_gan:
            frame_indices = torch.randn((batch, frames)).topk(1, dim=-1).indices
            recon_video_frames = pick_video_frame(recon_video, frame_indices)

            fake_logits = self.discr(recon_video_frames)
            gen_loss = hinge_gen_loss(fake_logits)

            adaptive_weight = 1.0

            if exists(norm_grad_wrt_perceptual_loss):
                norm_grad_wrt_gen_loss = grad_layer_wrt_loss(gen_loss, last_dec_layer).norm(p=2)
                adaptive_weight = norm_grad_wrt_perceptual_loss / norm_grad_wrt_gen_loss.clamp(
                    min=1e-3
                )
                adaptive_weight.clamp_(max=1e3)

                if torch.isnan(adaptive_weight).any():
                    adaptive_weight = 1.0
        else:
            gen_loss = self.zero
            adaptive_weight = 0.0

        # multiscale discriminator losses

        multiscale_gen_losses = []
        multiscale_gen_adaptive_weights = []

        if self.has_multiscale_gan and self.has_multiscale_discrs:
            if not exists(recon_video_frames):
                recon_video_frames = pick_video_frame(recon_video, frame_indices)

            for discr in self.multiscale_discrs:
                fake_logits = recon_video_frames
                multiscale_gen_loss = hinge_gen_loss(fake_logits)

                multiscale_gen_losses.append(multiscale_gen_loss)

                multiscale_adaptive_weight = 1.0

                if exists(norm_grad_wrt_perceptual_loss):
                    norm_grad_wrt_gen_loss = grad_layer_wrt_loss(
                        multiscale_gen_loss, last_dec_layer
                    ).norm(p=2)
                    multiscale_adaptive_weight = (
                        norm_grad_wrt_perceptual_loss / norm_grad_wrt_gen_loss.clamp(min=1e-5)
                    )
                    multiscale_adaptive_weight.clamp_(max=1e3)

                multiscale_gen_adaptive_weights.append(multiscale_adaptive_weight)

        # calculate total loss

        total_loss = (
            recon_loss
            + aux_losses * self.quantizer_aux_loss_weight
            + perceptual_loss * self.perceptual_loss_weight
            + gen_loss * adaptive_weight * adversarial_loss_weight
        )

        if self.has_multiscale_discrs:
            weighted_multiscale_gen_losses = sum(
                loss * weight
                for loss, weight in zip(multiscale_gen_losses, multiscale_gen_adaptive_weights)
            )

            total_loss = (
                total_loss + weighted_multiscale_gen_losses * multiscale_adversarial_loss_weight
            )

        # loss breakdown

        loss_breakdown = LossBreakdown(
            recon_loss,
            aux_losses,
            quantizer_loss_breakdown,
            perceptual_loss,
            gen_loss,
            adaptive_weight,
            multiscale_gen_losses,
            multiscale_gen_adaptive_weights,
        )

        return total_loss, loss_breakdown


# main class


class MagViT2(Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x