GPT-SoVITS/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py

"""
Modified From https://github.com/XXXXRT666/GPT-SoVITS
"""

from __future__ import annotations

import math
import os
import pickle
import platform
import time
from abc import ABC, abstractmethod
from contextlib import nullcontext
from pathlib import Path
from queue import Queue
from typing import Literal, MutableSequence

import torch
import torch._inductor.config
import torch.nn.functional as F
from torch.cuda.graphs import CUDAGraph
from torch.profiler import ExecutionTraceObserver, ProfilerAction, tensorboard_trace_handler

from tools.my_utils import get_machine_id

from . import nn
from .quantization import replace_all_linear_with_fp8
from .structs import KVCacheProtocol, T2SDecoderProtocol, T2SSession

Tensor = torch.Tensor


class TokenEmbedding(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        vocab_size: int,
    ):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)

    @property
    def weight(self) -> Tensor:
        return self.word_embeddings.weight

    def embedding(self, index: int) -> Tensor:
        return self.word_embeddings.weight[index : index + 1]

    def __call__(self, x: Tensor):
        x = self.word_embeddings(x)
        return x


class SinePositionalEmbedding(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        scale: bool = False,
        alpha: bool = False,
        max_batch_size: int = 10,
        max_seq_length: int = 1500,
    ):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
        self.max_batch_size = max_batch_size
        self.max_seq_length = max_seq_length

        self.reverse = False
        self.register_buffer("pe", torch.zeros(max_batch_size, max_seq_length, embedding_dim), persistent=False)
        self.pe: torch.Tensor
        self.compute_pe()

    def compute_pe(self):
        """Reset the positional encodings."""
        if self.reverse:
            position = torch.arange(self.max_seq_length - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
        else:
            position = torch.arange(self.max_seq_length, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
        )
        pe = self.pe
        pe[:, :, 0::2] = torch.sin(position * div_term)
        pe[:, :, 1::2] = torch.cos(position * div_term)

    def __call__(self, input_pos: Tensor, x: Tensor) -> Tensor:
        """
        Args:
            input_pos (Tensor): [batch_size, ]
            x (Tensor): [batch_size, 1, embed_dim]

        Returns:
            embedded_x (Tensor): [batch_size, 1, embed_dim]
        """

        batch_size = x.shape[0]
        pe_values = self.pe[torch.arange(batch_size), input_pos - 1]  # (batch_size, embed_dim)

        return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)  # (batch_size, 1, embed_dim)

    def prefill(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): [batch_size, seq_len, embed_dim]

        Returns:
            embedded_x (Tensor): [batch_size, seq_len, embed_dim]
        """

        batch_size = x.shape[0]
        pe_values = self.pe[:batch_size, : x.shape[-2]]
        return x * self.x_scale + self.alpha * pe_values


class KVCacheABC(nn.Module, ABC, KVCacheProtocol):
    def __init__(self, batch_size: int, max_seq_length: int, n_heads: int, head_dim: int) -> None:
        super().__init__()

        self.n_head = n_heads
        self.head_dim = head_dim
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length

        self.k_cache: Tensor
        self.v_cache: Tensor

    def empty(self):
        self.k_cache.zero_()
        self.v_cache.zero_()

    @abstractmethod
    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor, *args, **kwds) -> tuple[Tensor, Tensor]: ...

    @abstractmethod
    def prefill_kv(self, k_val: Tensor, v_val: Tensor) -> None: ...

    def sync_cache(self, kv_cache: KVCacheProtocol):
        self.k_cache.copy_(kv_cache.k_cache)
        self.v_cache.copy_(kv_cache.v_cache)


class KVCacheNHD(KVCacheABC):
    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
        super().__init__(batch_size, max_seq_length, n_heads, head_dim)

        assert batch_size > 0
        cache_shape = (batch_size, max_seq_length, n_heads, head_dim)

        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)

    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
        # input_pos: [B, ], k_val: [B, 1, H, D]

        index = (
            (input_pos - 1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .expand(
                -1,
                -1,
                self.n_head,
                self.head_dim,
            )
            .to(torch.int64)
        )  # (bs, 1, num_head, head_dim)

        k_out = self.k_cache
        v_out = self.v_cache
        k_out.scatter_(1, index, k_val)
        v_out.scatter_(1, index, v_val)

        return k_out, v_out

    def empty(self):
        self.k_cache.zero_()
        self.v_cache.zero_()

    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
        # input_pos: int, k_val: [B, S, H, D]

        self.k_cache[:, : k_val.shape[1]] = k_val
        self.v_cache[:, : v_val.shape[1]] = v_val


class KVCacheHND(KVCacheABC):
    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
        super().__init__(batch_size, max_seq_length, n_heads, head_dim)

        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)

        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)

    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
        # input_pos: [B, ], k_val: [B, H, 1, D]

        index = (
            (input_pos - 1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .expand(
                -1,
                self.n_head,
                -1,
                self.head_dim,
            )
            .to(torch.int64)
        )  # (bs, num_head, 1, head_dim)

        k_out = self.k_cache
        v_out = self.v_cache
        k_out.scatter_(2, index, k_val)
        v_out.scatter_(2, index, v_val)

        return k_out, v_out

    def empty(self):
        self.k_cache.zero_()
        self.v_cache.zero_()

    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
        # input_pos: int, k_val: [B, S, H, D]

        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)


class KVCacheHNDVarlen(KVCacheABC):
    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
        super().__init__(batch_size, max_seq_length, n_heads, head_dim)

        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
        self.cache_idx: Tensor

        self.register_buffer("cache_idx", torch.arange(batch_size), persistent=False)
        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)

    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
        # input_pos: [B, ], k_val: [B, H, 1, D]

        k_out = self.k_cache
        v_out = self.v_cache

        ip0 = input_pos - 1

        k_out[self.cache_idx, :, ip0, None] = k_val
        v_out[self.cache_idx, :, ip0, None] = v_val

        return k_out, v_out

    def empty(self):
        self.k_cache.zero_()
        self.v_cache.zero_()

    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
        # input_pos: int, k_val: [B, S, H, D]

        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)


class AttentionABC(nn.Module, ABC):
    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
        super().__init__()

        self.n_head = n_head
        self.hidden_dim = hidden_dim
        assert hidden_dim % n_head == 0
        self.head_dim = hidden_dim // n_head

        self.max_seq_length = max_seq_length

        # key, query, value projections for all heads, but in a batch
        self.in_proj: nn.Linear
        self.out_proj: nn.Linear

        self._register_load_state_dict_pre_hook(self.load_hook)

    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
        keys_to_modify = [key for key in state_dict if "in_proj_" in key]
        for key in keys_to_modify:
            new_key = key.replace("in_proj_", "in_proj.")  # in_proj_ -> in_proj.
            state_dict[new_key] = state_dict.pop(key)

    @abstractmethod
    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds) -> Tensor: ...

    def prefill(self, x: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor) -> Tensor:
        bsz, seqlen, _ = x.shape

        q, k, v = self.in_proj(x).chunk(3, dim=-1)

        q, k, v = map(lambda x: x.contiguous().view(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))

        kv_cache.prefill_kv(k, v)

        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))

        attn = F.scaled_dot_product_attention(q, k, v, attn_mask)

        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, -1)

        output = self.out_proj(attn)

        return output


class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int) -> None:
        super().__init__()

        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)

    def __call__(self, x: Tensor):
        return self.linear2(F.relu(self.linear1(x), inplace=True))


class TransformerBlockABC(nn.Module, ABC):
    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
        super().__init__()

        self.hidden_dim = hidden_dim
        self.max_seq_length = max_seq_length

        self.attention: AttentionABC
        self.feed_forward: FeedForward
        self.attention_norm: nn.LayerNorm
        self.ffn_norm: nn.LayerNorm

        self._register_load_state_dict_pre_hook(self.load_hook)

    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
        for key in list(state_dict.keys()):
            new_key = (
                key.replace("self_attn", "attention")
                .replace("linear", "feed_forward.linear")
                .replace("norm1", "attention_norm")
                .replace("norm2", "ffn_norm")
            )
            state_dict[new_key] = state_dict.pop(key)

    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds):
        h = self.attention_norm(
            x
            + self.attention(
                x,
                input_pos,
                kv_cache,
                *args,
                **kwds,
            )
        )
        out = self.ffn_norm(h + self.feed_forward(h))
        return out

    def prefill(
        self,
        x: Tensor,
        kv_cache: KVCacheProtocol,
        attn_mask: Tensor,
    ) -> Tensor:
        h = self.attention_norm(
            x
            + self.attention.prefill(
                x,
                kv_cache,
                attn_mask,
            )
        )
        out = self.ffn_norm(h + self.feed_forward(h))
        return out


class TransformerDecoderABC(nn.Module, ABC):
    def __init__(
        self,
        hidden_dim: int,
        n_layer: int,
        n_head: int,
        ffn_dim: int,
        vocab_size: int,
        max_seq_length: int,
        max_batch_size: int,
    ) -> None:
        super().__init__()

        self.hidden_dim = hidden_dim
        self.n_head = n_head
        assert hidden_dim % n_head == 0

        self.head_dim = hidden_dim // n_head
        self.vocab_size = vocab_size

        self.n_layer = n_layer

        self.layers: MutableSequence[TransformerBlockABC]

        self.max_seq_length = max_seq_length
        self.max_batch_size = max_batch_size

    def __call__(self, input_pos: Tensor, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds):
        for layer, kv_cache in zip(self.layers, kv_caches):
            x = layer(x, input_pos, kv_cache, *args, **kwds)
        return x

    def prefill(self, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], attn_mask: Tensor):
        for layer, kv_cache in zip(self.layers, kv_caches):
            x = layer.prefill(x, kv_cache, attn_mask)
        return x


class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
    def __init__(
        self,
        config: dict,
        max_seq_length: int = 1500,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__()

        hidden_dim: int = config["model"]["hidden_dim"]
        embedding_dim: int = config["model"]["embedding_dim"]
        n_head: int = config["model"]["head"]
        n_layer: int = config["model"]["n_layer"]
        vocab_size: int = config["model"]["vocab_size"]
        phoneme_vocab_size: int = config["model"]["phoneme_vocab_size"]
        EOS: int = config["model"]["EOS"]
        ffn_dim: int = hidden_dim * 4

        self.n_layer = int(n_layer)
        self.hidden_dim = int(hidden_dim)
        self.n_head = int(n_head)
        assert hidden_dim % n_head == 0

        self.head_dim = int(hidden_dim // n_head)
        self.embedding_dim = int(embedding_dim)
        self.ffn_dim = int(ffn_dim)
        self.vocab_size = int(vocab_size)
        self.phoneme_vocab_size = int(phoneme_vocab_size)
        self.max_seq_length = max_seq_length
        self.max_batch_size = max_batch_size
        self.EOS = EOS
        assert self.EOS == self.vocab_size - 1

        self.bert_proj: nn.Linear
        self.ar_predict_layer: nn.Linear
        self.h: TransformerDecoderABC

        self.kv_class: type[KVCacheABC]

        self.GraphCache: CUDAGraphCacheABC | None
        self.compiled: bool = False

        self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size)
        self.ar_text_position = SinePositionalEmbedding(
            self.embedding_dim,
            scale=False,
            alpha=True,
            max_batch_size=max_batch_size,
            max_seq_length=max_seq_length,
        )
        self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size)
        self.ar_audio_position = SinePositionalEmbedding(
            self.embedding_dim,
            scale=False,
            alpha=True,
            max_batch_size=max_batch_size,
            max_seq_length=max_seq_length,
        )

        self.graph_cache_class: type[CUDAGraphCacheABC]

        self.bits: int
        self.group_size: int

        self._register_load_state_dict_pre_hook(self.load_hook)

    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
        model_keys = [key for key in state_dict if key.startswith("model.")]
        for key in model_keys:
            new_key = key[len("model.") :]
            state_dict[new_key] = state_dict.pop(key)

    def init_cache(self, bsz: int = 0) -> MutableSequence[KVCacheProtocol]:
        bsz = bsz or self.h.max_batch_size
        assert bsz <= self.h.max_batch_size
        seq_lens = self.h.max_seq_length
        dtype = self.bert_proj.bias.dtype
        kvclass = self.kv_class

        return nn.ModuleList(
            [kvclass(bsz, seq_lens, self.n_head, self.head_dim) for _ in range(self.n_layer)],
        ).to(self.device, dtype)  # type: ignore

    def embed(
        self,
        x: list[torch.Tensor],
        y: torch.Tensor,
        bert_features: list[torch.Tensor],
    ):
        x_len: list[int] = [i.shape[0] for i in x]
        x_len_max = max(x_len)
        xy_pos = torch.zeros((len(x), x_len_max + y.shape[1], self.embedding_dim)).to(bert_features[0].dtype)

        bert_features = list(map(lambda x: x.transpose(0, 1), bert_features))

        y_len = y.shape[1]
        y_emb = self.ar_audio_embedding(y)
        y_pos = self.ar_audio_position.prefill(y_emb)

        for bs, (x_, len_, bert_feature) in enumerate(zip(x, x_len, bert_features)):
            x_emb = self.ar_text_embedding(x_)
            bert = self.bert_proj(bert_feature)
            x_emb = x_emb + bert
            x_pos = self.ar_text_position.prefill(x_emb.unsqueeze(0))
            xy_pos[[bs], :len_] = x_pos
            xy_pos[[bs], len_ : len_ + y_len] = y_pos

        return xy_pos

    def compile(self, *args, **kwds):
        if (
            torch.cuda.is_available()
            and platform.system() != "Windows"
            or platform.system() == "macOS"
            and self.compiled is False
        ):
            cache_path = Path.cwd() / "compile_cache"
            if cache_path.exists() is False:
                cache_path.mkdir(parents=True, exist_ok=True)
            else:
                assert cache_path.is_dir()
                cache_file = (
                    cache_path
                    / f"t2s_decoder_{self.n_layer}_{self.hidden_dim}_{self.n_head}_{self.ffn_dim}_{self.phoneme_vocab_size}_{get_machine_id()}_{torch.__version__}.GSV"
                )
                if cache_file.exists():
                    try:
                        with open(cache_file, "rb") as f:
                            cache_data = pickle.load(f)
                        torch.compiler.load_cache_artifacts(cache_data)
                    except Exception as e:
                        print(f"Failed to resotore compile cache from {cache_file}: {e}")

            # Experimental features to reduce compilation times, will be on by default in future
            torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = True
            torch._inductor.config.coordinate_descent_tuning = True
            torch._inductor.config.triton.unique_kernel_names = True
            torch._inductor.config.fx_graph_cache = True
            torch._inductor.config.triton.cudagraph_trees = True
            torch._inductor.config.triton.cudagraph_support_input_mutation = True
            self.h.compile(fullgraph=True, mode="reduce-overhead")
            self.compiled = True

    def save_compile_cache(self):
        if torch.cuda.is_available() and platform.system() != "Windows" or platform.system() == "macOS":
            cache_path = Path.cwd() / "compile_cache"
            if cache_path.exists() is False:
                cache_path.mkdir(parents=True, exist_ok=True)
            else:
                assert cache_path.is_dir()
                cache_file = (
                    cache_path
                    / f"t2s_decoder_{self.n_layer}_{self.hidden_dim}_{self.n_head}_{self.ffn_dim}_{self.phoneme_vocab_size}_{get_machine_id()}_{torch.__version__}.GSV"
                )
                if cache_file.exists():
                    return

                try:
                    cache = torch.compiler.save_cache_artifacts()
                    assert cache
                    cache_data = cache[0]
                    with open(cache_file, "wb") as f:
                        pickle.dump(cache_data, f)
                except Exception as e:
                    print(f"Failed to save compile cache to {cache_file}: {e}")

    def capture(
        self, input_pos: Tensor, x: Tensor, x_dec: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds
    ) -> CUDAGraph:
        assert torch.cuda.is_available()
        s = torch.cuda.Stream()
        s.wait_stream(torch.cuda.current_stream())

        graph = torch.cuda.CUDAGraph()

        with torch.cuda.stream(s):
            for _ in range(5):
                self.h(input_pos, x, kv_caches, *args, **kwds)
        torch.cuda.current_stream().wait_stream(s)

        with torch.cuda.graph(graph):
            x_dec.copy_(self.h(input_pos, x, kv_caches, *args, **kwds))
        torch.cuda.synchronize()

        return graph

    @abstractmethod
    def pre_forward(self, session: T2SSession) -> tuple[list[Tensor], dict[str, Tensor]]:
        return list(), dict()

    @abstractmethod
    def post_forward(self, idx: int, session: T2SSession) -> None:
        return

    def quantize(self, mode: Literal["Int8", "FP8", "FP8_E4M3FN"] | None = None) -> None:
        if mode is None:
            return
        if mode not in {"Int8", "FP8", "FP8_E4M3FN"}:
            raise ValueError(f"Unsupported quantization mode: {mode}")
        match mode:
            case "Int8":
                self.bits = 8
                self.group_size = 32
                import torchao

                torchao.quantization.quantize_(self.h, torchao.quantization.Int8WeightOnlyConfig(self.group_size))

            case "FP8":
                self.bits = 8
                import torchao

                torchao.quantization.quantize_(self.h, torchao.quantization.Float8WeightOnlyConfig())

            case "FP8_E4M3FN":
                self.bits = 8
                replace_all_linear_with_fp8(self.h)

            case _:
                raise ValueError(f"Unsupported Quantization Mode for PyTorch: {mode}")


class CUDAGraphStateABC(ABC):
    def __init__(
        self,
        bsz: int,
        decoder: T2SDecoderABC,
    ) -> None:
        self.bsz = bsz
        self.embedding_dim = decoder.embedding_dim
        self.dtype = decoder.bert_proj.bias.dtype
        self.device = decoder.device

        self.decoder: T2SDecoderABC = decoder
        self.graph: torch.cuda.CUDAGraph | None = None
        self.stream: torch.cuda.Stream | None = None

        self.xy_pos = torch.rand(size=(self.bsz, 1, self.embedding_dim), device=self.device).to(self.dtype)
        self.kv_cache: MutableSequence[KVCacheProtocol] = decoder.init_cache(bsz)
        self.xy_dec = self.xy_pos.clone()
        self.input_pos = torch.tensor([10] * self.bsz, device=self.device).to(torch.int32)

        self.capture()

    @abstractmethod
    def capture(self): ...

    def assign_graph(self, session: T2SSession) -> CUDAGraphStateABC:
        assert self.graph
        session.graph = self.graph
        session.stream = self.stream

        session.xy_pos_ = self.xy_pos
        session.xy_dec_ = self.xy_dec
        session.input_pos = self.input_pos.copy_(session.input_pos)

        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
            cache.sync_cache(cache_)

        return self


class CUDAGraphCacheABC(ABC):
    is_applicable: bool

    def __init__(self, decoder: T2SDecoderABC, cache_size: int = 5) -> None:
        self.decoder = decoder
        self.max_batch_size = decoder.max_batch_size
        self.cache_size = cache_size

        self.graph_cache: dict[int, Queue[CUDAGraphStateABC]] = {}

        if torch.cuda.is_available() and torch.version.cuda is not None and os.environ.get("CUDAGraph", "1") != "0":
            self.create_graph_cache(1)

    def __getitem__(self, bsz: int) -> CUDAGraphStateABC:
        if self.is_applicable:
            assert bsz <= self.max_batch_size
            if self.graph_cache.get(bsz) is None:
                self.create_graph_cache(bsz)
            return self.graph_cache[bsz].get()
        else:
            raise RuntimeError("CUDAGraph Is Not Applicable")

    @abstractmethod
    def create_graph_cache(self, bsz: int): ...

    def release_graph(self, graph_state: CUDAGraphStateABC | None):
        if graph_state is None:
            return
        bsz = graph_state.bsz
        assert bsz <= self.max_batch_size
        assert self.graph_cache.get(bsz) is not None
        self.graph_cache[bsz].put(graph_state)


class TorchProfiler:
    def __init__(self, debug: bool, log_dir: str = "./profiler/torch") -> None:
        self.debug = debug and os.environ.get("TORCH_PROFILER") == "1"
        self.log_dir = log_dir + "/" + str(time.time())
        self.__profiler: torch.profiler.profile

        if self.debug and not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir, exist_ok=True)

        self.tensorboard_handler = tensorboard_trace_handler(self.log_dir)

    def profiler_callback(self, prof: torch.profiler.profile):
        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=30))
        self.tensorboard_handler(prof)

    @staticmethod
    def three_step_schedule(step: int) -> ProfilerAction:
        if step == 0:
            return ProfilerAction.NONE
        elif step == 1:
            return ProfilerAction.RECORD
        elif step == 2:
            return ProfilerAction.RECORD_AND_SAVE
        else:
            return ProfilerAction.RECORD_AND_SAVE

    def start(self):
        if not self.debug:
            return
        assert self.__profiler is not None
        self.__profiler.step()

    def end(self):
        if not self.debug:
            return
        assert self.__profiler is not None
        self.__profiler.step()

    def profiler(self):
        if self.debug:
            activities_list = [torch.profiler.ProfilerActivity.CPU]
            if torch.cuda.is_available():
                activities_list.append(torch.profiler.ProfilerActivity.CUDA)

            self.__profiler = torch.profiler.profile(
                activities=activities_list,
                record_shapes=True,
                with_stack=True,
                with_modules=True,
                with_flops=True,
                profile_memory=True,
                schedule=self.three_step_schedule,
                on_trace_ready=self.profiler_callback,
                execution_trace_observer=(
                    ExecutionTraceObserver().register_callback(f"{self.log_dir}/execution_trace.json")
                ),
            )
            return self.__profiler
        else:
            return nullcontext()

    def record(self, func_name: str):
        if self.debug:
            return torch.profiler.record_function(func_name)
        else:
            return nullcontext()