From 7bc152ff350eed9967206a681032abe38b7b8ca7 Mon Sep 17 00:00:00 2001 From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com> Date: Wed, 11 Feb 2026 19:35:55 +0530 Subject: [PATCH] Fix incorrect frame padding formula in lora trainers The ncopy calculation used `latent.shape[2] % patch_size_t` which computes the remainder rather than the number of frames needed to reach alignment. For example, with shape[2]=13 and patch_size_t=4, this gives ncopy=1, resulting in 14 frames which is still not divisible by 4, causing the assertion to fail. The correct formula is `(patch_size_t - latent.shape[2] % patch_size_t) % patch_size_t` which computes how many frames must be prepended to reach the next multiple of patch_size_t. The outer modulo handles the already-aligned case (returns 0 instead of patch_size_t). Fixes #782 --- finetune/models/cogvideox_i2v/lora_trainer.py | 2 +- finetune/models/cogvideox_t2v/lora_trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finetune/models/cogvideox_i2v/lora_trainer.py b/finetune/models/cogvideox_i2v/lora_trainer.py index 793cf76..f830c08 100644 --- a/finetune/models/cogvideox_i2v/lora_trainer.py +++ b/finetune/models/cogvideox_i2v/lora_trainer.py @@ -115,7 +115,7 @@ class CogVideoXI2VLoraTrainer(Trainer): patch_size_t = self.state.transformer_config.patch_size_t if patch_size_t is not None: - ncopy = latent.shape[2] % patch_size_t + ncopy = (patch_size_t - latent.shape[2] % patch_size_t) % patch_size_t # Copy the first frame ncopy times to match patch_size_t first_frame = latent[:, :, :1, :, :] # Get first frame [B, C, 1, H, W] latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2) diff --git a/finetune/models/cogvideox_t2v/lora_trainer.py b/finetune/models/cogvideox_t2v/lora_trainer.py index 5f0ec1c..410d4de 100644 --- a/finetune/models/cogvideox_t2v/lora_trainer.py +++ b/finetune/models/cogvideox_t2v/lora_trainer.py @@ -109,7 +109,7 @@ class CogVideoXT2VLoraTrainer(Trainer): patch_size_t = self.state.transformer_config.patch_size_t if patch_size_t is not None: - ncopy = latent.shape[2] % patch_size_t + ncopy = (patch_size_t - latent.shape[2] % patch_size_t) % patch_size_t # Copy the first frame ncopy times to match patch_size_t first_frame = latent[:, :, :1, :, :] # Get first frame [B, C, 1, H, W] latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)