diff --git a/finetune/datasets/i2v_dataset.py b/finetune/datasets/i2v_dataset.py index b26bb7f..bde0caa 100644 --- a/finetune/datasets/i2v_dataset.py +++ b/finetune/datasets/i2v_dataset.py @@ -139,7 +139,6 @@ class BaseI2VDataset(Dataset): logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False) if encoded_video_path.exists(): - # encoded_video = torch.load(encoded_video_path, weights_only=True) encoded_video = load_file(encoded_video_path)["encoded_video"] logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False) # shape of image: [C, H, W] @@ -151,10 +150,6 @@ class BaseI2VDataset(Dataset): # Current shape of frames: [F, C, H, W] frames = self.video_transform(frames) - # Add image into the first frame. - # Note, **this operation maybe model-specific**, and maybe change in the future. - frames = torch.cat([image.unsqueeze(0), frames], dim=0) - # Convert to [B, C, F, H, W] frames = frames.unsqueeze(0) frames = frames.permute(0, 2, 1, 3, 4).contiguous()