From f6d722cec7a94fcae036549130918beb37fe4330 Mon Sep 17 00:00:00 2001
From: OleehyO <leehy0357@gmail.com>
Date: Thu, 9 Jan 2025 15:52:51 +0000
Subject: [PATCH] fix: remove copying first video frame as conditioning image

---
 finetune/datasets/i2v_dataset.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/finetune/datasets/i2v_dataset.py b/finetune/datasets/i2v_dataset.py
index b26bb7f..bde0caa 100644
--- a/finetune/datasets/i2v_dataset.py
+++ b/finetune/datasets/i2v_dataset.py
@@ -139,7 +139,6 @@ class BaseI2VDataset(Dataset):
             logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False)
 
         if encoded_video_path.exists():
-            # encoded_video = torch.load(encoded_video_path, weights_only=True)
             encoded_video = load_file(encoded_video_path)["encoded_video"]
             logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False)
             # shape of image: [C, H, W]
@@ -151,10 +150,6 @@ class BaseI2VDataset(Dataset):
             # Current shape of frames: [F, C, H, W]
             frames = self.video_transform(frames)
 
-            # Add image into the first frame.
-            # Note, **this operation maybe model-specific**, and maybe change in the future.
-            frames = torch.cat([image.unsqueeze(0), frames], dim=0)
-
             # Convert to [B, C, F, H, W]
             frames = frames.unsqueeze(0)
             frames = frames.permute(0, 2, 1, 3, 4).contiguous()