From 35383e2db32f9786e6953ac92aac61684b762cfa Mon Sep 17 00:00:00 2001 From: Zheng Guang Cong Date: Sat, 11 Jan 2025 17:08:25 +0800 Subject: [PATCH 1/3] fix potential bug of i2v Image value is in [0, 255] and should be transformed into [-1, 1], similar to video. --- finetune/datasets/i2v_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/datasets/i2v_dataset.py b/finetune/datasets/i2v_dataset.py index b26bb7f..76663a3 100644 --- a/finetune/datasets/i2v_dataset.py +++ b/finetune/datasets/i2v_dataset.py @@ -153,7 +153,7 @@ class BaseI2VDataset(Dataset): # Add image into the first frame. # Note, **this operation maybe model-specific**, and maybe change in the future. - frames = torch.cat([image.unsqueeze(0), frames], dim=0) + frames = torch.cat([self.image_transform(image).unsqueeze(0), frames], dim=0) # Convert to [B, C, F, H, W] frames = frames.unsqueeze(0) From cd861bbe1e4a78efa487db509789d80d7846aecd Mon Sep 17 00:00:00 2001 From: Zheng Guang Cong Date: Sat, 11 Jan 2025 17:24:35 +0800 Subject: [PATCH 2/3] Update i2v_dataset.py image should also be transformed to [-1, 1] --- finetune/datasets/i2v_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/finetune/datasets/i2v_dataset.py b/finetune/datasets/i2v_dataset.py index 76663a3..451611e 100644 --- a/finetune/datasets/i2v_dataset.py +++ b/finetune/datasets/i2v_dataset.py @@ -148,12 +148,13 @@ class BaseI2VDataset(Dataset): frames, image = self.preprocess(video, image) frames = frames.to(self.device) image = image.to(self.device) + image = self.image_transform(image) # Current shape of frames: [F, C, H, W] frames = self.video_transform(frames) # Add image into the first frame. # Note, **this operation maybe model-specific**, and maybe change in the future. - frames = torch.cat([self.image_transform(image).unsqueeze(0), frames], dim=0) + frames = torch.cat([image.unsqueeze(0), frames], dim=0) # Convert to [B, C, F, H, W] frames = frames.unsqueeze(0) From 09a49d35466f479949ba2525234758e3342e4e07 Mon Sep 17 00:00:00 2001 From: Zheng Guang Cong Date: Sat, 11 Jan 2025 17:29:27 +0800 Subject: [PATCH 3/3] fix bug of i2v; video is already 0-255 video is already 0-255 and should not be multiplied 255 any more --- finetune/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/trainer.py b/finetune/trainer.py index 53fc193..d1eaab5 100644 --- a/finetune/trainer.py +++ b/finetune/trainer.py @@ -526,7 +526,7 @@ class Trainer: video, self.state.train_frames, self.state.train_height, self.state.train_width ) # Convert video tensor (F, C, H, W) to list of PIL images - video = (video * 255).round().clamp(0, 255).to(torch.uint8) + video = video.round().clamp(0, 255).to(torch.uint8) video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video] logger.debug(