add pipeline

2025-04-05 03:04:56 +08:00 · 2025-01-12 19:27:21 +08:00 · 2025-01-12 19:27:21 +08:00 · 1534bf33eb
commit 1534bf33eb
parent 86a0226f80
14 changed files with 117 additions and 78 deletions
--- a/finetune/README.md
+++ b/finetune/README.md
@ -4,33 +4,34 @@

 [日本語で読む](./README_ja.md)

-If you're looking for the fine-tuning instructions for the SAT version, please check [here](../sat/README_zh.md). The dataset format for this version differs from the one used here.
+If you're looking for the fine-tuning instructions for the SAT version, please check [here](../sat/README_zh.md). The
+dataset format for this version differs from the one used here.

 ## Hardware Requirements

-| Model | Training Type | Distribution Strategy | Mixed Precision | Training Resolution (FxHxW) | Hardware Requirements |
-|----------------------------|-----------------|------------------------------------|-----------------|--------------------------------------------|---------------------|
-| cogvideox-t2v-2b | lora (rank128) | DDP | fp16 | 49x480x720 | 16GB VRAM (NVIDIA 4080) |
-| cogvideox-{t2v, i2v}-5b | lora (rank128) | DDP | bf16 | 49x480x720 | 24GB VRAM (NVIDIA 4090) |
-| cogvideox1.5-{t2v, i2v}-5b | lora (rank128) | DDP | bf16 | 81x768x1360 | 35GB VRAM (NVIDIA A100) |
-| cogvideox-t2v-2b | sft | DDP | fp16 | 49x480x720 | 36GB VRAM (NVIDIA A100) |
-| cogvideox-t2v-2b | sft | 1-GPU zero-2 + opt offload | fp16 | 49x480x720 | 17GB VRAM (NVIDIA 4090) |
-| cogvideox-t2v-2b | sft | 8-GPU zero-2 | fp16 | 49x480x720 | 17GB VRAM (NVIDIA 4090) |
-| cogvideox-t2v-2b | sft | 8-GPU zero-3 | fp16 | 49x480x720 | 19GB VRAM (NVIDIA 4090) |
-| cogvideox-t2v-2b | sft | 8-GPU zero-3 + opt and param offload | bf16 | 49x480x720 | 14GB VRAM (NVIDIA 4080) |
-| cogvideox-{t2v, i2v}-5b | sft | 1-GPU zero-2 + opt offload | bf16 | 49x480x720 | 42GB VRAM (NVIDIA A100) |
-| cogvideox-{t2v, i2v}-5b | sft | 8-GPU zero-2 | bf16 | 49x480x720 | 42GB VRAM (NVIDIA 4090) |
-| cogvideox-{t2v, i2v}-5b | sft | 8-GPU zero-3 | bf16 | 49x480x720 | 43GB VRAM (NVIDIA 4090) |
-| cogvideox-{t2v, i2v}-5b | sft | 8-GPU zero-3 + opt and param offload | bf16 | 49x480x720 | 28GB VRAM (NVIDIA 5090) |
-| cogvideox1.5-{t2v, i2v}-5b | sft | 1-GPU zero-2 + opt offload | bf16 | 81x768x1360 | 56GB VRAM (NVIDIA A100) |
-| cogvideox1.5-{t2v, i2v}-5b | sft | 8-GPU zero-2 | bf16 | 81x768x1360 | 55GB VRAM (NVIDIA A100) |
-| cogvideox1.5-{t2v, i2v}-5b | sft | 8-GPU zero-3 | bf16 | 81x768x1360 | 55GB VRAM (NVIDIA A100) |
-| cogvideox1.5-{t2v, i2v}-5b | sft | 8-GPU zero-3 + opt and param offload | bf16 | 81x768x1360 | 40GB VRAM (NVIDIA A100) |
-
+| Model                      | Training Type  | Distribution Strategy                | Mixed Precision | Training Resolution (FxHxW) | Hardware Requirements   |
+|----------------------------|----------------|--------------------------------------|-----------------|-----------------------------|-------------------------|
+| cogvideox-t2v-2b           | lora (rank128) | DDP                                  | fp16            | 49x480x720                  | 16GB VRAM (NVIDIA 4080) |
+| cogvideox-{t2v, i2v}-5b    | lora (rank128) | DDP                                  | bf16            | 49x480x720                  | 24GB VRAM (NVIDIA 4090) |
+| cogvideox1.5-{t2v, i2v}-5b | lora (rank128) | DDP                                  | bf16            | 81x768x1360                 | 35GB VRAM (NVIDIA A100) |
+| cogvideox-t2v-2b           | sft            | DDP                                  | fp16            | 49x480x720                  | 36GB VRAM (NVIDIA A100) |
+| cogvideox-t2v-2b           | sft            | 1-GPU zero-2 + opt offload           | fp16            | 49x480x720                  | 17GB VRAM (NVIDIA 4090) |
+| cogvideox-t2v-2b           | sft            | 8-GPU zero-2                         | fp16            | 49x480x720                  | 17GB VRAM (NVIDIA 4090) |
+| cogvideox-t2v-2b           | sft            | 8-GPU zero-3                         | fp16            | 49x480x720                  | 19GB VRAM (NVIDIA 4090) |
+| cogvideox-t2v-2b           | sft            | 8-GPU zero-3 + opt and param offload | bf16            | 49x480x720                  | 14GB VRAM (NVIDIA 4080) |
+| cogvideox-{t2v, i2v}-5b    | sft            | 1-GPU zero-2 + opt offload           | bf16            | 49x480x720                  | 42GB VRAM (NVIDIA A100) |
+| cogvideox-{t2v, i2v}-5b    | sft            | 8-GPU zero-2                         | bf16            | 49x480x720                  | 42GB VRAM (NVIDIA 4090) |
+| cogvideox-{t2v, i2v}-5b    | sft            | 8-GPU zero-3                         | bf16            | 49x480x720                  | 43GB VRAM (NVIDIA 4090) |
+| cogvideox-{t2v, i2v}-5b    | sft            | 8-GPU zero-3 + opt and param offload | bf16            | 49x480x720                  | 28GB VRAM (NVIDIA 5090) |
+| cogvideox1.5-{t2v, i2v}-5b | sft            | 1-GPU zero-2 + opt offload           | bf16            | 81x768x1360                 | 56GB VRAM (NVIDIA A100) |
+| cogvideox1.5-{t2v, i2v}-5b | sft            | 8-GPU zero-2                         | bf16            | 81x768x1360                 | 55GB VRAM (NVIDIA A100) |
+| cogvideox1.5-{t2v, i2v}-5b | sft            | 8-GPU zero-3                         | bf16            | 81x768x1360                 | 55GB VRAM (NVIDIA A100) |
+| cogvideox1.5-{t2v, i2v}-5b | sft            | 8-GPU zero-3 + opt and param offload | bf16            | 81x768x1360                 | 40GB VRAM (NVIDIA A100) |

 ## Install Dependencies

-Since the relevant code has not yet been merged into the official `diffusers` release, you need to fine-tune based on the diffusers branch. Follow the steps below to install the dependencies:
+Since the relevant code has not yet been merged into the official `diffusers` release, you need to fine-tune based on
+the diffusers branch. Follow the steps below to install the dependencies:

 ```shell
 git clone https://github.com/huggingface/diffusers.git
@ -40,7 +41,8 @@ pip install -e .

 ## Prepare the Dataset

-First, you need to prepare your dataset. Depending on your task type (T2V or I2V), the dataset format will vary slightly:
+First, you need to prepare your dataset. Depending on your task type (T2V or I2V), the dataset format will vary
+slightly:

 ```
 .
@ -52,15 +54,18 @@ First, you need to prepare your dataset. Depending on your task type (T2V or I2V
 ```

 Where:
+
 - `prompts.txt`: Contains the prompts
 - `videos/`: Contains the .mp4 video files
 - `videos.txt`: Contains the list of video files in the `videos/` directory
 - `images/`: (Optional) Contains the .png reference image files
 - `images.txt`: (Optional) Contains the list of reference image files

-You can download a sample dataset (T2V) [Disney Steamboat Willie](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset).
+You can download a sample dataset (
+T2V) [Disney Steamboat Willie](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset).

-If you need to use a validation dataset during training, make sure to provide a validation dataset with the same format as the training dataset.
+If you need to use a validation dataset during training, make sure to provide a validation dataset with the same format
+as the training dataset.

 ## Running Scripts to Start Fine-tuning

@ -68,11 +73,15 @@ Before starting training, please note the following resolution requirements:

 1. The number of frames must be a multiple of 8 **plus 1** (i.e., 8N+1), such as 49, 81 ...
 2. Recommended video resolutions for each model:
-   - CogVideoX: 480x720 (height x width)
-   - CogVideoX1.5: 768x1360 (height x width)
-3. For samples (videos or images) that don't match the training resolution, the code will directly resize them. This may cause aspect ratio distortion and affect training results. It's recommended to preprocess your samples (e.g., using crop + resize to maintain aspect ratio) before training.
+    - CogVideoX: 480x720 (height x width)
+    - CogVideoX1.5: 768x1360 (height x width)
+3. For samples (videos or images) that don't match the training resolution, the code will directly resize them. This may
+   cause aspect ratio distortion and affect training results. It's recommended to preprocess your samples (e.g., using
+   crop + resize to maintain aspect ratio) before training.

-> **Important Note**: To improve training efficiency, we automatically encode videos and cache the results on disk before training. If you modify the data after training, please delete the latent directory under the video directory to ensure the latest data is used.
+> **Important Note**: To improve training efficiency, we automatically encode videos and cache the results on disk
+> before training. If you modify the data after training, please delete the latent directory under the video directory to
+> ensure the latest data is used.

 ### LoRA

@ -93,7 +102,8 @@ bash train_ddp_i2v.sh  # Image-to-Video (I2V) fine-tuning

 ### SFT

-We provide several zero configuration templates in the `configs/` directory. Please choose the appropriate training configuration based on your needs (configure the `deepspeed_config_file` option in `accelerate_config.yaml`).
+We provide several zero configuration templates in the `configs/` directory. Please choose the appropriate training
+configuration based on your needs (configure the `deepspeed_config_file` option in `accelerate_config.yaml`).

 ```bash
 # Parameters to configure are the same as LoRA training
@ -102,11 +112,15 @@ bash train_zero_t2v.sh  # Text-to-Video (T2V) fine-tuning
 bash train_zero_i2v.sh  # Image-to-Video (I2V) fine-tuning
 ```

-In addition to setting the bash script parameters, you need to set the relevant training options in the zero configuration file and ensure the zero training configuration matches the parameters in the bash script, such as batch_size, gradient_accumulation_steps, mixed_precision. For details, please refer to the [DeepSpeed official documentation](https://www.deepspeed.ai/docs/config-json/)
+In addition to setting the bash script parameters, you need to set the relevant training options in the zero
+configuration file and ensure the zero training configuration matches the parameters in the bash script, such as
+batch_size, gradient_accumulation_steps, mixed_precision. For details, please refer to
+the [DeepSpeed official documentation](https://www.deepspeed.ai/docs/config-json/)

 When using SFT training, please note:

-1. For SFT training, model offload is not used during validation, so the peak VRAM usage may exceed 24GB. For GPUs with less than 24GB VRAM, it's recommended to disable validation.
+1. For SFT training, model offload is not used during validation, so the peak VRAM usage may exceed 24GB. For GPUs with
+   less than 24GB VRAM, it's recommended to disable validation.

 2. Validation is slow when zero-3 is enabled, so it's recommended to disable validation when using zero-3.

@ -114,12 +128,19 @@ When using SFT training, please note:

 + Please refer to [cli_demo.py](../inference/cli_demo.py) for instructions on how to load the fine-tuned model.

-+ For SFT trained models, please first use the `zero_to_fp32.py` script in the `checkpoint-*/` directory to merge the model weights
+ For SFT trained models, please first use the `zero_to_fp32.py` script in the `checkpoint-*/` directory to merge the
+  model weights

 ## Best Practices

-+ We included 70 training videos with a resolution of `200 x 480 x 720` (frames x height x width). Through frame skipping in the data preprocessing, we created two smaller datasets with 49 and 16 frames to speed up experiments. The maximum frame count recommended by the CogVideoX team is 49 frames. These 70 videos were divided into three groups: 10, 25, and 50 videos, with similar conceptual nature.
+ We included 70 training videos with a resolution of `200 x 480 x 720` (frames x height x width). Through frame
+  skipping in the data preprocessing, we created two smaller datasets with 49 and 16 frames to speed up experiments. The
+  maximum frame count recommended by the CogVideoX team is 49 frames. These 70 videos were divided into three groups:
+  10, 25, and 50 videos, with similar conceptual nature.
 + Videos with 25 or more frames work best for training new concepts and styles.
-+ It's recommended to use an identifier token, which can be specified using `--id_token`, for better training results. This is similar to Dreambooth training, though regular fine-tuning without using this token will still work.
-+ The original repository uses `lora_alpha` set to 1. We found that this value performed poorly in several runs, possibly due to differences in the model backend and training settings. Our recommendation is to set `lora_alpha` to be equal to the rank or `rank // 2`.
+ It's recommended to use an identifier token, which can be specified using `--id_token`, for better training results.
+  This is similar to Dreambooth training, though regular fine-tuning without using this token will still work.
+ The original repository uses `lora_alpha` set to 1. We found that this value performed poorly in several runs,
+  possibly due to differences in the model backend and training settings. Our recommendation is to set `lora_alpha` to
+  be equal to the rank or `rank // 2`.
 + It's advised to use a rank of 64 or higher.
--- a/finetune/accelerate_config.yaml
+++ b/finetune/accelerate_config.yaml
@ -5,7 +5,7 @@ num_processes: 8  # should be the same as the number of GPUs

 debug: false
 deepspeed_config:
-  deepspeed_config_file: /home/lhy/code/CogVideo/finetune/configs/zero2.yaml  # e.g. /home/user/cogvideo/finetune/configs/zero2.yaml
+  deepspeed_config_file: configs/zero2.yaml  # e.g. configs/zero2.yaml, need use absolute path
  zero3_init_flag: false
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
--- a/finetune/models/cogvideox1_5_i2v/lora_trainer.py
+++ b/finetune/models/cogvideox1_5_i2v/lora_trainer.py
@ -0,0 +1,9 @@
+from ..cogvideox_i2v.lora_trainer import CogVideoXI2VLoraTrainer
+from ..utils import register
+
+
+class CogVideoX1_5I2VLoraTrainer(CogVideoXI2VLoraTrainer):
+    pass
+
+
+register("cogvideox1.5-i2v", "lora", CogVideoX1_5I2VLoraTrainer)
--- a/finetune/models/cogvideox1_5_i2v/sft_trainer.py
+++ b/finetune/models/cogvideox1_5_i2v/sft_trainer.py
@ -0,0 +1,9 @@
+from ..cogvideox_i2v.sft_trainer import CogVideoXI2VSftTrainer
+from ..utils import register
+
+
+class CogVideoX1_5I2VSftTrainer(CogVideoXI2VSftTrainer):
+    pass
+
+
+register("cogvideox1.5-i2v", "sft", CogVideoX1_5I2VSftTrainer)
--- a/finetune/models/cogvideox1_5_t2v/lora_trainer.py
+++ b/finetune/models/cogvideox1_5_t2v/lora_trainer.py
@ -0,0 +1,9 @@
+from ..cogvideox_t2v.lora_trainer import CogVideoXT2VLoraTrainer
+from ..utils import register
+
+
+class CogVideoX1_5T2VLoraTrainer(CogVideoXT2VLoraTrainer):
+    pass
+
+
+register("cogvideox1.5-t2v", "lora", CogVideoX1_5T2VLoraTrainer)
--- a/finetune/models/cogvideox1_5_t2v/sft_trainer.py
+++ b/finetune/models/cogvideox1_5_t2v/sft_trainer.py
@ -0,0 +1,9 @@
+from ..cogvideox_t2v.sft_trainer import CogVideoXT2VSftTrainer
+from ..utils import register
+
+
+class CogVideoX1_5T2VSftTrainer(CogVideoXT2VSftTrainer):
+    pass
+
+
+register("cogvideox1.5-t2v", "sft", CogVideoX1_5T2VSftTrainer)
--- a/finetune/models/cogvideox1dot5_i2v/lora_trainer.py
+++ b/finetune/models/cogvideox1dot5_i2v/lora_trainer.py
@ -1,9 +0,0 @@
-from ..cogvideox_i2v.lora_trainer import CogVideoXI2VLoraTrainer
-from ..utils import register
-
-
-class CogVideoX1dot5I2VLoraTrainer(CogVideoXI2VLoraTrainer):
-    pass
-
-
-register("cogvideox1.5-i2v", "lora", CogVideoX1dot5I2VLoraTrainer)
--- a/finetune/models/cogvideox1dot5_i2v/sft_trainer.py
+++ b/finetune/models/cogvideox1dot5_i2v/sft_trainer.py
@ -1,9 +0,0 @@
-from ..cogvideox_i2v.sft_trainer import CogVideoXI2VSftTrainer
-from ..utils import register
-
-
-class CogVideoX1dot5I2VSftTrainer(CogVideoXI2VSftTrainer):
-    pass
-
-
-register("cogvideox1.5-i2v", "sft", CogVideoX1dot5I2VSftTrainer)
--- a/finetune/models/cogvideox1dot5_t2v/lora_trainer.py
+++ b/finetune/models/cogvideox1dot5_t2v/lora_trainer.py
@ -1,9 +0,0 @@
-from ..cogvideox_t2v.lora_trainer import CogVideoXT2VLoraTrainer
-from ..utils import register
-
-
-class CogVideoX1dot5T2VLoraTrainer(CogVideoXT2VLoraTrainer):
-    pass
-
-
-register("cogvideox1.5-t2v", "lora", CogVideoX1dot5T2VLoraTrainer)
--- a/finetune/models/cogvideox1dot5_t2v/sft_trainer.py
+++ b/finetune/models/cogvideox1dot5_t2v/sft_trainer.py
@ -1,9 +0,0 @@
-from ..cogvideox_t2v.sft_trainer import CogVideoXT2VSftTrainer
-from ..utils import register
-
-
-class CogVideoX1dot5T2VSftTrainer(CogVideoXT2VSftTrainer):
-    pass
-
-
-register("cogvideox1.5-t2v", "sft", CogVideoX1dot5T2VSftTrainer)
--- a/finetune/models/cogvideox_i2v/lora_trainer.py
+++ b/finetune/models/cogvideox_i2v/lora_trainer.py
@ -9,6 +9,7 @@ from diffusers import (
 )
 from diffusers.models.embeddings import get_3d_rotary_pos_embed
 from PIL import Image
+from numpy import dtype
 from transformers import AutoTokenizer, T5EncoderModel
 from typing_extensions import override

@ -116,7 +117,7 @@ class CogVideoXI2VLoraTrainer(Trainer):

        # Get prompt embeddings
        _, seq_len, _ = prompt_embedding.shape
-        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1)
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)

        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
        images = images.unsqueeze(2)
@ -166,7 +167,7 @@ class CogVideoXI2VLoraTrainer(Trainer):
            else None
        )

-        # Predict noise
+        # Predict noise, For CogVideoX1.5 Only.
        ofs_emb = (
            None if self.state.transformer_config.ofs_embed_dim is None else latent.new_full((1,), fill_value=2.0)
        )
--- a/finetune/models/cogvideox_t2v/lora_trainer.py
+++ b/finetune/models/cogvideox_t2v/lora_trainer.py
@ -111,7 +111,7 @@ class CogVideoXT2VLoraTrainer(Trainer):

        # Get prompt embeddings
        _, seq_len, _ = prompt_embedding.shape
-        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1)
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)

        # Sample a random timestep for each sample
        timesteps = torch.randint(
@ -197,7 +197,7 @@ class CogVideoXT2VLoraTrainer(Trainer):
            base_num_frames = num_frames
        else:
            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
-
+        breakpoint()
        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
            embed_dim=transformer_config.attention_head_dim,
            crops_coords=None,
--- a/finetune/schemas/args.py
+++ b/finetune/schemas/args.py
@ -78,7 +78,7 @@ class Args(BaseModel):

    ########## Validation ##########
    do_validation: bool = False
-    validation_steps: int | None = None  # if set, should be a multiple of checkpointing_steps
+    validation_steps: int | None  # if set, should be a multiple of checkpointing_steps
    validation_dir: Path | None  # if set do_validation, should not be None
    validation_prompts: str | None  # if set do_validation, should not be None
    validation_images: str | None  # if set do_validation and model_type == i2v, should not be None
--- a/finetune/trainer.py
+++ b/finetune/trainer.py
@ -758,12 +758,29 @@ class Trainer:
        self.accelerator.register_save_state_pre_hook(save_model_hook)
        self.accelerator.register_load_state_pre_hook(load_model_hook)

+    # def __maybe_save_checkpoint(self, global_step: int, must_save: bool = False):
+    #     if self.accelerator.distributed_type == DistributedType.DEEPSPEED or self.accelerator.is_main_process:
+    #         if must_save or global_step % self.args.checkpointing_steps == 0:
+    #             save_path = get_intermediate_ckpt_path(
+    #                 checkpointing_limit=self.args.checkpointing_limit,
+    #                 step=global_step,
+    #                 output_dir=self.args.output_dir,
+    #             )
+    #         self.accelerator.save_state(save_path, safe_serialization=True)
+
    def __maybe_save_checkpoint(self, global_step: int, must_save: bool = False):
        if self.accelerator.distributed_type == DistributedType.DEEPSPEED or self.accelerator.is_main_process:
            if must_save or global_step % self.args.checkpointing_steps == 0:
+                # for training
                save_path = get_intermediate_ckpt_path(
                    checkpointing_limit=self.args.checkpointing_limit,
                    step=global_step,
                    output_dir=self.args.output_dir,
                )
-                self.accelerator.save_state(save_path)
+                self.accelerator.save_state(save_path, safe_serialization=True)
+                pipe = self.initialize_pipeline()
+                pipe_save_path = Path(self.args.output_dir) / f"checkpoint-pipeline-{global_step}"
+                pipe_save_path.mkdir(parents=True, exist_ok=True)
+                pipe.save_pretrained(pipe_save_path)
+                del pipe
+                torch.cuda.empty_cache()