diff --git a/sat/configs/cogvideox_5b.yaml b/sat/configs/cogvideox_5b.yaml index 6805dec..22ba694 100644 --- a/sat/configs/cogvideox_5b.yaml +++ b/sat/configs/cogvideox_5b.yaml @@ -1,5 +1,5 @@ model: - scale_factor: 0.7 # different from cogvideox_2b_infer.yaml + scale_factor: 0.7 disable_first_stage_autocast: true log_keys: - txt diff --git a/sat/configs/cogvideox_5b_i2v.yaml b/sat/configs/cogvideox_5b_i2v.yaml new file mode 100644 index 0000000..4baf963 --- /dev/null +++ b/sat/configs/cogvideox_5b_i2v.yaml @@ -0,0 +1,159 @@ +model: + scale_factor: 0.7 + disable_first_stage_autocast: true + latent_input: false + noised_image_input: true + noised_image_dropout: 0.05 + log_keys: + - txt + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + quantize_c_noise: False + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 # different from cogvideox_2b_infer.yaml + + network_config: + target: dit_video_concat.DiffusionTransformer + params: + time_embed_dim: 512 + elementwise_affine: True + num_frames: 49 + time_compressed_rate: 4 + latent_width: 90 + latent_height: 60 + num_layers: 42 + patch_size: 2 + in_channels: 32 #different from cogvideox_5b_infer.yaml + out_channels: 16 + hidden_size: 3072 + adm_in_channels: 256 + num_attention_heads: 48 + + transformer_args: + checkpoint_activations: True + vocab_size: 1 + max_sequence_length: 64 + layernorm_order: pre + skip_init: false + model_parallel_size: 1 + is_decoder: false + + modules: + pos_embed_config: + target: dit_video_concat.Rotary3DPositionEmbeddingMixin + params: + learnable_pos_embed: True + hidden_size_head: 64 + text_length: 226 + + patch_embed_config: + target: dit_video_concat.ImagePatchEmbeddingMixin + params: + text_hidden_size: 4096 + + adaln_layer_config: + target: dit_video_concat.AdaLNMixin + params: + qk_ln: True + + final_layer_config: + target: dit_video_concat.FinalLayerMixin + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: false + input_key: txt + ucg_rate: 0.1 + target: sgm.modules.encoders.modules.FrozenT5Embedder + params: + model_dir: "t5-v1_1-xxl" + max_length: 226 + + first_stage_config: + target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper + params: + cp_size: 1 + ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt" + ignore_keys: ['loss'] + + loss_config: + target: torch.nn.Identity + + regularizer_config: + target: vae_modules.regularizers.DiagonalGaussianRegularizer + + encoder_config: + target: vae_modules.cp_enc_dec.ContextParallelEncoder3D + params: + double_z: true + z_channels: 16 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 2, 4] + attn_resolutions: [] + num_res_blocks: 3 + dropout: 0.0 + gather_norm: True + + decoder_config: + target: vae_modules.cp_enc_dec.ContextParallelDecoder3D + params: + double_z: True + z_channels: 16 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 2, 4] + attn_resolutions: [] + num_res_blocks: 3 + dropout: 0.0 + gather_norm: True + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss + params: + fixed_frames: 0 + offset_noise_level: 0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + uniform_sampling: True + num_idx: 1000 + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler + params: + fixed_frames: 0 + num_steps: 50 + verbose: True + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 + + guider_config: + target: sgm.modules.diffusionmodules.guiders.DynamicCFG + params: + scale: 6 + exp: 5 + num_steps: 50 \ No newline at end of file diff --git a/sat/configs/cogvideox_5b_i2v_lora.yaml b/sat/configs/cogvideox_5b_i2v_lora.yaml new file mode 100644 index 0000000..e36aee7 --- /dev/null +++ b/sat/configs/cogvideox_5b_i2v_lora.yaml @@ -0,0 +1,165 @@ +model: + scale_factor: 0.7 + disable_first_stage_autocast: true + latent_input: false + noised_image_input: true + noised_image_dropout: 0.05 + not_trainable_prefixes: ['all'] ## Using Lora + log_keys: + - txt + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + quantize_c_noise: False + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 # different from cogvideox_2b_infer.yaml + + network_config: + target: dit_video_concat.DiffusionTransformer + params: + time_embed_dim: 512 + elementwise_affine: True + num_frames: 49 + time_compressed_rate: 4 + latent_width: 90 + latent_height: 60 + num_layers: 42 + patch_size: 2 + in_channels: 32 + out_channels: 16 + hidden_size: 3072 + adm_in_channels: 256 + num_attention_heads: 48 + + transformer_args: + checkpoint_activations: True + vocab_size: 1 + max_sequence_length: 64 + layernorm_order: pre + skip_init: false + model_parallel_size: 1 + is_decoder: false + + modules: + pos_embed_config: + target: dit_video_concat.Rotary3DPositionEmbeddingMixin + params: + learnable_pos_embed: True + hidden_size_head: 64 + text_length: 226 + + lora_config: + target: sat.model.finetune.lora2.LoraMixin + params: + r: 256 + + patch_embed_config: + target: dit_video_concat.ImagePatchEmbeddingMixin + params: + text_hidden_size: 4096 + + adaln_layer_config: + target: dit_video_concat.AdaLNMixin + params: + qk_ln: True + + final_layer_config: + target: dit_video_concat.FinalLayerMixin + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: false + input_key: txt + ucg_rate: 0.1 + target: sgm.modules.encoders.modules.FrozenT5Embedder + params: + model_dir: "t5-v1_1-xxl" + max_length: 226 + + first_stage_config: + target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper + params: + cp_size: 1 + ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt" + ignore_keys: [ 'loss' ] + + loss_config: + target: torch.nn.Identity + + regularizer_config: + target: vae_modules.regularizers.DiagonalGaussianRegularizer + + encoder_config: + target: vae_modules.cp_enc_dec.ContextParallelEncoder3D + params: + double_z: true + z_channels: 16 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 2, 4 ] + attn_resolutions: [ ] + num_res_blocks: 3 + dropout: 0.0 + gather_norm: True + + decoder_config: + target: vae_modules.cp_enc_dec.ContextParallelDecoder3D + params: + double_z: True + z_channels: 16 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 2, 4 ] + attn_resolutions: [ ] + num_res_blocks: 3 + dropout: 0.0 + gather_norm: True + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss + params: + fixed_frames: 0 + offset_noise_level: 0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + uniform_sampling: True + num_idx: 1000 + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler + params: + fixed_frames: 0 + num_steps: 50 + verbose: True + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization + params: + shift_scale: 1.0 + + guider_config: + target: sgm.modules.diffusionmodules.guiders.DynamicCFG + params: + scale: 6 + exp: 5 + num_steps: 50 \ No newline at end of file diff --git a/sat/configs/inference.yaml b/sat/configs/inference.yaml index a745639..a93bb99 100644 --- a/sat/configs/inference.yaml +++ b/sat/configs/inference.yaml @@ -1,15 +1,16 @@ args: + image2video: False # True for image2video, False for text2video latent_channels: 16 mode: inference load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter # load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter - batch_size: 1 input_type: txt input_file: configs/test.txt + sampling_image_size: [480, 720] sampling_num_frames: 13 # Must be 13, 11 or 9 sampling_fps: 8 - fp16: True # For CogVideoX-2B -# bf16: True # For CogVideoX-5B +# fp16: True # For CogVideoX-2B + bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V output_dir: outputs/ force_inference: True \ No newline at end of file diff --git a/sat/configs/sft.yaml b/sat/configs/sft.yaml index bbdf1a7..971c521 100644 --- a/sat/configs/sft.yaml +++ b/sat/configs/sft.yaml @@ -1,15 +1,15 @@ args: - checkpoint_activations: True ## using gradient checkpointing + checkpoint_activations: True # using gradient checkpointing model_parallel_size: 1 experiment_name: lora-disney mode: finetune - load: "cogvideox-2b-sat/transformer" + load: "{your CogVideoX SAT folder}/transformer" no_load_rng: True train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough eval_iters: 1 eval_interval: 100 eval_batch_size: 1 - save: ckpts_2b_lora + save: ckpts_5b_lora save_interval: 500 log_interval: 20 train_data: [ "disney" ] # Train data path @@ -28,7 +28,7 @@ data: skip_frms_num: 3. deepspeed: - # Minimun for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs + # Minimum for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs train_micro_batch_size_per_gpu: 2 gradient_accumulation_steps: 1 steps_per_print: 50 @@ -44,9 +44,9 @@ deepspeed: load_from_fp32_weights: false zero_allow_untested_optimizer: true bf16: - enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True + enabled: True # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True fp16: - enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False + enabled: False # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False loss_scale: 0 loss_scale_window: 400 hysteresis: 2 @@ -55,7 +55,7 @@ deepspeed: optimizer: type: sat.ops.FusedEmaAdam params: - lr: 0.001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT + lr: 0.00001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT betas: [ 0.9, 0.95 ] eps: 1e-8 weight_decay: 1e-4