model: scale_factor: 0.7 # different from cogvideox_2b_infer.yaml disable_first_stage_autocast: true not_trainable_prefixes: ['all'] # Using Lora log_keys: - txt denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 quantize_c_noise: False weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 1.0 # different from cogvideox_2b_infer.yaml network_config: target: dit_video_concat.DiffusionTransformer params: time_embed_dim: 512 elementwise_affine: True num_frames: 49 time_compressed_rate: 4 latent_width: 90 latent_height: 60 num_layers: 42 # different from cogvideox_2b_infer.yaml patch_size: 2 in_channels: 16 out_channels: 16 hidden_size: 3072 # different from cogvideox_2b_infer.yaml adm_in_channels: 256 num_attention_heads: 48 # different from cogvideox_2b_infer.yaml transformer_args: checkpoint_activations: True vocab_size: 1 max_sequence_length: 64 layernorm_order: pre skip_init: false model_parallel_size: 1 is_decoder: false modules: pos_embed_config: target: dit_video_concat.Rotary3DPositionEmbeddingMixin # different from cogvideox_2b_infer.yaml params: hidden_size_head: 64 text_length: 226 lora_config: # Using Lora target: sat.model.finetune.lora2.LoraMixin params: r: 128 patch_embed_config: target: dit_video_concat.ImagePatchEmbeddingMixin params: text_hidden_size: 4096 adaln_layer_config: target: dit_video_concat.AdaLNMixin params: qk_ln: True final_layer_config: target: dit_video_concat.FinalLayerMixin conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - is_trainable: false input_key: txt ucg_rate: 0.1 target: sgm.modules.encoders.modules.FrozenT5Embedder params: model_dir: "t5-v1_1-xxl" max_length: 226 first_stage_config: target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper params: cp_size: 1 ckpt_path: "cogvideox-5b-sat/vae/3d-vae.pt" ignore_keys: [ 'loss' ] loss_config: target: torch.nn.Identity regularizer_config: target: vae_modules.regularizers.DiagonalGaussianRegularizer encoder_config: target: vae_modules.cp_enc_dec.ContextParallelEncoder3D params: double_z: true z_channels: 16 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 2, 4 ] attn_resolutions: [ ] num_res_blocks: 3 dropout: 0.0 gather_norm: True decoder_config: target: vae_modules.cp_enc_dec.ContextParallelDecoder3D params: double_z: True z_channels: 16 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 2, 4 ] attn_resolutions: [ ] num_res_blocks: 3 dropout: 0.0 gather_norm: False loss_fn_config: target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss params: offset_noise_level: 0 sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling params: uniform_sampling: True num_idx: 1000 discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 1.0 # different from cogvideox_2b_infer.yaml sampler_config: target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler params: num_steps: 50 verbose: True discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 1.0 guider_config: target: sgm.modules.diffusionmodules.guiders.DynamicCFG params: scale: 6 exp: 5 num_steps: 50