CogVideo/sat/configs/cogvideox1.5_5b_i2v.yaml
Yuxuan Zhang 39c6562dc8 format
2025-03-22 15:14:06 +08:00

160 lines
4.3 KiB
YAML

model:
scale_factor: 0.7
disable_first_stage_autocast: true
latent_input: false
noised_image_input: true
noised_image_all_concat: false
noised_image_dropout: 0.05
augmentation_dropout: 0.15
log_keys:
- txt
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
quantize_c_noise: False
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
network_config:
target: dit_video_concat.DiffusionTransformer
params:
ofs_embed_dim: 512
time_embed_dim: 512
elementwise_affine: True
num_frames: 81 # for 5 seconds and 161 for 10 seconds
time_compressed_rate: 4
latent_width: 300
latent_height: 300
num_layers: 42
patch_size: [2, 2, 2]
in_channels: 32
out_channels: 16
hidden_size: 3072
adm_in_channels: 256
num_attention_heads: 48
transformer_args:
checkpoint_activations: True
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
skip_init: false
model_parallel_size: 1
is_decoder: false
modules:
pos_embed_config:
target: dit_video_concat.Rotary3DPositionEmbeddingMixin
params:
hidden_size_head: 64
text_length: 224
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
text_hidden_size: 4096
adaln_layer_config:
target: dit_video_concat.AdaLNMixin
params:
qk_ln: True
final_layer_config:
target: dit_video_concat.FinalLayerMixin
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: txt
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
model_dir: "google/t5-v1_1-xxl"
max_length: 224
first_stage_config:
target : vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt"
ignore_keys: ['loss']
loss_config:
target: torch.nn.Identity
regularizer_config:
target: vae_modules.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 2, 4]
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: True
decoder_config:
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
params:
double_z: True
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 2, 4]
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: True
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
params:
fixed_frames: 0
offset_noise_level: 0.0
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
uniform_sampling: True
group_num: 40
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
sampler_config:
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
params:
fixed_frames: 0
num_steps: 50
verbose: True
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
guider_config:
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
params:
scale: 6
exp: 5
num_steps: 50