mirror of
https://github.com/THUDM/CogVideo.git
synced 2025-10-09 20:00:13 +08:00
225 lines
5.7 KiB
YAML
225 lines
5.7 KiB
YAML
args:
|
|
checkpoint_activations: True ## using gradient checkpointing
|
|
model_parallel_size: 1
|
|
experiment_name: lora-disney
|
|
mode: finetune
|
|
load: "CogVideoX-2b-sat/transformer"
|
|
no_load_rng: True
|
|
train_iters: 1000
|
|
eval_iters: 1
|
|
eval_interval: 100
|
|
eval_batch_size: 1
|
|
save: ckpts
|
|
save_interval: 100
|
|
log_interval: 20
|
|
train_data: ["disney"]
|
|
valid_data: ["disney"]
|
|
split: 1,0,0
|
|
num_workers: 8
|
|
force_train: True
|
|
only_log_video_latents: True
|
|
|
|
data:
|
|
target: data_video.SFTDataset
|
|
params:
|
|
video_size: [480, 720]
|
|
fps: 8
|
|
max_num_frames: 49
|
|
skip_frms_num: 3.
|
|
|
|
deepspeed:
|
|
train_micro_batch_size_per_gpu: 1
|
|
gradient_accumulation_steps: 1
|
|
steps_per_print: 50
|
|
gradient_clipping: 0.1
|
|
zero_optimization:
|
|
stage: 2
|
|
cpu_offload: false
|
|
contiguous_gradients: false
|
|
overlap_comm: true
|
|
reduce_scatter: true
|
|
reduce_bucket_size: 1000000000
|
|
allgather_bucket_size: 1000000000
|
|
load_from_fp32_weights: false
|
|
zero_allow_untested_optimizer: true
|
|
bf16:
|
|
enabled: False
|
|
fp16:
|
|
enabled: True
|
|
loss_scale: 0
|
|
loss_scale_window: 400
|
|
hysteresis: 2
|
|
min_loss_scale: 1
|
|
optimizer:
|
|
type: sat.ops.FusedEmaAdam
|
|
params:
|
|
lr: 0.0002
|
|
betas: [0.9, 0.95]
|
|
eps: 1e-8
|
|
weight_decay: 1e-4
|
|
activation_checkpointing:
|
|
partition_activations: false
|
|
contiguous_memory_optimization: false
|
|
wall_clock_breakdown: false
|
|
|
|
|
|
model:
|
|
scale_factor: 1.15258426
|
|
disable_first_stage_autocast: true
|
|
not_trainable_prefixes: ['all'] ## Using Lora
|
|
log_keys:
|
|
- txt
|
|
|
|
denoiser_config:
|
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
|
params:
|
|
num_idx: 1000
|
|
quantize_c_noise: False
|
|
|
|
weighting_config:
|
|
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
|
|
scaling_config:
|
|
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
|
|
discretization_config:
|
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
|
params:
|
|
shift_scale: 3.0
|
|
|
|
network_config:
|
|
target: dit_video_concat.DiffusionTransformer
|
|
params:
|
|
time_embed_dim: 512
|
|
elementwise_affine: True
|
|
num_frames: 49
|
|
time_compressed_rate: 4
|
|
latent_width: 90
|
|
latent_height: 60
|
|
num_layers: 30
|
|
patch_size: 2
|
|
in_channels: 16
|
|
out_channels: 16
|
|
hidden_size: 1920
|
|
adm_in_channels: 256
|
|
num_attention_heads: 30
|
|
|
|
transformer_args:
|
|
checkpoint_activations: True ## using gradient checkpointing
|
|
vocab_size: 1
|
|
max_sequence_length: 64
|
|
layernorm_order: pre
|
|
skip_init: false
|
|
model_parallel_size: 1
|
|
is_decoder: false
|
|
|
|
modules:
|
|
pos_embed_config:
|
|
target: dit_video_concat.Basic3DPositionEmbeddingMixin
|
|
params:
|
|
text_length: 226
|
|
height_interpolation: 1.875
|
|
width_interpolation: 1.875
|
|
|
|
lora_config: ## Using Lora
|
|
target: sat.model.finetune.lora2.LoraMixin
|
|
params:
|
|
r: 128
|
|
|
|
patch_embed_config:
|
|
target: dit_video_concat.ImagePatchEmbeddingMixin
|
|
params:
|
|
text_hidden_size: 4096
|
|
|
|
adaln_layer_config:
|
|
target: dit_video_concat.AdaLNMixin
|
|
params:
|
|
qk_ln: True
|
|
|
|
final_layer_config:
|
|
target: dit_video_concat.FinalLayerMixin
|
|
|
|
conditioner_config:
|
|
target: sgm.modules.GeneralConditioner
|
|
params:
|
|
emb_models:
|
|
- is_trainable: false
|
|
input_key: txt
|
|
ucg_rate: 0.1
|
|
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
|
params:
|
|
model_dir: "google/t5-v1_1-xxl"
|
|
max_length: 226
|
|
|
|
first_stage_config:
|
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
|
params:
|
|
cp_size: 1
|
|
ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt"
|
|
ignore_keys: [ 'loss' ]
|
|
|
|
loss_config:
|
|
target: torch.nn.Identity
|
|
|
|
regularizer_config:
|
|
target: vae_modules.regularizers.DiagonalGaussianRegularizer
|
|
|
|
encoder_config:
|
|
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
|
|
params:
|
|
double_z: true
|
|
z_channels: 16
|
|
resolution: 256
|
|
in_channels: 3
|
|
out_ch: 3
|
|
ch: 128
|
|
ch_mult: [ 1, 2, 2, 4 ]
|
|
attn_resolutions: [ ]
|
|
num_res_blocks: 3
|
|
dropout: 0.0
|
|
gather_norm: True
|
|
|
|
decoder_config:
|
|
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
|
|
params:
|
|
double_z: True
|
|
z_channels: 16
|
|
resolution: 256
|
|
in_channels: 3
|
|
out_ch: 3
|
|
ch: 128
|
|
ch_mult: [ 1, 2, 2, 4 ]
|
|
attn_resolutions: [ ]
|
|
num_res_blocks: 3
|
|
dropout: 0.0
|
|
gather_norm: false
|
|
|
|
loss_fn_config:
|
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
|
params:
|
|
offset_noise_level: 0
|
|
sigma_sampler_config:
|
|
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
|
|
params:
|
|
uniform_sampling: True
|
|
num_idx: 1000
|
|
discretization_config:
|
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
|
params:
|
|
shift_scale: 3.0
|
|
|
|
sampler_config:
|
|
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
|
|
params:
|
|
num_steps: 50
|
|
verbose: True
|
|
|
|
discretization_config:
|
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
|
params:
|
|
shift_scale: 3.0
|
|
|
|
guider_config:
|
|
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
|
|
params:
|
|
scale: 6
|
|
exp: 5
|
|
num_steps: 50 |