mirror of
https://github.com/THUDM/CogVideo.git
synced 2025-04-06 03:57:56 +08:00
finetune complete
This commit is contained in:
parent
e7835cd79c
commit
19e6d2448e
@ -1,5 +1,5 @@
|
|||||||
model:
|
model:
|
||||||
scale_factor: 0.7 # different from cogvideox_2b_infer.yaml
|
scale_factor: 0.7
|
||||||
disable_first_stage_autocast: true
|
disable_first_stage_autocast: true
|
||||||
log_keys:
|
log_keys:
|
||||||
- txt
|
- txt
|
||||||
|
159
sat/configs/cogvideox_5b_i2v.yaml
Normal file
159
sat/configs/cogvideox_5b_i2v.yaml
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
model:
|
||||||
|
scale_factor: 0.7
|
||||||
|
disable_first_stage_autocast: true
|
||||||
|
latent_input: false
|
||||||
|
noised_image_input: true
|
||||||
|
noised_image_dropout: 0.05
|
||||||
|
log_keys:
|
||||||
|
- txt
|
||||||
|
|
||||||
|
denoiser_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||||
|
params:
|
||||||
|
num_idx: 1000
|
||||||
|
quantize_c_noise: False
|
||||||
|
|
||||||
|
weighting_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
|
||||||
|
scaling_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0 # different from cogvideox_2b_infer.yaml
|
||||||
|
|
||||||
|
network_config:
|
||||||
|
target: dit_video_concat.DiffusionTransformer
|
||||||
|
params:
|
||||||
|
time_embed_dim: 512
|
||||||
|
elementwise_affine: True
|
||||||
|
num_frames: 49
|
||||||
|
time_compressed_rate: 4
|
||||||
|
latent_width: 90
|
||||||
|
latent_height: 60
|
||||||
|
num_layers: 42
|
||||||
|
patch_size: 2
|
||||||
|
in_channels: 32 #different from cogvideox_5b_infer.yaml
|
||||||
|
out_channels: 16
|
||||||
|
hidden_size: 3072
|
||||||
|
adm_in_channels: 256
|
||||||
|
num_attention_heads: 48
|
||||||
|
|
||||||
|
transformer_args:
|
||||||
|
checkpoint_activations: True
|
||||||
|
vocab_size: 1
|
||||||
|
max_sequence_length: 64
|
||||||
|
layernorm_order: pre
|
||||||
|
skip_init: false
|
||||||
|
model_parallel_size: 1
|
||||||
|
is_decoder: false
|
||||||
|
|
||||||
|
modules:
|
||||||
|
pos_embed_config:
|
||||||
|
target: dit_video_concat.Rotary3DPositionEmbeddingMixin
|
||||||
|
params:
|
||||||
|
learnable_pos_embed: True
|
||||||
|
hidden_size_head: 64
|
||||||
|
text_length: 226
|
||||||
|
|
||||||
|
patch_embed_config:
|
||||||
|
target: dit_video_concat.ImagePatchEmbeddingMixin
|
||||||
|
params:
|
||||||
|
text_hidden_size: 4096
|
||||||
|
|
||||||
|
adaln_layer_config:
|
||||||
|
target: dit_video_concat.AdaLNMixin
|
||||||
|
params:
|
||||||
|
qk_ln: True
|
||||||
|
|
||||||
|
final_layer_config:
|
||||||
|
target: dit_video_concat.FinalLayerMixin
|
||||||
|
|
||||||
|
conditioner_config:
|
||||||
|
target: sgm.modules.GeneralConditioner
|
||||||
|
params:
|
||||||
|
emb_models:
|
||||||
|
- is_trainable: false
|
||||||
|
input_key: txt
|
||||||
|
ucg_rate: 0.1
|
||||||
|
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
||||||
|
params:
|
||||||
|
model_dir: "t5-v1_1-xxl"
|
||||||
|
max_length: 226
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
||||||
|
params:
|
||||||
|
cp_size: 1
|
||||||
|
ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt"
|
||||||
|
ignore_keys: ['loss']
|
||||||
|
|
||||||
|
loss_config:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
regularizer_config:
|
||||||
|
target: vae_modules.regularizers.DiagonalGaussianRegularizer
|
||||||
|
|
||||||
|
encoder_config:
|
||||||
|
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
|
||||||
|
params:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 16
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [1, 2, 2, 4]
|
||||||
|
attn_resolutions: []
|
||||||
|
num_res_blocks: 3
|
||||||
|
dropout: 0.0
|
||||||
|
gather_norm: True
|
||||||
|
|
||||||
|
decoder_config:
|
||||||
|
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
|
||||||
|
params:
|
||||||
|
double_z: True
|
||||||
|
z_channels: 16
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [1, 2, 2, 4]
|
||||||
|
attn_resolutions: []
|
||||||
|
num_res_blocks: 3
|
||||||
|
dropout: 0.0
|
||||||
|
gather_norm: True
|
||||||
|
|
||||||
|
loss_fn_config:
|
||||||
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
||||||
|
params:
|
||||||
|
fixed_frames: 0
|
||||||
|
offset_noise_level: 0
|
||||||
|
sigma_sampler_config:
|
||||||
|
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
|
||||||
|
params:
|
||||||
|
uniform_sampling: True
|
||||||
|
num_idx: 1000
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0
|
||||||
|
|
||||||
|
sampler_config:
|
||||||
|
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
|
||||||
|
params:
|
||||||
|
fixed_frames: 0
|
||||||
|
num_steps: 50
|
||||||
|
verbose: True
|
||||||
|
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0
|
||||||
|
|
||||||
|
guider_config:
|
||||||
|
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
|
||||||
|
params:
|
||||||
|
scale: 6
|
||||||
|
exp: 5
|
||||||
|
num_steps: 50
|
165
sat/configs/cogvideox_5b_i2v_lora.yaml
Normal file
165
sat/configs/cogvideox_5b_i2v_lora.yaml
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
model:
|
||||||
|
scale_factor: 0.7
|
||||||
|
disable_first_stage_autocast: true
|
||||||
|
latent_input: false
|
||||||
|
noised_image_input: true
|
||||||
|
noised_image_dropout: 0.05
|
||||||
|
not_trainable_prefixes: ['all'] ## Using Lora
|
||||||
|
log_keys:
|
||||||
|
- txt
|
||||||
|
|
||||||
|
denoiser_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||||
|
params:
|
||||||
|
num_idx: 1000
|
||||||
|
quantize_c_noise: False
|
||||||
|
|
||||||
|
weighting_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
|
||||||
|
scaling_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0 # different from cogvideox_2b_infer.yaml
|
||||||
|
|
||||||
|
network_config:
|
||||||
|
target: dit_video_concat.DiffusionTransformer
|
||||||
|
params:
|
||||||
|
time_embed_dim: 512
|
||||||
|
elementwise_affine: True
|
||||||
|
num_frames: 49
|
||||||
|
time_compressed_rate: 4
|
||||||
|
latent_width: 90
|
||||||
|
latent_height: 60
|
||||||
|
num_layers: 42
|
||||||
|
patch_size: 2
|
||||||
|
in_channels: 32
|
||||||
|
out_channels: 16
|
||||||
|
hidden_size: 3072
|
||||||
|
adm_in_channels: 256
|
||||||
|
num_attention_heads: 48
|
||||||
|
|
||||||
|
transformer_args:
|
||||||
|
checkpoint_activations: True
|
||||||
|
vocab_size: 1
|
||||||
|
max_sequence_length: 64
|
||||||
|
layernorm_order: pre
|
||||||
|
skip_init: false
|
||||||
|
model_parallel_size: 1
|
||||||
|
is_decoder: false
|
||||||
|
|
||||||
|
modules:
|
||||||
|
pos_embed_config:
|
||||||
|
target: dit_video_concat.Rotary3DPositionEmbeddingMixin
|
||||||
|
params:
|
||||||
|
learnable_pos_embed: True
|
||||||
|
hidden_size_head: 64
|
||||||
|
text_length: 226
|
||||||
|
|
||||||
|
lora_config:
|
||||||
|
target: sat.model.finetune.lora2.LoraMixin
|
||||||
|
params:
|
||||||
|
r: 256
|
||||||
|
|
||||||
|
patch_embed_config:
|
||||||
|
target: dit_video_concat.ImagePatchEmbeddingMixin
|
||||||
|
params:
|
||||||
|
text_hidden_size: 4096
|
||||||
|
|
||||||
|
adaln_layer_config:
|
||||||
|
target: dit_video_concat.AdaLNMixin
|
||||||
|
params:
|
||||||
|
qk_ln: True
|
||||||
|
|
||||||
|
final_layer_config:
|
||||||
|
target: dit_video_concat.FinalLayerMixin
|
||||||
|
|
||||||
|
conditioner_config:
|
||||||
|
target: sgm.modules.GeneralConditioner
|
||||||
|
params:
|
||||||
|
emb_models:
|
||||||
|
- is_trainable: false
|
||||||
|
input_key: txt
|
||||||
|
ucg_rate: 0.1
|
||||||
|
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
||||||
|
params:
|
||||||
|
model_dir: "t5-v1_1-xxl"
|
||||||
|
max_length: 226
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
||||||
|
params:
|
||||||
|
cp_size: 1
|
||||||
|
ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt"
|
||||||
|
ignore_keys: [ 'loss' ]
|
||||||
|
|
||||||
|
loss_config:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
regularizer_config:
|
||||||
|
target: vae_modules.regularizers.DiagonalGaussianRegularizer
|
||||||
|
|
||||||
|
encoder_config:
|
||||||
|
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
|
||||||
|
params:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 16
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [ 1, 2, 2, 4 ]
|
||||||
|
attn_resolutions: [ ]
|
||||||
|
num_res_blocks: 3
|
||||||
|
dropout: 0.0
|
||||||
|
gather_norm: True
|
||||||
|
|
||||||
|
decoder_config:
|
||||||
|
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
|
||||||
|
params:
|
||||||
|
double_z: True
|
||||||
|
z_channels: 16
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [ 1, 2, 2, 4 ]
|
||||||
|
attn_resolutions: [ ]
|
||||||
|
num_res_blocks: 3
|
||||||
|
dropout: 0.0
|
||||||
|
gather_norm: True
|
||||||
|
|
||||||
|
loss_fn_config:
|
||||||
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
||||||
|
params:
|
||||||
|
fixed_frames: 0
|
||||||
|
offset_noise_level: 0
|
||||||
|
sigma_sampler_config:
|
||||||
|
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
|
||||||
|
params:
|
||||||
|
uniform_sampling: True
|
||||||
|
num_idx: 1000
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0
|
||||||
|
|
||||||
|
sampler_config:
|
||||||
|
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
|
||||||
|
params:
|
||||||
|
fixed_frames: 0
|
||||||
|
num_steps: 50
|
||||||
|
verbose: True
|
||||||
|
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
|
||||||
|
params:
|
||||||
|
shift_scale: 1.0
|
||||||
|
|
||||||
|
guider_config:
|
||||||
|
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
|
||||||
|
params:
|
||||||
|
scale: 6
|
||||||
|
exp: 5
|
||||||
|
num_steps: 50
|
@ -1,15 +1,16 @@
|
|||||||
args:
|
args:
|
||||||
|
image2video: False # True for image2video, False for text2video
|
||||||
latent_channels: 16
|
latent_channels: 16
|
||||||
mode: inference
|
mode: inference
|
||||||
load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter
|
load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter
|
||||||
# load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
|
# load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
|
||||||
|
|
||||||
batch_size: 1
|
batch_size: 1
|
||||||
input_type: txt
|
input_type: txt
|
||||||
input_file: configs/test.txt
|
input_file: configs/test.txt
|
||||||
|
sampling_image_size: [480, 720]
|
||||||
sampling_num_frames: 13 # Must be 13, 11 or 9
|
sampling_num_frames: 13 # Must be 13, 11 or 9
|
||||||
sampling_fps: 8
|
sampling_fps: 8
|
||||||
fp16: True # For CogVideoX-2B
|
# fp16: True # For CogVideoX-2B
|
||||||
# bf16: True # For CogVideoX-5B
|
bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V
|
||||||
output_dir: outputs/
|
output_dir: outputs/
|
||||||
force_inference: True
|
force_inference: True
|
@ -1,15 +1,15 @@
|
|||||||
args:
|
args:
|
||||||
checkpoint_activations: True ## using gradient checkpointing
|
checkpoint_activations: True # using gradient checkpointing
|
||||||
model_parallel_size: 1
|
model_parallel_size: 1
|
||||||
experiment_name: lora-disney
|
experiment_name: lora-disney
|
||||||
mode: finetune
|
mode: finetune
|
||||||
load: "cogvideox-2b-sat/transformer"
|
load: "{your CogVideoX SAT folder}/transformer"
|
||||||
no_load_rng: True
|
no_load_rng: True
|
||||||
train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough
|
train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough
|
||||||
eval_iters: 1
|
eval_iters: 1
|
||||||
eval_interval: 100
|
eval_interval: 100
|
||||||
eval_batch_size: 1
|
eval_batch_size: 1
|
||||||
save: ckpts_2b_lora
|
save: ckpts_5b_lora
|
||||||
save_interval: 500
|
save_interval: 500
|
||||||
log_interval: 20
|
log_interval: 20
|
||||||
train_data: [ "disney" ] # Train data path
|
train_data: [ "disney" ] # Train data path
|
||||||
@ -28,7 +28,7 @@ data:
|
|||||||
skip_frms_num: 3.
|
skip_frms_num: 3.
|
||||||
|
|
||||||
deepspeed:
|
deepspeed:
|
||||||
# Minimun for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
|
# Minimum for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
|
||||||
train_micro_batch_size_per_gpu: 2
|
train_micro_batch_size_per_gpu: 2
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
steps_per_print: 50
|
steps_per_print: 50
|
||||||
@ -44,9 +44,9 @@ deepspeed:
|
|||||||
load_from_fp32_weights: false
|
load_from_fp32_weights: false
|
||||||
zero_allow_untested_optimizer: true
|
zero_allow_untested_optimizer: true
|
||||||
bf16:
|
bf16:
|
||||||
enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
|
enabled: True # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
|
||||||
fp16:
|
fp16:
|
||||||
enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
|
enabled: False # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
|
||||||
loss_scale: 0
|
loss_scale: 0
|
||||||
loss_scale_window: 400
|
loss_scale_window: 400
|
||||||
hysteresis: 2
|
hysteresis: 2
|
||||||
@ -55,7 +55,7 @@ deepspeed:
|
|||||||
optimizer:
|
optimizer:
|
||||||
type: sat.ops.FusedEmaAdam
|
type: sat.ops.FusedEmaAdam
|
||||||
params:
|
params:
|
||||||
lr: 0.001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
|
lr: 0.00001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
|
||||||
betas: [ 0.9, 0.95 ]
|
betas: [ 0.9, 0.95 ]
|
||||||
eps: 1e-8
|
eps: 1e-8
|
||||||
weight_decay: 1e-4
|
weight_decay: 1e-4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user