finetune complete

This commit is contained in:
zR 2024-09-14 18:40:34 +08:00
parent e7835cd79c
commit 19e6d2448e
5 changed files with 336 additions and 11 deletions

View File

@ -1,5 +1,5 @@
model:
scale_factor: 0.7 # different from cogvideox_2b_infer.yaml
scale_factor: 0.7
disable_first_stage_autocast: true
log_keys:
- txt

View File

@ -0,0 +1,159 @@
model:
scale_factor: 0.7
disable_first_stage_autocast: true
latent_input: false
noised_image_input: true
noised_image_dropout: 0.05
log_keys:
- txt
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
quantize_c_noise: False
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0 # different from cogvideox_2b_infer.yaml
network_config:
target: dit_video_concat.DiffusionTransformer
params:
time_embed_dim: 512
elementwise_affine: True
num_frames: 49
time_compressed_rate: 4
latent_width: 90
latent_height: 60
num_layers: 42
patch_size: 2
in_channels: 32 #different from cogvideox_5b_infer.yaml
out_channels: 16
hidden_size: 3072
adm_in_channels: 256
num_attention_heads: 48
transformer_args:
checkpoint_activations: True
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
skip_init: false
model_parallel_size: 1
is_decoder: false
modules:
pos_embed_config:
target: dit_video_concat.Rotary3DPositionEmbeddingMixin
params:
learnable_pos_embed: True
hidden_size_head: 64
text_length: 226
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
text_hidden_size: 4096
adaln_layer_config:
target: dit_video_concat.AdaLNMixin
params:
qk_ln: True
final_layer_config:
target: dit_video_concat.FinalLayerMixin
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: txt
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
model_dir: "t5-v1_1-xxl"
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt"
ignore_keys: ['loss']
loss_config:
target: torch.nn.Identity
regularizer_config:
target: vae_modules.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 2, 4]
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: True
decoder_config:
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
params:
double_z: True
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 2, 4]
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: True
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
params:
fixed_frames: 0
offset_noise_level: 0
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
uniform_sampling: True
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0
sampler_config:
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
params:
fixed_frames: 0
num_steps: 50
verbose: True
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0
guider_config:
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
params:
scale: 6
exp: 5
num_steps: 50

View File

@ -0,0 +1,165 @@
model:
scale_factor: 0.7
disable_first_stage_autocast: true
latent_input: false
noised_image_input: true
noised_image_dropout: 0.05
not_trainable_prefixes: ['all'] ## Using Lora
log_keys:
- txt
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
quantize_c_noise: False
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0 # different from cogvideox_2b_infer.yaml
network_config:
target: dit_video_concat.DiffusionTransformer
params:
time_embed_dim: 512
elementwise_affine: True
num_frames: 49
time_compressed_rate: 4
latent_width: 90
latent_height: 60
num_layers: 42
patch_size: 2
in_channels: 32
out_channels: 16
hidden_size: 3072
adm_in_channels: 256
num_attention_heads: 48
transformer_args:
checkpoint_activations: True
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
skip_init: false
model_parallel_size: 1
is_decoder: false
modules:
pos_embed_config:
target: dit_video_concat.Rotary3DPositionEmbeddingMixin
params:
learnable_pos_embed: True
hidden_size_head: 64
text_length: 226
lora_config:
target: sat.model.finetune.lora2.LoraMixin
params:
r: 256
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
text_hidden_size: 4096
adaln_layer_config:
target: dit_video_concat.AdaLNMixin
params:
qk_ln: True
final_layer_config:
target: dit_video_concat.FinalLayerMixin
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: txt
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
model_dir: "t5-v1_1-xxl"
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
ckpt_path: "cogvideox-5b-i2v-sat/vae/3d-vae.pt"
ignore_keys: [ 'loss' ]
loss_config:
target: torch.nn.Identity
regularizer_config:
target: vae_modules.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [ 1, 2, 2, 4 ]
attn_resolutions: [ ]
num_res_blocks: 3
dropout: 0.0
gather_norm: True
decoder_config:
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
params:
double_z: True
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [ 1, 2, 2, 4 ]
attn_resolutions: [ ]
num_res_blocks: 3
dropout: 0.0
gather_norm: True
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
params:
fixed_frames: 0
offset_noise_level: 0
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
uniform_sampling: True
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0
sampler_config:
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
params:
fixed_frames: 0
num_steps: 50
verbose: True
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 1.0
guider_config:
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
params:
scale: 6
exp: 5
num_steps: 50

View File

@ -1,15 +1,16 @@
args:
image2video: False # True for image2video, False for text2video
latent_channels: 16
mode: inference
load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter
# load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
batch_size: 1
input_type: txt
input_file: configs/test.txt
sampling_image_size: [480, 720]
sampling_num_frames: 13 # Must be 13, 11 or 9
sampling_fps: 8
fp16: True # For CogVideoX-2B
# bf16: True # For CogVideoX-5B
# fp16: True # For CogVideoX-2B
bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V
output_dir: outputs/
force_inference: True

View File

@ -1,15 +1,15 @@
args:
checkpoint_activations: True ## using gradient checkpointing
checkpoint_activations: True # using gradient checkpointing
model_parallel_size: 1
experiment_name: lora-disney
mode: finetune
load: "cogvideox-2b-sat/transformer"
load: "{your CogVideoX SAT folder}/transformer"
no_load_rng: True
train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough
eval_iters: 1
eval_interval: 100
eval_batch_size: 1
save: ckpts_2b_lora
save: ckpts_5b_lora
save_interval: 500
log_interval: 20
train_data: [ "disney" ] # Train data path
@ -28,7 +28,7 @@ data:
skip_frms_num: 3.
deepspeed:
# Minimun for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
# Minimum for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
train_micro_batch_size_per_gpu: 2
gradient_accumulation_steps: 1
steps_per_print: 50
@ -44,9 +44,9 @@ deepspeed:
load_from_fp32_weights: false
zero_allow_untested_optimizer: true
bf16:
enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
enabled: True # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
fp16:
enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
enabled: False # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
loss_scale: 0
loss_scale_window: 400
hysteresis: 2
@ -55,7 +55,7 @@ deepspeed:
optimizer:
type: sat.ops.FusedEmaAdam
params:
lr: 0.001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
lr: 0.00001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
betas: [ 0.9, 0.95 ]
eps: 1e-8
weight_decay: 1e-4