diff --git a/sat/configs/cogvideox1.5_5b.yaml b/sat/configs/cogvideox1.5_5b.yaml index 0000ec2..62d46be 100644 --- a/sat/configs/cogvideox1.5_5b.yaml +++ b/sat/configs/cogvideox1.5_5b.yaml @@ -23,7 +23,7 @@ model: params: time_embed_dim: 512 elementwise_affine: True - num_frames: 81 + num_frames: 81 # for 5 seconds and 161 for 10 seconds time_compressed_rate: 4 latent_width: 300 latent_height: 300 diff --git a/sat/configs/cogvideox1.5_5b_i2v.yaml b/sat/configs/cogvideox1.5_5b_i2v.yaml index c65f0b7..65d90f9 100644 --- a/sat/configs/cogvideox1.5_5b_i2v.yaml +++ b/sat/configs/cogvideox1.5_5b_i2v.yaml @@ -25,11 +25,10 @@ model: network_config: target: dit_video_concat.DiffusionTransformer params: -# space_interpolation: 1.875 ofs_embed_dim: 512 time_embed_dim: 512 elementwise_affine: True - num_frames: 81 + num_frames: 81 # for 5 seconds and 161 for 10 seconds time_compressed_rate: 4 latent_width: 300 latent_height: 300 diff --git a/sat/configs/inference.yaml b/sat/configs/inference.yaml index a93bb99..644e71a 100644 --- a/sat/configs/inference.yaml +++ b/sat/configs/inference.yaml @@ -1,16 +1,14 @@ args: - image2video: False # True for image2video, False for text2video +# image2video: True # True for image2video, False for text2video latent_channels: 16 mode: inference load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter - # load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter batch_size: 1 input_type: txt input_file: configs/test.txt - sampling_image_size: [480, 720] - sampling_num_frames: 13 # Must be 13, 11 or 9 - sampling_fps: 8 -# fp16: True # For CogVideoX-2B - bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V - output_dir: outputs/ + sampling_image_size: [768, 1360] # remove this for I2V + sampling_num_frames: 22 # 42 for 10 seconds and 22 for 5 seconds + sampling_fps: 16 + bf16: True + output_dir: outputs force_inference: True \ No newline at end of file