add 10 second comment

2025-11-18 08:12:09 +08:00 · 2024-11-08 22:31:39 +08:00 · 2024-11-08 22:31:39 +08:00 · d8ee013842
commit d8ee013842
parent e43a7645fd
3 changed files with 8 additions and 11 deletions
--- a/sat/configs/cogvideox1.5_5b.yaml
+++ b/sat/configs/cogvideox1.5_5b.yaml
@ -23,7 +23,7 @@ model:
    params:
      time_embed_dim: 512
      elementwise_affine: True
-      num_frames: 81
+      num_frames: 81 # for 5 seconds and 161 for 10 seconds
      time_compressed_rate: 4
      latent_width: 300
      latent_height: 300
--- a/sat/configs/cogvideox1.5_5b_i2v.yaml
+++ b/sat/configs/cogvideox1.5_5b_i2v.yaml
@ -25,11 +25,10 @@ model:
  network_config:
    target: dit_video_concat.DiffusionTransformer
    params:
 #      space_interpolation: 1.875
      ofs_embed_dim: 512
      time_embed_dim: 512
      elementwise_affine: True
-      num_frames: 81
+      num_frames: 81  # for 5 seconds and 161 for 10 seconds
      time_compressed_rate: 4
      latent_width: 300
      latent_height: 300
--- a/sat/configs/inference.yaml
+++ b/sat/configs/inference.yaml
@ -1,16 +1,14 @@
 args:
-  image2video: False # True for image2video, False for text2video
+#  image2video: True  # True for image2video, False for text2video
  latent_channels: 16
  mode: inference
  load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter
  # load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
  batch_size: 1
  input_type: txt
  input_file: configs/test.txt
-  sampling_image_size: [480, 720]
+  sampling_image_size: [768, 1360] # remove this for I2V
-  sampling_num_frames: 13  # Must be 13, 11 or 9
+  sampling_num_frames: 22  # 42 for 10 seconds and 22 for 5 seconds
-  sampling_fps: 8
+  sampling_fps: 16
-#  fp16: True # For CogVideoX-2B
+  bf16: True
-  bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V
+  output_dir: outputs
  output_dir: outputs/
  force_inference: True