From 795dd144a4534eeb6f3561ddc18eff1ea067ffa1 Mon Sep 17 00:00:00 2001
From: OleehyO <leehy0357@gmail.com>
Date: Sun, 12 Jan 2025 05:36:32 +0000
Subject: [PATCH] Rename lora training scripts as ddp

---
 finetune/train_ddp_i2v.sh | 70 +++++++++++++++++++++++++++++++++++++++
 finetune/train_ddp_t2v.sh | 68 +++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 finetune/train_ddp_i2v.sh
 create mode 100644 finetune/train_ddp_t2v.sh

diff --git a/finetune/train_ddp_i2v.sh b/finetune/train_ddp_i2v.sh
new file mode 100644
index 0000000..ecd322b
--- /dev/null
+++ b/finetune/train_ddp_i2v.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# Prevent tokenizer parallelism issues
+export TOKENIZERS_PARALLELISM=false
+
+# Model Configuration
+MODEL_ARGS=(
+    --model_path "THUDM/CogVideoX1.5-5B-I2V"
+    --model_name "cogvideox1.5-i2v"  # ["cogvideox-i2v"]
+    --model_type "i2v"
+    --training_type "lora"
+)
+
+# Output Configuration
+OUTPUT_ARGS=(
+    --output_dir "/path/to/your/output_dir"
+    --report_to "tensorboard"
+)
+
+# Data Configuration
+DATA_ARGS=(
+    --data_root "/absolute/path/to/your/data_root"
+    --caption_column "prompt.txt"
+    --video_column "videos.txt"
+    # --image_column "images.txt"  # comment this line will use first frame of video as image conditioning
+    --train_resolution "81x768x1360"  # (frames x height x width), frames should be 8N+1
+)
+
+# Training Configuration
+TRAIN_ARGS=(
+    --train_epochs 10
+    --batch_size 1
+    --gradient_accumulation_steps 1
+    --mixed_precision "bf16"  # ["no", "fp16"]
+    --seed 42
+)
+
+# System Configuration
+SYSTEM_ARGS=(
+    --num_workers 8
+    --pin_memory True
+    --nccl_timeout 1800
+)
+
+# Checkpointing Configuration
+CHECKPOINT_ARGS=(
+    --checkpointing_steps 5
+    --checkpointing_limit 10
+    --resume_from_checkpoint "/absolute/path/to/checkpoint_dir"  # if you want to resume from a checkpoint, otherwise, comment this line
+)
+
+# Validation Configuration
+VALIDATION_ARGS=(
+    --do_validation false  # ["true", "false"]
+    --validation_dir "/absolute/path/to/your/validation_set"
+    --validation_steps 20  # should be multiple of checkpointing_steps
+    --validation_prompts "prompts.txt"
+    --validation_images "images.txt"
+    --gen_fps 16
+)
+
+# Combine all arguments and launch training
+accelerate launch train.py \
+    "${MODEL_ARGS[@]}" \
+    "${OUTPUT_ARGS[@]}" \
+    "${DATA_ARGS[@]}" \
+    "${TRAIN_ARGS[@]}" \
+    "${SYSTEM_ARGS[@]}" \
+    "${CHECKPOINT_ARGS[@]}" \
+    "${VALIDATION_ARGS[@]}"
diff --git a/finetune/train_ddp_t2v.sh b/finetune/train_ddp_t2v.sh
new file mode 100644
index 0000000..4340d56
--- /dev/null
+++ b/finetune/train_ddp_t2v.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Prevent tokenizer parallelism issues
+export TOKENIZERS_PARALLELISM=false
+
+# Model Configuration
+MODEL_ARGS=(
+    --model_path "THUDM/CogVideoX1.5-5B"
+    --model_name "cogvideox1.5-t2v"  # ["cogvideox-t2v"]
+    --model_type "t2v"
+    --training_type "lora"
+)
+
+# Output Configuration
+OUTPUT_ARGS=(
+    --output_dir "/absolute/path/to/your/output_dir"
+    --report_to "tensorboard"
+)
+
+# Data Configuration
+DATA_ARGS=(
+    --data_root "/absolute/path/to/your/data_root"
+    --caption_column "prompt.txt"
+    --video_column "videos.txt"
+    --train_resolution "81x768x1360"  # (frames x height x width), frames should be 8N+1
+)
+
+# Training Configuration
+TRAIN_ARGS=(
+    --train_epochs 10
+    --batch_size 1
+    --gradient_accumulation_steps 1
+    --mixed_precision "bf16"  # ["no", "fp16"]
+    --seed 42
+)
+
+# System Configuration
+SYSTEM_ARGS=(
+    --num_workers 8
+    --pin_memory True
+    --nccl_timeout 1800
+)
+
+# Checkpointing Configuration
+CHECKPOINT_ARGS=(
+    --checkpointing_steps 5
+    --checkpointing_limit 10
+    --resume_from_checkpoint "/absolute/path/to/checkpoint_dir"  # if you want to resume from a checkpoint, otherwise, comment this line
+)
+
+# Validation Configuration
+VALIDATION_ARGS=(
+    --do_validation false  # ["true", "false"]
+    --validation_dir "/absolute/path/to/your/validation_set"
+    --validation_steps 20  # should be multiple of checkpointing_steps
+    --validation_prompts "prompts.txt"
+    --gen_fps 16
+)
+
+# Combine all arguments and launch training
+accelerate launch train.py \
+    "${MODEL_ARGS[@]}" \
+    "${OUTPUT_ARGS[@]}" \
+    "${DATA_ARGS[@]}" \
+    "${TRAIN_ARGS[@]}" \
+    "${SYSTEM_ARGS[@]}" \
+    "${CHECKPOINT_ARGS[@]}" \
+    "${VALIDATION_ARGS[@]}"