chore: update default training configurations

This commit is contained in:
OleehyO 2025-01-12 08:50:15 +00:00
parent b362663679
commit 70c899f444
3 changed files with 7 additions and 7 deletions

View File

@ -1,11 +1,11 @@
compute_environment: LOCAL_MACHINE compute_environment: LOCAL_MACHINE
gpu_ids: "0,1,2,4" gpu_ids: "0,1,2,3,4,5,6,7"
num_processes: 4 # should be the same as the number of GPUs num_processes: 8 # should be the same as the number of GPUs
debug: false debug: false
deepspeed_config: deepspeed_config:
deepspeed_config_file: /absolute/path/to/your/deepspeed_config.yaml # e.g. /home/user/cogvideo/finetune/configs/zero2.yaml deepspeed_config_file: /home/lhy/code/CogVideo/finetune/configs/zero2.yaml # e.g. /home/user/cogvideo/finetune/configs/zero2.yaml
zero3_init_flag: false zero3_init_flag: false
distributed_type: DEEPSPEED distributed_type: DEEPSPEED
downcast_bf16: 'no' downcast_bf16: 'no'

View File

@ -47,8 +47,8 @@ SYSTEM_ARGS=(
# Checkpointing Configuration # Checkpointing Configuration
CHECKPOINT_ARGS=( CHECKPOINT_ARGS=(
--checkpointing_steps 5 --checkpointing_steps 10
--checkpointing_limit 10 --checkpointing_limit 2
--resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line --resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line
) )

View File

@ -46,8 +46,8 @@ SYSTEM_ARGS=(
# Checkpointing Configuration # Checkpointing Configuration
CHECKPOINT_ARGS=( CHECKPOINT_ARGS=(
--checkpointing_steps 5 --checkpointing_steps 10
--checkpointing_limit 10 --checkpointing_limit 2
--resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line --resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line
) )