chore: update default training configurations

This commit is contained in:
OleehyO 2025-01-12 08:50:15 +00:00
parent b362663679
commit 70c899f444
3 changed files with 7 additions and 7 deletions

View File

@ -1,11 +1,11 @@
compute_environment: LOCAL_MACHINE
gpu_ids: "0,1,2,4"
num_processes: 4 # should be the same as the number of GPUs
gpu_ids: "0,1,2,3,4,5,6,7"
num_processes: 8 # should be the same as the number of GPUs
debug: false
deepspeed_config:
deepspeed_config_file: /absolute/path/to/your/deepspeed_config.yaml # e.g. /home/user/cogvideo/finetune/configs/zero2.yaml
deepspeed_config_file: /home/lhy/code/CogVideo/finetune/configs/zero2.yaml # e.g. /home/user/cogvideo/finetune/configs/zero2.yaml
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'

View File

@ -47,8 +47,8 @@ SYSTEM_ARGS=(
# Checkpointing Configuration
CHECKPOINT_ARGS=(
--checkpointing_steps 5
--checkpointing_limit 10
--checkpointing_steps 10
--checkpointing_limit 2
--resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line
)

View File

@ -46,8 +46,8 @@ SYSTEM_ARGS=(
# Checkpointing Configuration
CHECKPOINT_ARGS=(
--checkpointing_steps 5
--checkpointing_limit 10
--checkpointing_steps 10
--checkpointing_limit 2
--resume_from_checkpoint "/absolute/path/to/checkpoint_dir" # if you want to resume from a checkpoint, otherwise, comment this line
)