mirror of
https://github.com/THUDM/CogVideo.git
synced 2025-04-06 03:57:56 +08:00
update finetune
This commit is contained in:
parent
0139615252
commit
a490c3c895
@ -14,7 +14,7 @@
|
|||||||
📚 Check here to view <a href="https://arxiv.org/abs/2408.06072" target="_blank">Paper</a>
|
📚 Check here to view <a href="https://arxiv.org/abs/2408.06072" target="_blank">Paper</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👋 Join our <a href="resources/WECHAT.md" target="_blank">WeChat</a> and <a href="https://discord.gg/Ewaabk6s" target="_blank">Discord</a>
|
👋 Join our <a href="resources/WECHAT.md" target="_blank">WeChat</a> and <a href="https://discord.gg/B94UfuhN" target="_blank">Discord</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
📍 Visit <a href="https://chatglm.cn/video?fr=osm_cogvideox">清影</a> and <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">API Platform</a> to experience larger-scale commercial video generation models.
|
📍 Visit <a href="https://chatglm.cn/video?fr=osm_cogvideox">清影</a> and <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">API Platform</a> to experience larger-scale commercial video generation models.
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
📚 <a href="https://arxiv.org/abs/2408.06072" target="_blank">論文</a> をチェック
|
📚 <a href="https://arxiv.org/abs/2408.06072" target="_blank">論文</a> をチェック
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👋 <a href="resources/WECHAT.md" target="_blank">WeChat</a> と <a href="https://discord.gg/Ewaabk6s" target="_blank">Discord</a> に参加
|
👋 <a href="resources/WECHAT.md" target="_blank">WeChat</a> と <a href="https://discord.gg/B94UfuhN" target="_blank">Discord</a> に参加
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
📍 <a href="https://chatglm.cn/video?fr=osm_cogvideox">清影</a> と <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">APIプラットフォーム</a> を訪問して、より大規模な商用ビデオ生成モデルを体験
|
📍 <a href="https://chatglm.cn/video?fr=osm_cogvideox">清影</a> と <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9">APIプラットフォーム</a> を訪問して、より大規模な商用ビデオ生成モデルを体験
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
📚 查看 <a href="https://arxiv.org/abs/2408.06072" target="_blank">论文</a>
|
📚 查看 <a href="https://arxiv.org/abs/2408.06072" target="_blank">论文</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👋 加入我们的 <a href="resources/WECHAT.md" target="_blank">微信</a> 和 <a href="https://discord.gg/Ewaabk6s" target="_blank">Discord</a>
|
👋 加入我们的 <a href="resources/WECHAT.md" target="_blank">微信</a> 和 <a href="https://discord.gg/B94UfuhN" target="_blank">Discord</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
📍 前往<a href="https://chatglm.cn/video?fr=osm_cogvideox"> 清影</a> 和 <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9"> API平台</a> 体验更大规模的商业版视频生成模型。
|
📍 前往<a href="https://chatglm.cn/video?fr=osm_cogvideox"> 清影</a> 和 <a href="https://open.bigmodel.cn/?utm_campaign=open&_channel_track_key=OWTVNma9"> API平台</a> 体验更大规模的商业版视频生成模型。
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
diffusers==0.30.0
|
git+https://github.com/huggingface/diffusers.git@main#egg=diffusers
|
||||||
transformers==4.44.0
|
transformers==4.44.0
|
||||||
accelerate==0.33.0
|
git+https://github.com/huggingface/accelerate.git@main#egg=accelerate
|
||||||
sentencepiece==0.2.0 # T5
|
sentencepiece==0.2.0 # T5
|
||||||
SwissArmyTransformer==0.4.12 # Inference
|
SwissArmyTransformer==0.4.12 # Inference
|
||||||
torch==2.4.0 # Tested in 2.2 2.3 2.4 and 2.5
|
torch==2.4.0 # Tested in 2.2 2.3 2.4 and 2.5
|
||||||
|
160
sat/README.md
160
sat/README.md
@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
[日本語で読む](./README_ja.md)
|
[日本語で読む](./README_ja.md)
|
||||||
|
|
||||||
|
|
||||||
This folder contains the inference code using [SAT](https://github.com/THUDM/SwissArmyTransformer) weights and the
|
This folder contains the inference code using [SAT](https://github.com/THUDM/SwissArmyTransformer) weights and the
|
||||||
fine-tuning code for SAT weights.
|
fine-tuning code for SAT weights.
|
||||||
|
|
||||||
@ -69,110 +68,49 @@ loading it into Deepspeed in Finetune.
|
|||||||
0 directories, 8 files
|
0 directories, 8 files
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Modify the file `configs/cogvideox_2b_infer.yaml`.
|
Each text file shares the same name as its corresponding video, serving as the label for that video. Videos and labels
|
||||||
|
should be matched one-to-one. Generally, a single video should not be associated with multiple labels.
|
||||||
|
|
||||||
```yaml
|
For style fine-tuning, please prepare at least 50 videos and labels with similar styles to ensure proper fitting.
|
||||||
load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer model path
|
|
||||||
|
|
||||||
conditioner_config:
|
### Modifying Configuration Files
|
||||||
target: sgm.modules.GeneralConditioner
|
|
||||||
params:
|
|
||||||
emb_models:
|
|
||||||
- is_trainable: false
|
|
||||||
input_key: txt
|
|
||||||
ucg_rate: 0.1
|
|
||||||
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
|
||||||
params:
|
|
||||||
model_dir: "google/t5-v1_1-xxl" ## T5 model path
|
|
||||||
max_length: 226
|
|
||||||
|
|
||||||
first_stage_config:
|
We support two fine-tuning methods: `Lora` and full-parameter fine-tuning. Please note that both methods only fine-tune
|
||||||
target: sgm.models.autoencoder.VideoAutoencoderInferenceWrapper
|
the `transformer` part and do not modify the `VAE` section. `T5` is used solely as an Encoder. Please modify
|
||||||
params:
|
the `configs/sft.yaml` (for full-parameter fine-tuning) file as follows:
|
||||||
cp_size: 1
|
|
||||||
ckpt_path: "{your_CogVideoX-2b-sat_path}/vae/3d-vae.pt" ## VAE model path
|
|
||||||
```
|
|
||||||
|
|
||||||
+ If using txt to save multiple prompts, please refer to `configs/test.txt` for modification. One prompt per line. If
|
|
||||||
you don't know how to write prompts, you can first use [this code](../inference/convert_demo.py) to call LLM for
|
|
||||||
refinement.
|
|
||||||
+ If using the command line as input, modify
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
input_type: cli
|
|
||||||
```
|
|
||||||
|
|
||||||
so that prompts can be entered from the command line.
|
|
||||||
|
|
||||||
If you want to change the output video directory, you can modify:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
output_dir: outputs/
|
|
||||||
```
|
|
||||||
|
|
||||||
The default is saved in the `.outputs/` folder.
|
|
||||||
|
|
||||||
4. Run the inference code to start inference
|
|
||||||
|
|
||||||
```shell
|
|
||||||
bash inference.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
## Fine-Tuning the Model
|
|
||||||
|
|
||||||
### Preparing the Dataset
|
|
||||||
|
|
||||||
The dataset format should be as follows:
|
|
||||||
|
|
||||||
```
|
```
|
||||||
.
|
# checkpoint_activations: True ## Using gradient checkpointing (Both checkpoint_activations in the config file need to be set to True)
|
||||||
├── labels
|
|
||||||
│ ├── 1.txt
|
|
||||||
│ ├── 2.txt
|
|
||||||
│ ├── ...
|
|
||||||
└── videos
|
|
||||||
├── 1.mp4
|
|
||||||
├── 2.mp4
|
|
||||||
├── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
Each txt file should have the same name as its corresponding video file and contain the labels for that video. Each
|
|
||||||
video should have a one-to-one correspondence with a label. Typically, a video should not have multiple labels.
|
|
||||||
|
|
||||||
For style fine-tuning, please prepare at least 50 videos and labels with similar styles to facilitate fitting.
|
|
||||||
|
|
||||||
### Modifying the Configuration File
|
|
||||||
|
|
||||||
We support both `Lora` and `full-parameter fine-tuning` methods. Please note that both fine-tuning methods only apply to
|
|
||||||
the `transformer` part. The `VAE part` is not modified. `T5` is only used as an Encoder.
|
|
||||||
|
|
||||||
the `configs/cogvideox_2b_sft.yaml` (for full fine-tuning) as follows.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# checkpoint_activations: True ## using gradient checkpointing (both checkpoint_activations in the configuration file need to be set to True)
|
|
||||||
model_parallel_size: 1 # Model parallel size
|
model_parallel_size: 1 # Model parallel size
|
||||||
experiment_name: lora-disney # Experiment name (do not change)
|
experiment_name: lora-disney # Experiment name (do not modify)
|
||||||
mode: finetune # Mode (do not change)
|
mode: finetune # Mode (do not modify)
|
||||||
load: "{your_CogVideoX-2b-sat_path}/transformer" # Transformer model path
|
load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer model path
|
||||||
no_load_rng: True # Whether to load the random seed
|
no_load_rng: True # Whether to load random seed
|
||||||
train_iters: 1000 # Number of training iterations
|
train_iters: 1000 # Training iterations
|
||||||
eval_iters: 1 # Number of evaluation iterations
|
eval_iters: 1 # Evaluation iterations
|
||||||
eval_interval: 100 # Evaluation interval
|
eval_interval: 100 # Evaluation interval
|
||||||
eval_batch_size: 1 # Batch size for evaluation
|
eval_batch_size: 1 # Evaluation batch size
|
||||||
save: ckpts # Model save path
|
save: ckpts # Model save path
|
||||||
save_interval: 100 # Model save interval
|
save_interval: 100 # Model save interval
|
||||||
log_interval: 20 # Log output interval
|
log_interval: 20 # Log output interval
|
||||||
train_data: [ "your train data path" ]
|
train_data: [ "your train data path" ]
|
||||||
valid_data: [ "your val data path" ] # Training and validation sets can be the same
|
valid_data: [ "your val data path" ] # Training and validation datasets can be the same
|
||||||
split: 1,0,0 # Ratio of training, validation, and test sets
|
split: 1,0,0 # Training, validation, and test set ratio
|
||||||
num_workers: 8 # Number of worker threads for data loading
|
num_workers: 8 # Number of worker threads for data loader
|
||||||
force_train: True # Allow missing keys when loading ckpt (refer to T5 and VAE which are loaded independently)
|
force_train: True # Allow missing keys when loading checkpoint (T5 and VAE are loaded separately)
|
||||||
only_log_video_latents: True # Avoid using VAE decoder when eval to save memory
|
only_log_video_latents: True # Avoid memory overhead caused by VAE decode
|
||||||
|
deepspeed:
|
||||||
|
bf16:
|
||||||
|
enabled: False # For CogVideoX-2B set to False and for CogVideoX-5B set to True
|
||||||
|
fp16:
|
||||||
|
enabled: True # For CogVideoX-2B set to True and for CogVideoX-5B set to False
|
||||||
```
|
```
|
||||||
|
|
||||||
If you wish to use Lora fine-tuning, you also need to modify:
|
If you wish to use Lora fine-tuning, you also need to modify the `cogvideox_<model_parameters>_lora` file:
|
||||||
|
|
||||||
```yaml
|
Here, take `CogVideoX-2B` as a reference:
|
||||||
|
|
||||||
|
```
|
||||||
model:
|
model:
|
||||||
scale_factor: 1.15258426
|
scale_factor: 1.15258426
|
||||||
disable_first_stage_autocast: true
|
disable_first_stage_autocast: true
|
||||||
@ -186,15 +124,47 @@ model:
|
|||||||
r: 256
|
r: 256
|
||||||
```
|
```
|
||||||
|
|
||||||
### Fine-Tuning and Validation
|
### Modifying Run Scripts
|
||||||
|
|
||||||
1. Run the inference code to start fine-tuning.
|
Edit `finetune_single_gpu.sh` or `finetune_multi_gpus.sh` to select the configuration file. Below are two examples:
|
||||||
|
|
||||||
```shell
|
1. If you want to use the `CogVideoX-2B` model and the `Lora` method, you need to modify `finetune_single_gpu.sh`
|
||||||
|
or `finetune_multi_gpus.sh`:
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. If you want to use the `CogVideoX-2B` model and the `full-parameter fine-tuning` method, you need to
|
||||||
|
modify `finetune_single_gpu.sh` or `finetune_multi_gpus.sh`:
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fine-Tuning and Evaluation
|
||||||
|
|
||||||
|
Run the inference code to start fine-tuning.
|
||||||
|
|
||||||
|
```
|
||||||
bash finetune_single_gpu.sh # Single GPU
|
bash finetune_single_gpu.sh # Single GPU
|
||||||
bash finetune_multi_gpus.sh # Multi GPUs
|
bash finetune_multi_gpus.sh # Multi GPUs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using the Fine-Tuned Model
|
||||||
|
|
||||||
|
The fine-tuned model cannot be merged; here is how to modify the inference configuration file `inference.sh`:
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="$environs python sample_video.py --base configs/cogvideox_<model_parameters>_lora.yaml configs/inference.yaml --seed 42"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, execute the code:
|
||||||
|
|
||||||
|
```
|
||||||
|
bash inference.sh
|
||||||
|
```
|
||||||
|
|
||||||
### Converting to Huggingface Diffusers Supported Weights
|
### Converting to Huggingface Diffusers Supported Weights
|
||||||
|
|
||||||
The SAT weight format is different from Huggingface's weight format and needs to be converted. Please run:
|
The SAT weight format is different from Huggingface's weight format and needs to be converted. Please run:
|
||||||
|
@ -140,57 +140,94 @@ bash inference.sh
|
|||||||
|
|
||||||
### 設定ファイルの変更
|
### 設定ファイルの変更
|
||||||
|
|
||||||
`Lora` と
|
`Lora` とフルパラメータ微調整の2つの方法をサポートしています。両方の微調整方法は、`transformer` 部分のみを微調整し、`VAE`
|
||||||
全パラメータファインチューニングの2つの方法をサポートしています。これらのファインチューニング方法は `transformer`
|
部分には変更を加えないことに注意してください。`T5` はエンコーダーとしてのみ使用されます。以下のように `configs/sft.yaml` (
|
||||||
部分にのみ適用されます。`VAE` 部分は変更されません。`T5` はエンコーダーとしてのみ使用されます。
|
フルパラメータ微調整用) ファイルを変更してください。
|
||||||
|
|
||||||
`configs/cogvideox_2b_sft.yaml` (全量ファインチューニング用) を次のように変更します。
|
```
|
||||||
|
# checkpoint_activations: True ## 勾配チェックポイントを使用する場合 (設定ファイル内の2つの checkpoint_activations を True に設定する必要があります)
|
||||||
```yaml
|
|
||||||
# checkpoint_activations: True ## using gradient checkpointing (設定ファイル内の2つのcheckpoint_activationsを両方Trueに設定する必要があります)
|
|
||||||
model_parallel_size: 1 # モデル並列サイズ
|
model_parallel_size: 1 # モデル並列サイズ
|
||||||
experiment_name: lora-disney # 実験名 (変更しないでください)
|
experiment_name: lora-disney # 実験名 (変更しないでください)
|
||||||
mode: finetune # モード (変更しないでください)
|
mode: finetune # モード (変更しないでください)
|
||||||
load: "{your_CogVideoX-2b-sat_path}/transformer" # Transformer モデルパス
|
load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer モデルのパス
|
||||||
no_load_rng: True # ランダムシードをロードするかどうか
|
no_load_rng: True # 乱数シードを読み込むかどうか
|
||||||
train_iters: 1000 # トレーニングイテレーション数
|
train_iters: 1000 # トレーニングイテレーション数
|
||||||
eval_iters: 1 # 評価イテレーション数
|
eval_iters: 1 # 評価イテレーション数
|
||||||
eval_interval: 100 # 評価間隔
|
eval_interval: 100 # 評価間隔
|
||||||
eval_batch_size: 1 # 評価のバッチサイズ
|
eval_batch_size: 1 # 評価バッチサイズ
|
||||||
save: ckpts # モデル保存パス
|
save: ckpts # モデル保存パス
|
||||||
save_interval: 100 # モデル保存間隔
|
save_interval: 100 # モデル保存間隔
|
||||||
log_interval: 20 # ログ出力間隔
|
log_interval: 20 # ログ出力間隔
|
||||||
train_data: [ "your train data path" ]
|
train_data: [ "your train data path" ]
|
||||||
valid_data: [ "your val data path" ] # トレーニングセットと検証セットは同じでもかまいません
|
valid_data: [ "your val data path" ] # トレーニングデータと評価データは同じでも構いません
|
||||||
split: 1,0,0 # トレーニングセット、検証セット、テストセットの比率
|
split: 1,0,0 # トレーニングセット、評価セット、テストセットの割合
|
||||||
num_workers: 8 # データローダーのワーカースレッド数
|
num_workers: 8 # データローダーのワーカースレッド数
|
||||||
force_train: True # ckpt をロードする際に missing keys を許可するかどうか (T5 と VAE は独立してロードされます)
|
force_train: True # チェックポイントをロードするときに欠落したキーを許可 (T5 と VAE は別々にロードされます)
|
||||||
only_log_video_latents: True # VAE デコーダーを使用しないようにしてメモリを節約します
|
only_log_video_latents: True # VAE のデコードによるメモリオーバーヘッドを回避
|
||||||
|
deepspeed:
|
||||||
|
bf16:
|
||||||
|
enabled: False # CogVideoX-2B の場合は False に設定し、CogVideoX-5B の場合は True に設定
|
||||||
|
fp16:
|
||||||
|
enabled: True # CogVideoX-2B の場合は True に設定し、CogVideoX-5B の場合は False に設定
|
||||||
```
|
```
|
||||||
|
|
||||||
Lora ファインチューニングを使用する場合は、次のように変更する必要があります:
|
Lora 微調整を使用したい場合は、`cogvideox_<model_parameters>_lora` ファイルも変更する必要があります。
|
||||||
|
|
||||||
```yaml
|
ここでは、`CogVideoX-2B` を参考にします。
|
||||||
|
|
||||||
|
```
|
||||||
model:
|
model:
|
||||||
scale_factor: 1.15258426
|
scale_factor: 1.15258426
|
||||||
disable_first_stage_autocast: true
|
disable_first_stage_autocast: true
|
||||||
not_trainable_prefixes: [ 'all' ] ## コメント解除
|
not_trainable_prefixes: [ 'all' ] ## コメントを解除
|
||||||
log_keys:
|
log_keys:
|
||||||
- txt'
|
- txt'
|
||||||
|
|
||||||
lora_config: ## コメント解除
|
lora_config: ## コメントを解除
|
||||||
target: sat.model.finetune.lora2.LoraMixin
|
target: sat.model.finetune.lora2.LoraMixin
|
||||||
params:
|
params:
|
||||||
r: 256
|
r: 256
|
||||||
```
|
```
|
||||||
|
|
||||||
### ファインチューニングと検証
|
### 実行スクリプトの変更
|
||||||
|
|
||||||
1. 推論コードを実行してファインチューニングを開始します。
|
設定ファイルを選択するために `finetune_single_gpu.sh` または `finetune_multi_gpus.sh` を編集します。以下に2つの例を示します。
|
||||||
|
|
||||||
```shell
|
1. `CogVideoX-2B` モデルを使用し、`Lora` 手法を利用する場合は、`finetune_single_gpu.sh` または `finetune_multi_gpus.sh`
|
||||||
bash finetune_single_gpu.sh # Single GPU
|
を変更する必要があります。
|
||||||
bash finetune_multi_gpus.sh # Multi GPUs
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. `CogVideoX-2B` モデルを使用し、`フルパラメータ微調整` 手法を利用する場合は、`finetune_single_gpu.sh`
|
||||||
|
または `finetune_multi_gpus.sh` を変更する必要があります。
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 微調整と評価
|
||||||
|
|
||||||
|
推論コードを実行して微調整を開始します。
|
||||||
|
|
||||||
|
```
|
||||||
|
bash finetune_single_gpu.sh # シングルGPU
|
||||||
|
bash finetune_multi_gpus.sh # マルチGPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 微調整後のモデルの使用
|
||||||
|
|
||||||
|
微調整されたモデルは統合できません。ここでは、推論設定ファイル `inference.sh` を変更する方法を示します。
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="$environs python sample_video.py --base configs/cogvideox_<model_parameters>_lora.yaml configs/inference.yaml --seed 42"
|
||||||
|
```
|
||||||
|
|
||||||
|
その後、次のコードを実行します。
|
||||||
|
|
||||||
|
```
|
||||||
|
bash inference.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
### Huggingface Diffusers サポートのウェイトに変換
|
### Huggingface Diffusers サポートのウェイトに変換
|
||||||
|
@ -50,7 +50,9 @@ git clone https://huggingface.co/THUDM/CogVideoX-2b.git
|
|||||||
mkdir t5-v1_1-xxl
|
mkdir t5-v1_1-xxl
|
||||||
mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
|
mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
|
||||||
```
|
```
|
||||||
|
|
||||||
通过上述方案,你将会得到一个 safetensor 格式的T5文件,确保在 Deepspeed微调过程中读入的时候不会报错。
|
通过上述方案,你将会得到一个 safetensor 格式的T5文件,确保在 Deepspeed微调过程中读入的时候不会报错。
|
||||||
|
|
||||||
```
|
```
|
||||||
├── added_tokens.json
|
├── added_tokens.json
|
||||||
├── config.json
|
├── config.json
|
||||||
@ -63,6 +65,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
|
|||||||
|
|
||||||
0 directories, 8 files
|
0 directories, 8 files
|
||||||
```
|
```
|
||||||
|
|
||||||
3. 修改`configs/cogvideox_2b_infer.yaml`中的文件。
|
3. 修改`configs/cogvideox_2b_infer.yaml`中的文件。
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@ -138,7 +141,7 @@ bash inference.sh
|
|||||||
|
|
||||||
我们支持 `Lora` 和 全参数微调两种方式。请注意,两种微调方式都仅仅对 `transformer` 部分进行微调。不改动 `VAE` 部分。`T5`仅作为
|
我们支持 `Lora` 和 全参数微调两种方式。请注意,两种微调方式都仅仅对 `transformer` 部分进行微调。不改动 `VAE` 部分。`T5`仅作为
|
||||||
Encoder 使用。
|
Encoder 使用。
|
||||||
部分。 请按照以下方式修改`configs/cogvideox_2b_sft.yaml`(全量微调) 中的文件。
|
部分。 请按照以下方式修改`configs/sft.yaml`(全量微调) 中的文件。
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# checkpoint_activations: True ## using gradient checkpointing (配置文件中的两个checkpoint_activations都需要设置为True)
|
# checkpoint_activations: True ## using gradient checkpointing (配置文件中的两个checkpoint_activations都需要设置为True)
|
||||||
@ -160,9 +163,16 @@ Encoder 使用。
|
|||||||
num_workers: 8 # 数据加载器的工作线程数
|
num_workers: 8 # 数据加载器的工作线程数
|
||||||
force_train: True # 在加载checkpoint时允许missing keys (T5 和 VAE 单独加载)
|
force_train: True # 在加载checkpoint时允许missing keys (T5 和 VAE 单独加载)
|
||||||
only_log_video_latents: True # 避免VAE decode带来的显存开销
|
only_log_video_latents: True # 避免VAE decode带来的显存开销
|
||||||
|
deepspeed:
|
||||||
|
bf16:
|
||||||
|
enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
|
||||||
|
fp16:
|
||||||
|
enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
|
||||||
```
|
```
|
||||||
|
|
||||||
如果你希望使用 Lora 微调,你还需要修改:
|
如果你希望使用 Lora 微调,你还需要修改`cogvideox_<模型参数>_lora` 文件:
|
||||||
|
|
||||||
|
这里以 `CogVideoX-2B` 为参考:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
model:
|
model:
|
||||||
@ -178,15 +188,46 @@ model:
|
|||||||
r: 256
|
r: 256
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 修改运行脚本
|
||||||
|
|
||||||
|
编辑`finetune_single_gpu.sh` 或者 `finetune_multi_gpus.sh`,选择配置文件。下面是两个例子:
|
||||||
|
|
||||||
|
1. 如果您想使用 `CogVideoX-2B` 模型并使用`Lora`方案,您需要修改`finetune_single_gpu.sh` 或者 `finetune_multi_gpus.sh`:
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 如果您想使用 `CogVideoX-2B` 模型并使用`全量微调`方案,您需要修改`finetune_single_gpu.sh`
|
||||||
|
或者 `finetune_multi_gpus.sh`:
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
```
|
||||||
|
|
||||||
### 微调和验证
|
### 微调和验证
|
||||||
|
|
||||||
1. 运行推理代码,即可开始微调。
|
运行推理代码,即可开始微调。
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
bash finetune_single_gpu.sh # Single GPU
|
bash finetune_single_gpu.sh # Single GPU
|
||||||
bash finetune_multi_gpus.sh # Multi GPUs
|
bash finetune_multi_gpus.sh # Multi GPUs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 使用微调后的模型
|
||||||
|
|
||||||
|
微调后的模型无法合并,这里展现了如何修改推理配置文件 `inference.sh`
|
||||||
|
|
||||||
|
```
|
||||||
|
run_cmd="$environs python sample_video.py --base configs/cogvideox_<模型参数>_lora.yaml configs/inference.yaml --seed 42"
|
||||||
|
```
|
||||||
|
|
||||||
|
然后,执行代码:
|
||||||
|
|
||||||
|
```
|
||||||
|
bash inference.sh
|
||||||
|
```
|
||||||
|
|
||||||
### 转换到 Huggingface Diffusers 库支持的权重
|
### 转换到 Huggingface Diffusers 库支持的权重
|
||||||
|
|
||||||
SAT 权重格式与 Huggingface 的权重格式不同,需要转换。请运行
|
SAT 权重格式与 Huggingface 的权重格式不同,需要转换。请运行
|
||||||
|
@ -1,75 +1,9 @@
|
|||||||
args:
|
|
||||||
checkpoint_activations: True ## using gradient checkpointing
|
|
||||||
model_parallel_size: 1
|
|
||||||
experiment_name: lora-disney
|
|
||||||
mode: finetune
|
|
||||||
load: "CogVideoX-2b-sat/transformer"
|
|
||||||
no_load_rng: True
|
|
||||||
train_iters: 1000
|
|
||||||
eval_iters: 1
|
|
||||||
eval_interval: 100
|
|
||||||
eval_batch_size: 1
|
|
||||||
save: ckpts
|
|
||||||
save_interval: 100
|
|
||||||
log_interval: 20
|
|
||||||
train_data: ["disney"]
|
|
||||||
valid_data: ["disney"]
|
|
||||||
split: 1,0,0
|
|
||||||
num_workers: 8
|
|
||||||
force_train: True
|
|
||||||
only_log_video_latents: True
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: data_video.SFTDataset
|
|
||||||
params:
|
|
||||||
video_size: [480, 720]
|
|
||||||
fps: 8
|
|
||||||
max_num_frames: 49
|
|
||||||
skip_frms_num: 3.
|
|
||||||
|
|
||||||
deepspeed:
|
|
||||||
train_micro_batch_size_per_gpu: 1
|
|
||||||
gradient_accumulation_steps: 1
|
|
||||||
steps_per_print: 50
|
|
||||||
gradient_clipping: 0.1
|
|
||||||
zero_optimization:
|
|
||||||
stage: 2
|
|
||||||
cpu_offload: false
|
|
||||||
contiguous_gradients: false
|
|
||||||
overlap_comm: true
|
|
||||||
reduce_scatter: true
|
|
||||||
reduce_bucket_size: 1000000000
|
|
||||||
allgather_bucket_size: 1000000000
|
|
||||||
load_from_fp32_weights: false
|
|
||||||
zero_allow_untested_optimizer: true
|
|
||||||
bf16:
|
|
||||||
enabled: False
|
|
||||||
fp16:
|
|
||||||
enabled: True
|
|
||||||
loss_scale: 0
|
|
||||||
loss_scale_window: 400
|
|
||||||
hysteresis: 2
|
|
||||||
min_loss_scale: 1
|
|
||||||
optimizer:
|
|
||||||
type: sat.ops.FusedEmaAdam
|
|
||||||
params:
|
|
||||||
lr: 0.0002
|
|
||||||
betas: [0.9, 0.95]
|
|
||||||
eps: 1e-8
|
|
||||||
weight_decay: 1e-4
|
|
||||||
activation_checkpointing:
|
|
||||||
partition_activations: false
|
|
||||||
contiguous_memory_optimization: false
|
|
||||||
wall_clock_breakdown: false
|
|
||||||
|
|
||||||
|
|
||||||
model:
|
model:
|
||||||
scale_factor: 1.15258426
|
scale_factor: 1.15258426
|
||||||
disable_first_stage_autocast: true
|
disable_first_stage_autocast: true
|
||||||
not_trainable_prefixes: ['all'] ## Using Lora
|
|
||||||
log_keys:
|
log_keys:
|
||||||
- txt
|
- txt
|
||||||
|
|
||||||
denoiser_config:
|
denoiser_config:
|
||||||
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||||
params:
|
params:
|
||||||
@ -119,11 +53,6 @@ model:
|
|||||||
height_interpolation: 1.875
|
height_interpolation: 1.875
|
||||||
width_interpolation: 1.875
|
width_interpolation: 1.875
|
||||||
|
|
||||||
lora_config: ## Using Lora
|
|
||||||
target: sat.model.finetune.lora2.LoraMixin
|
|
||||||
params:
|
|
||||||
r: 128
|
|
||||||
|
|
||||||
patch_embed_config:
|
patch_embed_config:
|
||||||
target: dit_video_concat.ImagePatchEmbeddingMixin
|
target: dit_video_concat.ImagePatchEmbeddingMixin
|
||||||
params:
|
params:
|
||||||
@ -146,14 +75,14 @@ model:
|
|||||||
ucg_rate: 0.1
|
ucg_rate: 0.1
|
||||||
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
||||||
params:
|
params:
|
||||||
model_dir: "google/t5-v1_1-xxl"
|
model_dir: "t5-v1_1-xxl"
|
||||||
max_length: 226
|
max_length: 226
|
||||||
|
|
||||||
first_stage_config:
|
first_stage_config:
|
||||||
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
||||||
params:
|
params:
|
||||||
cp_size: 1
|
cp_size: 1
|
||||||
ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt"
|
ckpt_path: "cogvideox-2b-sat/vae/3d-vae.pt"
|
||||||
ignore_keys: [ 'loss' ]
|
ignore_keys: [ 'loss' ]
|
||||||
|
|
||||||
loss_config:
|
loss_config:
|
||||||
@ -190,7 +119,7 @@ model:
|
|||||||
attn_resolutions: [ ]
|
attn_resolutions: [ ]
|
||||||
num_res_blocks: 3
|
num_res_blocks: 3
|
||||||
dropout: 0.0
|
dropout: 0.0
|
||||||
gather_norm: false
|
gather_norm: False
|
||||||
|
|
||||||
loss_fn_config:
|
loss_fn_config:
|
||||||
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
@ -1,19 +1,7 @@
|
|||||||
args:
|
|
||||||
latent_channels: 16
|
|
||||||
mode: inference
|
|
||||||
load: "CogVideoX-2b-sat/transformer"
|
|
||||||
batch_size: 1
|
|
||||||
input_type: txt
|
|
||||||
input_file: test.txt
|
|
||||||
sampling_num_frames: 13 # Must be 13, 11 or 9
|
|
||||||
sampling_fps: 8
|
|
||||||
fp16: True
|
|
||||||
output_dir: outputs/
|
|
||||||
force_inference: True
|
|
||||||
|
|
||||||
model:
|
model:
|
||||||
scale_factor: 1.15258426
|
scale_factor: 1.15258426
|
||||||
disable_first_stage_autocast: true
|
disable_first_stage_autocast: true
|
||||||
|
not_trainable_prefixes: ['all'] ## Using Lora
|
||||||
log_keys:
|
log_keys:
|
||||||
- txt
|
- txt
|
||||||
|
|
||||||
@ -50,6 +38,7 @@ model:
|
|||||||
num_attention_heads: 30
|
num_attention_heads: 30
|
||||||
|
|
||||||
transformer_args:
|
transformer_args:
|
||||||
|
checkpoint_activations: True ## using gradient checkpointing
|
||||||
vocab_size: 1
|
vocab_size: 1
|
||||||
max_sequence_length: 64
|
max_sequence_length: 64
|
||||||
layernorm_order: pre
|
layernorm_order: pre
|
||||||
@ -65,6 +54,11 @@ model:
|
|||||||
height_interpolation: 1.875
|
height_interpolation: 1.875
|
||||||
width_interpolation: 1.875
|
width_interpolation: 1.875
|
||||||
|
|
||||||
|
lora_config:
|
||||||
|
target: sat.model.finetune.lora2.LoraMixin
|
||||||
|
params:
|
||||||
|
r: 128
|
||||||
|
|
||||||
patch_embed_config:
|
patch_embed_config:
|
||||||
target: dit_video_concat.ImagePatchEmbeddingMixin
|
target: dit_video_concat.ImagePatchEmbeddingMixin
|
||||||
params:
|
params:
|
||||||
@ -87,14 +81,14 @@ model:
|
|||||||
ucg_rate: 0.1
|
ucg_rate: 0.1
|
||||||
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
target: sgm.modules.encoders.modules.FrozenT5Embedder
|
||||||
params:
|
params:
|
||||||
model_dir: "google/t5-v1_1-xxl"
|
model_dir: "t5-v1_1-xxl"
|
||||||
max_length: 226
|
max_length: 226
|
||||||
|
|
||||||
first_stage_config:
|
first_stage_config:
|
||||||
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
|
||||||
params:
|
params:
|
||||||
cp_size: 1
|
cp_size: 1
|
||||||
ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt"
|
ckpt_path: "cogvideox-2b-sat/vae/3d-vae.pt"
|
||||||
ignore_keys: [ 'loss' ]
|
ignore_keys: [ 'loss' ]
|
||||||
|
|
||||||
loss_config:
|
loss_config:
|
||||||
@ -131,7 +125,7 @@ model:
|
|||||||
attn_resolutions: [ ]
|
attn_resolutions: [ ]
|
||||||
num_res_blocks: 3
|
num_res_blocks: 3
|
||||||
dropout: 0.0
|
dropout: 0.0
|
||||||
gather_norm: false
|
gather_norm: False
|
||||||
|
|
||||||
loss_fn_config:
|
loss_fn_config:
|
||||||
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
|
15
sat/configs/inference.yaml
Normal file
15
sat/configs/inference.yaml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
args:
|
||||||
|
latent_channels: 16
|
||||||
|
mode: inference
|
||||||
|
# load: "{your_CogVideoX-2b-sat_path}/transformer" # This is for Full model without lora adapter
|
||||||
|
# load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
|
||||||
|
|
||||||
|
batch_size: 1
|
||||||
|
input_type: txt
|
||||||
|
input_file: configs/test.txt
|
||||||
|
sampling_num_frames: 13 # Must be 13, 11 or 9
|
||||||
|
sampling_fps: 8
|
||||||
|
fp16: True # For CogVideoX-2B
|
||||||
|
# bf16: True # For CogVideoX-5B
|
||||||
|
output_dir: outputs/
|
||||||
|
force_inference: True
|
65
sat/configs/sft.yaml
Normal file
65
sat/configs/sft.yaml
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
args:
|
||||||
|
checkpoint_activations: True ## using gradient checkpointing
|
||||||
|
model_parallel_size: 1
|
||||||
|
experiment_name: lora-disney
|
||||||
|
mode: finetune
|
||||||
|
load: "cogvideox-2b-sat/transformer"
|
||||||
|
no_load_rng: True
|
||||||
|
train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough
|
||||||
|
eval_iters: 1
|
||||||
|
eval_interval: 100
|
||||||
|
eval_batch_size: 1
|
||||||
|
save: ckpts_2b_lora
|
||||||
|
save_interval: 500
|
||||||
|
log_interval: 20
|
||||||
|
train_data: [ "disney" ] # Train data path
|
||||||
|
valid_data: [ "disney" ] # Validation data path, can be the same as train_data(not recommended)
|
||||||
|
split: 1,0,0
|
||||||
|
num_workers: 8
|
||||||
|
force_train: True
|
||||||
|
only_log_video_latents: True
|
||||||
|
|
||||||
|
data:
|
||||||
|
target: data_video.SFTDataset
|
||||||
|
params:
|
||||||
|
video_size: [ 480, 720 ]
|
||||||
|
fps: 8
|
||||||
|
max_num_frames: 49
|
||||||
|
skip_frms_num: 3.
|
||||||
|
|
||||||
|
deepspeed:
|
||||||
|
# Minimun for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
|
||||||
|
train_micro_batch_size_per_gpu: 2
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
steps_per_print: 50
|
||||||
|
gradient_clipping: 0.1
|
||||||
|
zero_optimization:
|
||||||
|
stage: 2
|
||||||
|
cpu_offload: false
|
||||||
|
contiguous_gradients: false
|
||||||
|
overlap_comm: true
|
||||||
|
reduce_scatter: true
|
||||||
|
reduce_bucket_size: 1000000000
|
||||||
|
allgather_bucket_size: 1000000000
|
||||||
|
load_from_fp32_weights: false
|
||||||
|
zero_allow_untested_optimizer: true
|
||||||
|
bf16:
|
||||||
|
enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
|
||||||
|
fp16:
|
||||||
|
enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
|
||||||
|
loss_scale: 0
|
||||||
|
loss_scale_window: 400
|
||||||
|
hysteresis: 2
|
||||||
|
min_loss_scale: 1
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: sat.ops.FusedEmaAdam
|
||||||
|
params:
|
||||||
|
lr: 0.001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
|
||||||
|
betas: [ 0.9, 0.95 ]
|
||||||
|
eps: 1e-8
|
||||||
|
weight_decay: 1e-4
|
||||||
|
activation_checkpointing:
|
||||||
|
partition_activations: false
|
||||||
|
contiguous_memory_optimization: false
|
||||||
|
wall_clock_breakdown: false
|
@ -1,8 +1,8 @@
|
|||||||
#! /bin/bash
|
#! /bin/bash
|
||||||
|
|
||||||
echo "RUN on `hostname`, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
||||||
|
|
||||||
run_cmd="torchrun --standalone --nproc_per_node=4 train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM"
|
run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
|
||||||
echo ${run_cmd}
|
echo ${run_cmd}
|
||||||
eval ${run_cmd}
|
eval ${run_cmd}
|
||||||
|
@ -4,7 +4,7 @@ echo "RUN on `hostname`, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
|||||||
|
|
||||||
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
|
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
|
||||||
|
|
||||||
run_cmd="$environs python train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM"
|
run_cmd="$environs python train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
|
||||||
|
|
||||||
echo ${run_cmd}
|
echo ${run_cmd}
|
||||||
eval ${run_cmd}
|
eval ${run_cmd}
|
||||||
|
@ -4,7 +4,7 @@ echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
|||||||
|
|
||||||
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
|
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
|
||||||
|
|
||||||
run_cmd="$environs python sample_video.py --base configs/cogvideox_2b_infer.yaml"
|
run_cmd="$environs python sample_video.py --base configs/cogvideox_2b.yaml configs/inference.yaml --seed $RANDOM"
|
||||||
|
|
||||||
echo ${run_cmd}
|
echo ${run_cmd}
|
||||||
eval ${run_cmd}
|
eval ${run_cmd}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user