From c8c7b62aa16135a184b66dc7d1dd4089cfadd4d9 Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Sat, 9 Nov 2024 22:07:32 +0800
Subject: [PATCH 1/9] update diffusers code
---
inference/cli_demo.py | 48 +++++++++++++--------------
inference/cli_demo_quantization.py | 53 ++++++++++++------------------
requirements.txt | 10 +++---
3 files changed, 49 insertions(+), 62 deletions(-)
diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index 6e1dbf3..6e5ebf5 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -3,15 +3,15 @@ This script demonstrates how to generate a video using the CogVideoX model with
The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
and video-to-video (v2v), depending on the input data and different weight.
-- text-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b
-- video-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b
-- image-to-video: THUDM/CogVideoX-5b-I2V
+- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
Running the Script:
To run the script, use the following command with appropriate arguments:
```bash
-$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX-5b --generate_type "t2v"
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
```
Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
@@ -23,7 +23,6 @@ from typing import Literal
import torch
from diffusers import (
CogVideoXPipeline,
- CogVideoXDDIMScheduler,
CogVideoXDPMScheduler,
CogVideoXImageToVideoPipeline,
CogVideoXVideoToVideoPipeline,
@@ -37,6 +36,7 @@ def generate_video(
model_path: str,
lora_path: str = None,
lora_rank: int = 128,
+ num_frames=81,
output_path: str = "./output.mp4",
image_or_video_path: str = "",
num_inference_steps: int = 50,
@@ -45,6 +45,7 @@ def generate_video(
dtype: torch.dtype = torch.bfloat16,
generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video
seed: int = 42,
+ fps: int = 8,
):
"""
Generates a video based on the given prompt and saves it to the specified path.
@@ -56,11 +57,13 @@ def generate_video(
- lora_rank (int): The rank of the LoRA weights.
- output_path (str): The path where the generated video will be saved.
- num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+ - num_frames (int): Number of frames to generate.
- guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
- num_videos_per_prompt (int): Number of videos to generate per prompt.
- dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
- generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
- seed (int): The seed for reproducibility.
+ - fps (int): The frames per second for the generated video.
"""
# 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
@@ -109,11 +112,11 @@ def generate_video(
if generate_type == "i2v":
video_generate = pipe(
prompt=prompt,
- image=image, # The path of the image to be used as the background of the video
+ image=image, # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480
num_videos_per_prompt=num_videos_per_prompt, # Number of videos to generate per prompt
num_inference_steps=num_inference_steps, # Number of inference steps
- num_frames=49, # Number of frames to generate,changed to 49 for diffusers version `0.30.3` and after.
- use_dynamic_cfg=True, # This id used for DPM Sechduler, for DDIM scheduler, it should be False
+ num_frames=num_frames, # Number of frames to generate
+ use_dynamic_cfg=True, # This id used for DPM scheduler, for DDIM scheduler, it should be False
guidance_scale=guidance_scale,
generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility
).frames[0]
@@ -122,7 +125,7 @@ def generate_video(
prompt=prompt,
num_videos_per_prompt=num_videos_per_prompt,
num_inference_steps=num_inference_steps,
- num_frames=49,
+ num_frames=num_frames,
use_dynamic_cfg=True,
guidance_scale=guidance_scale,
generator=torch.Generator().manual_seed(seed),
@@ -133,13 +136,12 @@ def generate_video(
video=video, # The path of the video to be used as the background of the video
num_videos_per_prompt=num_videos_per_prompt,
num_inference_steps=num_inference_steps,
- # num_frames=49,
+ num_frames=num_frames,
use_dynamic_cfg=True,
guidance_scale=guidance_scale,
generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility
).frames[0]
- # 5. Export the generated frames to a video file. fps must be 8 for original video.
- export_to_video(video_generate, output_path, fps=8)
+ export_to_video(video_generate, output_path, fps=fps)
if __name__ == "__main__":
@@ -152,24 +154,18 @@ if __name__ == "__main__":
help="The path of the image to be used as the background of the video",
)
parser.add_argument(
- "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
+ "--model_path", type=str, default="THUDM/CogVideoX-5b", help="Path of the pre-trained model use"
)
parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used")
parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights")
- parser.add_argument(
- "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
- )
+ parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video")
parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
- parser.add_argument(
- "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
- )
+ parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+ parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
+ parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process")
parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
- parser.add_argument(
- "--generate_type", type=str, default="t2v", help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')"
- )
- parser.add_argument(
- "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
- )
+ parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation")
+ parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation")
parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
args = parser.parse_args()
@@ -180,6 +176,7 @@ if __name__ == "__main__":
lora_path=args.lora_path,
lora_rank=args.lora_rank,
output_path=args.output_path,
+ num_frames=args.num_frames,
image_or_video_path=args.image_or_video_path,
num_inference_steps=args.num_inference_steps,
guidance_scale=args.guidance_scale,
@@ -187,4 +184,5 @@ if __name__ == "__main__":
dtype=dtype,
generate_type=args.generate_type,
seed=args.seed,
+ fps=args.fps,
)
diff --git a/inference/cli_demo_quantization.py b/inference/cli_demo_quantization.py
index b56c9d2..1ea3358 100644
--- a/inference/cli_demo_quantization.py
+++ b/inference/cli_demo_quantization.py
@@ -3,7 +3,7 @@ This script demonstrates how to generate a video from a text prompt using CogVid
Note:
-Must install the `torchao`,`torch`,`diffusers`,`accelerate` library FROM SOURCE to use the quantization feature.
+Must install the `torchao`,`torch` library FROM SOURCE to use the quantization feature.
Only NVIDIA GPUs like H100 or higher are supported om FP-8 quantization.
ALL quantization schemes must use with NVIDIA GPUs.
@@ -51,6 +51,9 @@ def generate_video(
num_videos_per_prompt: int = 1,
quantization_scheme: str = "fp8",
dtype: torch.dtype = torch.bfloat16,
+ num_frames: int = 81,
+ fps: int = 8,
+ seed: int = 42,
):
"""
Generates a video based on the given prompt and saves it to the specified path.
@@ -65,7 +68,6 @@ def generate_video(
- quantization_scheme (str): The quantization scheme to use ('int8', 'fp8').
- dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
"""
-
text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=dtype)
text_encoder = quantize_model(part=text_encoder, quantization_scheme=quantization_scheme)
transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype)
@@ -80,54 +82,38 @@ def generate_video(
torch_dtype=dtype,
)
pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
-
- # Using with compile will run faster. First time infer will cost ~30min to compile.
- # pipe.transformer.to(memory_format=torch.channels_last)
-
- # for FP8 should remove pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload()
-
- # This is not for FP8 and INT8 and should remove this line
- # pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
+
video = pipe(
prompt=prompt,
num_videos_per_prompt=num_videos_per_prompt,
num_inference_steps=num_inference_steps,
- num_frames=49,
+ num_frames=num_frames,
use_dynamic_cfg=True,
guidance_scale=guidance_scale,
- generator=torch.Generator(device="cuda").manual_seed(42),
+ generator=torch.Generator(device="cuda").manual_seed(seed),
).frames[0]
- export_to_video(video, output_path, fps=8)
+ export_to_video(video, output_path, fps=fps)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
+ parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX-5b", help="Path of the pre-trained model")
+ parser.add_argument("--output_path", type=str, default="./output.mp4", help="Path to save generated video")
+ parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+ parser.add_argument("--guidance_scale", type=float, default=6.0, help="Classifier-free guidance scale")
+ parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Videos to generate per prompt")
+ parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type (e.g., 'float16', 'bfloat16')")
parser.add_argument(
- "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
- )
- parser.add_argument(
- "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
- )
- parser.add_argument(
- "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
- )
- parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
- parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
- parser.add_argument(
- "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16', 'bfloat16')"
- )
- parser.add_argument(
- "--quantization_scheme",
- type=str,
- default="bf16",
- choices=["int8", "fp8"],
- help="The quantization scheme to use (int8, fp8)",
+ "--quantization_scheme", type=str, default="fp8", choices=["int8", "fp8"], help="Quantization scheme"
)
+ parser.add_argument("--num_frames", type=int, default=81, help="Number of frames in the video")
+ parser.add_argument("--fps", type=int, default=16, help="Frames per second for output video")
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
args = parser.parse_args()
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
@@ -140,4 +126,7 @@ if __name__ == "__main__":
num_videos_per_prompt=args.num_videos_per_prompt,
quantization_scheme=args.quantization_scheme,
dtype=dtype,
+ num_frames=args.num_frames,
+ fps=args.fps,
+ seed=args.seed,
)
diff --git a/requirements.txt b/requirements.txt
index bb36b92..3008655 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
diffusers>=0.31.0
-accelerate>=1.0.1
-transformers>=4.46.1
+accelerate>=1.1.1
+transformers>=4.46.2
numpy==1.26.0
torch>=2.5.0
torchvision>=0.20.0
sentencepiece>=0.2.0
SwissArmyTransformer>=0.4.12
-gradio>=5.4.0
+gradio>=5.5.0
imageio>=2.35.1
imageio-ffmpeg>=0.5.1
-openai>=1.53.0
+openai>=1.54.0
moviepy>=1.0.3
-scikit-video>=1.1.11
+scikit-video>=1.1.11
\ No newline at end of file
From fb806eecce4dec0a0fb9c4c59c1833f2f6ffe65e Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Sat, 9 Nov 2024 22:29:36 +0800
Subject: [PATCH 2/9] update table
---
README.md | 7 ++++---
README_ja.md | 7 ++++---
README_zh.md | 7 ++++---
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 9f8122e..8841a4b 100644
--- a/README.md
+++ b/README.md
@@ -199,13 +199,13 @@ models we currently offer, along with their foundational information.
Single GPU Memory Usage
|
- SAT BF16: 66GB
|
+ SAT BF16: 76GB diffusers BF16: from 10GB* diffusers INT8(torchao): from 7GB* |
SAT FP16: 18GB diffusers FP16: 4GB minimum* diffusers INT8 (torchao): 3.6GB minimum* |
SAT BF16: 26GB diffusers BF16 : 5GB minimum* diffusers INT8 (torchao): 4.4GB minimum* |
Multi-GPU Memory Usage |
- Not Supported
|
+ BF16: 24GB* using diffusers
|
FP16: 10GB* using diffusers
|
BF16: 15GB* using diffusers
|
@@ -243,7 +243,8 @@ models we currently offer, along with their foundational information.
Download Link (Diffusers) |
- Coming Soon |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
diff --git a/README_ja.md b/README_ja.md
index 9962d1b..b6ad2c6 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -191,13 +191,13 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
単一GPUメモリ消費量
|
- SAT BF16: 66GB
|
+ SAT BF16: 76GB diffusers BF16:10GBから* diffusers INT8(torchao):7GBから* |
SAT FP16: 18GB diffusers FP16: 4GB以上* diffusers INT8(torchao): 3.6GB以上* |
SAT BF16: 26GB diffusers BF16 : 5GB以上* diffusers INT8(torchao): 4.4GB以上* |
複数GPU推論メモリ消費量 |
- 非対応
|
+ BF16: 24GB* using diffusers
|
FP16: 10GB* diffusers使用
|
BF16: 15GB* diffusers使用
|
@@ -235,7 +235,8 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
ダウンロードリンク (Diffusers) |
- 近日公開 |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
diff --git a/README_zh.md b/README_zh.md
index c66fc85..8188db0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -182,13 +182,13 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
单GPU显存消耗
|
- SAT BF16: 66GB
|
+ SAT BF16: 76GB diffusers BF16 : 10GB起* diffusers INT8(torchao): 7G起* |
SAT FP16: 18GB diffusers FP16: 4GB起* diffusers INT8(torchao): 3.6G起* |
SAT BF16: 26GB diffusers BF16 : 5GB起* diffusers INT8(torchao): 4.4G起* |
多GPU推理显存消耗 |
- 不支持
|
+ BF16: 24GB* using diffusers
|
FP16: 10GB* using diffusers
|
BF16: 15GB* using diffusers
|
@@ -226,7 +226,8 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
下载链接 (Diffusers) |
- 即将推出 |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
+ 🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
🤗 HuggingFace 🤖 ModelScope 🟣 WiseModel |
From 2475902027a4ed4427e9bcbf9894d5ef4a2c55cb Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Sat, 9 Nov 2024 22:43:02 +0800
Subject: [PATCH 3/9] friendly link
---
README.md | 1 +
README_ja.md | 1 +
README_zh.md | 1 +
3 files changed, 3 insertions(+)
diff --git a/README.md b/README.md
index 8841a4b..b735003 100644
--- a/README.md
+++ b/README.md
@@ -320,6 +320,7 @@ works have already been adapted for CogVideoX, and we invite everyone to use the
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth Studio is a diffusion engine. It has
restructured the architecture, including text encoders, UNet, VAE, etc., enhancing computational performance while
maintaining compatibility with open-source community models. The framework has been adapted for CogVideoX.
++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): A simple ControlNet module code that includes the CogVideoX model.
## Project Structure
diff --git a/README_ja.md b/README_ja.md
index b6ad2c6..e6af4e9 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -300,6 +300,7 @@ pipe.vae.enable_tiling()
キーフレーム補間生成において、より大きな柔軟性を提供することを目的とした、CogVideoX構造を基にした修正版のパイプライン。
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth
Studioは、拡散エンジンです。テキストエンコーダー、UNet、VAEなどを含むアーキテクチャを再構築し、オープンソースコミュニティモデルとの互換性を維持しつつ、計算性能を向上させました。このフレームワークはCogVideoXに適応しています。
++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): CogVideoXモデルを含むシンプルなControlNetモジュールのコード。
## プロジェクト構造
diff --git a/README_zh.md b/README_zh.md
index 8188db0..c4a370e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -281,6 +281,7 @@ pipe.vae.enable_tiling()
+ [CogVideoX-Interpolation](https://github.com/feizc/CogvideX-Interpolation): 基于 CogVideoX 结构修改的管道,旨在为关键帧插值生成提供更大的灵活性。
+ [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth 工作室是一款扩散引擎。重构了架构,包括文本编码器、UNet、VAE
等,在保持与开源社区模型兼容性的同时,提升了计算性能。该框架已经适配 CogVideoX。
++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): 一个包含 CogvideoX 模型的简单 Controlnet 模块的代码。
## 完整项目代码结构
From 155456befa8a3fd077ca03eaaf7ca19ea64fe133 Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Sat, 9 Nov 2024 22:49:03 +0800
Subject: [PATCH 4/9] update
---
README.md | 10 ++++------
README_ja.md | 9 ++++-----
README_zh.md | 7 +++----
3 files changed, 11 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index b735003..d20ffab 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,8 @@ Experience the CogVideoX-5B model online at
Date: Tue, 12 Nov 2024 00:17:19 +0800
Subject: [PATCH 7/9] add width and height
---
inference/cli_demo.py | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index db281bb..bc97dd8 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -37,6 +37,8 @@ def generate_video(
lora_path: str = None,
lora_rank: int = 128,
num_frames: int = 81,
+ width: int = 1360,
+ height: int = 768,
output_path: str = "./output.mp4",
image_or_video_path: str = "",
num_inference_steps: int = 50,
@@ -58,6 +60,8 @@ def generate_video(
- output_path (str): The path where the generated video will be saved.
- num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
- num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps.
+ - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V
+ - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V
- guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
- num_videos_per_prompt (int): Number of videos to generate per prompt.
- dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
@@ -111,8 +115,11 @@ def generate_video(
# This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames.
if generate_type == "i2v":
video_generate = pipe(
+ height=height,
+ width=width,
prompt=prompt,
- image=image, # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480
+ image=image,
+ # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480
num_videos_per_prompt=num_videos_per_prompt, # Number of videos to generate per prompt
num_inference_steps=num_inference_steps, # Number of inference steps
num_frames=num_frames, # Number of frames to generate
@@ -162,6 +169,8 @@ if __name__ == "__main__":
parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
+ parser.add_argument("--width", type=int, default=1360, help="Number of steps for the inference process")
+ parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process")
parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process")
parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation")
@@ -177,6 +186,8 @@ if __name__ == "__main__":
lora_rank=args.lora_rank,
output_path=args.output_path,
num_frames=args.num_frames,
+ width=args.width,
+ height=args.height,
image_or_video_path=args.image_or_video_path,
num_inference_steps=args.num_inference_steps,
guidance_scale=args.guidance_scale,
From e7a35ea33bf4e03fbbe04b2b63fda7b6fae85eee Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Wed, 13 Nov 2024 17:06:16 +0800
Subject: [PATCH 8/9] update friendly link
---
.gitignore | 3 ++-
README.md | 3 ++-
README_ja.md | 3 ++-
README_zh.md | 4 +++-
4 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/.gitignore b/.gitignore
index 9669d70..6be6f4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ logs/
*.DS_Store
.idea
output*
-test*
\ No newline at end of file
+test*
+venv
\ No newline at end of file
diff --git a/README.md b/README.md
index b36c9e3..67c6a21 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Experience the CogVideoX-5B model online at Inference Precision
- BF16 |
+ BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 |
FP16*(Recommended), BF16, FP32, FP8*, INT8, Not supported: INT4 |
BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 |
diff --git a/README_ja.md b/README_ja.md
index 26b02c1..a7aa11b 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -186,7 +186,7 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の
推論精度 |
- BF16 |
+ BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 |
FP16*(推奨), BF16, FP32,FP8*,INT8,INT4非対応 |
BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 |
diff --git a/README_zh.md b/README_zh.md
index f456376..704c467 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -176,7 +176,7 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源
推理精度 |
- BF16 |
+ BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 |
FP16*(推荐), BF16, FP32,FP8*,INT8,不支持INT4 |
BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 |
diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index bc97dd8..a211b4b 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -103,16 +103,13 @@ def generate_video(
# turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
# and enable to("cuda")
- pipe.to("cuda")
-
- # pipe.enable_sequential_cpu_offload()
-
+ # pipe.to("cuda")
+ pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
# 4. Generate the video frames based on the prompt.
# `num_frames` is the Number of frames to generate.
- # This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames.
if generate_type == "i2v":
video_generate = pipe(
height=height,
diff --git a/tools/convert_weight_sat2hf.py b/tools/convert_weight_sat2hf.py
index f325018..b70af1a 100644
--- a/tools/convert_weight_sat2hf.py
+++ b/tools/convert_weight_sat2hf.py
@@ -92,6 +92,8 @@ TRANSFORMER_KEYS_RENAME_DICT = {
"post_attn1_layernorm": "norm2.norm",
"time_embed.0": "time_embedding.linear_1",
"time_embed.2": "time_embedding.linear_2",
+ "ofs_embed.0": "ofs_embedding.linear_1",
+ "ofs_embed.2": "ofs_embedding.linear_2",
"mixins.patch_embed": "patch_embed",
"mixins.final_layer.norm_final": "norm_out.norm",
"mixins.final_layer.linear": "proj_out",
@@ -146,12 +148,13 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key:
def convert_transformer(
- ckpt_path: str,
- num_layers: int,
- num_attention_heads: int,
- use_rotary_positional_embeddings: bool,
- i2v: bool,
- dtype: torch.dtype,
+ ckpt_path: str,
+ num_layers: int,
+ num_attention_heads: int,
+ use_rotary_positional_embeddings: bool,
+ i2v: bool,
+ dtype: torch.dtype,
+ init_kwargs: Dict[str, Any],
):
PREFIX_KEY = "model.diffusion_model."
@@ -161,11 +164,13 @@ def convert_transformer(
num_layers=num_layers,
num_attention_heads=num_attention_heads,
use_rotary_positional_embeddings=use_rotary_positional_embeddings,
- use_learned_positional_embeddings=i2v,
+ ofs_embed_dim=512 if (i2v and init_kwargs["patch_size_t"] is not None) else None, # CogVideoX1.5-5B-I2V
+ use_learned_positional_embeddings=i2v and init_kwargs["patch_size_t"] is None, # CogVideoX-5B-I2V
+ **init_kwargs,
).to(dtype=dtype)
for key in list(original_state_dict.keys()):
- new_key = key[len(PREFIX_KEY):]
+ new_key = key[len(PREFIX_KEY) :]
for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
new_key = new_key.replace(replace_key, rename_key)
update_state_dict_inplace(original_state_dict, key, new_key)
@@ -175,13 +180,18 @@ def convert_transformer(
if special_key not in key:
continue
handler_fn_inplace(key, original_state_dict)
+
transformer.load_state_dict(original_state_dict, strict=True)
return transformer
-def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
+def convert_vae(ckpt_path: str, scaling_factor: float, version: str, dtype: torch.dtype):
+ init_kwargs = {"scaling_factor": scaling_factor}
+ if version == "1.5":
+ init_kwargs.update({"invert_scale_latents": True})
+
original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
- vae = AutoencoderKLCogVideoX(scaling_factor=scaling_factor).to(dtype=dtype)
+ vae = AutoencoderKLCogVideoX(**init_kwargs).to(dtype=dtype)
for key in list(original_state_dict.keys()):
new_key = key[:]
@@ -199,6 +209,34 @@ def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
return vae
+def get_transformer_init_kwargs(version: str):
+ if version == "1.0":
+ vae_scale_factor_spatial = 8
+ init_kwargs = {
+ "patch_size": 2,
+ "patch_size_t": None,
+ "patch_bias": True,
+ "sample_height": 480 // vae_scale_factor_spatial,
+ "sample_width": 720 // vae_scale_factor_spatial,
+ "sample_frames": 49,
+ }
+
+ elif version == "1.5":
+ vae_scale_factor_spatial = 8
+ init_kwargs = {
+ "patch_size": 2,
+ "patch_size_t": 2,
+ "patch_bias": False,
+ "sample_height": 768 // vae_scale_factor_spatial,
+ "sample_width": 1360 // vae_scale_factor_spatial,
+ "sample_frames": 81,
+ }
+ else:
+ raise ValueError("Unsupported version of CogVideoX.")
+
+ return init_kwargs
+
+
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
@@ -214,6 +252,12 @@ def get_args():
parser.add_argument(
"--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
)
+ parser.add_argument(
+ "--typecast_text_encoder",
+ action="store_true",
+ default=False,
+ help="Whether or not to apply fp16/bf16 precision to text_encoder",
+ )
# For CogVideoX-2B, num_layers is 30. For 5B, it is 42
parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
# For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
@@ -226,7 +270,18 @@ def get_args():
parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
# For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
- parser.add_argument("--i2v", action="store_true", default=False, help="Whether to save the model weights in fp16")
+ parser.add_argument(
+ "--i2v",
+ action="store_true",
+ default=False,
+ help="Whether the model to be converted is the Image-to-Video version of CogVideoX.",
+ )
+ parser.add_argument(
+ "--version",
+ choices=["1.0", "1.5"],
+ default="1.0",
+ help="Which version of CogVideoX to use for initializing default modeling parameters.",
+ )
return parser.parse_args()
@@ -242,6 +297,7 @@ if __name__ == "__main__":
dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
if args.transformer_ckpt_path is not None:
+ init_kwargs = get_transformer_init_kwargs(args.version)
transformer = convert_transformer(
args.transformer_ckpt_path,
args.num_layers,
@@ -249,14 +305,19 @@ if __name__ == "__main__":
args.use_rotary_positional_embeddings,
args.i2v,
dtype,
+ init_kwargs,
)
if args.vae_ckpt_path is not None:
- vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype)
+ # Keep VAE in float32 for better quality
+ vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, args.version, torch.float32)
- text_encoder_id = "/share/official_pretrains/hf_home/t5-v1_1-xxl"
+ text_encoder_id = "google/t5-v1_1-xxl"
tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+ if args.typecast_text_encoder:
+ text_encoder = text_encoder.to(dtype=dtype)
+
# Apparently, the conversion does not work anymore without this :shrug:
for param in text_encoder.parameters():
param.data = param.data.contiguous()
@@ -288,11 +349,6 @@ if __name__ == "__main__":
scheduler=scheduler,
)
- if args.fp16:
- pipe = pipe.to(dtype=torch.float16)
- if args.bf16:
- pipe = pipe.to(dtype=torch.bfloat16)
-
# We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
# for users to specify variant when the default is not fp32 and they want to run with the correct default (which
# is either fp16/bf16 here).