From c8c7b62aa16135a184b66dc7d1dd4089cfadd4d9 Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Sat, 9 Nov 2024 22:07:32 +0800 Subject: [PATCH 1/9] update diffusers code --- inference/cli_demo.py | 48 +++++++++++++-------------- inference/cli_demo_quantization.py | 53 ++++++++++++------------------ requirements.txt | 10 +++--- 3 files changed, 49 insertions(+), 62 deletions(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index 6e1dbf3..6e5ebf5 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -3,15 +3,15 @@ This script demonstrates how to generate a video using the CogVideoX model with The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v), and video-to-video (v2v), depending on the input data and different weight. -- text-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b -- video-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b -- image-to-video: THUDM/CogVideoX-5b-I2V +- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b +- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b +- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V Running the Script: To run the script, use the following command with appropriate arguments: ```bash -$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX-5b --generate_type "t2v" +$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" ``` Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths. @@ -23,7 +23,6 @@ from typing import Literal import torch from diffusers import ( CogVideoXPipeline, - CogVideoXDDIMScheduler, CogVideoXDPMScheduler, CogVideoXImageToVideoPipeline, CogVideoXVideoToVideoPipeline, @@ -37,6 +36,7 @@ def generate_video( model_path: str, lora_path: str = None, lora_rank: int = 128, + num_frames=81, output_path: str = "./output.mp4", image_or_video_path: str = "", num_inference_steps: int = 50, @@ -45,6 +45,7 @@ def generate_video( dtype: torch.dtype = torch.bfloat16, generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video seed: int = 42, + fps: int = 8, ): """ Generates a video based on the given prompt and saves it to the specified path. @@ -56,11 +57,13 @@ def generate_video( - lora_rank (int): The rank of the LoRA weights. - output_path (str): The path where the generated video will be saved. - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality. + - num_frames (int): Number of frames to generate. - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt. - num_videos_per_prompt (int): Number of videos to generate per prompt. - dtype (torch.dtype): The data type for computation (default is torch.bfloat16). - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').· - seed (int): The seed for reproducibility. + - fps (int): The frames per second for the generated video. """ # 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16). @@ -109,11 +112,11 @@ def generate_video( if generate_type == "i2v": video_generate = pipe( prompt=prompt, - image=image, # The path of the image to be used as the background of the video + image=image, # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480 num_videos_per_prompt=num_videos_per_prompt, # Number of videos to generate per prompt num_inference_steps=num_inference_steps, # Number of inference steps - num_frames=49, # Number of frames to generate,changed to 49 for diffusers version `0.30.3` and after. - use_dynamic_cfg=True, # This id used for DPM Sechduler, for DDIM scheduler, it should be False + num_frames=num_frames, # Number of frames to generate + use_dynamic_cfg=True, # This id used for DPM scheduler, for DDIM scheduler, it should be False guidance_scale=guidance_scale, generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility ).frames[0] @@ -122,7 +125,7 @@ def generate_video( prompt=prompt, num_videos_per_prompt=num_videos_per_prompt, num_inference_steps=num_inference_steps, - num_frames=49, + num_frames=num_frames, use_dynamic_cfg=True, guidance_scale=guidance_scale, generator=torch.Generator().manual_seed(seed), @@ -133,13 +136,12 @@ def generate_video( video=video, # The path of the video to be used as the background of the video num_videos_per_prompt=num_videos_per_prompt, num_inference_steps=num_inference_steps, - # num_frames=49, + num_frames=num_frames, use_dynamic_cfg=True, guidance_scale=guidance_scale, generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility ).frames[0] - # 5. Export the generated frames to a video file. fps must be 8 for original video. - export_to_video(video_generate, output_path, fps=8) + export_to_video(video_generate, output_path, fps=fps) if __name__ == "__main__": @@ -152,24 +154,18 @@ if __name__ == "__main__": help="The path of the image to be used as the background of the video", ) parser.add_argument( - "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used" + "--model_path", type=str, default="THUDM/CogVideoX-5b", help="Path of the pre-trained model use" ) parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used") parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights") - parser.add_argument( - "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved" - ) + parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video") parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") - parser.add_argument( - "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process" - ) + parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps") + parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process") + parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process") parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") - parser.add_argument( - "--generate_type", type=str, default="t2v", help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')" - ) - parser.add_argument( - "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')" - ) + parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation") + parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation") parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility") args = parser.parse_args() @@ -180,6 +176,7 @@ if __name__ == "__main__": lora_path=args.lora_path, lora_rank=args.lora_rank, output_path=args.output_path, + num_frames=args.num_frames, image_or_video_path=args.image_or_video_path, num_inference_steps=args.num_inference_steps, guidance_scale=args.guidance_scale, @@ -187,4 +184,5 @@ if __name__ == "__main__": dtype=dtype, generate_type=args.generate_type, seed=args.seed, + fps=args.fps, ) diff --git a/inference/cli_demo_quantization.py b/inference/cli_demo_quantization.py index b56c9d2..1ea3358 100644 --- a/inference/cli_demo_quantization.py +++ b/inference/cli_demo_quantization.py @@ -3,7 +3,7 @@ This script demonstrates how to generate a video from a text prompt using CogVid Note: -Must install the `torchao`,`torch`,`diffusers`,`accelerate` library FROM SOURCE to use the quantization feature. +Must install the `torchao`,`torch` library FROM SOURCE to use the quantization feature. Only NVIDIA GPUs like H100 or higher are supported om FP-8 quantization. ALL quantization schemes must use with NVIDIA GPUs. @@ -51,6 +51,9 @@ def generate_video( num_videos_per_prompt: int = 1, quantization_scheme: str = "fp8", dtype: torch.dtype = torch.bfloat16, + num_frames: int = 81, + fps: int = 8, + seed: int = 42, ): """ Generates a video based on the given prompt and saves it to the specified path. @@ -65,7 +68,6 @@ def generate_video( - quantization_scheme (str): The quantization scheme to use ('int8', 'fp8'). - dtype (torch.dtype): The data type for computation (default is torch.bfloat16). """ - text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=dtype) text_encoder = quantize_model(part=text_encoder, quantization_scheme=quantization_scheme) transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=dtype) @@ -80,54 +82,38 @@ def generate_video( torch_dtype=dtype, ) pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing") - - # Using with compile will run faster. First time infer will cost ~30min to compile. - # pipe.transformer.to(memory_format=torch.channels_last) - - # for FP8 should remove pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload() - - # This is not for FP8 and INT8 and should remove this line - # pipe.enable_sequential_cpu_offload() pipe.vae.enable_slicing() pipe.vae.enable_tiling() + video = pipe( prompt=prompt, num_videos_per_prompt=num_videos_per_prompt, num_inference_steps=num_inference_steps, - num_frames=49, + num_frames=num_frames, use_dynamic_cfg=True, guidance_scale=guidance_scale, - generator=torch.Generator(device="cuda").manual_seed(42), + generator=torch.Generator(device="cuda").manual_seed(seed), ).frames[0] - export_to_video(video, output_path, fps=8) + export_to_video(video, output_path, fps=fps) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX") parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated") + parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX-5b", help="Path of the pre-trained model") + parser.add_argument("--output_path", type=str, default="./output.mp4", help="Path to save generated video") + parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps") + parser.add_argument("--guidance_scale", type=float, default=6.0, help="Classifier-free guidance scale") + parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Videos to generate per prompt") + parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type (e.g., 'float16', 'bfloat16')") parser.add_argument( - "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used" - ) - parser.add_argument( - "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved" - ) - parser.add_argument( - "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process" - ) - parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") - parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") - parser.add_argument( - "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16', 'bfloat16')" - ) - parser.add_argument( - "--quantization_scheme", - type=str, - default="bf16", - choices=["int8", "fp8"], - help="The quantization scheme to use (int8, fp8)", + "--quantization_scheme", type=str, default="fp8", choices=["int8", "fp8"], help="Quantization scheme" ) + parser.add_argument("--num_frames", type=int, default=81, help="Number of frames in the video") + parser.add_argument("--fps", type=int, default=16, help="Frames per second for output video") + parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") args = parser.parse_args() dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 @@ -140,4 +126,7 @@ if __name__ == "__main__": num_videos_per_prompt=args.num_videos_per_prompt, quantization_scheme=args.quantization_scheme, dtype=dtype, + num_frames=args.num_frames, + fps=args.fps, + seed=args.seed, ) diff --git a/requirements.txt b/requirements.txt index bb36b92..3008655 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,14 @@ diffusers>=0.31.0 -accelerate>=1.0.1 -transformers>=4.46.1 +accelerate>=1.1.1 +transformers>=4.46.2 numpy==1.26.0 torch>=2.5.0 torchvision>=0.20.0 sentencepiece>=0.2.0 SwissArmyTransformer>=0.4.12 -gradio>=5.4.0 +gradio>=5.5.0 imageio>=2.35.1 imageio-ffmpeg>=0.5.1 -openai>=1.53.0 +openai>=1.54.0 moviepy>=1.0.3 -scikit-video>=1.1.11 +scikit-video>=1.1.11 \ No newline at end of file From fb806eecce4dec0a0fb9c4c59c1833f2f6ffe65e Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Sat, 9 Nov 2024 22:29:36 +0800 Subject: [PATCH 2/9] update table --- README.md | 7 ++++--- README_ja.md | 7 ++++--- README_zh.md | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9f8122e..8841a4b 100644 --- a/README.md +++ b/README.md @@ -199,13 +199,13 @@ models we currently offer, along with their foundational information. Single GPU Memory Usage
- SAT BF16: 66GB
+ SAT BF16: 76GB
diffusers BF16: from 10GB*
diffusers INT8(torchao): from 7GB* SAT FP16: 18GB
diffusers FP16: 4GB minimum*
diffusers INT8 (torchao): 3.6GB minimum* SAT BF16: 26GB
diffusers BF16 : 5GB minimum*
diffusers INT8 (torchao): 4.4GB minimum* Multi-GPU Memory Usage - Not Supported
+ BF16: 24GB* using diffusers
FP16: 10GB* using diffusers
BF16: 15GB* using diffusers
@@ -243,7 +243,8 @@ models we currently offer, along with their foundational information. Download Link (Diffusers) - Coming Soon + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel diff --git a/README_ja.md b/README_ja.md index 9962d1b..b6ad2c6 100644 --- a/README_ja.md +++ b/README_ja.md @@ -191,13 +191,13 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の 単一GPUメモリ消費量
- SAT BF16: 66GB
+ SAT BF16: 76GB
diffusers BF16:10GBから*
diffusers INT8(torchao):7GBから* SAT FP16: 18GB
diffusers FP16: 4GB以上*
diffusers INT8(torchao): 3.6GB以上* SAT BF16: 26GB
diffusers BF16 : 5GB以上*
diffusers INT8(torchao): 4.4GB以上* 複数GPU推論メモリ消費量 - 非対応
+ BF16: 24GB* using diffusers
FP16: 10GB* diffusers使用
BF16: 15GB* diffusers使用
@@ -235,7 +235,8 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の ダウンロードリンク (Diffusers) - 近日公開 + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel diff --git a/README_zh.md b/README_zh.md index c66fc85..8188db0 100644 --- a/README_zh.md +++ b/README_zh.md @@ -182,13 +182,13 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源 单GPU显存消耗
- SAT BF16: 66GB
+ SAT BF16: 76GB
diffusers BF16 : 10GB起*
diffusers INT8(torchao): 7G起* SAT FP16: 18GB
diffusers FP16: 4GB起*
diffusers INT8(torchao): 3.6G起* SAT BF16: 26GB
diffusers BF16 : 5GB起*
diffusers INT8(torchao): 4.4G起* 多GPU推理显存消耗 - 不支持
+ BF16: 24GB* using diffusers
FP16: 10GB* using diffusers
BF16: 15GB* using diffusers
@@ -226,7 +226,8 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源 下载链接 (Diffusers) - 即将推出 + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel + 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel 🤗 HuggingFace
🤖 ModelScope
🟣 WiseModel From 2475902027a4ed4427e9bcbf9894d5ef4a2c55cb Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Sat, 9 Nov 2024 22:43:02 +0800 Subject: [PATCH 3/9] friendly link --- README.md | 1 + README_ja.md | 1 + README_zh.md | 1 + 3 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 8841a4b..b735003 100644 --- a/README.md +++ b/README.md @@ -320,6 +320,7 @@ works have already been adapted for CogVideoX, and we invite everyone to use the + [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth Studio is a diffusion engine. It has restructured the architecture, including text encoders, UNet, VAE, etc., enhancing computational performance while maintaining compatibility with open-source community models. The framework has been adapted for CogVideoX. ++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): A simple ControlNet module code that includes the CogVideoX model. ## Project Structure diff --git a/README_ja.md b/README_ja.md index b6ad2c6..e6af4e9 100644 --- a/README_ja.md +++ b/README_ja.md @@ -300,6 +300,7 @@ pipe.vae.enable_tiling() キーフレーム補間生成において、より大きな柔軟性を提供することを目的とした、CogVideoX構造を基にした修正版のパイプライン。 + [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth Studioは、拡散エンジンです。テキストエンコーダー、UNet、VAEなどを含むアーキテクチャを再構築し、オープンソースコミュニティモデルとの互換性を維持しつつ、計算性能を向上させました。このフレームワークはCogVideoXに適応しています。 ++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): CogVideoXモデルを含むシンプルなControlNetモジュールのコード。 ## プロジェクト構造 diff --git a/README_zh.md b/README_zh.md index 8188db0..c4a370e 100644 --- a/README_zh.md +++ b/README_zh.md @@ -281,6 +281,7 @@ pipe.vae.enable_tiling() + [CogVideoX-Interpolation](https://github.com/feizc/CogvideX-Interpolation): 基于 CogVideoX 结构修改的管道,旨在为关键帧插值生成提供更大的灵活性。 + [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): DiffSynth 工作室是一款扩散引擎。重构了架构,包括文本编码器、UNet、VAE 等,在保持与开源社区模型兼容性的同时,提升了计算性能。该框架已经适配 CogVideoX。 ++ [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): 一个包含 CogvideoX 模型的简单 Controlnet 模块的代码。 ## 完整项目代码结构 From 155456befa8a3fd077ca03eaaf7ca19ea64fe133 Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Sat, 9 Nov 2024 22:49:03 +0800 Subject: [PATCH 4/9] update --- README.md | 10 ++++------ README_ja.md | 9 ++++----- README_zh.md | 7 +++---- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index b735003..d20ffab 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ Experience the CogVideoX-5B model online at Date: Tue, 12 Nov 2024 00:17:19 +0800 Subject: [PATCH 7/9] add width and height --- inference/cli_demo.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index db281bb..bc97dd8 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -37,6 +37,8 @@ def generate_video( lora_path: str = None, lora_rank: int = 128, num_frames: int = 81, + width: int = 1360, + height: int = 768, output_path: str = "./output.mp4", image_or_video_path: str = "", num_inference_steps: int = 50, @@ -58,6 +60,8 @@ def generate_video( - output_path (str): The path where the generated video will be saved. - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality. - num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps. + - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V + - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt. - num_videos_per_prompt (int): Number of videos to generate per prompt. - dtype (torch.dtype): The data type for computation (default is torch.bfloat16). @@ -111,8 +115,11 @@ def generate_video( # This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames. if generate_type == "i2v": video_generate = pipe( + height=height, + width=width, prompt=prompt, - image=image, # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480 + image=image, + # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480 num_videos_per_prompt=num_videos_per_prompt, # Number of videos to generate per prompt num_inference_steps=num_inference_steps, # Number of inference steps num_frames=num_frames, # Number of frames to generate @@ -162,6 +169,8 @@ if __name__ == "__main__": parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps") parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process") + parser.add_argument("--width", type=int, default=1360, help="Number of steps for the inference process") + parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process") parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process") parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation") @@ -177,6 +186,8 @@ if __name__ == "__main__": lora_rank=args.lora_rank, output_path=args.output_path, num_frames=args.num_frames, + width=args.width, + height=args.height, image_or_video_path=args.image_or_video_path, num_inference_steps=args.num_inference_steps, guidance_scale=args.guidance_scale, From e7a35ea33bf4e03fbbe04b2b63fda7b6fae85eee Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Wed, 13 Nov 2024 17:06:16 +0800 Subject: [PATCH 8/9] update friendly link --- .gitignore | 3 ++- README.md | 3 ++- README_ja.md | 3 ++- README_zh.md | 4 +++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 9669d70..6be6f4b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ logs/ *.DS_Store .idea output* -test* \ No newline at end of file +test* +venv \ No newline at end of file diff --git a/README.md b/README.md index b36c9e3..67c6a21 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Experience the CogVideoX-5B model online at Inference Precision - BF16 + BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 FP16*(Recommended), BF16, FP32, FP8*, INT8, Not supported: INT4 BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 diff --git a/README_ja.md b/README_ja.md index 26b02c1..a7aa11b 100644 --- a/README_ja.md +++ b/README_ja.md @@ -186,7 +186,7 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の 推論精度 - BF16 + BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 FP16*(推奨), BF16, FP32,FP8*,INT8,INT4非対応 BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 diff --git a/README_zh.md b/README_zh.md index f456376..704c467 100644 --- a/README_zh.md +++ b/README_zh.md @@ -176,7 +176,7 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源 推理精度 - BF16 + BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 FP16*(推荐), BF16, FP32,FP8*,INT8,不支持INT4 BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 diff --git a/inference/cli_demo.py b/inference/cli_demo.py index bc97dd8..a211b4b 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -103,16 +103,13 @@ def generate_video( # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference # and enable to("cuda") - pipe.to("cuda") - - # pipe.enable_sequential_cpu_offload() - + # pipe.to("cuda") + pipe.enable_sequential_cpu_offload() pipe.vae.enable_slicing() pipe.vae.enable_tiling() # 4. Generate the video frames based on the prompt. # `num_frames` is the Number of frames to generate. - # This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames. if generate_type == "i2v": video_generate = pipe( height=height, diff --git a/tools/convert_weight_sat2hf.py b/tools/convert_weight_sat2hf.py index f325018..b70af1a 100644 --- a/tools/convert_weight_sat2hf.py +++ b/tools/convert_weight_sat2hf.py @@ -92,6 +92,8 @@ TRANSFORMER_KEYS_RENAME_DICT = { "post_attn1_layernorm": "norm2.norm", "time_embed.0": "time_embedding.linear_1", "time_embed.2": "time_embedding.linear_2", + "ofs_embed.0": "ofs_embedding.linear_1", + "ofs_embed.2": "ofs_embedding.linear_2", "mixins.patch_embed": "patch_embed", "mixins.final_layer.norm_final": "norm_out.norm", "mixins.final_layer.linear": "proj_out", @@ -146,12 +148,13 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: def convert_transformer( - ckpt_path: str, - num_layers: int, - num_attention_heads: int, - use_rotary_positional_embeddings: bool, - i2v: bool, - dtype: torch.dtype, + ckpt_path: str, + num_layers: int, + num_attention_heads: int, + use_rotary_positional_embeddings: bool, + i2v: bool, + dtype: torch.dtype, + init_kwargs: Dict[str, Any], ): PREFIX_KEY = "model.diffusion_model." @@ -161,11 +164,13 @@ def convert_transformer( num_layers=num_layers, num_attention_heads=num_attention_heads, use_rotary_positional_embeddings=use_rotary_positional_embeddings, - use_learned_positional_embeddings=i2v, + ofs_embed_dim=512 if (i2v and init_kwargs["patch_size_t"] is not None) else None, # CogVideoX1.5-5B-I2V + use_learned_positional_embeddings=i2v and init_kwargs["patch_size_t"] is None, # CogVideoX-5B-I2V + **init_kwargs, ).to(dtype=dtype) for key in list(original_state_dict.keys()): - new_key = key[len(PREFIX_KEY):] + new_key = key[len(PREFIX_KEY) :] for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items(): new_key = new_key.replace(replace_key, rename_key) update_state_dict_inplace(original_state_dict, key, new_key) @@ -175,13 +180,18 @@ def convert_transformer( if special_key not in key: continue handler_fn_inplace(key, original_state_dict) + transformer.load_state_dict(original_state_dict, strict=True) return transformer -def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype): +def convert_vae(ckpt_path: str, scaling_factor: float, version: str, dtype: torch.dtype): + init_kwargs = {"scaling_factor": scaling_factor} + if version == "1.5": + init_kwargs.update({"invert_scale_latents": True}) + original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True)) - vae = AutoencoderKLCogVideoX(scaling_factor=scaling_factor).to(dtype=dtype) + vae = AutoencoderKLCogVideoX(**init_kwargs).to(dtype=dtype) for key in list(original_state_dict.keys()): new_key = key[:] @@ -199,6 +209,34 @@ def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype): return vae +def get_transformer_init_kwargs(version: str): + if version == "1.0": + vae_scale_factor_spatial = 8 + init_kwargs = { + "patch_size": 2, + "patch_size_t": None, + "patch_bias": True, + "sample_height": 480 // vae_scale_factor_spatial, + "sample_width": 720 // vae_scale_factor_spatial, + "sample_frames": 49, + } + + elif version == "1.5": + vae_scale_factor_spatial = 8 + init_kwargs = { + "patch_size": 2, + "patch_size_t": 2, + "patch_bias": False, + "sample_height": 768 // vae_scale_factor_spatial, + "sample_width": 1360 // vae_scale_factor_spatial, + "sample_frames": 81, + } + else: + raise ValueError("Unsupported version of CogVideoX.") + + return init_kwargs + + def get_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -214,6 +252,12 @@ def get_args(): parser.add_argument( "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory" ) + parser.add_argument( + "--typecast_text_encoder", + action="store_true", + default=False, + help="Whether or not to apply fp16/bf16 precision to text_encoder", + ) # For CogVideoX-2B, num_layers is 30. For 5B, it is 42 parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks") # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48 @@ -226,7 +270,18 @@ def get_args(): parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE") # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0 parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE") - parser.add_argument("--i2v", action="store_true", default=False, help="Whether to save the model weights in fp16") + parser.add_argument( + "--i2v", + action="store_true", + default=False, + help="Whether the model to be converted is the Image-to-Video version of CogVideoX.", + ) + parser.add_argument( + "--version", + choices=["1.0", "1.5"], + default="1.0", + help="Which version of CogVideoX to use for initializing default modeling parameters.", + ) return parser.parse_args() @@ -242,6 +297,7 @@ if __name__ == "__main__": dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32 if args.transformer_ckpt_path is not None: + init_kwargs = get_transformer_init_kwargs(args.version) transformer = convert_transformer( args.transformer_ckpt_path, args.num_layers, @@ -249,14 +305,19 @@ if __name__ == "__main__": args.use_rotary_positional_embeddings, args.i2v, dtype, + init_kwargs, ) if args.vae_ckpt_path is not None: - vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype) + # Keep VAE in float32 for better quality + vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, args.version, torch.float32) - text_encoder_id = "/share/official_pretrains/hf_home/t5-v1_1-xxl" + text_encoder_id = "google/t5-v1_1-xxl" tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH) text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir) + if args.typecast_text_encoder: + text_encoder = text_encoder.to(dtype=dtype) + # Apparently, the conversion does not work anymore without this :shrug: for param in text_encoder.parameters(): param.data = param.data.contiguous() @@ -288,11 +349,6 @@ if __name__ == "__main__": scheduler=scheduler, ) - if args.fp16: - pipe = pipe.to(dtype=torch.float16) - if args.bf16: - pipe = pipe.to(dtype=torch.bfloat16) - # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird # for users to specify variant when the default is not fp32 and they want to run with the correct default (which # is either fp16/bf16 here).