From a46d762cd9e43e32997078fd23044cf272df92ac Mon Sep 17 00:00:00 2001 From: Zheng Guang Cong Date: Fri, 6 Dec 2024 20:14:43 +0800 Subject: [PATCH 1/5] fix bugs of image-to-video without image-condition --- finetune/train_cogvideox_image_to_video_lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/train_cogvideox_image_to_video_lora.py b/finetune/train_cogvideox_image_to_video_lora.py index 188d955..79e6223 100644 --- a/finetune/train_cogvideox_image_to_video_lora.py +++ b/finetune/train_cogvideox_image_to_video_lora.py @@ -1283,7 +1283,7 @@ def main(args): image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=image.device) image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=image.dtype) - noisy_image = torch.randn_like(image) * image_noise_sigma[:, None, None, None, None] + noisy_image = image + torch.randn_like(image) * image_noise_sigma[:, None, None, None, None] image_latent_dist = vae.encode(noisy_image).latent_dist return latent_dist, image_latent_dist From 5a03e6fa793e05327463b06170913d7ac7710ec4 Mon Sep 17 00:00:00 2001 From: Gforky Date: Sat, 14 Dec 2024 16:12:57 +0800 Subject: [PATCH 2/5] [fix]fix typo in train_cogvideox_image_to_video_lora.py --- finetune/train_cogvideox_image_to_video_lora.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finetune/train_cogvideox_image_to_video_lora.py b/finetune/train_cogvideox_image_to_video_lora.py index 79e6223..abf245f 100644 --- a/finetune/train_cogvideox_image_to_video_lora.py +++ b/finetune/train_cogvideox_image_to_video_lora.py @@ -1246,11 +1246,11 @@ def main(args): use_deepspeed_optimizer = ( accelerator.state.deepspeed_plugin is not None - and accelerator.state.deepspeed_plugin.deepspeed_config.get("optimizer", "none").lower() == "none" + and accelerator.state.deepspeed_plugin.deepspeed_config.get("optimizer", "none").lower() != "none" ) use_deepspeed_scheduler = ( accelerator.state.deepspeed_plugin is not None - and accelerator.state.deepspeed_plugin.deepspeed_config.get("scheduler", "none").lower() == "none" + and accelerator.state.deepspeed_plugin.deepspeed_config.get("scheduler", "none").lower() != "none" ) optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer) From 92a589240f31207fdd87d1f4daffb561359a97ea Mon Sep 17 00:00:00 2001 From: OleehyO Date: Wed, 18 Dec 2024 07:38:10 +0000 Subject: [PATCH 3/5] [bugfix] fix specific resolution setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Different models use different resolutions, for example, for the CogVideoX1.5 series models, the optimal generation resolution is 1360x768, But for CogVideoX, the best resolution is 720x480. --- inference/cli_demo.py | 49 +++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index ea8b4fc..f4dbc28 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -17,9 +17,9 @@ $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVide Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths. """ -import warnings +import logging import argparse -from typing import Literal +from typing import Literal, Optional import torch from diffusers import ( @@ -31,6 +31,20 @@ from diffusers import ( from diffusers.utils import export_to_video, load_image, load_video +logging.basicConfig(level=logging.INFO) + +# Recommended resolution for each model (width, height) +RESOLUTION_MAP = { + # cogvideox1.5-* + "cogvideox1.5-5b-i2v": (1360, 768), + "cogvideox1.5-5b": (1360, 768), + + # cogvideox-* + "cogvideox-5b-i2v": (720, 480), + "cogvideox-5b": (720, 480), + "cogvideox-2b": (720, 480), +} + def generate_video( prompt: str, @@ -38,8 +52,8 @@ def generate_video( lora_path: str = None, lora_rank: int = 128, num_frames: int = 81, - width: int = 1360, - height: int = 768, + width: Optional[int] = None, + height: Optional[int] = None, output_path: str = "./output.mp4", image_or_video_path: str = "", num_inference_steps: int = 50, @@ -48,7 +62,7 @@ def generate_video( dtype: torch.dtype = torch.bfloat16, generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video seed: int = 42, - fps: int = 8, + fps: int = 16, ): """ Generates a video based on the given prompt and saves it to the specified path. @@ -78,10 +92,19 @@ def generate_video( image = None video = None - if (width != 1360 or height != 768) and "cogvideox1.5-5b-i2v" in model_path.lower(): - warnings.warn(f"The width({width}) and height({height}) are not recommended for CogVideoX1.5-5B-I2V. The best resolution for CogVideoX1.5-5B-I2V is 1360x768.") - elif (width != 720 or height != 480) and "cogvideox-5b-i2v" in model_path.lower(): - warnings.warn(f"The width({width}) and height({height}) are not recommended for CogVideo-5B-I2V. The best resolution for CogVideo-5B-I2V is 720x480.") + model_name = model_path.split("/")[-1].lower() + desired_resolution = RESOLUTION_MAP[model_name] + if width is None or height is None: + width, height = desired_resolution + logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m") + elif (width, height) != desired_resolution: + if generate_type == "i2v": + # For i2v models, use user-defined width and height + logging.warning(f"\033[1;31mThe width({width}) and height({height}) are not recommended for {model_name}. The best resolution is {desired_resolution}.\033[0m") + else: + # Otherwise, use the recommended width and height + logging.warning(f"\033[1;31m{model_name} is not supported for custom resolution. Setting back to default resolution {desired_resolution}.\033[0m") + width, height = desired_resolution if generate_type == "i2v": pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype) @@ -132,6 +155,8 @@ def generate_video( ).frames[0] elif generate_type == "t2v": video_generate = pipe( + height=height, + width=width, prompt=prompt, num_videos_per_prompt=num_videos_per_prompt, num_inference_steps=num_inference_steps, @@ -142,6 +167,8 @@ def generate_video( ).frames[0] else: video_generate = pipe( + height=height, + width=width, prompt=prompt, video=video, # The path of the video to be used as the background of the video num_videos_per_prompt=num_videos_per_prompt, @@ -172,8 +199,8 @@ if __name__ == "__main__": parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps") parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process") - parser.add_argument("--width", type=int, default=1360, help="Number of steps for the inference process") - parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process") + parser.add_argument("--width", type=int, default=None, help="Number of steps for the inference process") + parser.add_argument("--height", type=int, default=None, help="Number of steps for the inference process") parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process") parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation") From 1811c50e7343060da7779d6e2e4e66d023fbb0c5 Mon Sep 17 00:00:00 2001 From: OleehyO Date: Wed, 18 Dec 2024 12:30:13 +0000 Subject: [PATCH 4/5] [docs] improve help messages in argument parser Fix and clarify help documentation in parser.add_argument() to better describe command-line arguments. --- inference/cli_demo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index f4dbc28..b9820c9 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -199,9 +199,9 @@ if __name__ == "__main__": parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps") parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process") - parser.add_argument("--width", type=int, default=None, help="Number of steps for the inference process") - parser.add_argument("--height", type=int, default=None, help="Number of steps for the inference process") - parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process") + parser.add_argument("--width", type=int, default=None, help="The width of the generated video") + parser.add_argument("--height", type=int, default=None, help="The height of the generated video") + parser.add_argument("--fps", type=int, default=16, help="The frames per second for the generated video") parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation") parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation") From a6e611e3542e87695cf5f77b3209ef5164da4357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=89=E6=B4=8B=E4=B8=89=E6=B4=8B?= <1258009915@qq.com> Date: Fri, 27 Dec 2024 19:37:08 +0800 Subject: [PATCH 5/5] docs: update TOC and add friendly link in README files - Update table of contents in README.md, README_ja.md and README_zh.md - Add friendly link section to all README files --- README.md | 31 +++++++++++++++++++------------ README_ja.md | 21 ++++++++++++++------- README_zh.md | 23 +++++++++++++++-------- 3 files changed, 48 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index a22b290..e58d8d8 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,24 @@ The SAT code has already been updated, while the diffusers version is still unde Jump to a specific section: -- [Quick Start](#Quick-Start) - - [SAT](#sat) - - [Diffusers](#Diffusers) -- [CogVideoX-2B Video Works](#cogvideox-2b-gallery) -- [Introduction to the CogVideoX Model](#Model-Introduction) -- [Full Project Structure](#project-structure) - - [Inference](#inference) - - [SAT](#sat) - - [Tools](#tools) -- [Introduction to CogVideo(ICLR'23) Model](#cogvideoiclr23) -- [Citations](#Citation) -- [Model License](#Model-License) +- [Quick Start](#quick-start) + - [Prompt Optimization](#prompt-optimization) + - [SAT](#sat) + - [Diffusers](#diffusers) +- [Gallery](#gallery) + - [CogVideoX-5B](#cogvideox-5b) + - [CogVideoX-2B](#cogvideox-2b) +- [Model Introduction](#model-introduction) +- [Friendly Links](#friendly-links) +- [Project Structure](#project-structure) + - [Quick Start with Colab](#quick-start-with-colab) + - [Inference](#inference) + - [finetune](#finetune) + - [sat](#sat-1) + - [Tools](#tools) +- [CogVideo(ICLR'23)](#cogvideoiclr23) +- [Citation](#citation) +- [Model-License](#model-license) ## Quick Start @@ -321,6 +327,7 @@ works have already been adapted for CogVideoX, and we invite everyone to use the + [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): A simple ControlNet module code that includes the CogVideoX model. + [VideoTuna](https://github.com/VideoVerses/VideoTuna): VideoTuna is the first repo that integrates multiple AI video generation models for text-to-video, image-to-video, text-to-image generation. + [ConsisID](https://github.com/PKU-YuanGroup/ConsisID): An identity-preserving text-to-video generation model, bases on CogVideoX-5B, which keep the face consistent in the generated video by frequency decomposition. ++ [A Step by Step Tutorial](https://www.youtube.com/watch?v=5UCkMzP2VLE&ab_channel=SECourses): A step-by-step guide on installing and optimizing the CogVideoX1.5-5B-I2V model in Windows and cloud environments. Special thanks to the [FurkanGozukara](https://github.com/FurkanGozukara) for his effort and support! ## Project Structure diff --git a/README_ja.md b/README_ja.md index c8c2d24..6c2303c 100644 --- a/README_ja.md +++ b/README_ja.md @@ -62,15 +62,21 @@ SAT バージョンのコードは [こちら](https://huggingface.co/THUDM/CogV 特定のセクションにジャンプ: - [クイックスタート](#クイックスタート) - - [SAT](#sat) - - [Diffusers](#Diffusers) -- [CogVideoX-2B ギャラリー](#CogVideoX-2B-ギャラリー) + - [プロンプトの最適化](#プロンプトの最適化) + - [SAT](#sat) + - [Diffusers](#diffusers) +- [Gallery](#gallery) + - [CogVideoX-5B](#cogvideox-5b) + - [CogVideoX-2B](#cogvideox-2b) - [モデル紹介](#モデル紹介) +- [友好的リンク](#友好的リンク) - [プロジェクト構造](#プロジェクト構造) - - [推論](#推論) - - [sat](#sat) - - [ツール](#ツール)= -- [CogVideo(ICLR'23)モデル紹介](#CogVideoICLR23) + - [Colabでのクイックスタート](#colabでのクイックスタート) + - [Inference](#inference) + - [finetune](#finetune) + - [sat](#sat-1) + - [ツール](#ツール) +- [CogVideo(ICLR'23)](#cogvideoiclr23) - [引用](#引用) - [ライセンス契約](#ライセンス契約) @@ -302,6 +308,7 @@ pipe.vae.enable_tiling() + [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): CogVideoXモデルを含むシンプルなControlNetモジュールのコード。 + [VideoTuna](https://github.com/VideoVerses/VideoTuna): VideoTuna は、テキストからビデオ、画像からビデオ、テキストから画像生成のための複数のAIビデオ生成モデルを統合した最初のリポジトリです。 + [ConsisID](https://github.com/PKU-YuanGroup/ConsisID): 一貫性のある顔を保持するために、周波数分解を使用するCogVideoX-5Bに基づいたアイデンティティ保持型テキストから動画生成モデル。 ++ [ステップバイステップチュートリアル](https://www.youtube.com/watch?v=5UCkMzP2VLE&ab_channel=SECourses): WindowsおよびクラウドでのCogVideoX1.5-5B-I2Vモデルのインストールと最適化に関するステップバイステップガイド。[FurkanGozukara](https://github.com/FurkanGozukara)氏の尽力とサポートに感謝いたします! ## プロジェクト構造 diff --git a/README_zh.md b/README_zh.md index b280bd2..393770a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -51,15 +51,21 @@ CogVideoX1.5-5B 系列模型支持 **10秒** 长度的视频和更高的分辨 跳转到指定部分: - [快速开始](#快速开始) - - [SAT](#sat) - - [Diffusers](#Diffusers) -- [CogVideoX-2B 视频作品](#cogvideox-2b-视频作品) -- [CogVideoX模型介绍](#模型介绍) + - [提示词优化](#提示词优化) + - [SAT](#sat) + - [Diffusers](#diffusers) +- [视频作品](#视频作品) + - [CogVideoX-5B](#cogvideox-5b) + - [CogVideoX-2B](#cogvideox-2b) +- [模型介绍](#模型介绍) +- [友情链接](#友情链接) - [完整项目代码结构](#完整项目代码结构) - - [Inference](#inference) - - [SAT](#sat) - - [Tools](#tools) -- [CogVideo(ICLR'23)模型介绍](#cogvideoiclr23) + - [Colab 快速使用](#colab-快速使用) + - [inference](#inference) + - [finetune](#finetune) + - [sat](#sat-1) + - [tools](#tools) +- [CogVideo(ICLR'23)](#cogvideoiclr23) - [引用](#引用) - [模型协议](#模型协议) @@ -282,6 +288,7 @@ pipe.vae.enable_tiling() + [CogVideoX-Controlnet](https://github.com/TheDenk/cogvideox-controlnet): 一个包含 CogvideoX 模型的简单 Controlnet 模块的代码。 + [VideoTuna](https://github.com/VideoVerses/VideoTuna):VideoTuna 是首个集成多种 AI 视频生成模型的仓库,支持文本转视频、图像转视频、文本转图像生成。 + [ConsisID](https://github.com/PKU-YuanGroup/ConsisID): 一种身份保持的文本到视频生成模型,基于 CogVideoX-5B,通过频率分解在生成的视频中保持面部一致性。 ++ [教程](https://www.youtube.com/watch?v=5UCkMzP2VLE&ab_channel=SECourses): 一个关于在Windows和云环境中安装和优化CogVideoX1.5-5B-I2V模型的分步指南。特别感谢[FurkanGozukara](https://github.com/FurkanGozukara)的努力和支持! ## 完整项目代码结构