diff --git a/inference/gradio_composite_demo/app.py b/inference/gradio_composite_demo/app.py index b180e01..9b164c1 100644 --- a/inference/gradio_composite_demo/app.py +++ b/inference/gradio_composite_demo/app.py @@ -53,7 +53,7 @@ pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained( ).to(device) pipe_image = CogVideoXImageToVideoPipeline.from_pretrained( - "THUDM/CogVideoX-5b", + "THUDM/CogVideoX-5b-I2V", transformer=CogVideoXTransformer3DModel.from_pretrained( "THUDM/CogVideoX-5b-I2V", subfolder="transformer", torch_dtype=torch.bfloat16 ), @@ -65,10 +65,10 @@ pipe_image = CogVideoXImageToVideoPipeline.from_pretrained( ).to(device) -pipe.transformer.to(memory_format=torch.channels_last) -pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) -pipe_image.transformer.to(memory_format=torch.channels_last) -pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True) +# pipe.transformer.to(memory_format=torch.channels_last) +# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) +# pipe_image.transformer.to(memory_format=torch.channels_last) +# pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True) os.makedirs("./output", exist_ok=True) os.makedirs("./gradio_tmp", exist_ok=True) @@ -294,7 +294,8 @@ def delete_old_files(): threading.Thread(target=delete_old_files, daemon=True).start() -examples = [["horse.mp4"], ["kitten.mp4"], ["train_running.mp4"]] +examples_videos = [["example_videos/horse.mp4"], ["example_videos/kitten.mp4"], ["example_videos/train_running.mp4"]] +examples_images = [["example_images/beach.png"], ["example_images/street.png"], ["example_images/camping.png"]] with gr.Blocks() as demo: gr.Markdown(""" @@ -302,7 +303,8 @@ with gr.Blocks() as demo: CogVideoX-5B Huggingface Space🤗
- 🤗 5B Model Hub | + 🤗 5B(T2V) Model Hub | + 🤗 5B(I2V) Model Hub | 🌐 Github | 📜 arxiv
@@ -320,10 +322,11 @@ with gr.Blocks() as demo: with gr.Column(): with gr.Accordion("I2V: Image Input (cannot be used simultaneously with video input)", open=False): image_input = gr.Image(label="Input Image (will be cropped to 720 * 480)") + examples_component_images = gr.Examples(examples_images, inputs=[examples_images], cache_examples=False) with gr.Accordion("V2V: Video Input (cannot be used simultaneously with image input)", open=False): video_input = gr.Video(label="Input Video (will be cropped to 49 frames, 6 seconds at 8fps)") strength = gr.Slider(0.1, 1.0, value=0.8, step=0.01, label="Strength") - examples_component = gr.Examples(examples, inputs=[video_input], cache_examples=False) + examples_component_videos = gr.Examples(examples_videos, inputs=[examples_videos], cache_examples=False) prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5) with gr.Row(): @@ -338,7 +341,7 @@ with gr.Blocks() as demo: label="Inference Seed (Enter a positive number, -1 for random)", value=-1 ) with gr.Row(): - enable_scale = gr.Checkbox(label="Super-Resolution (720 × 480 -> 1440 × 960)", value=False) + enable_scale = gr.Checkbox(label="Super-Resolution (720 × 480 -> 2880 × 1920)", value=False) enable_rife = gr.Checkbox(label="Frame Interpolation (8fps -> 16fps)", value=False) gr.Markdown( "✨In this demo, we use [RIFE](https://github.com/hzwer/ECCV2022-RIFE) for frame interpolation and [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) for upscaling(Super-Resolution).
    The entire process is based on open-source solutions." @@ -356,7 +359,7 @@ with gr.Blocks() as demo: gr.Markdown("""
- 🎥 Video Gallery + 🎥 Video Gallery(For 5B)
diff --git a/inference/gradio_composite_demo/example_images/beach.png b/inference/gradio_composite_demo/example_images/beach.png new file mode 100644 index 0000000..8ea6b60 Binary files /dev/null and b/inference/gradio_composite_demo/example_images/beach.png differ diff --git a/inference/gradio_composite_demo/example_images/camping.png b/inference/gradio_composite_demo/example_images/camping.png new file mode 100644 index 0000000..a0b809c Binary files /dev/null and b/inference/gradio_composite_demo/example_images/camping.png differ diff --git a/inference/gradio_composite_demo/example_images/street.png b/inference/gradio_composite_demo/example_images/street.png new file mode 100644 index 0000000..0b95e34 Binary files /dev/null and b/inference/gradio_composite_demo/example_images/street.png differ diff --git a/inference/gradio_composite_demo/example_videos/horse.mp4 b/inference/gradio_composite_demo/example_videos/horse.mp4 new file mode 100644 index 0000000..6b16dc4 --- /dev/null +++ b/inference/gradio_composite_demo/example_videos/horse.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c857bbc0d197c0751db9d6da9b5c85eafd163511ff9b0e10be65adf8ef9e352 +size 453387 diff --git a/inference/gradio_composite_demo/example_videos/kitten.mp4 b/inference/gradio_composite_demo/example_videos/kitten.mp4 new file mode 100644 index 0000000..3c1ce90 Binary files /dev/null and b/inference/gradio_composite_demo/example_videos/kitten.mp4 differ diff --git a/inference/gradio_composite_demo/example_videos/train_running.mp4 b/inference/gradio_composite_demo/example_videos/train_running.mp4 new file mode 100644 index 0000000..b67f2b1 Binary files /dev/null and b/inference/gradio_composite_demo/example_videos/train_running.mp4 differ diff --git a/inference/gradio_composite_demo/requirements.txt b/inference/gradio_composite_demo/requirements.txt index 8c64659..927b3d4 100644 --- a/inference/gradio_composite_demo/requirements.txt +++ b/inference/gradio_composite_demo/requirements.txt @@ -3,7 +3,7 @@ safetensors>=0.4.5 spandrel>=0.4.0 tqdm>=4.66.5 scikit-video>=1.1.11 -diffusers>=0.30.1 +git+https://github.com/huggingface/diffusers.git@main transformers>=4.44.0 accelerate>=0.34.2 opencv-python>=4.10.0.84 diff --git a/sat/README_zh.md b/sat/README_zh.md index c6010e3..a67c70e 100644 --- a/sat/README_zh.md +++ b/sat/README_zh.md @@ -18,7 +18,10 @@ pip install -r requirements.txt ### 2. 下载模型权重 -首先,前往 SAT 镜像下载模型权重。对于 CogVideoX-2B 模型,请按照如下方式下载: +首先,前往 SAT 镜像下载模型权重。 + +对于 CogVideoX-2B 模型,请按照如下方式下载: + ```shell mkdir CogVideoX-2b-sat cd CogVideoX-2b-sat @@ -29,28 +32,27 @@ wget https://cloud.tsinghua.edu.cn/f/556a3e1329e74f1bac45/?dl=1 mv 'index.html?dl=1' transformer.zip unzip transformer.zip ``` -对于 CogVideoX-5B 模型,请按照如下方式下载(VAE文件相同): -```shell -mkdir CogVideoX-5b-sat -cd CogVideoX-5b-sat -wget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1 -mv 'index.html?dl=1' vae.zip -unzip vae.zip -``` -然后,您需要前往[清华云盘](https://cloud.tsinghua.edu.cn/d/fcef5b3904294a6885e5/?p=%2F&mode=list)下载我们的模型,并进行解压。 -整理之后, 两个模型的完整模型结构应该如下: + +请按如下链接方式下载 CogVideoX-5B 模型的 `transformers` 文件(VAE 文件与 2B 相同): + ++ [CogVideoX-5B](https://cloud.tsinghua.edu.cn/d/fcef5b3904294a6885e5/?p=%2F&mode=list) ++ [CogVideoX-5B-I2V](https://cloud.tsinghua.edu.cn/d/5cc62a2d6e7d45c0a2f6/?p=%2F1&mode=list) + +接着,你需要将模型文件排版成如下格式: ``` . ├── transformer -│   ├── 1000 (or 1) -│   │   └── mp_rank_00_model_states.pt -│   └── latest +│ ├── 1000 (or 1) +│ │ └── mp_rank_00_model_states.pt +│ └── latest └── vae └── 3d-vae.pt ``` -由于模型的权重档案较大,建议使用`git lfs`。`git lfs`安装参见[这里](https://github.com/git-lfs/git-lfs?tab=readme-ov-file#installing) +由于模型的权重档案较大,建议使用`git lfs`。`git lfs` +安装参见[这里](https://github.com/git-lfs/git-lfs?tab=readme-ov-file#installing) + ```shell git lfs install ``` @@ -254,7 +256,7 @@ args: sampling_num_frames: 13 # Must be 13, 11 or 9 sampling_fps: 8 fp16: True # For CogVideoX-2B -# bf16: True # For CogVideoX-5B + # bf16: True # For CogVideoX-5B output_dir: outputs/ force_inference: True ``` @@ -406,28 +408,29 @@ python ../tools/convert_weight_sat2hf.py ### 从SAT权重文件 导出Huggingface Diffusers lora权重 -支持了从SAT权重文件 +支持了从SAT权重文件 在经过上面这些步骤训练之后,我们得到了一个sat带lora的权重,在{args.save}/1000/1000/mp_rank_00_model_states.pt你可以看到这个文件 导出的lora权重脚本在CogVideoX仓库 tools/export_sat_lora_weight.py ,导出后使用 load_cogvideox_lora.py 推理 -- 导出命令 + +导出命令: + ``` python tools/export_sat_lora_weight.py --sat_pt_path {args.save}/{experiment_name}-09-09-21-10/1000/mp_rank_00_model_states.pt --lora_save_directory {args.save}/export_hf_lora_weights_1/ -··· +``` 这次训练主要修改了下面几个模型结构,下面列出了 转换为HF格式的lora结构对应关系,可以看到lora将模型注意力结构上增加一个低秩权重, ``` - - 'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight', - 'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight', - 'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight', - 'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight', - 'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight', - 'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight', - 'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight', - 'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight' +'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight', +'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight', +'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight', +'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight', +'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight', +'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight', +'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight', +'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight' ``` - + 通过export_sat_lora_weight.py将它转换为HF格式的lora结构 ![alt text](../resources/hf_lora_weights.png) diff --git a/tools/llm_flux_cogvideox/generate.sh b/tools/llm_flux_cogvideox/generate.sh new file mode 100644 index 0000000..ca8fc1b --- /dev/null +++ b/tools/llm_flux_cogvideox/generate.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +NUM_VIDEOS=100 +INFERENCE_STEPS=50 +GUIDANCE_SCALE=7.0 +OUTPUT_DIR_PREFIX="outputs/gpu_" +LOG_DIR_PREFIX="logs/gpu_" + +CUDA_DEVICES=${CUDA_VISIBLE_DEVICES:-"0,1,2,3,4,5,6,7"} + +IFS=',' read -r -a GPU_ARRAY <<< "$CUDA_DEVICES" + +for i in "${!GPU_ARRAY[@]}" +do + GPU=${GPU_ARRAY[$i]} + echo "Starting task on GPU $GPU..." + CUDA_VISIBLE_DEVICES=$GPU nohup python3 llm_flux_cogvideox.py \ + --num_videos $NUM_VIDEOS \ + --image_generator_num_inference_steps $INFERENCE_STEPS \ + --guidance_scale $GUIDANCE_SCALE \ + --use_dynamic_cfg \ + --output_dir ${OUTPUT_DIR_PREFIX}${GPU} \ + > ${LOG_DIR_PREFIX}${GPU}.log 2>&1 & +done \ No newline at end of file diff --git a/tools/llm_flux_cogvideox/llm_flux_cogvideox.py b/tools/llm_flux_cogvideox/llm_flux_cogvideox.py new file mode 100644 index 0000000..92c1a38 --- /dev/null +++ b/tools/llm_flux_cogvideox/llm_flux_cogvideox.py @@ -0,0 +1,256 @@ +""" +The original experimental code for this project can be found at: + +https://gist.github.com/a-r-r-o-w/d070cce059ab4ceab3a9f289ff83c69c + +By using this code, description prompts will be generated through a local large language model, and images will be +generated using the black-forest-labs/FLUX.1-dev model, followed by video generation via CogVideoX. +The entire process utilizes open-source solutions, without the need for any API keys. + +You can use the generate.sh file in the same folder to automate running this code +for batch generation of videos and images. + +bash generate.sh + +""" + +import argparse +import gc +import json +import os +import pathlib +import random +from typing import Any, Dict + +from transformers import AutoTokenizer + +os.environ["TORCH_LOGS"] = "+dynamo,recompiles,graph_breaks" +os.environ["TORCHDYNAMO_VERBOSE"] = "1" + +import numpy as np +import torch +import transformers +from diffusers import CogVideoXImageToVideoPipeline, CogVideoXDPMScheduler, DiffusionPipeline +from diffusers.utils.logging import get_logger +from diffusers.utils import export_to_video + +torch.set_float32_matmul_precision("high") + +logger = get_logger(__name__) + +SYSTEM_PROMPT = """ +You are part of a team of people that create videos using generative models. You use a video-generation model that can generate a video about anything you describe. + +For example, if you respond with "A beautiful morning in the woods with the sun peaking through the trees", the video generation model will create a video of exactly as described. You task is to summarize the descriptions of videos provided to by users, and create details prompts to feed into the generative model. + +There are a few rules to follow: +- You will only ever output a single video description per request. +- If the user mentions to summarize the prompt in [X] words, make sure to not exceed the limit. + +You responses should just be the video generation prompt. Here are examples: +- "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting." +- "A street artist, clad in a worn-out denim jacket and a colorful banana, stands before a vast concrete wall in the heart, holding a can of spray paint, spray-painting a colorful bird on a mottled wall" +""".strip() + +USER_PROMPT = """ +Could you generate a prompt for a video generation model? +Please limit the prompt to [{0}] words. +""".strip() + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--num_videos", + type=int, + default=5, + help="Number of unique videos you would like to generate." + ) + parser.add_argument( + "--model_path", + type=str, + default="THUDM/CogVideoX-5B", + help="The path of Image2Video CogVideoX-5B", + ) + parser.add_argument( + "--caption_generator_model_id", + type=str, + default="THUDM/glm-4-9b-chat", + help="Caption generation model. default GLM-4-9B", + ) + parser.add_argument( + "--caption_generator_cache_dir", + type=str, + default=None, + help="Cache directory for caption generation model." + ) + parser.add_argument( + "--image_generator_model_id", + type=str, + default="black-forest-labs/FLUX.1-dev", + help="Image generation model." + ) + parser.add_argument( + "--image_generator_cache_dir", + type=str, + default=None, + help="Cache directory for image generation model." + ) + parser.add_argument( + "--image_generator_num_inference_steps", + type=int, + default=50, + help="Caption generation model." + ) + parser.add_argument( + "--guidance_scale", + type=float, + default=7, + help="Guidance scale to be use for generation." + ) + parser.add_argument( + "--use_dynamic_cfg", + action="store_true", + help="Whether or not to use cosine dynamic guidance for generation [Recommended].", + ) + parser.add_argument( + "--output_dir", + type=str, + default="outputs/", + help="Location where generated images and videos should be stored.", + ) + parser.add_argument( + "--compile", + action="store_true", + help="Whether or not to compile the transformer of image and video generators." + ) + parser.add_argument( + "--enable_vae_tiling", + action="store_true", + help="Whether or not to use VAE tiling when encoding/decoding." + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Seed for reproducibility." + ) + return parser.parse_args() + + +def reset_memory(): + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + torch.cuda.reset_accumulated_memory_stats() + + +@torch.no_grad() +def main(args: Dict[str, Any]) -> None: + output_dir = pathlib.Path(args.output_dir) + os.makedirs(output_dir.as_posix(), exist_ok=True) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + reset_memory() + tokenizer = AutoTokenizer.from_pretrained(args.caption_generator_model_id, trust_remote_code=True) + caption_generator = transformers.pipeline( + "text-generation", + model=args.caption_generator_model_id, + device_map="auto", + model_kwargs={ + "local_files_only": True, + "cache_dir": args.caption_generator_cache_dir, + "torch_dtype": torch.bfloat16, + }, + trust_remote_code=True, + tokenizer=tokenizer + ) + + captions = [] + for i in range(args.num_videos): + num_words = random.choice([100, 150, 200]) + user_prompt = USER_PROMPT.format(num_words) + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + + outputs = caption_generator(messages, max_new_tokens=226) + caption = outputs[0]["generated_text"][-1]["content"] + if caption.startswith("\"") and caption.endswith("\""): + caption = caption[1:-1] + captions.append(caption) + logger.info(f"Generated caption: {caption}") + + with open(output_dir / "captions.json", "w") as file: + json.dump(captions, file) + + del caption_generator + reset_memory() + + image_generator = DiffusionPipeline.from_pretrained( + args.image_generator_model_id, + cache_dir=args.image_generator_cache_dir, + torch_dtype=torch.bfloat16 + ) + image_generator.to("cuda") + + if args.compile: + image_generator.transformer = torch.compile(image_generator.transformer, mode="max-autotune", fullgraph=True) + + if args.enable_vae_tiling: + image_generator.vae.enable_tiling() + + images = [] + for index, caption in enumerate(captions): + image = image_generator( + prompt=caption, + height=480, + width=720, + num_inference_steps=args.image_generator_num_inference_steps, + guidance_scale=3.5, + ).images[0] + filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_") + image.save(output_dir / f"{index}_{filename}.png") + images.append(image) + + del image_generator + reset_memory() + + video_generator = CogVideoXImageToVideoPipeline.from_pretrained( + args.model_path, torch_dtype=torch.bfloat16).to("cuda") + video_generator.scheduler = CogVideoXDPMScheduler.from_config( + video_generator.scheduler.config, + timestep_spacing="trailing") + + if args.compile: + video_generator.transformer = torch.compile(video_generator.transformer, mode="max-autotune", fullgraph=True) + + if args.enable_vae_tiling: + video_generator.vae.enable_tiling() + + generator = torch.Generator().manual_seed(args.seed) + for index, (caption, image) in enumerate(zip(captions, images)): + video = video_generator( + image=image, + prompt=caption, + height=480, + width=720, + num_frames=49, + num_inference_steps=50, + guidance_scale=args.guidance_scale, + use_dynamic_cfg=args.use_dynamic_cfg, + generator=generator, + ).frames[0] + filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_") + export_to_video(video, output_dir / f"{index}_{filename}.mp4", fps=8) + + +if __name__ == "__main__": + args = get_args() + main(args)