diff --git a/inference/cli_demo.py b/inference/cli_demo.py index 9c59b00..41f4267 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -14,7 +14,10 @@ To run the script, use the following command with appropriate arguments: $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" ``` +You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory + Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths. + """ import argparse @@ -22,6 +25,7 @@ import logging from typing import Literal, Optional import torch + from diffusers import ( CogVideoXDPMScheduler, CogVideoXImageToVideoPipeline, @@ -121,7 +125,7 @@ def generate_video( # If you're using with lora, add this code if lora_path: pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1") - pipe.fuse_lora(components=["transformer"], lora_scale=1/lora_rank) + pipe.fuse_lora(components=["transformer"], lora_scale=1 / lora_rank) # 2. Set Scheduler. # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`. @@ -134,8 +138,9 @@ def generate_video( # 3. Enable CPU offload for the model. # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference # and enable to("cuda") - # pipe.to("cuda") + + # pipe.enable_model_cpu_offload() pipe.enable_sequential_cpu_offload() pipe.vae.enable_slicing() pipe.vae.enable_tiling()