feat(cli_demo): add native multi-GPU support via device_map

Add --device_map CLI argument to enable multi-GPU inference without
code modifications. This implements the functionality described in
existing comments at lines 92-93 and 146-150.

Changes:
- Add device_map parameter to generate_video() with options:
  - None (default): Uses sequential CPU offload (backward compatible)
  - 'balanced': Distributes model evenly across GPUs (recommended)
  - 'auto': Automatic device placement by accelerate
  - 'sequential': Fills GPUs one by one in order
- Conditionally enable CPU offload only when device_map is None
- Pass device_map to from_pretrained() for all three pipeline types:
  CogVideoXPipeline, CogVideoXImageToVideoPipeline, CogVideoXVideoToVideoPipeline
- Update module docstring with multi-GPU usage examples
- Add validation for device_map values

Backward compatibility:
- Default behavior unchanged (device_map=None uses CPU offload)
- All existing scripts work without modification

Usage:
  # Multi-GPU with balanced distribution (recommended):
  python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \
    --device_map balanced

  # Multi-GPU with auto placement:
  python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \
    --device_map auto
This commit is contained in:
Test User 2026-02-19 03:06:36 +00:00
parent 7a1af71545
commit 207020afea

View File

@ -11,11 +11,24 @@ Running the Script:
To run the script, use the following command with appropriate arguments: To run the script, use the following command with appropriate arguments:
```bash ```bash
# Single GPU (default behavior, uses CPU offload):
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
# Multi-GPU with balanced device mapping:
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map balanced
# Multi-GPU with auto device mapping:
$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map auto
``` ```
You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
Multi-GPU Support:
- Use `--device_map balanced` to distribute the model evenly across available GPUs (recommended for inference)
- Use `--device_map auto` for automatic device placement by accelerate
- Use `--device_map sequential` to fill GPUs sequentially (useful for uneven memory)
- Default behavior (no --device_map) uses CPU offload for single-GPU setups
Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths. Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
""" """
@ -48,6 +61,9 @@ RESOLUTION_MAP = {
"cogvideox-2b": (480, 720), "cogvideox-2b": (480, 720),
} }
# Valid device_map options for multi-GPU support
VALID_DEVICE_MAPS = {"auto", "balanced", "sequential"}
def generate_video( def generate_video(
prompt: str, prompt: str,
@ -66,6 +82,7 @@ def generate_video(
generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video
seed: int = 42, seed: int = 42,
fps: int = 16, fps: int = 16,
device_map: Optional[str] = None,
): ):
""" """
Generates a video based on the given prompt and saves it to the specified path. Generates a video based on the given prompt and saves it to the specified path.
@ -86,11 +103,23 @@ def generate_video(
- generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').· - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
- seed (int): The seed for reproducibility. - seed (int): The seed for reproducibility.
- fps (int): The frames per second for the generated video. - fps (int): The frames per second for the generated video.
- device_map (str): Device placement strategy for multi-GPU support. Options:
- None (default): Uses sequential CPU offload for single-GPU setups
- "balanced": Distributes model layers evenly across available GPUs (recommended)
- "auto": Automatic device placement by accelerate library
- "sequential": Fills GPUs one by one in order
Multi-GPU Usage Examples:
# Balanced distribution across GPUs (recommended):
generate_video(prompt="...", model_path="...", device_map="balanced")
# Automatic device placement:
generate_video(prompt="...", model_path="...", device_map="auto")
""" """
# 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16). # 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
# add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload() # When device_map is specified, the model is distributed across multiple GPUs.
# function to use Multi GPUs. # When device_map is None (default), CPU offload is enabled for single-GPU setups.
image = None image = None
video = None video = None
@ -115,13 +144,25 @@ def generate_video(
) )
height, width = desired_resolution height, width = desired_resolution
# Validate device_map if provided
if device_map is not None and device_map not in VALID_DEVICE_MAPS:
raise ValueError(
f"Invalid device_map '{device_map}'. Must be one of: {', '.join(sorted(VALID_DEVICE_MAPS))} or None"
)
# Build kwargs for from_pretrained - add device_map only when specified
load_kwargs = {"torch_dtype": dtype}
if device_map is not None:
load_kwargs["device_map"] = device_map
logging.info(f"\033[1mUsing device_map='{device_map}' for multi-GPU inference\033[0m")
if generate_type == "i2v": if generate_type == "i2v":
pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype) pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, **load_kwargs)
image = load_image(image=image_or_video_path) image = load_image(image=image_or_video_path)
elif generate_type == "t2v": elif generate_type == "t2v":
pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype) pipe = CogVideoXPipeline.from_pretrained(model_path, **load_kwargs)
else: else:
pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype) pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, **load_kwargs)
video = load_video(image_or_video_path) video = load_video(image_or_video_path)
# If you're using with lora, add this code # If you're using with lora, add this code
@ -141,13 +182,16 @@ def generate_video(
pipe.scheduler.config, timestep_spacing="trailing" pipe.scheduler.config, timestep_spacing="trailing"
) )
# 3. Enable CPU offload for the model. # 3. Enable CPU offload for the model (only when not using multi-GPU device_map).
# turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference # When device_map is specified, the model is already distributed across GPUs,
# and enable to("cuda") # so CPU offload is not needed and would conflict with device placement.
# pipe.to("cuda") if device_map is None:
# Single-GPU mode: use CPU offload to manage memory
# Turn off if you have multiple GPUs or enough GPU memory (such as H100)
# pipe.enable_model_cpu_offload()
pipe.enable_sequential_cpu_offload()
# pipe.enable_model_cpu_offload() # VAE optimizations work in both single and multi-GPU modes
pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing() pipe.vae.enable_slicing()
pipe.vae.enable_tiling() pipe.vae.enable_tiling()
@ -248,6 +292,13 @@ if __name__ == "__main__":
"--dtype", type=str, default="bfloat16", help="The data type for computation" "--dtype", type=str, default="bfloat16", help="The data type for computation"
) )
parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility") parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
parser.add_argument(
"--device_map",
type=str,
default=None,
choices=["auto", "balanced", "sequential"],
help="Device placement strategy for multi-GPU inference. Options: 'balanced' (recommended, distributes evenly), 'auto' (automatic placement), 'sequential' (fills GPUs in order). Default: None (uses CPU offload for single-GPU)",
)
args = parser.parse_args() args = parser.parse_args()
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
@ -268,4 +319,5 @@ if __name__ == "__main__":
generate_type=args.generate_type, generate_type=args.generate_type,
seed=args.seed, seed=args.seed,
fps=args.fps, fps=args.fps,
device_map=args.device_map,
) )