From 207020afea1a8c3fe05caa4d196f68bf5037d3c5 Mon Sep 17 00:00:00 2001 From: Test User Date: Thu, 19 Feb 2026 03:06:36 +0000 Subject: [PATCH] feat(cli_demo): add native multi-GPU support via device_map Add --device_map CLI argument to enable multi-GPU inference without code modifications. This implements the functionality described in existing comments at lines 92-93 and 146-150. Changes: - Add device_map parameter to generate_video() with options: - None (default): Uses sequential CPU offload (backward compatible) - 'balanced': Distributes model evenly across GPUs (recommended) - 'auto': Automatic device placement by accelerate - 'sequential': Fills GPUs one by one in order - Conditionally enable CPU offload only when device_map is None - Pass device_map to from_pretrained() for all three pipeline types: CogVideoXPipeline, CogVideoXImageToVideoPipeline, CogVideoXVideoToVideoPipeline - Update module docstring with multi-GPU usage examples - Add validation for device_map values Backward compatibility: - Default behavior unchanged (device_map=None uses CPU offload) - All existing scripts work without modification Usage: # Multi-GPU with balanced distribution (recommended): python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \ --device_map balanced # Multi-GPU with auto placement: python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \ --device_map auto --- inference/cli_demo.py | 74 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index 2e28165..daa1005 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -11,11 +11,24 @@ Running the Script: To run the script, use the following command with appropriate arguments: ```bash +# Single GPU (default behavior, uses CPU offload): $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" + +# Multi-GPU with balanced device mapping: +$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map balanced + +# Multi-GPU with auto device mapping: +$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map auto ``` You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory +Multi-GPU Support: +- Use `--device_map balanced` to distribute the model evenly across available GPUs (recommended for inference) +- Use `--device_map auto` for automatic device placement by accelerate +- Use `--device_map sequential` to fill GPUs sequentially (useful for uneven memory) +- Default behavior (no --device_map) uses CPU offload for single-GPU setups + Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths. """ @@ -48,6 +61,9 @@ RESOLUTION_MAP = { "cogvideox-2b": (480, 720), } +# Valid device_map options for multi-GPU support +VALID_DEVICE_MAPS = {"auto", "balanced", "sequential"} + def generate_video( prompt: str, @@ -66,6 +82,7 @@ def generate_video( generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video seed: int = 42, fps: int = 16, + device_map: Optional[str] = None, ): """ Generates a video based on the given prompt and saves it to the specified path. @@ -86,11 +103,23 @@ def generate_video( - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').ยท - seed (int): The seed for reproducibility. - fps (int): The frames per second for the generated video. + - device_map (str): Device placement strategy for multi-GPU support. Options: + - None (default): Uses sequential CPU offload for single-GPU setups + - "balanced": Distributes model layers evenly across available GPUs (recommended) + - "auto": Automatic device placement by accelerate library + - "sequential": Fills GPUs one by one in order + + Multi-GPU Usage Examples: + # Balanced distribution across GPUs (recommended): + generate_video(prompt="...", model_path="...", device_map="balanced") + + # Automatic device placement: + generate_video(prompt="...", model_path="...", device_map="auto") """ # 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16). - # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload() - # function to use Multi GPUs. + # When device_map is specified, the model is distributed across multiple GPUs. + # When device_map is None (default), CPU offload is enabled for single-GPU setups. image = None video = None @@ -115,13 +144,25 @@ def generate_video( ) height, width = desired_resolution + # Validate device_map if provided + if device_map is not None and device_map not in VALID_DEVICE_MAPS: + raise ValueError( + f"Invalid device_map '{device_map}'. Must be one of: {', '.join(sorted(VALID_DEVICE_MAPS))} or None" + ) + + # Build kwargs for from_pretrained - add device_map only when specified + load_kwargs = {"torch_dtype": dtype} + if device_map is not None: + load_kwargs["device_map"] = device_map + logging.info(f"\033[1mUsing device_map='{device_map}' for multi-GPU inference\033[0m") + if generate_type == "i2v": - pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype) + pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, **load_kwargs) image = load_image(image=image_or_video_path) elif generate_type == "t2v": - pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype) + pipe = CogVideoXPipeline.from_pretrained(model_path, **load_kwargs) else: - pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype) + pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, **load_kwargs) video = load_video(image_or_video_path) # If you're using with lora, add this code @@ -141,13 +182,16 @@ def generate_video( pipe.scheduler.config, timestep_spacing="trailing" ) - # 3. Enable CPU offload for the model. - # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference - # and enable to("cuda") - # pipe.to("cuda") + # 3. Enable CPU offload for the model (only when not using multi-GPU device_map). + # When device_map is specified, the model is already distributed across GPUs, + # so CPU offload is not needed and would conflict with device placement. + if device_map is None: + # Single-GPU mode: use CPU offload to manage memory + # Turn off if you have multiple GPUs or enough GPU memory (such as H100) + # pipe.enable_model_cpu_offload() + pipe.enable_sequential_cpu_offload() - # pipe.enable_model_cpu_offload() - pipe.enable_sequential_cpu_offload() + # VAE optimizations work in both single and multi-GPU modes pipe.vae.enable_slicing() pipe.vae.enable_tiling() @@ -248,6 +292,13 @@ if __name__ == "__main__": "--dtype", type=str, default="bfloat16", help="The data type for computation" ) parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility") + parser.add_argument( + "--device_map", + type=str, + default=None, + choices=["auto", "balanced", "sequential"], + help="Device placement strategy for multi-GPU inference. Options: 'balanced' (recommended, distributes evenly), 'auto' (automatic placement), 'sequential' (fills GPUs in order). Default: None (uses CPU offload for single-GPU)", + ) args = parser.parse_args() dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 @@ -268,4 +319,5 @@ if __name__ == "__main__": generate_type=args.generate_type, seed=args.seed, fps=args.fps, + device_map=args.device_map, )