From 207020afea1a8c3fe05caa4d196f68bf5037d3c5 Mon Sep 17 00:00:00 2001
From: Test User <test@example.com>
Date: Thu, 19 Feb 2026 03:06:36 +0000
Subject: [PATCH] feat(cli_demo): add native multi-GPU support via device_map

Add --device_map CLI argument to enable multi-GPU inference without
code modifications. This implements the functionality described in
existing comments at lines 92-93 and 146-150.

Changes:
- Add device_map parameter to generate_video() with options:
  - None (default): Uses sequential CPU offload (backward compatible)
  - 'balanced': Distributes model evenly across GPUs (recommended)
  - 'auto': Automatic device placement by accelerate
  - 'sequential': Fills GPUs one by one in order
- Conditionally enable CPU offload only when device_map is None
- Pass device_map to from_pretrained() for all three pipeline types:
  CogVideoXPipeline, CogVideoXImageToVideoPipeline, CogVideoXVideoToVideoPipeline
- Update module docstring with multi-GPU usage examples
- Add validation for device_map values

Backward compatibility:
- Default behavior unchanged (device_map=None uses CPU offload)
- All existing scripts work without modification

Usage:
  # Multi-GPU with balanced distribution (recommended):
  python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \
    --device_map balanced

  # Multi-GPU with auto placement:
  python cli_demo.py --prompt '...' --model_path THUDM/CogVideoX1.5-5b \
    --device_map auto
---
 inference/cli_demo.py | 74 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index 2e28165..daa1005 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -11,11 +11,24 @@ Running the Script:
 To run the script, use the following command with appropriate arguments:
 
 ```bash
+# Single GPU (default behavior, uses CPU offload):
 $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
+
+# Multi-GPU with balanced device mapping:
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map balanced
+
+# Multi-GPU with auto device mapping:
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v" --device_map auto
 ```
 
 You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
 
+Multi-GPU Support:
+- Use `--device_map balanced` to distribute the model evenly across available GPUs (recommended for inference)
+- Use `--device_map auto` for automatic device placement by accelerate
+- Use `--device_map sequential` to fill GPUs sequentially (useful for uneven memory)
+- Default behavior (no --device_map) uses CPU offload for single-GPU setups
+
 Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
 
 """
@@ -48,6 +61,9 @@ RESOLUTION_MAP = {
     "cogvideox-2b": (480, 720),
 }
 
+# Valid device_map options for multi-GPU support
+VALID_DEVICE_MAPS = {"auto", "balanced", "sequential"}
+
 
 def generate_video(
     prompt: str,
@@ -66,6 +82,7 @@ def generate_video(
     generate_type: str = Literal["t2v", "i2v", "v2v"],  # i2v: image to video, v2v: video to video
     seed: int = 42,
     fps: int = 16,
+    device_map: Optional[str] = None,
 ):
     """
     Generates a video based on the given prompt and saves it to the specified path.
@@ -86,11 +103,23 @@ def generate_video(
     - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
     - seed (int): The seed for reproducibility.
     - fps (int): The frames per second for the generated video.
+    - device_map (str): Device placement strategy for multi-GPU support. Options:
+        - None (default): Uses sequential CPU offload for single-GPU setups
+        - "balanced": Distributes model layers evenly across available GPUs (recommended)
+        - "auto": Automatic device placement by accelerate library
+        - "sequential": Fills GPUs one by one in order
+
+    Multi-GPU Usage Examples:
+        # Balanced distribution across GPUs (recommended):
+        generate_video(prompt="...", model_path="...", device_map="balanced")
+
+        # Automatic device placement:
+        generate_video(prompt="...", model_path="...", device_map="auto")
     """
 
     # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
-    # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
-    # function to use Multi GPUs.
+    # When device_map is specified, the model is distributed across multiple GPUs.
+    # When device_map is None (default), CPU offload is enabled for single-GPU setups.
 
     image = None
     video = None
@@ -115,13 +144,25 @@ def generate_video(
             )
             height, width = desired_resolution
 
+    # Validate device_map if provided
+    if device_map is not None and device_map not in VALID_DEVICE_MAPS:
+        raise ValueError(
+            f"Invalid device_map '{device_map}'. Must be one of: {', '.join(sorted(VALID_DEVICE_MAPS))} or None"
+        )
+
+    # Build kwargs for from_pretrained - add device_map only when specified
+    load_kwargs = {"torch_dtype": dtype}
+    if device_map is not None:
+        load_kwargs["device_map"] = device_map
+        logging.info(f"\033[1mUsing device_map='{device_map}' for multi-GPU inference\033[0m")
+
     if generate_type == "i2v":
-        pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
+        pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, **load_kwargs)
         image = load_image(image=image_or_video_path)
     elif generate_type == "t2v":
-        pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype)
+        pipe = CogVideoXPipeline.from_pretrained(model_path, **load_kwargs)
     else:
-        pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
+        pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, **load_kwargs)
         video = load_video(image_or_video_path)
 
     # If you're using with lora, add this code
@@ -141,13 +182,16 @@ def generate_video(
         pipe.scheduler.config, timestep_spacing="trailing"
     )
 
-    # 3. Enable CPU offload for the model.
-    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
-    # and enable to("cuda")
-    # pipe.to("cuda")
+    # 3. Enable CPU offload for the model (only when not using multi-GPU device_map).
+    # When device_map is specified, the model is already distributed across GPUs,
+    # so CPU offload is not needed and would conflict with device placement.
+    if device_map is None:
+        # Single-GPU mode: use CPU offload to manage memory
+        # Turn off if you have multiple GPUs or enough GPU memory (such as H100)
+        # pipe.enable_model_cpu_offload()
+        pipe.enable_sequential_cpu_offload()
 
-    # pipe.enable_model_cpu_offload()
-    pipe.enable_sequential_cpu_offload()
+    # VAE optimizations work in both single and multi-GPU modes
     pipe.vae.enable_slicing()
     pipe.vae.enable_tiling()
 
@@ -248,6 +292,13 @@ if __name__ == "__main__":
         "--dtype", type=str, default="bfloat16", help="The data type for computation"
     )
     parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+    parser.add_argument(
+        "--device_map",
+        type=str,
+        default=None,
+        choices=["auto", "balanced", "sequential"],
+        help="Device placement strategy for multi-GPU inference. Options: 'balanced' (recommended, distributes evenly), 'auto' (automatic placement), 'sequential' (fills GPUs in order). Default: None (uses CPU offload for single-GPU)",
+    )
 
     args = parser.parse_args()
     dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
@@ -268,4 +319,5 @@ if __name__ == "__main__":
         generate_type=args.generate_type,
         seed=args.seed,
         fps=args.fps,
+        device_map=args.device_map,
     )