Add an option to run gradio web demo with very low vram

2026-06-04 02:54:23 +08:00 · 2024-08-29 11:44:11 -03:00 · 2024-08-29 11:44:11 -03:00 · e70d90b0e6
commit e70d90b0e6
parent 3b4fa1d907
2 changed files with 12 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -300,6 +300,8 @@ of the **CogVideoX** open-source model.
 cd inference
 # For Linux and Windows users
 python gradio_web_demo.py
+# or run 5b with pipe.enable_sequential_cpu_offload() and  pipe.vae.enable_tiling()
+python3 inference/gradio_web_demo.py --very-low-vram-5b

 # For macOS with Apple Silicon users, Intel not supported, this maybe 20x slower than RTX 4090
 PYTORCH_ENABLE_MPS_FALLBACK=1 python gradio_web_demo.py
--- a/inference/gradio_web_demo.py
+++ b/inference/gradio_web_demo.py
@ -17,10 +17,18 @@ from diffusers.utils import export_to_video
 from datetime import datetime, timedelta
 from openai import OpenAI
 import moviepy.editor as mp
+import sys
+

 dtype = torch.float16
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=dtype).to(device)
+
+if '--very-low-vram-5b' in sys.argv:
+    pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype)
+    pipe.enable_sequential_cpu_offload()
+    pipe.vae.enable_tiling()
+else:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=dtype).to(device)

 os.makedirs("./output", exist_ok=True)
 os.makedirs("./gradio_tmp", exist_ok=True)