diff --git a/README.md b/README.md
index 5f562cc..a37458e 100644
--- a/README.md
+++ b/README.md
@@ -300,6 +300,8 @@ of the **CogVideoX** open-source model.
 cd inference
 # For Linux and Windows users
 python gradio_web_demo.py
+# or run 5b with pipe.enable_sequential_cpu_offload() and  pipe.vae.enable_tiling()
+python3 inference/gradio_web_demo.py --very-low-vram-5b
 
 # For macOS with Apple Silicon users, Intel not supported, this maybe 20x slower than RTX 4090
 PYTORCH_ENABLE_MPS_FALLBACK=1 python gradio_web_demo.py
diff --git a/inference/gradio_web_demo.py b/inference/gradio_web_demo.py
index 8204a8f..a9953b2 100644
--- a/inference/gradio_web_demo.py
+++ b/inference/gradio_web_demo.py
@@ -17,10 +17,18 @@ from diffusers.utils import export_to_video
 from datetime import datetime, timedelta
 from openai import OpenAI
 import moviepy.editor as mp
+import sys
+
 
 dtype = torch.float16
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=dtype).to(device)
+
+if '--very-low-vram-5b' in sys.argv:
+    pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype)
+    pipe.enable_sequential_cpu_offload()
+    pipe.vae.enable_tiling()
+else:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=dtype).to(device)
 
 os.makedirs("./output", exist_ok=True)
 os.makedirs("./gradio_tmp", exist_ok=True)