mirror of
https://github.com/THUDM/CogVideo.git
synced 2025-04-05 03:04:56 +08:00
remove streamlit demo since outdate
This commit is contained in:
parent
0a558e0964
commit
1b16921b2e
@ -84,8 +84,8 @@ def generate_video(
|
||||
# Using with compile will run faster. First time infer will cost ~30min to compile.
|
||||
# pipe.transformer.to(memory_format=torch.channels_last)
|
||||
|
||||
# for FP8 should remove pipe.enable_sequential_cpu_offload()
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
# for FP8 should remove pipe.enable_model_cpu_offload()
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
# This is not for FP8 and INT8 and should remove this line
|
||||
# pipe.enable_sequential_cpu_offload()
|
||||
|
@ -55,7 +55,7 @@ pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained(
|
||||
pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
|
||||
"THUDM/CogVideoX-5b",
|
||||
transformer=CogVideoXTransformer3DModel.from_pretrained(
|
||||
"THUDM/CogVideoX-5b-I2V", subfolder="transformers", torch_dtype=torch.bfloat16
|
||||
"THUDM/CogVideoX-5b-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
|
||||
),
|
||||
vae=pipe.vae,
|
||||
scheduler=pipe.scheduler,
|
||||
@ -65,10 +65,10 @@ pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
|
||||
).to(device)
|
||||
|
||||
|
||||
pipe.transformer.to(memory_format=torch.channels_last)
|
||||
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
|
||||
pipe_image.transformer.to(memory_format=torch.channels_last)
|
||||
pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True)
|
||||
# pipe.transformer.to(memory_format=torch.channels_last)
|
||||
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
|
||||
# pipe_image.transformer.to(memory_format=torch.channels_last)
|
||||
# pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True)
|
||||
|
||||
os.makedirs("./output", exist_ok=True)
|
||||
os.makedirs("./gradio_tmp", exist_ok=True)
|
||||
@ -241,7 +241,7 @@ def infer(
|
||||
generator=torch.Generator(device="cpu").manual_seed(seed),
|
||||
).frames
|
||||
elif image_input is not None:
|
||||
image_input = Image.fromarray(image_input) # Change to PIL
|
||||
image_input = Image.fromarray(image_input).resize(size=(720, 480)) # Convert to PIL
|
||||
image = load_image(image_input)
|
||||
video_pt = pipe_image(
|
||||
image=image,
|
||||
|
@ -2,6 +2,10 @@
|
||||
THis is the main file for the gradio web demo. It uses the CogVideoX-2B model to generate videos gradio web demo.
|
||||
set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
|
||||
|
||||
This demo only supports the text-to-video generation model.
|
||||
If you wish to use the image-to-video or video-to-video generation models,
|
||||
please use the gradio_composite_demo to implement the full GUI functionality.
|
||||
|
||||
Usage:
|
||||
OpenAI_API_KEY=your_openai_api_key OpenAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
|
||||
"""
|
||||
@ -18,12 +22,8 @@ from datetime import datetime, timedelta
|
||||
from openai import OpenAI
|
||||
import moviepy.editor as mp
|
||||
|
||||
dtype = torch.bfloat16
|
||||
device = "cuda" # Need to use cuda
|
||||
pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16).to("cuda")
|
||||
|
||||
pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype).to(device)
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
pipe.vae.enable_slicing()
|
||||
pipe.vae.enable_tiling()
|
||||
|
||||
@ -47,6 +47,7 @@ Video descriptions must have the same num of words as examples below. Extra word
|
||||
def convert_prompt(prompt: str, retry_times: int = 3) -> str:
|
||||
if not os.environ.get("OPENAI_API_KEY"):
|
||||
return prompt
|
||||
|
||||
client = OpenAI()
|
||||
text = prompt.strip()
|
||||
|
||||
@ -83,7 +84,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
|
||||
"content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
|
||||
},
|
||||
],
|
||||
model="glm-4-0520",
|
||||
model="glm-4-plus",
|
||||
temperature=0.01,
|
||||
top_p=0.7,
|
||||
stream=False,
|
||||
@ -145,19 +146,9 @@ threading.Thread(target=delete_old_files, daemon=True).start()
|
||||
with gr.Blocks() as demo:
|
||||
gr.Markdown("""
|
||||
<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
|
||||
CogVideoX-2B Huggingface Space🤗
|
||||
</div>
|
||||
<div style="text-align: center;">
|
||||
<a href="https://huggingface.co/THUDM/CogVideoX-2B">🤗 2B Model Hub</a> |
|
||||
<a href="https://github.com/THUDM/CogVideo">🌐 Github</a> |
|
||||
<a href="https://arxiv.org/pdf/2408.06072">📜 arxiv </a>
|
||||
</div>
|
||||
CogVideoX Gradio Simple Space🤗
|
||||
""")
|
||||
|
||||
<div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px;">
|
||||
⚠️ This demo is for academic research and experiential use only.
|
||||
Users should strictly adhere to local laws and ethics.
|
||||
</div>
|
||||
""")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
|
||||
@ -173,7 +164,6 @@ with gr.Blocks() as demo:
|
||||
"**Optional Parameters** (default values are recommended)<br>"
|
||||
"Increasing the number of inference steps will produce more detailed videos, but it will slow down the process.<br>"
|
||||
"50 steps are recommended for most cases.<br>"
|
||||
"For the 5B model, 50 steps will take approximately 350 seconds."
|
||||
)
|
||||
with gr.Row():
|
||||
num_inference_steps = gr.Number(label="Inference Steps", value=50)
|
||||
@ -186,42 +176,6 @@ with gr.Blocks() as demo:
|
||||
download_video_button = gr.File(label="📥 Download Video", visible=False)
|
||||
download_gif_button = gr.File(label="📥 Download GIF", visible=False)
|
||||
|
||||
gr.Markdown("""
|
||||
<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
|
||||
<div style="text-align: center; font-size: 24px; font-weight: bold; margin-bottom: 20px;">
|
||||
Demo Videos with 50 Inference Steps and 6.0 Guidance Scale.
|
||||
</div>
|
||||
<tr>
|
||||
<td style="width: 25%; vertical-align: top; font-size: 0.8em;">
|
||||
<p>A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.</p>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top;">
|
||||
<video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="100%" controls autoplay></video>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top; font-size: 0.8em;">
|
||||
<p>The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.</p>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top;">
|
||||
<video src="https://github.com/user-attachments/assets/9de41efd-d4d1-4095-aeda-246dd834e91d" width="100%" controls autoplay></video>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="width: 25%; vertical-align: top; font-size: 0.8em;">
|
||||
<p>A street artist, clad in a worn-out denim jacket and a colorful bandana, stands before a vast concrete wall in the heart, holding a can of spray paint, spray-painting a colorful bird on a mottled wall.</p>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top;">
|
||||
<video src="https://github.com/user-attachments/assets/941d6661-6a8d-4a1b-b912-59606f0b2841" width="100%" controls autoplay></video>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top; font-size: 0.8em;">
|
||||
<p>In the haunting backdrop of a war-torn city, where ruins and crumbled walls tell a story of devastation, a poignant close-up frames a young girl. Her face is smudged with ash, a silent testament to the chaos around her. Her eyes glistening with a mix of sorrow and resilience, capturing the raw emotion of a world that has lost its innocence to the ravages of conflict.</p>
|
||||
</td>
|
||||
<td style="width: 25%; vertical-align: top;">
|
||||
<video src="https://github.com/user-attachments/assets/938529c4-91ae-4f60-b96b-3c3947fa63cb" width="100%" controls autoplay></video>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
""")
|
||||
|
||||
def generate(prompt, num_inference_steps, guidance_scale, model_choice, progress=gr.Progress(track_tqdm=True)):
|
||||
tensor = infer(prompt, num_inference_steps, guidance_scale, progress=progress)
|
||||
video_path = save_video(tensor)
|
||||
|
@ -1,219 +0,0 @@
|
||||
"""
|
||||
This script is used to create a Streamlit web application for generating videos using the CogVideoX model.
|
||||
|
||||
Run the script using Streamlit:
|
||||
$ export OPENAI_API_KEY=your OpenAI Key or ZhiupAI Key
|
||||
$ export OPENAI_BASE_URL=https://open.bigmodel.cn/api/paas/v4/ # using with ZhipuAI, Not using this when using OpenAI
|
||||
$ streamlit run web_demo.py
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
import imageio
|
||||
import numpy as np
|
||||
import streamlit as st
|
||||
import torch
|
||||
from convert_demo import convert_prompt
|
||||
from diffusers import CogVideoXPipeline
|
||||
|
||||
|
||||
model_path: str = "THUDM/CogVideoX-2b"
|
||||
|
||||
|
||||
# Load the model at the start
|
||||
@st.cache_resource
|
||||
def load_model(model_path: str, dtype: torch.dtype, device: str) -> CogVideoXPipeline:
|
||||
"""
|
||||
Load the CogVideoX model.
|
||||
|
||||
Args:
|
||||
- model_path (str): Path to the model.
|
||||
- dtype (torch.dtype): Data type for model.
|
||||
- device (str): Device to load the model on.
|
||||
|
||||
Returns:
|
||||
- CogVideoXPipeline: Loaded model pipeline.
|
||||
"""
|
||||
pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype)
|
||||
pipe.enable_model_cpu_offload()
|
||||
return pipe
|
||||
|
||||
|
||||
# Define a function to generate video based on the provided prompt and model path
|
||||
def generate_video(
|
||||
pipe: CogVideoXPipeline,
|
||||
prompt: str,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 6.0,
|
||||
num_videos_per_prompt: int = 1,
|
||||
device: str = "cuda",
|
||||
dtype: torch.dtype = torch.float16,
|
||||
) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate a video based on the provided prompt and model path.
|
||||
|
||||
Args:
|
||||
- pipe (CogVideoXPipeline): The pipeline for generating videos.
|
||||
- prompt (str): Text prompt for video generation.
|
||||
- num_inference_steps (int): Number of inference steps.
|
||||
- guidance_scale (float): Guidance scale for generation.
|
||||
- num_videos_per_prompt (int): Number of videos to generate per prompt.
|
||||
- device (str): Device to run the generation on.
|
||||
- dtype (torch.dtype): Data type for the model.
|
||||
|
||||
Returns:
|
||||
- List[np.ndarray]: Generated video frames.
|
||||
"""
|
||||
prompt_embeds, _ = pipe.encode_prompt(
|
||||
prompt=prompt,
|
||||
negative_prompt=None,
|
||||
do_classifier_free_guidance=True,
|
||||
num_videos_per_prompt=num_videos_per_prompt,
|
||||
max_sequence_length=226,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
pipe.vae.enable_slicing()
|
||||
pipe.vae.enable_tiling()
|
||||
# Generate video
|
||||
video = pipe(
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=torch.zeros_like(prompt_embeds),
|
||||
).frames[0]
|
||||
return video
|
||||
|
||||
|
||||
def save_video(video: List[np.ndarray], path: str, fps: int = 8) -> None:
|
||||
"""
|
||||
Save the generated video to a file.
|
||||
|
||||
Args:
|
||||
- video (List[np.ndarray]): Video frames.
|
||||
- path (str): Path to save the video.
|
||||
- fps (int): Frames per second for the video.
|
||||
"""
|
||||
# Remove the first frame
|
||||
video = video[1:]
|
||||
|
||||
writer = imageio.get_writer(path, fps=fps, codec="libx264")
|
||||
for frame in video:
|
||||
np_frame = np.array(frame)
|
||||
writer.append_data(np_frame)
|
||||
|
||||
writer.close()
|
||||
|
||||
|
||||
def save_metadata(
|
||||
prompt: str,
|
||||
converted_prompt: str,
|
||||
num_inference_steps: int,
|
||||
guidance_scale: float,
|
||||
num_videos_per_prompt: int,
|
||||
path: str,
|
||||
) -> None:
|
||||
"""
|
||||
Save metadata to a JSON file.
|
||||
|
||||
Args:
|
||||
- prompt (str): Original prompt.
|
||||
- converted_prompt (str): Converted prompt.
|
||||
- num_inference_steps (int): Number of inference steps.
|
||||
- guidance_scale (float): Guidance scale.
|
||||
- num_videos_per_prompt (int): Number of videos per prompt.
|
||||
- path (str): Path to save the metadata.
|
||||
"""
|
||||
metadata = {
|
||||
"prompt": prompt,
|
||||
"converted_prompt": converted_prompt,
|
||||
"num_inference_steps": num_inference_steps,
|
||||
"guidance_scale": guidance_scale,
|
||||
"num_videos_per_prompt": num_videos_per_prompt,
|
||||
}
|
||||
with open(path, "w") as f:
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""
|
||||
Main function to run the Streamlit web application.
|
||||
"""
|
||||
st.set_page_config(page_title="CogVideoX-Demo", page_icon="🎥", layout="wide")
|
||||
st.write("# CogVideoX 🎥")
|
||||
dtype: torch.dtype = torch.float16
|
||||
device: str = "cuda"
|
||||
|
||||
global pipe
|
||||
pipe = load_model(model_path, dtype, device)
|
||||
|
||||
with st.sidebar:
|
||||
st.info("It will take some time to generate a video (~90 seconds per videos in 50 steps).", icon="ℹ️")
|
||||
num_inference_steps: int = st.number_input("Inference Steps", min_value=1, max_value=100, value=50)
|
||||
guidance_scale: float = st.number_input("Guidance Scale", min_value=0.0, max_value=20.0, value=6.0)
|
||||
num_videos_per_prompt: int = st.number_input("Videos per Prompt", min_value=1, max_value=10, value=1)
|
||||
|
||||
share_links_container = st.empty()
|
||||
|
||||
prompt: str = st.chat_input("Prompt")
|
||||
|
||||
if prompt:
|
||||
# Not Necessary, Suggestions
|
||||
with st.spinner("Refining prompts..."):
|
||||
converted_prompt = convert_prompt(prompt=prompt, retry_times=1)
|
||||
if converted_prompt is None:
|
||||
st.error("Failed to Refining the prompt, Using origin one.")
|
||||
|
||||
st.info(f"**Origin prompt:** \n{prompt} \n \n**Convert prompt:** \n{converted_prompt}")
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
with st.spinner("Generating Video..."):
|
||||
start_time = time.time()
|
||||
video_paths = []
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_dir = f"./output/{timestamp}"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
metadata_path = os.path.join(output_dir, "config.json")
|
||||
save_metadata(
|
||||
prompt, converted_prompt, num_inference_steps, guidance_scale, num_videos_per_prompt, metadata_path
|
||||
)
|
||||
|
||||
for i in range(num_videos_per_prompt):
|
||||
video_path = os.path.join(output_dir, f"output_{i + 1}.mp4")
|
||||
|
||||
video = generate_video(
|
||||
pipe, converted_prompt or prompt, num_inference_steps, guidance_scale, 1, device, dtype
|
||||
)
|
||||
save_video(video, video_path, fps=8)
|
||||
video_paths.append(video_path)
|
||||
with open(video_path, "rb") as video_file:
|
||||
video_bytes: bytes = video_file.read()
|
||||
st.video(video_bytes, autoplay=True, loop=True, format="video/mp4")
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
used_time: float = time.time() - start_time
|
||||
st.success(f"Videos generated in {used_time:.2f} seconds.")
|
||||
|
||||
# Create download links in the sidebar
|
||||
with share_links_container:
|
||||
st.sidebar.write("### Download Links:")
|
||||
for video_path in video_paths:
|
||||
video_name = os.path.basename(video_path)
|
||||
with open(video_path, "rb") as f:
|
||||
video_bytes: bytes = f.read()
|
||||
b64_video = base64.b64encode(video_bytes).decode()
|
||||
href = f'<a href="data:video/mp4;base64,{b64_video}" download="{video_name}">Download {video_name}</a>'
|
||||
st.sidebar.markdown(href, unsafe_allow_html=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -7,7 +7,6 @@ torchvision==0.19.0
|
||||
sentencepiece==0.2.0
|
||||
SwissArmyTransformer>=0.4.12
|
||||
gradio>=4.44.0
|
||||
streamlit>=1.38.0
|
||||
imageio>=2.35.1
|
||||
imageio-ffmpeg>=0.5.1
|
||||
openai>=1.45.0
|
||||
|
@ -8,16 +8,15 @@ will acknowledge your contribution on the project homepage.
|
||||
|
||||
## Model Algorithms
|
||||
|
||||
- Support for model quantization inference (Int4, Int8, etc. quantization engineering)
|
||||
- Support for multi-card inference / model inference concurrency engineering
|
||||
- Support for non-CUDA architecture inference devices
|
||||
- Support for model quantization inference (Int4 quantization project)
|
||||
- Optimization of model fine-tuning data loading (replacing the existing decord tool)
|
||||
|
||||
## Model Engineering / Secondary Development
|
||||
## Model Engineering
|
||||
|
||||
- Model fine-tuning examples / best prompt practices
|
||||
- Video super-resolution/frame interpolation for enhancing video generation quality.
|
||||
- Any peripheral tools for the model
|
||||
- Any minimal complete open-source projects using the CogVideoX open-source model
|
||||
- Model fine-tuning examples / Best prompt practices
|
||||
- Inference adaptation on different devices (e.g., MLX framework)
|
||||
- Any tools related to the model
|
||||
- Any minimal fully open-source project using the CogVideoX open-source model
|
||||
|
||||
## Code Standards
|
||||
|
||||
|
@ -4,14 +4,13 @@
|
||||
|
||||
## 模型算法
|
||||
|
||||
- 模型量化推理支持 (Int4,Int8等量化工程)
|
||||
- 模型多卡推理支持 / 模型推理并发工程
|
||||
- 非 CUDA 架构 推理设备支持
|
||||
- 模型量化推理支持 (Int4量化工程)
|
||||
- 模型微调数据载入优化支持(替换现有的decord工具)
|
||||
|
||||
## 模型工程 / 模型二次开发
|
||||
## 模型工程
|
||||
|
||||
- 模型微调示例 / 最佳提示词实践
|
||||
- 视频超分/插帧,用于美化视频生成效果。
|
||||
- 不同设备上的推理适配(MLX等框架)
|
||||
- 任何模型周边工具
|
||||
- 任何使用CogVideoX开源模型制作的最小完整开源项目
|
||||
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 597 KiB |
Binary file not shown.
Before Width: | Height: | Size: 4.6 MiB After Width: | Height: | Size: 605 KiB |
Loading…
x
Reference in New Issue
Block a user