diff --git a/inference/cli_demo.py b/inference/cli_demo.py index 1910a78..f89e63a 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -21,26 +21,28 @@ import argparse from typing import Literal import torch -from diffusers import (CogVideoXPipeline, - CogVideoXDDIMScheduler, - CogVideoXDPMScheduler, - CogVideoXImageToVideoPipeline, - CogVideoXVideoToVideoPipeline) +from diffusers import ( + CogVideoXPipeline, + CogVideoXDDIMScheduler, + CogVideoXDPMScheduler, + CogVideoXImageToVideoPipeline, + CogVideoXVideoToVideoPipeline, +) from diffusers.utils import export_to_video, load_image, load_video def generate_video( - prompt: str, - model_path: str, - output_path: str = "./output.mp4", - image_or_video_path: str = "", - num_inference_steps: int = 50, - guidance_scale: float = 6.0, - num_videos_per_prompt: int = 1, - dtype: torch.dtype = torch.bfloat16, - generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video - seed: int = 42, + prompt: str, + model_path: str, + output_path: str = "./output.mp4", + image_or_video_path: str = "", + num_inference_steps: int = 50, + guidance_scale: float = 6.0, + num_videos_per_prompt: int = 1, + dtype: torch.dtype = torch.bfloat16, + generate_type: str = Literal["t2v", "i2v", "v2v"], # i2v: image to video, v2v: video to video + seed: int = 42, ): """ Generates a video based on the given prompt and saves it to the specified path. @@ -53,7 +55,7 @@ def generate_video( - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt. - num_videos_per_prompt (int): Number of videos to generate per prompt. - dtype (torch.dtype): The data type for computation (default is torch.bfloat16). - - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v'). + - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').· - seed (int): The seed for reproducibility. """ @@ -97,13 +99,13 @@ def generate_video( image=image, # The path of the image to be used as the background of the video num_videos_per_prompt=num_videos_per_prompt, # Number of videos to generate per prompt num_inference_steps=num_inference_steps, # Number of inference steps - num_frames=49, # Number of frames to generate,changed to 49 for diffusers version `0.31.0` and after. + num_frames=49, # Number of frames to generate,changed to 49 for diffusers version `0.30.3` and after. use_dynamic_cfg=True, ## This id used for DPM Sechduler, for DDIM scheduler, it should be False guidance_scale=guidance_scale, generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility ).frames[0] elif generate_type == "t2v": - video_generate = pipe( + video_generate = pipe( prompt=prompt, num_videos_per_prompt=num_videos_per_prompt, num_inference_steps=num_inference_steps, @@ -130,19 +132,29 @@ def generate_video( if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX") parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated") - parser.add_argument("--image_or_video_path", type=str, default=None, - help="The path of the image to be used as the background of the video") - parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX-5b", - help="The path of the pre-trained model to be used") - parser.add_argument("--output_path", type=str, default="./output.mp4", - help="The path where the generated video will be saved") + parser.add_argument( + "--image_or_video_path", + type=str, + default=None, + help="The path of the image to be used as the background of the video", + ) + parser.add_argument( + "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used" + ) + parser.add_argument( + "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved" + ) parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") - parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of steps for the inference process") + parser.add_argument( + "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process" + ) parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt") - parser.add_argument("--generate_type", type=str, default="t2v", - help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')") - parser.add_argument("--dtype", type=str, default="bfloat16", - help="The data type for computation (e.g., 'float16' or 'bfloat16')") + parser.add_argument( + "--generate_type", type=str, default="t2v", help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')" + ) + parser.add_argument( + "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')" + ) parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility") args = parser.parse_args() diff --git a/inference/gradio_composite_demo/rife_model.py b/inference/gradio_composite_demo/rife_model.py index 455cfff..0a69ca6 100644 --- a/inference/gradio_composite_demo/rife_model.py +++ b/inference/gradio_composite_demo/rife_model.py @@ -8,7 +8,7 @@ import numpy as np import logging import skvideo.io from rife.RIFE_HDv3 import Model -from huggingface_hub import hf_hub_download, snapshot_download + logger = logging.getLogger(__name__) device = "cuda" if torch.cuda.is_available() else "cpu" @@ -19,9 +19,8 @@ def pad_image(img, scale): tmp = max(32, int(32 / scale)) ph = ((h - 1) // tmp + 1) * tmp pw = ((w - 1) // tmp + 1) * tmp - padding = (0, pw - w, 0, ph - h) - - return F.pad(img, padding), padding + padding = (0, 0, pw - w, ph - h) + return F.pad(img, padding) def make_inference(model, I0, I1, upscale_amount, n): @@ -45,15 +44,9 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi for b in range(samples.shape[0]): frame = samples[b : b + 1] _, _, h, w = frame.shape - I0 = samples[b : b + 1] I1 = samples[b + 1 : b + 2] if b + 2 < samples.shape[0] else samples[-1:] - - I0, padding = pad_image(I0, upscale_amount) - I0 = I0.to(torch.float) - I1, _ = pad_image(I1, upscale_amount) - I1 = I1.to(torch.float) - + I1 = pad_image(I1, upscale_amount) # [c, h, w] I0_small = F.interpolate(I0, (32, 32), mode="bilinear", align_corners=False) I1_small = F.interpolate(I1, (32, 32), mode="bilinear", align_corners=False) @@ -61,24 +54,14 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3]) if ssim > 0.996: - I1 = samples[b : b + 1] - # print(f'upscale_amount:{upscale_amount}') - # print(f'ssim:{upscale_amount}') - # print(f'I0 shape:{I0.shape}') - # print(f'I1 shape:{I1.shape}') - I1, padding = pad_image(I1, upscale_amount) - # print(f'I0 shape:{I0.shape}') - # print(f'I1 shape:{I1.shape}') + I1 = I0 + I1 = pad_image(I1, upscale_amount) I1 = make_inference(model, I0, I1, upscale_amount, 1) - - # print(f'I0 shape:{I0.shape}') - # print(f'I1[0] shape:{I1[0].shape}') - I1 = I1[0] - - # print(f'I1[0] unpadded shape:{I1.shape}') - I1_small = F.interpolate(I1, (32, 32), mode="bilinear", align_corners=False) + + I1_small = F.interpolate(I1[0], (32, 32), mode="bilinear", align_corners=False) ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3]) - frame = I1[padding[0]:, padding[2]:, padding[3]:,padding[1]:] + frame = I1[0] + I1 = I1[0] tmp_output = [] if ssim < 0.2: @@ -88,13 +71,9 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi else: tmp_output = make_inference(model, I0, I1, upscale_amount, 2**exp - 1) if exp else [] - frame, _ = pad_image(frame, upscale_amount) - print(f'frame shape:{frame.shape}') - print(f'tmp_output[0] shape:{tmp_output[0].shape}') + frame = pad_image(frame, upscale_amount) tmp_output = [frame] + tmp_output - - for i, frame in enumerate(tmp_output): - frame = F.interpolate(frame, size=(h, w)) + for i, frame in enumerate(tmp_output): output.append(frame.to(output_device)) return output @@ -117,26 +96,14 @@ def frame_generator(video_capture): def rife_inference_with_path(model, video_path): - # Open the video file video_capture = cv2.VideoCapture(video_path) - fps = video_capture.get(cv2.CAP_PROP_FPS) # Get the frames per second - tot_frame = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT)) # Total frames in the video + tot_frame = video_capture.get(cv2.CAP_PROP_FRAME_COUNT) pt_frame_data = [] pt_frame = skvideo.io.vreader(video_path) - # Cyclic reading of the video frames - while video_capture.isOpened(): - ret, frame = video_capture.read() - - if not ret: - break - - # BGR to RGB - frame_rgb = frame[..., ::-1] - frame_rgb = frame_rgb.copy() - tensor = torch.from_numpy(frame_rgb).float().to("cpu", non_blocking=True).float() / 255.0 + for frame in pt_frame: pt_frame_data.append( - tensor.permute(2, 0, 1) - ) # to [c, h, w,] + torch.from_numpy(np.transpose(frame, (2, 0, 1))).to("cpu", non_blocking=True).float() / 255.0 + ) pt_frame = torch.from_numpy(np.stack(pt_frame_data)) pt_frame = pt_frame.to(device) @@ -164,11 +131,3 @@ def rife_inference_with_latents(model, latents): rife_results.append(pt_image) return torch.stack(rife_results) - - -if __name__ == "__main__": - snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife") - model = load_rife_model("model_rife") - - video_path = rife_inference_with_path(model, "/mnt/ceph/develop/jiawei/CogVideo/sat/configs/outputs/1_In_the_heart_of_a_bustling_city,_a_young_woman_with_long,_flowing_brown_hair_and_a_radiant_smile_stands_out._She's_donne/0/000000.mp4") - print(video_path) \ No newline at end of file diff --git a/inference/gradio_web_demo.py b/inference/gradio_web_demo.py index 9c5520e..a00d1b8 100644 --- a/inference/gradio_web_demo.py +++ b/inference/gradio_web_demo.py @@ -19,8 +19,9 @@ from openai import OpenAI import moviepy.editor as mp dtype = torch.bfloat16 +device = "cuda" # Need to use cuda -pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype) +pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype).to(device) pipe.enable_model_cpu_offload() pipe.enable_sequential_cpu_offload() pipe.vae.enable_slicing() diff --git a/requirements.txt b/requirements.txt index 4fe6ddc..492fee6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -diffusers>=0.30.1 #git+https://github.com/huggingface/diffusers.git@main#egg=diffusers is suggested -transformers>=4.44.2 # The development team is working on version 4.44.2 -accelerate>=0.33.0 #git+https://github.com/huggingface/accelerate.git@main#egg=accelerate is suggested -sentencepiece>=0.2.0 # T5 used -SwissArmyTransformer>=0.4.12 +diffusers>=0.30.3 +accelerate>=0.34.2 +transformers>=4.44.2 numpy==1.26.0 -torch>=2.4.0 # Tested in 2.2 2.3 2.4 and 2.5, The development team is working on version 2.4.0. -torchvision>=0.19.0 # The development team is working on version 0.19.0. -gradio>=4.42.0 # For HF gradio demo -streamlit>=1.38.0 # For streamlit web demo -imageio==2.34.2 # For diffusers inference export video -imageio-ffmpeg==0.5.1 # For diffusers inference export video -openai>=1.42.0 # For prompt refiner -moviepy==1.0.3 # For export video +torch==2.4.0 +torchvision==0.19.0 +sentencepiece==0.2.0 +SwissArmyTransformer>=0.4.12 +gradio>=4.44.0 +streamlit>=1.38.0 +imageio>=2.35.1 +imageio-ffmpeg>=0.5.1 +openai>=1.45.0 +moviepy==1.0.3 pillow==9.5.0 \ No newline at end of file diff --git a/sat/diffusion_video.py b/sat/diffusion_video.py index 8329e9d..963038b 100644 --- a/sat/diffusion_video.py +++ b/sat/diffusion_video.py @@ -323,7 +323,6 @@ class SATVideoDiffusionEngine(nn.Module): if isinstance(c[k], torch.Tensor): c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc)) - if self.noised_image_input: image = x[:, :, 0:1] image = self.add_noise_to_first_frame(image) diff --git a/sat/finetune_multi_gpus.sh b/sat/finetune_multi_gpus.sh index a9a8ad2..a33f7cc 100644 --- a/sat/finetune_multi_gpus.sh +++ b/sat/finetune_multi_gpus.sh @@ -1,8 +1,8 @@ #! /bin/bash -echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" +echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" -run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_5b_i2v_lora.yaml configs/sft.yaml --seed $RANDOM" +run_cmd="PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --standalone --nproc_per_node=8 train_video.py --base configs/test_cogvideox_5b_i2v_lora.yaml configs/test_sft.yaml --seed $RANDOM" echo ${run_cmd} eval ${run_cmd}