fix convert

2025-11-15 22:42:10 +08:00 · 2024-08-06 18:49:07 +08:00 · 2024-08-06 18:49:07 +08:00 · 487a815219
commit 487a815219
parent 8e8275d2e8
3 changed files with 33 additions and 13 deletions
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@ -11,10 +11,30 @@ Run the script:
 """
 import argparse
 import tempfile
 from typing import Union, List
 import PIL
 import imageio
 import numpy as np
 import torch
 from diffusers import CogVideoXPipeline
-from diffusers.utils import export_to_video
+
 def export_to_video_imageio(
    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 8
 ) -> str:
    """
    Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX)
    """
    if output_video_path is None:
        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
    if isinstance(video_frames[0], PIL.Image.Image):
        video_frames = [np.array(frame) for frame in video_frames]
    with imageio.get_writer(output_video_path, fps=fps) as writer:
        for frame in video_frames:
            writer.append_data(frame)
    return output_video_path
 def generate_video(
@ -43,7 +63,7 @@ def generate_video(
    # Load the pre-trained CogVideoX pipeline with the specified precision (float16) and move it to the specified device
    pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
-    pipe.enable_sequential_cpu_offload() # Enable sequential CPU offload for faster inference
+
    # Encode the prompt to get the prompt embeddings
    prompt_embeds, _ = pipe.encode_prompt(
        prompt=prompt,  # The textual description for video generation
@ -64,7 +84,7 @@ def generate_video(
    ).frames[0]
    # Export the generated frames to a video file. fps must be 8
-    export_to_video(video, output_path, fps=8)
+    export_to_video_imageio(video, output_path, fps=8)
 if __name__ == "__main__":
--- a/inference/cli_vae_demo.py
+++ b/inference/cli_vae_demo.py
@ -36,16 +36,18 @@ def vae_demo(model_path, video_path, dtype, device):
    model = AutoencoderKLCogVideoX.from_pretrained(model_path, torch_dtype=dtype).to(device)
    # Load video frames
-    video_reader = imageio.get_reader(video_path, 'ffmpeg')
+    video_reader = imageio.get_reader(video_path, "ffmpeg")
    frames = []
    for frame in video_reader:
        frames.append(frame)
    video_reader.close()
    # Transform frames to Tensor
-    transform = transforms.Compose([
+    transform = transforms.Compose(
        [
            transforms.ToTensor(),
-    ])
+        ]
    )
    frames_tensor = torch.stack([transform(frame) for frame in frames]).to(device)
    # Add batch dimension and reshape to [1, 3, 49, 480, 720]
@ -84,9 +86,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert a CogVideoX model to Diffusers")
    parser.add_argument("--model_path", type=str, required=True, help="The path to the CogVideoX model")
    parser.add_argument("--video_path", type=str, required=True, help="The path to the video file")
-    parser.add_argument(
+    parser.add_argument("--output_path", type=str, default="./", help="The path to save the output video")
        "--output_path", type=str, default="./", help="The path to save the output video"
    )
    parser.add_argument(
        "--dtype", type=str, default="float16", help="The data type for computation (e.g., 'float16' or 'float32')"
    )
--- a/requirements.txt
+++ b/requirements.txt
@ -6,5 +6,5 @@ opencv-python>=4.10
 imageio-ffmpeg>=0.5.1
 openai>=1.38.0
 transformers>=4.43.3
-accelerate>=0.33.0
+sentencepiece>=0.2.0
 pillow==9.5.0