From b410841bcf074144864d9565d894159348bf4a4a Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Mon, 16 Sep 2024 23:01:37 +0800 Subject: [PATCH] llm-flux-cogvideox-i2v-tools --- inference/gradio_composite_demo/app.py | 23 +- .../example_images/beach.png | Bin 0 -> 385407 bytes .../example_images/camping.png | Bin 0 -> 484057 bytes .../example_images/street.png | Bin 0 -> 477803 bytes .../example_videos/horse.mp4 | 3 + .../example_videos/kitten.mp4 | Bin 0 -> 881636 bytes .../example_videos/train_running.mp4 | Bin 0 -> 576715 bytes .../gradio_composite_demo/requirements.txt | 2 +- sat/README_zh.md | 61 +++-- tools/llm_flux_cogvideox/generate.sh | 24 ++ .../llm_flux_cogvideox/llm_flux_cogvideox.py | 256 ++++++++++++++++++ 11 files changed, 329 insertions(+), 40 deletions(-) create mode 100644 inference/gradio_composite_demo/example_images/beach.png create mode 100644 inference/gradio_composite_demo/example_images/camping.png create mode 100644 inference/gradio_composite_demo/example_images/street.png create mode 100644 inference/gradio_composite_demo/example_videos/horse.mp4 create mode 100644 inference/gradio_composite_demo/example_videos/kitten.mp4 create mode 100644 inference/gradio_composite_demo/example_videos/train_running.mp4 create mode 100644 tools/llm_flux_cogvideox/generate.sh create mode 100644 tools/llm_flux_cogvideox/llm_flux_cogvideox.py diff --git a/inference/gradio_composite_demo/app.py b/inference/gradio_composite_demo/app.py index b180e01..9b164c1 100644 --- a/inference/gradio_composite_demo/app.py +++ b/inference/gradio_composite_demo/app.py @@ -53,7 +53,7 @@ pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained( ).to(device) pipe_image = CogVideoXImageToVideoPipeline.from_pretrained( - "THUDM/CogVideoX-5b", + "THUDM/CogVideoX-5b-I2V", transformer=CogVideoXTransformer3DModel.from_pretrained( "THUDM/CogVideoX-5b-I2V", subfolder="transformer", torch_dtype=torch.bfloat16 ), @@ -65,10 +65,10 @@ pipe_image = CogVideoXImageToVideoPipeline.from_pretrained( ).to(device) -pipe.transformer.to(memory_format=torch.channels_last) -pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) -pipe_image.transformer.to(memory_format=torch.channels_last) -pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True) +# pipe.transformer.to(memory_format=torch.channels_last) +# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) +# pipe_image.transformer.to(memory_format=torch.channels_last) +# pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True) os.makedirs("./output", exist_ok=True) os.makedirs("./gradio_tmp", exist_ok=True) @@ -294,7 +294,8 @@ def delete_old_files(): threading.Thread(target=delete_old_files, daemon=True).start() -examples = [["horse.mp4"], ["kitten.mp4"], ["train_running.mp4"]] +examples_videos = [["example_videos/horse.mp4"], ["example_videos/kitten.mp4"], ["example_videos/train_running.mp4"]] +examples_images = [["example_images/beach.png"], ["example_images/street.png"], ["example_images/camping.png"]] with gr.Blocks() as demo: gr.Markdown(""" @@ -302,7 +303,8 @@ with gr.Blocks() as demo: CogVideoX-5B Huggingface Space🤗
@@ -320,10 +322,11 @@ with gr.Blocks() as demo: with gr.Column(): with gr.Accordion("I2V: Image Input (cannot be used simultaneously with video input)", open=False): image_input = gr.Image(label="Input Image (will be cropped to 720 * 480)") + examples_component_images = gr.Examples(examples_images, inputs=[examples_images], cache_examples=False) with gr.Accordion("V2V: Video Input (cannot be used simultaneously with image input)", open=False): video_input = gr.Video(label="Input Video (will be cropped to 49 frames, 6 seconds at 8fps)") strength = gr.Slider(0.1, 1.0, value=0.8, step=0.01, label="Strength") - examples_component = gr.Examples(examples, inputs=[video_input], cache_examples=False) + examples_component_videos = gr.Examples(examples_videos, inputs=[examples_videos], cache_examples=False) prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5) with gr.Row(): @@ -338,7 +341,7 @@ with gr.Blocks() as demo: label="Inference Seed (Enter a positive number, -1 for random)", value=-1 ) with gr.Row(): - enable_scale = gr.Checkbox(label="Super-Resolution (720 × 480 -> 1440 × 960)", value=False) + enable_scale = gr.Checkbox(label="Super-Resolution (720 × 480 -> 2880 × 1920)", value=False) enable_rife = gr.Checkbox(label="Frame Interpolation (8fps -> 16fps)", value=False) gr.Markdown( "✨In this demo, we use [RIFE](https://github.com/hzwer/ECCV2022-RIFE) for frame interpolation and [Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) for upscaling(Super-Resolution).
diff --git a/inference/gradio_composite_demo/example_images/beach.png b/inference/gradio_composite_demo/example_images/beach.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ea6b60ac52df227ccdbe689c64848a36bd95fae
GIT binary patch
literal 385407
zcmV(hK={9jP) DMKurR?J{G5R{1bFwCnN
zzrI33nAp+>xS?R=eubqQ>2OVy(i7Y(1bZ&x8+ySjean`VG^J5q&+-uqd;yatJ0bZ|
zI*- &hP#5vA_|!~Ig0sk@SU
zie=HlIK;ccgM9lDC?D1UWTRH1tQileb+>7Bg(N3xWb9DiE6@D?v0q`$wz=_cO |