diff --git a/inference/cli_demo.py b/inference/cli_demo.py
index 1910a78..f89e63a 100644
--- a/inference/cli_demo.py
+++ b/inference/cli_demo.py
@@ -21,26 +21,28 @@ import argparse
 from typing import Literal
 
 import torch
-from diffusers import (CogVideoXPipeline,
-                       CogVideoXDDIMScheduler,
-                       CogVideoXDPMScheduler,
-                       CogVideoXImageToVideoPipeline,
-                       CogVideoXVideoToVideoPipeline)
+from diffusers import (
+    CogVideoXPipeline,
+    CogVideoXDDIMScheduler,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXVideoToVideoPipeline,
+)
 
 from diffusers.utils import export_to_video, load_image, load_video
 
 
 def generate_video(
-        prompt: str,
-        model_path: str,
-        output_path: str = "./output.mp4",
-        image_or_video_path: str = "",
-        num_inference_steps: int = 50,
-        guidance_scale: float = 6.0,
-        num_videos_per_prompt: int = 1,
-        dtype: torch.dtype = torch.bfloat16,
-        generate_type: str = Literal["t2v", "i2v", "v2v"],  # i2v: image to video, v2v: video to video
-        seed: int = 42,
+    prompt: str,
+    model_path: str,
+    output_path: str = "./output.mp4",
+    image_or_video_path: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    generate_type: str = Literal["t2v", "i2v", "v2v"],  # i2v: image to video, v2v: video to video
+    seed: int = 42,
 ):
     """
     Generates a video based on the given prompt and saves it to the specified path.
@@ -53,7 +55,7 @@ def generate_video(
     - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
     - num_videos_per_prompt (int): Number of videos to generate per prompt.
     - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
-    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').
+    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
     - seed (int): The seed for reproducibility.
     """
 
@@ -97,13 +99,13 @@ def generate_video(
             image=image,  # The path of the image to be used as the background of the video
             num_videos_per_prompt=num_videos_per_prompt,  # Number of videos to generate per prompt
             num_inference_steps=num_inference_steps,  # Number of inference steps
-            num_frames=49,  # Number of frames to generate，changed to 49 for diffusers version `0.31.0` and after.
+            num_frames=49,  # Number of frames to generate，changed to 49 for diffusers version `0.30.3` and after.
             use_dynamic_cfg=True,  ## This id used for DPM Sechduler, for DDIM scheduler, it should be False
             guidance_scale=guidance_scale,
             generator=torch.Generator().manual_seed(seed),  # Set the seed for reproducibility
         ).frames[0]
     elif generate_type == "t2v":
-        video_generate  = pipe(
+        video_generate = pipe(
             prompt=prompt,
             num_videos_per_prompt=num_videos_per_prompt,
             num_inference_steps=num_inference_steps,
@@ -130,19 +132,29 @@ def generate_video(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
     parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
-    parser.add_argument("--image_or_video_path", type=str, default=None,
-                        help="The path of the image to be used as the background of the video")
-    parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX-5b",
-                        help="The path of the pre-trained model to be used")
-    parser.add_argument("--output_path", type=str, default="./output.mp4",
-                        help="The path where the generated video will be saved")
+    parser.add_argument(
+        "--image_or_video_path",
+        type=str,
+        default=None,
+        help="The path of the image to be used as the background of the video",
+    )
+    parser.add_argument(
+        "--model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
+    )
+    parser.add_argument(
+        "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
+    )
     parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
-    parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of steps for the inference process")
+    parser.add_argument(
+        "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
+    )
     parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
-    parser.add_argument("--generate_type", type=str, default="t2v",
-                        help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')")
-    parser.add_argument("--dtype", type=str, default="bfloat16",
-                        help="The data type for computation (e.g., 'float16' or 'bfloat16')")
+    parser.add_argument(
+        "--generate_type", type=str, default="t2v", help="The type of video generation (e.g., 't2v', 'i2v', 'v2v')"
+    )
+    parser.add_argument(
+        "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
+    )
     parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
 
     args = parser.parse_args()
diff --git a/inference/gradio_composite_demo/rife_model.py b/inference/gradio_composite_demo/rife_model.py
index 455cfff..0a69ca6 100644
--- a/inference/gradio_composite_demo/rife_model.py
+++ b/inference/gradio_composite_demo/rife_model.py
@@ -8,7 +8,7 @@ import numpy as np
 import logging
 import skvideo.io
 from rife.RIFE_HDv3 import Model
-from huggingface_hub import hf_hub_download, snapshot_download
+
 logger = logging.getLogger(__name__)
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -19,9 +19,8 @@ def pad_image(img, scale):
     tmp = max(32, int(32 / scale))
     ph = ((h - 1) // tmp + 1) * tmp
     pw = ((w - 1) // tmp + 1) * tmp
-    padding = (0,  pw - w, 0, ph - h)
- 
-    return F.pad(img, padding), padding
+    padding = (0, 0, pw - w, ph - h)
+    return F.pad(img, padding)
 
 
 def make_inference(model, I0, I1, upscale_amount, n):
@@ -45,15 +44,9 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi
     for b in range(samples.shape[0]):
         frame = samples[b : b + 1]
         _, _, h, w = frame.shape
-        
         I0 = samples[b : b + 1]
         I1 = samples[b + 1 : b + 2] if b + 2 < samples.shape[0] else samples[-1:]
-         
-        I0, padding = pad_image(I0, upscale_amount)
-        I0 = I0.to(torch.float)
-        I1, _ = pad_image(I1, upscale_amount)
-        I1 = I1.to(torch.float)
-         
+        I1 = pad_image(I1, upscale_amount)
         # [c, h, w]
         I0_small = F.interpolate(I0, (32, 32), mode="bilinear", align_corners=False)
         I1_small = F.interpolate(I1, (32, 32), mode="bilinear", align_corners=False)
@@ -61,24 +54,14 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi
         ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3])
 
         if ssim > 0.996:
-            I1 = samples[b : b + 1]
-            # print(f'upscale_amount:{upscale_amount}')
-            # print(f'ssim:{upscale_amount}')
-            # print(f'I0 shape:{I0.shape}')
-            # print(f'I1 shape:{I1.shape}')
-            I1, padding = pad_image(I1, upscale_amount)
-            # print(f'I0 shape:{I0.shape}')
-            # print(f'I1 shape:{I1.shape}')
+            I1 = I0
+            I1 = pad_image(I1, upscale_amount)
             I1 = make_inference(model, I0, I1, upscale_amount, 1)
-            
-            # print(f'I0 shape:{I0.shape}')
-            # print(f'I1[0] shape:{I1[0].shape}') 
-            I1 = I1[0]
-            
-            # print(f'I1[0] unpadded shape:{I1.shape}') 
-            I1_small = F.interpolate(I1, (32, 32), mode="bilinear", align_corners=False)
+
+            I1_small = F.interpolate(I1[0], (32, 32), mode="bilinear", align_corners=False)
             ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3])
-            frame = I1[padding[0]:, padding[2]:,  padding[3]:,padding[1]:]
+            frame = I1[0]
+            I1 = I1[0]
 
         tmp_output = []
         if ssim < 0.2:
@@ -88,13 +71,9 @@ def ssim_interpolation_rife(model, samples, exp=1, upscale_amount=1, output_devi
         else:
             tmp_output = make_inference(model, I0, I1, upscale_amount, 2**exp - 1) if exp else []
 
-        frame, _ = pad_image(frame, upscale_amount)
-        print(f'frame shape:{frame.shape}')
-        print(f'tmp_output[0] shape:{tmp_output[0].shape}')
+        frame = pad_image(frame, upscale_amount)
         tmp_output = [frame] + tmp_output
- 
-        for i, frame in enumerate(tmp_output): 
-            frame = F.interpolate(frame, size=(h, w))
+        for i, frame in enumerate(tmp_output):
             output.append(frame.to(output_device))
     return output
 
@@ -117,26 +96,14 @@ def frame_generator(video_capture):
 
 
 def rife_inference_with_path(model, video_path):
-    # Open the video file
     video_capture = cv2.VideoCapture(video_path)
-    fps = video_capture.get(cv2.CAP_PROP_FPS)  # Get the frames per second
-    tot_frame = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))  # Total frames in the video
+    tot_frame = video_capture.get(cv2.CAP_PROP_FRAME_COUNT)
     pt_frame_data = []
     pt_frame = skvideo.io.vreader(video_path)
-      # Cyclic reading of the video frames
-    while video_capture.isOpened():
-        ret, frame = video_capture.read()
-
-        if not ret:
-            break
-
-        # BGR to RGB
-        frame_rgb = frame[..., ::-1]
-        frame_rgb = frame_rgb.copy()
-        tensor = torch.from_numpy(frame_rgb).float().to("cpu", non_blocking=True).float() / 255.0
+    for frame in pt_frame:
         pt_frame_data.append(
-            tensor.permute(2, 0, 1)
-        )  # to [c, h, w,]
+            torch.from_numpy(np.transpose(frame, (2, 0, 1))).to("cpu", non_blocking=True).float() / 255.0
+        )
 
     pt_frame = torch.from_numpy(np.stack(pt_frame_data))
     pt_frame = pt_frame.to(device)
@@ -164,11 +131,3 @@ def rife_inference_with_latents(model, latents):
         rife_results.append(pt_image)
 
     return torch.stack(rife_results)
-
-
-if __name__ == "__main__":
-    snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife")
-    model = load_rife_model("model_rife")
- 
-    video_path = rife_inference_with_path(model, "/mnt/ceph/develop/jiawei/CogVideo/sat/configs/outputs/1_In_the_heart_of_a_bustling_city,_a_young_woman_with_long,_flowing_brown_hair_and_a_radiant_smile_stands_out._She's_donne/0/000000.mp4")
-    print(video_path)
\ No newline at end of file
diff --git a/inference/gradio_web_demo.py b/inference/gradio_web_demo.py
index 9c5520e..a00d1b8 100644
--- a/inference/gradio_web_demo.py
+++ b/inference/gradio_web_demo.py
@@ -19,8 +19,9 @@ from openai import OpenAI
 import moviepy.editor as mp
 
 dtype = torch.bfloat16
+device = "cuda"  # Need to use cuda
 
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype)
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype).to(device)
 pipe.enable_model_cpu_offload()
 pipe.enable_sequential_cpu_offload()
 pipe.vae.enable_slicing()
diff --git a/requirements.txt b/requirements.txt
index 4fe6ddc..492fee6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,15 @@
-diffusers>=0.30.1 #git+https://github.com/huggingface/diffusers.git@main#egg=diffusers is suggested
-transformers>=4.44.2  # The development team is working on version 4.44.2
-accelerate>=0.33.0 #git+https://github.com/huggingface/accelerate.git@main#egg=accelerate is suggested
-sentencepiece>=0.2.0 # T5 used
-SwissArmyTransformer>=0.4.12
+diffusers>=0.30.3
+accelerate>=0.34.2
+transformers>=4.44.2
 numpy==1.26.0
-torch>=2.4.0 # Tested in 2.2 2.3 2.4 and 2.5, The development team is working on version 2.4.0.
-torchvision>=0.19.0 # The development team is working on version 0.19.0.
-gradio>=4.42.0 # For HF gradio demo
-streamlit>=1.38.0 # For streamlit web demo
-imageio==2.34.2 # For diffusers inference export video
-imageio-ffmpeg==0.5.1 # For diffusers inference export video
-openai>=1.42.0 # For prompt refiner
-moviepy==1.0.3 # For export video
+torch==2.4.0
+torchvision==0.19.0
+sentencepiece==0.2.0
+SwissArmyTransformer>=0.4.12
+gradio>=4.44.0
+streamlit>=1.38.0
+imageio>=2.35.1
+imageio-ffmpeg>=0.5.1
+openai>=1.45.0
+moviepy==1.0.3
 pillow==9.5.0
\ No newline at end of file
diff --git a/sat/diffusion_video.py b/sat/diffusion_video.py
index 8329e9d..963038b 100644
--- a/sat/diffusion_video.py
+++ b/sat/diffusion_video.py
@@ -323,7 +323,6 @@ class SATVideoDiffusionEngine(nn.Module):
             if isinstance(c[k], torch.Tensor):
                 c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
 
-
         if self.noised_image_input:
             image = x[:, :, 0:1]
             image = self.add_noise_to_first_frame(image)
diff --git a/sat/finetune_multi_gpus.sh b/sat/finetune_multi_gpus.sh
index a9a8ad2..a33f7cc 100644
--- a/sat/finetune_multi_gpus.sh
+++ b/sat/finetune_multi_gpus.sh
@@ -1,8 +1,8 @@
 #! /bin/bash
 
-echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 
-run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_5b_i2v_lora.yaml configs/sft.yaml --seed $RANDOM"
+run_cmd="PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --standalone --nproc_per_node=8 train_video.py --base configs/test_cogvideox_5b_i2v_lora.yaml configs/test_sft.yaml --seed $RANDOM"
 
 echo ${run_cmd}
 eval ${run_cmd}