diff --git a/GPT_SoVITS/Accelerate/PyTorch/__init__.py b/GPT_SoVITS/Accelerate/PyTorch/__init__.py
index 7cef76e7..d2f73b3b 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/__init__.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/__init__.py
@@ -1,4 +1,5 @@
 import importlib.util
+import os
 
 import torch
 
@@ -25,8 +26,12 @@ torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
 torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.enabled = True
 
+cpu_count = os.cpu_count() or 1
+torch.set_num_threads(cpu_count)
+torch.set_num_interop_threads(cpu_count)
+
 backends = ["torch_varlen"]
-if torch.cuda.is_available():
+if torch.cuda.is_available() and torch.version.cuda is not None:
     backends.append("torch_static_cuda_graph")
     # if importlib.util.find_spec("sageattention") is not None:
     #     for i in range(torch.cuda.device_count()):
@@ -44,7 +49,7 @@ if torch.cuda.is_available():
 #     backends.append("mps_flash_attn_varlen")
 
 BLACKWELL = False
-if torch.cuda.is_available():
+if torch.cuda.is_available() and torch.version.cuda is not None:
     for i in range(torch.cuda.device_count()):
         major, minor = torch.cuda.get_device_capability(i)
         sm_version = major + minor / 10.0
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
index adf7fceb..e3a49ce1 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
@@ -12,6 +12,7 @@ from ..structs import T2SSession
 from ..t2s_model_abc import (
     AttentionABC,
     CUDAGraphCacheABC,
+    CUDAGraphStateABC,
     FeedForward,
     KVCacheNHD,
     KVCacheProtocol,
@@ -114,6 +115,8 @@ class T2SDecoder(T2SDecoderABC):
 
         self.kv_class = KVCacheNHD
 
+        self.graph_cache_class = CUDAGraphCache
+
     def compile(self, *args, **kwds):
         pass
 
@@ -124,40 +127,39 @@ class T2SDecoder(T2SDecoderABC):
         return super().pre_forward(session)
 
 
-class CUDAGraphCache(CUDAGraphCacheABC):
+class CUDAGraphState(CUDAGraphStateABC):
+    applicable: bool = True
+
     def __init__(
         self,
-        decoder: T2SDecoder,
+        bsz: int,
+        decoder: T2SDecoderABC,
     ) -> None:
-        self.is_applicable = True
-        super().__init__(decoder)
+        super().__init__(bsz, decoder)
 
-    def release_graph(self, session: T2SSession):
-        if session.id == self.id:
-            self.assigned = False
-        else:
-            assert session.graph
-            session.graph.reset()
-            del session.graph, session.xy_pos_, session.xy_dec_, session.input_pos, session.kv_cache
+    def capture(self):
+        graph = self.decoder.capture(
+            self.input_pos,
+            self.xy_pos,
+            self.xy_dec,
+            self.kv_cache,
+        )
+        self.graph = graph
+        self.stream = torch.cuda.Stream()
 
-    def get_cache_graph(self, session: T2SSession):
-        assert self.graph
-        session.graph = self.graph
-        session.stream = self.stream
 
-        session.xy_pos_ = self.xy_pos
-        session.xy_dec_ = self.xy_dec
-        session.input_pos = self.input_pos.copy_(session.input_pos)
+class CUDAGraphCache(CUDAGraphCacheABC):
+    is_applicable = True
 
-        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
-            cache.sync_cache(cache_)
+    def __init__(
+        self,
+        decoder,
+        cache_size: int = 5,
+    ) -> None:
+        super().__init__(decoder, cache_size)
 
-    def capture_new_graph(self, session: T2SSession):
-        session.xy_pos_ = self.xy_pos.clone()
-        session.xy_dec_ = self.xy_dec.clone()
-        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
-
-        args, kwds = self.decoder.pre_forward(session)
-        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
-        session.graph = graph
-        session.stream = torch.cuda.Stream()  # type: ignore
+    def create_graph_cache(self, bsz: int):
+        for _ in range(self.cache_size):
+            state = CUDAGraphState(bsz, self.decoder)
+            state.capture()
+            self.graph_cache[bsz].put(state)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
index a499e63e..a3ef0a3f 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
@@ -6,6 +6,7 @@ from ..structs import KVCacheProtocol, T2SSession
 from ..t2s_model_abc import (
     AttentionABC,
     CUDAGraphCacheABC,
+    CUDAGraphStateABC,
     FeedForward,
     KVCacheHND,
     T2SDecoderABC,
@@ -91,6 +92,8 @@ class T2SDecoder(T2SDecoderABC):
 
         self.kv_class = KVCacheHND
 
+        self.graph_cache_class = CUDAGraphCache
+
     def pre_forward(self, session: T2SSession):
         attn_mask = session.attn_mask
         return list(), dict(attn_mask=attn_mask)
@@ -111,57 +114,54 @@ class T2SDecoder(T2SDecoderABC):
         attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
 
 
+class CUDAGraphState(CUDAGraphStateABC):
+    applicable: bool = False
+
+    def __init__(
+        self,
+        bsz: int,
+        decoder: T2SDecoderABC,
+    ) -> None:
+        self.attn_mask: Tensor = (
+            torch.randint(
+                0,
+                2,
+                (bsz, decoder.n_head, 1, decoder.max_seq_length),
+            )
+            .bool()
+            .to(decoder.device)
+        )
+
+        super().__init__(bsz, decoder)
+
+    def capture(self):
+        graph = self.decoder.capture(
+            self.input_pos,
+            self.xy_pos,
+            self.xy_dec,
+            self.kv_cache,
+            attn_mask=self.attn_mask,
+        )
+        self.graph = graph
+        self.stream = torch.cuda.Stream()
+
+    def assign_graph(self, session: T2SSession):
+        session.attn_mask = self.attn_mask
+        return super().assign_graph(session)
+
+
 class CUDAGraphCache(CUDAGraphCacheABC):
+    is_applicable = True
+
     def __init__(
         self,
         decoder,
+        cache_size: int = 5,
     ) -> None:
-        self.is_applicable = True
-        super().__init__(decoder)
-        if torch.cuda.is_available():
-            self.attn_mask = (
-                torch.randint(0, 2, (decoder.max_batch_size, decoder.n_head, 1, decoder.max_seq_length))
-                .bool()
-                .to(self.device, self.dtype)
-            )
+        super().__init__(decoder, cache_size)
 
-    def release_graph(self, session: T2SSession):
-        if session.id == self.id:
-            self.assigned = False
-        else:
-            assert session.graph
-            session.graph.reset()
-            del (
-                session.graph,
-                session.xy_pos_,
-                session.xy_dec_,
-                session.input_pos,
-                session.kv_cache,
-                session.attn_mask,
-            )
-
-    def get_cache_graph(self, session: T2SSession):
-        assert self.graph
-        session.graph = self.graph
-        session.stream = self.stream
-
-        session.xy_pos_ = self.xy_pos
-        session.xy_dec_ = self.xy_dec
-        session.input_pos = self.input_pos.copy_(session.input_pos)
-
-        session.attn_mask = self.attn_mask
-
-        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
-            cache.sync_cache(cache_)
-
-    def capture_new_graph(self, session: T2SSession):
-        session.xy_pos_ = self.xy_pos.clone()
-        session.xy_dec_ = self.xy_dec.clone()
-        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
-
-        session.attn_mask = self.attn_mask.clone().copy_(session.attn_mask)
-
-        args, kwds = self.decoder.pre_forward(session)
-        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
-        session.graph = graph
-        session.stream = torch.cuda.Stream()  # type: ignore
+    def create_graph_cache(self, bsz: int):
+        for _ in range(self.cache_size):
+            state = CUDAGraphState(bsz, self.decoder)
+            state.capture()
+            self.graph_cache[bsz].put(state)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
index 5aee3e61..9f687ac4 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
@@ -99,6 +99,8 @@ class T2SDecoder(T2SDecoderABC):
 
         self.kv_class = KVCacheHNDVarlen
 
+        self.graph_cache_class = CUDAGraphCache
+
     def capture(
         self,
         *args,
@@ -127,18 +129,13 @@ class T2SDecoder(T2SDecoderABC):
 
 
 class CUDAGraphCache(CUDAGraphCacheABC):
+    is_applicable = False
+
     def __init__(
         self,
         decoder,
     ) -> None:
-        self.is_applicable = False
         super().__init__(decoder)
 
-    def release_graph(self, session: T2SSession):
-        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
-
-    def get_cache_graph(self, session: T2SSession):
-        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
-
-    def capture_new_graph(self, session: T2SSession):
+    def create_graph_cache(self, bsz: int) -> NoReturn:
         raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
diff --git a/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py b/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
index ee5f2f79..9c560666 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
@@ -1,6 +1,6 @@
 import contextlib
+import gc
 import os
-import sys
 import time
 import traceback
 from importlib import import_module
@@ -34,7 +34,7 @@ class T2SEngine(T2SEngineProtocol):
         self.decoder_model: T2SDecoderABC = decoder_model.to(self.device, self.dtype)
         # self.decoder_model.compile()
 
-        self.graphcache: CUDAGraphCacheABC = self.init_cache()
+        self.graphcache: CUDAGraphCacheABC = decoder_model.graph_cache_class(self.decoder_model)
 
     def _handle_request(self, request: T2SRequest):
         with self.device:
@@ -47,6 +47,7 @@ class T2SEngine(T2SEngineProtocol):
             infer_speed = 0.0
             infer_time = 0.0
             idx = 0
+            graph_state = None
 
             torch_profiler = TorchProfiler(debug)
             with (
@@ -64,105 +65,132 @@ class T2SEngine(T2SEngineProtocol):
                 max_token = min(int(1500 - session.input_pos.max()), 1000) * session.bsz
                 task = progress.add_task("T2S Decoding", total=max_token)
 
-                for idx in range(max_token):
-                    progress.update(task, advance=session.bsz)
-                    if idx == 0:
-                        with torch_profiler.record("Prefill"), timer("Torch.Prefill", debug=debug):
-                            session.kv_cache = decoder.init_cache(session.bsz)
-                            t1 = time.perf_counter()
-                            xy_dec = decoder.h.prefill(session.xy_pos, session.kv_cache, session.attn_mask)
-                            xy_dec = xy_dec[batch_idx, None, session.input_pos - 1]
-                    else:
-                        if (
-                            request.use_cuda_graph
-                            and session.graph is None
-                            and self.graphcache.is_applicable
-                            and torch.cuda.is_available()
-                        ):
-                            self.graphcache.assign_graph(session)
-
-                        with torch_profiler.record("Decode"), timer("Torch.Decode", debug=debug):
-                            if session.graph:
-                                assert session.stream
-                                session.stream.wait_stream(torch.cuda.default_stream())
-                                with torch.cuda.stream(session.stream):
-                                    session.xy_pos_.copy_(session.xy_pos)
-                                    session.graph.replay()
-                                    xy_dec = session.xy_dec_.clone()
-                            else:
-                                args, kwds = decoder.pre_forward(session)
-                                xy_dec = decoder.h(
-                                    session.input_pos,
-                                    session.xy_pos,
-                                    session.kv_cache,
-                                    *args,
-                                    **kwds,
-                                )
-
-                    with torch.cuda.stream(session.stream) if session.stream is not None else contextlib.nullcontext():
-                        decoder.post_forward(idx, session)
-                        logits = decoder.ar_predict_layer(xy_dec.squeeze(1))
-
+                try:
+                    for idx in range(max_token):
+                        progress.update(task, advance=session.bsz)
                         if idx == 0:
-                            logits[:, -1] = float("-inf")
+                            with torch_profiler.record("Prefill"), timer("Torch.Prefill", debug=debug):
+                                session.kv_cache = decoder.init_cache(session.bsz)
+                                t1 = time.perf_counter()
+                                xy_dec = decoder.h.prefill(session.xy_pos, session.kv_cache, session.attn_mask)
+                                xy_dec = xy_dec[batch_idx, None, session.input_pos - 1]
+                        else:
+                            if (
+                                idx == 1
+                                and request.use_cuda_graph
+                                and self.graphcache.is_applicable
+                                and torch.cuda.is_available()
+                                and torch.version.cuda is not None
+                                and os.environ.get("CUDAGraph", "1") != "0"
+                            ):
+                                graph_state = self.graphcache[session.bsz].assign_graph(session)
 
-                        with torch_profiler.record("Sampling"), timer("Torch.Sampling", debug=debug):
-                            samples = session.sample(
-                                logits=logits,
-                                previous_tokens=session.y[:, : session.y_len + idx],
-                                top_k=request.top_k,
-                                top_p=request.top_p,
-                                repetition_penalty=request.repetition_penalty,
-                                temperature=request.temperature,
-                            )
-                            session.y[batch_idx.reshape(-1, 1), session.y_len + idx] = samples
-                            session.input_pos.add_(1)
-
-                        with torch_profiler.record("EOS"), timer("Torch.EOS", debug=debug):
-                            argmax_token = torch.argmax(logits, dim=-1)
-                            sample_token = samples.squeeze(1)
-                            EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
-
-                            newly_done_mask = EOS_mask & (~session.completed)
-                            newly_done_indices = newly_done_mask.nonzero()
-
-                            if newly_done_indices.numel() > 0:
-                                for i in newly_done_indices:
-                                    session.y_results[i] = session.y[i, session.y_len : session.y_len + idx].squeeze(0)
-                                    session.completed[newly_done_indices] = True
-
-                        if torch.all(session.completed).item():
-                            logger.info(
-                                f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[i.size(-1) for i in session.y_results].__str__().strip('[]')}"
-                            )
-                            logger.info(
-                                f"Infer Speed: {(idx + 1) * session.bsz / (time.perf_counter() - t1):.2f} token/s"
-                            )
-                            infer_time = time.perf_counter() - t1
-                            infer_speed = (idx + 1) * session.bsz / infer_time
-                            break
-
-                        if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == max_token - 1:
-                            for i in range(session.bsz):
-                                if not session.completed[i].item():
-                                    session.y_results[i] = session.y[[i], session.y_len : session.y_len + idx].squeeze(
-                                        0
+                            with torch_profiler.record("Decode"), timer("Torch.Decode", debug=debug):
+                                if session.graph:
+                                    assert session.stream
+                                    session.stream.wait_stream(torch.cuda.default_stream())
+                                    with torch.cuda.stream(session.stream):
+                                        session.xy_pos_.copy_(session.xy_pos)
+                                        session.graph.replay()
+                                        xy_dec = session.xy_dec_.clone()
+                                else:
+                                    args, kwds = decoder.pre_forward(session)
+                                    xy_dec = decoder.h(
+                                        session.input_pos,
+                                        session.xy_pos,
+                                        session.kv_cache,
+                                        *args,
+                                        **kwds,
                                     )
-                                    session.completed[i] = True
-                                logger.error("Bad Full Prediction")
+
+                        with (
+                            torch.cuda.stream(session.stream)
+                            if session.stream is not None
+                            else contextlib.nullcontext()
+                        ):
+                            decoder.post_forward(idx, session)
+                            logits = decoder.ar_predict_layer(xy_dec.squeeze(1))
+
+                            if idx == 0:
+                                logits[:, -1] = float("-inf")
+
+                            with torch_profiler.record("Sampling"), timer("Torch.Sampling", debug=debug):
+                                samples = session.sample(
+                                    logits=logits,
+                                    previous_tokens=session.y[:, : session.y_len + idx],
+                                    top_k=request.top_k,
+                                    top_p=request.top_p,
+                                    repetition_penalty=request.repetition_penalty,
+                                    temperature=request.temperature,
+                                )
+                                session.y[batch_idx.reshape(-1, 1), session.y_len + idx] = samples
+                                session.input_pos.add_(1)
+
+                            with torch_profiler.record("EOS"), timer("Torch.EOS", debug=debug):
+                                argmax_token = torch.argmax(logits, dim=-1)
+                                sample_token = samples.squeeze(1)
+                                EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
+
+                                newly_done_mask = EOS_mask & (~session.completed)
+                                newly_done_indices = newly_done_mask.nonzero()
+
+                                if newly_done_indices.numel() > 0:
+                                    for i in newly_done_indices:
+                                        session.y_results[i] = session.y[
+                                            i, session.y_len : session.y_len + idx
+                                        ].squeeze(0)
+                                        session.completed[newly_done_indices] = True
+
+                            if torch.all(session.completed).item():
+                                logger.info(
+                                    f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[i.size(-1) for i in session.y_results].__str__().strip('[]')}"
+                                )
+                                logger.info(
+                                    f"Infer Speed: {(idx + 1) * session.bsz / (time.perf_counter() - t1):.2f} token/s"
+                                )
                                 infer_time = time.perf_counter() - t1
                                 infer_speed = (idx + 1) * session.bsz / infer_time
-                            break
+                                break
 
-                        with torch_profiler.record("NextPos"), timer("Torch.NextPos", debug=debug):
-                            y_emb = decoder.ar_audio_embedding(samples)
-                            session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
+                            if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == max_token - 1:
+                                for i in range(session.bsz):
+                                    if not session.completed[i].item():
+                                        session.y_results[i] = session.y[
+                                            [i], session.y_len : session.y_len + idx
+                                        ].squeeze(0)
+                                        session.completed[i] = True
+                                    logger.error("Bad Full Prediction")
+                                    infer_time = time.perf_counter() - t1
+                                    infer_speed = (idx + 1) * session.bsz / infer_time
+                                break
 
-                        if idx == 10:
-                            torch_profiler.end()
+                            with torch_profiler.record("NextPos"), timer("Torch.NextPos", debug=debug):
+                                y_emb = decoder.ar_audio_embedding(samples)
+                                session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
 
-            if request.use_cuda_graph and self.graphcache.is_applicable:
-                self.graphcache.release_graph(session)
+                            if idx == 10:
+                                torch_profiler.end()
+                finally:
+                    if (
+                        request.use_cuda_graph
+                        and self.graphcache.is_applicable
+                        and torch.cuda.is_available()
+                        and torch.version.cuda is not None
+                        and os.environ.get("CUDAGraph", "1") != "0"
+                    ):
+                        self.graphcache.release_graph(graph_state)
+
+                        match decoder.device.type:
+                            case "cuda":
+                                torch.cuda.empty_cache()
+                            case "mps":
+                                torch.mps.empty_cache()
+                            case "xpu":
+                                torch.xpu.empty_cache()
+                            case "mtia":
+                                torch.mtia.empty_cache()
+                            case "cpu":
+                                gc.collect(1)
 
             return session.y_results[: request.valid_length], infer_speed, infer_time, (idx + 1) * session.bsz
 
@@ -205,14 +233,3 @@ class T2SEngine(T2SEngineProtocol):
             logger.info(f"Quantized by {quantize_mode} Quantization")
 
         return decoder.eval()
-
-    def init_cache(self):
-        assert self.decoder_model
-
-        module_name = self.decoder_model.__class__.__module__
-        module = sys.modules.get(module_name)
-        assert module
-
-        target_class: type[CUDAGraphCacheABC] = getattr(module, "CUDAGraphCache")
-
-        return target_class(self.decoder_model)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
index 4a808877..39b831cc 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
@@ -8,11 +8,11 @@ import math
 import os
 import pickle
 import platform
-import random
 import time
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
 from pathlib import Path
+from queue import Queue
 from typing import Literal, MutableSequence
 
 import torch
@@ -479,6 +479,8 @@ class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
             max_seq_length=max_seq_length,
         )
 
+        self.graph_cache_class: type[CUDAGraphCacheABC]
+
         self.bits: int
         self.group_size: int
 
@@ -639,56 +641,78 @@ class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
                 raise ValueError(f"Unsupported Quantization Mode for PyTorch: {mode}")
 
 
-class CUDAGraphCacheABC(ABC):
+class CUDAGraphStateABC(ABC):
     def __init__(
         self,
+        bsz: int,
         decoder: T2SDecoderABC,
     ) -> None:
-        self.is_applicable: bool
+        self.bsz = bsz
+        self.embedding_dim = decoder.embedding_dim
+        self.dtype = decoder.bert_proj.bias.dtype
+        self.device = decoder.device
 
-        if torch.cuda.is_available() and self.is_applicable:
-            self.device: torch.device = decoder.device
-            self.dtype = decoder.bert_proj.bias.dtype
+        self.decoder: T2SDecoderABC = decoder
+        self.graph: torch.cuda.CUDAGraph | None = None
+        self.stream: torch.cuda.Stream | None = None
 
-            self.assigned: bool = False
+        self.xy_pos = torch.rand(size=(self.bsz, 1, self.embedding_dim), device=self.device).to(self.dtype)
+        self.kv_cache: MutableSequence[KVCacheProtocol] = decoder.init_cache(bsz)
+        self.xy_dec = self.xy_pos.clone()
+        self.input_pos = torch.tensor([10] * self.bsz, device=self.device).to(torch.int32)
 
-            self.decoder: T2SDecoderABC = decoder
-            self.kv_cache: MutableSequence[KVCacheProtocol] = decoder.init_cache(decoder.max_batch_size)
-            self.xy_pos = torch.rand(size=(decoder.max_batch_size, 1, decoder.embedding_dim), device=self.device).to(
-                self.dtype
-            )
-            self.xy_dec = self.xy_pos.clone()
+        self.capture()
 
-            self.input_pos = torch.tensor([10] * decoder.max_batch_size, device=self.device).int()
-            self.graph: torch.cuda.CUDAGraph | None = None
-            self.stream: torch.cuda.Stream | None
+    @abstractmethod
+    def capture(self): ...
 
-            self.id: int = random.randint(1, 2**32 - 1)
+    def assign_graph(self, session: T2SSession) -> CUDAGraphStateABC:
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
 
-    def assign_graph(self, session: T2SSession):
-        if self.graph is None:
-            args, kwds = self.decoder.pre_forward(session)
-            graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
-            self.graph = graph
-            self.stream = torch.cuda.Stream()
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
 
-        if self.assigned is False:
-            self.get_cache_graph(session)
-            session.id = self.id
-            self.assigned = True
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+
+        return self
+
+
+class CUDAGraphCacheABC(ABC):
+    is_applicable: bool
+
+    def __init__(self, decoder: T2SDecoderABC, cache_size: int = 5) -> None:
+        self.decoder = decoder
+        self.max_batch_size = decoder.max_batch_size
+        self.cache_size = cache_size
+
+        self.graph_cache: dict[int, Queue[CUDAGraphStateABC]] = {}
+
+        if torch.cuda.is_available() and torch.version.cuda is not None and os.environ.get("CUDAGraph", "1") != "0":
+            self.create_graph_cache(1)
+
+    def __getitem__(self, bsz: int) -> CUDAGraphStateABC:
+        if self.is_applicable:
+            assert bsz <= self.max_batch_size
+            if self.graph_cache.get(bsz) is None:
+                self.create_graph_cache(bsz)
+            return self.graph_cache[bsz].get()
         else:
-            self.capture_new_graph(session)
+            raise RuntimeError("CUDAGraph Is Not Applicable")
 
     @abstractmethod
-    def release_graph(self, session: T2SSession): ...
+    def create_graph_cache(self, bsz: int): ...
 
-    @abstractmethod
-    def get_cache_graph(self, session: T2SSession):
-        pass
-
-    @abstractmethod
-    def capture_new_graph(self, session: T2SSession):
-        pass
+    def release_graph(self, graph_state: CUDAGraphStateABC | None):
+        if graph_state is None:
+            return
+        bsz = graph_state.bsz
+        assert bsz <= self.max_batch_size
+        assert self.graph_cache.get(bsz) is not None
+        self.graph_cache[bsz].put(graph_state)
 
 
 class TorchProfiler:
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index f245773e..68476762 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -955,7 +955,7 @@ class TTS:
         """
         ########## variables initialization ###########
         torch.set_grad_enabled(False)
-        ttfb_time = time.perf_counter()
+        ttft_time = time.perf_counter()
         self.stop_flag: bool = False
         text: str = inputs.get("text", "")
         text_lang: str = inputs.get("text_lang", "")
@@ -1141,7 +1141,9 @@ class TTS:
                     temperature=temperature,
                     repetition_penalty=repetition_penalty,
                     debug=os.environ.get("DEBUG", "0") == "1",
-                    use_cuda_graph=torch.cuda.is_available(),
+                    use_cuda_graph=torch.cuda.is_available()
+                    and torch.version.cuda is not None
+                    and os.environ.get("CUDAGraph", "1") != "0",
                 )
                 t2s_result = self.t2s_model.generate(t2s_request)
 
@@ -1151,7 +1153,7 @@ class TTS:
 
                 pred_semantic_list = t2s_result.result
                 assert pred_semantic_list
-                pred_semantic_list = [semantic.squeeze(0) for semantic in pred_semantic_list]
+                pred_semantic_list = [semantic.squeeze(0).to(self.configs.device) for semantic in pred_semantic_list]
                 infer_len.append(t2s_result.total_tokens)
                 infer_time.append(t2s_result.infer_speed[-1])
 
@@ -1243,7 +1245,7 @@ class TTS:
                             )
                             batch_audio_fragment.append(audio_fragment)
                 if idx == 0:
-                    ttfb_time = time.perf_counter() - ttfb_time
+                    ttft_time = time.perf_counter() - ttft_time
                 t5 = time.perf_counter()
                 t_45 += t5 - t4
                 if return_fragment:
@@ -1307,10 +1309,10 @@ class TTS:
             console.print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
             console.print(f">> RTF: {rtf_value:.2f}")
 
-            if ttfb_time > 2:
-                console.print(f">> TTFB: {ttfb_time:.3f} s")
+            if ttft_time > 2:
+                console.print(f">> TTFT: {ttft_time:.3f} s")
             else:
-                console.print(f">> TTFB: {ttfb_time * 1000:.3f} ms")
+                console.print(f">> TTFT: {ttft_time * 1000:.3f} ms")
 
             self.empty_cache()
 
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 08803e43..d7bfda5a 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -708,7 +708,7 @@ async def get_tts_wav(
     torch.set_grad_enabled(False)
     progress(0, desc="Inferencing...")
     debug = os.getenv("DEBUG") == "1"
-    ttfb_time = ttime()
+    ttft_time = ttime()
 
     if ref_wav_path:
         pass
@@ -829,7 +829,9 @@ async def get_tts_wav(
                 top_p=top_p,
                 temperature=temperature,
                 early_stop_num=1500,
-                use_cuda_graph=torch.cuda.is_available(),  # Try to use CUDA Graph for all backend, fallback to normal if not applicapble
+                use_cuda_graph=torch.cuda.is_available()
+                and torch.version.cuda is not None
+                and os.environ.get("CUDAGraph", "1") != "0",
                 debug=debug,
             )
             assert t2s_engine
@@ -938,7 +940,7 @@ async def get_tts_wav(
                 wav_gen = vocoder_model(cfm_res)  # type: ignore
                 audio = wav_gen[0][0]
         if i_text == 0:
-            ttfb_time = ttime() - ttfb_time
+            ttft_time = ttime() - ttft_time
         max_audio = torch.abs(audio).max()  # 简单防止16bit爆音
         if max_audio > 1:
             audio = audio / max_audio
@@ -980,12 +982,12 @@ async def get_tts_wav(
     gr.Info(f"{infer_speed_avg:.2f} Token/s", title="Infer Speed")
     gr.Info(f"{rtf_value:.2f}", title="RTF")
 
-    if ttfb_time > 2:
-        console.print(f">> TTFB: {ttfb_time:.3f} s")
-        gr.Info(f"{ttfb_time:.3f} s", title="TTFB")
+    if ttft_time > 2:
+        console.print(f">> TTFT: {ttft_time:.3f} s")
+        gr.Info(f"{ttft_time:.3f} s", title="TTFT")
     else:
-        console.print(f">> TTFB: {ttfb_time * 1000:.3f} ms")
-        gr.Info(f"{ttfb_time * 1000:.3f} ms", title="TTFB")
+        console.print(f">> TTFT: {ttft_time * 1000:.3f} ms")
+        gr.Info(f"{ttft_time * 1000:.3f} ms", title="TTFT")
 
     progress(1, desc="Done")
     yield opt_sr, (audio_opt_n * 32767).astype(np.int16)
diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index b752d06c..fd99311d 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -958,7 +958,7 @@ class SynthesizerTrn(nn.Module):
                     ge = self.prelu(ge)
             return ge
 
-        if type(refer) == list:
+        if isinstance(refer, list):
             ges = []
             for idx, _refer in enumerate(refer):
                 ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
diff --git a/README.md b/README.md
index 36ebebb5..22a03746 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Unseen speakers few-shot fine-tuning demo:
 
 ## Infer Speed
 
-|   Device    |  RTF  |  TTFB  | Batch Size |           Backend           |
+|   Device    |  RTF  |  TTFT  | Batch Size |           Backend           |
 | :---------: | :---: | :----: | :--------: | :-------------------------: |
 |  RTX 5090   | 0.05  | 150 ms |     1      | Flash Attn Varlen CUDAGraph |
 |  RTX 4090   | 0.014 |  UNK   |     24     | Flash Attn Varlen CUDAGraph |
@@ -138,13 +138,13 @@ pip install -r requirements.txt
 
 ```bash
 conda activate GPTSoVits
-conda install ffmpeg=7 -c conda-forge
+conda install ffmpeg -c conda-forge
 ```
 
 ##### Ubuntu/Debian Users
 
 ```bash
-sudo apt install ffmpeg=7
+sudo apt install ffmpeg
 sudo apt install libsox-dev
 ```
 
diff --git a/docs/cn/README.md b/docs/cn/README.md
index d9f71654..c20b4846 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -51,7 +51,7 @@
 
 ## 推理速度
 
-|   Device    |  RTF  |  TTFB  | Batch Size |           Backend           |
+|   Device    |  RTF  |  TTFT  | Batch Size |           Backend           |
 | :---------: | :---: | :----: | :--------: | :-------------------------: |
 |  RTX 5090   | 0.05  | 150 ms |     1      | Flash Attn Varlen CUDAGraph |
 |  RTX 4090   | 0.014 |  UNK   |     24     | Flash Attn Varlen CUDAGraph |
@@ -136,13 +136,13 @@ pip install -r requirements.txt
 
 ```bash
 conda activate GPTSoVits
-conda install ffmpeg=7 -c conda-forge
+conda install ffmpeg -c conda-forge
 ```
 
 ##### Ubuntu/Debian 用户
 
 ```bash
-sudo apt install ffmpeg=7
+sudo apt install ffmpeg
 sudo apt install libsox-dev
 ```
 
diff --git a/docs/ja/README.md b/docs/ja/README.md
index ef3446a4..682ec42b 100644
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@@ -51,7 +51,7 @@
 
 ## 推論速度
 
-|   Device    |  RTF  |  TTFB  | Batch Size |           Backend           |
+|   Device    |  RTF  |  TTFT  | Batch Size |           Backend           |
 | :---------: | :---: | :----: | :--------: | :-------------------------: |
 |  RTX 5090   | 0.05  | 150 ms |     1      | Flash Attn Varlen CUDAGraph |
 |  RTX 4090   | 0.014 |  UNK   |     24     | Flash Attn Varlen CUDAGraph |
@@ -126,13 +126,13 @@ pip install -r requirements.txt
 
 ```bash
 conda activate GPTSoVits
-conda install ffmpeg=7 -c conda-forge
+conda install ffmpeg -c conda-forge
 ```
 
 ##### Ubuntu/Debian ユーザー
 
 ```bash
-sudo apt install ffmpeg=7
+sudo apt install ffmpeg
 sudo apt install libsox-dev
 ```
 
diff --git a/docs/ko/README.md b/docs/ko/README.md
index 809debf1..2b97b0c4 100644
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@@ -51,7 +51,7 @@
 
 ## 추론 속도
 
-|   Device    |  RTF  |  TTFB  | Batch Size |           Backend           |
+|   Device    |  RTF  |  TTFT  | Batch Size |           Backend           |
 | :---------: | :---: | :----: | :--------: | :-------------------------: |
 |  RTX 5090   | 0.05  | 150 ms |     1      | Flash Attn Varlen CUDAGraph |
 |  RTX 4090   | 0.014 |  UNK   |     24     | Flash Attn Varlen CUDAGraph |
@@ -132,13 +132,13 @@ pip install -r requirements.txt
 
 ```bash
 conda activate GPTSoVits
-conda install ffmpeg=7 -c conda-forge
+conda install ffmpeg -c conda-forge
 ```
 
 ##### Ubuntu/Debian 사용자
 
 ```bash
-sudo apt install ffmpeg=7
+sudo apt install ffmpeg
 sudo apt install libsox-dev
 ```
 
diff --git a/docs/tr/README.md b/docs/tr/README.md
index 88c7a9ae..a8ee23df 100644
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@@ -51,7 +51,7 @@ Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
 
 ## çıkarım hızı
 
-|   Device    |  RTF  |  TTFB  | Batch Size |           Backend           |
+|   Device    |  RTF  |  TTFT  | Batch Size |           Backend           |
 | :---------: | :---: | :----: | :--------: | :-------------------------: |
 |  RTX 5090   | 0.05  | 150 ms |     1      | Flash Attn Varlen CUDAGraph |
 |  RTX 4090   | 0.014 |  UNK   |     24     | Flash Attn Varlen CUDAGraph |
@@ -132,13 +132,13 @@ pip install -r requirements.txt
 
 ```bash
 conda activate GPTSoVits
-conda install ffmpeg=7 -c conda-forge
+conda install ffmpeg -c conda-forge
 ```
 
 ##### Ubuntu/Debian Kullanıcıları
 
 ```bash
-sudo apt install ffmpeg=7
+sudo apt install ffmpeg
 sudo apt install libsox-dev
 ```
 
diff --git a/install.sh b/install.sh
index 5fe5dcb6..2276ac44 100644
--- a/install.sh
+++ b/install.sh
@@ -227,7 +227,7 @@ else
 fi
 
 echo -e "${INFO}Installing FFmpeg & CMake..."
-run_conda_quiet ffmpeg=7 cmake make
+run_conda_quiet ffmpeg cmake make
 echo -e "${SUCCESS}FFmpeg & CMake Installed"
 
 echo -e "${INFO}Installing unzip..."
diff --git a/test.py b/test.py
index 098beade..423a94d6 100644
--- a/test.py
+++ b/test.py
@@ -552,7 +552,7 @@ def get_tts_wav(
 ):
     torch.set_grad_enabled(False)
     debug = os.getenv("DEBUG") == "1"
-    ttfb_time = ttime()
+    ttft_time = ttime()
 
     if ref_wav_path:
         pass
@@ -698,7 +698,7 @@ def get_tts_wav(
         )[0][0]  # type: ignore
 
         if i_text == 0:
-            ttfb_time = ttime() - ttfb_time
+            ttft_time = ttime() - ttft_time
         max_audio = torch.abs(audio).max()  # 简单防止16bit爆音
         if max_audio > 1:
             audio = audio / max_audio
@@ -729,10 +729,10 @@ def get_tts_wav(
     console.print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
     console.print(f">> RTF: {rtf_value:.2f}")
 
-    if ttfb_time > 2:
-        console.print(f">> TTFB: {ttfb_time:.3f} s")
+    if ttft_time > 2:
+        console.print(f">> TTFT: {ttft_time:.3f} s")
     else:
-        console.print(f">> TTFB: {ttfb_time * 1000:.3f} ms")
+        console.print(f">> TTFT: {ttft_time * 1000:.3f} ms")
 
     yield opt_sr, (audio_opt_n * 32767).astype(np.int16)