diff --git a/.github/build_windows_packages.ps1 b/.github/build_windows_packages.ps1
index 2e4acb2a..d527c5bd 100644
--- a/.github/build_windows_packages.ps1
+++ b/.github/build_windows_packages.ps1
@@ -115,12 +115,17 @@ Remove-Item $ffDir.FullName -Recurse -Force
 Write-Host "[INFO] Installing PyTorch..."
 & ".\runtime\python.exe" -m ensurepip
 & ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location
+
 switch ($cuda) {
     "cu124" {
-        & ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install psutil ninja packaging wheel "setuptools>=42" --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     "cu128" {
+        & ".\runtime\python.exe" -m pip install psutil ninja packaging wheel "setuptools>=42" --no-warn-script-location
         & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     default {
         Write-Error "Unsupported CUDA version: $cuda"
diff --git a/.github/workflows/build_windows_packages.yaml b/.github/workflows/build_windows_packages.yaml
index 32861463..95e9d0ff 100644
--- a/.github/workflows/build_windows_packages.yaml
+++ b/.github/workflows/build_windows_packages.yaml
@@ -31,6 +31,15 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Install Windows CUDA 12.9
+        if: ${{ runner.os == 'Windows' && matrix.torch_cuda == '12.8' }}
+        uses: Jimver/cuda-toolkit
+        id: cuda-toolkit-win-129
+        with:
+          cuda: 12.9.1
+          method: "network"
+          sub-packages: '["nvcc", "cudart", "visual_studio_integration"]'
+
       - name: Run Build and Upload Script
         shell: pwsh
         run: |
diff --git a/Docker/miniconda_install.sh b/Docker/miniconda_install.sh
index 001a2a46..cf2e3d6f 100644
--- a/Docker/miniconda_install.sh
+++ b/Docker/miniconda_install.sh
@@ -23,8 +23,10 @@ fi
 
 if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
+    SYSROOT_PKG="sysroot_linux-64>=2.28"
 elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
+    SYSROOT_PKG="sysroot_linux-aarch64>=2.28"
 else
     exit 1
 fi
@@ -45,20 +47,36 @@ rm miniconda.sh
 
 source "$HOME/miniconda3/etc/profile.d/conda.sh"
 
+"$HOME/miniconda3/bin/conda" init bash
+
+source "$HOME/.bashrc"
+
 "$HOME/miniconda3/bin/conda" config --add channels conda-forge
 
 "$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
 
 "$HOME/miniconda3/bin/conda" install python=3.11 -q -y
 
-"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
+"$HOME/miniconda3/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -q -y
 
 if [ "$CUDA_VERSION" = "12.8" ]; then
     "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
+    "$HOME/miniconda3/bin/conda" install cuda-nvcc=12.8 -c nvidia
 elif [ "$CUDA_VERSION" = "12.6" ]; then
-    "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
+    "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
+    "$HOME/miniconda3/bin/conda" install cuda-nvcc=12.6 -c nvidia
 fi
 
+CUDA_PATH=$(echo "$HOME/miniconda3/targets/"*-linux | awk '{print $1}')
+
+export CUDA_HOME=$CUDA_PATH
+export PATH="$HOME/miniconda3/bin:$PATH"
+export PATH="$CUDA_HOME/bin:$PATH"
+export PATH="$CUDA_HOME/nvvm/bin:$PATH"
+
+"$HOME/miniconda3/bin/pip" install psutil ninja packaging wheel "setuptools>=42"
+"$HOME/miniconda3/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
+
 "$HOME/miniconda3/bin/pip" cache purge
 
 rm $LOG_PATH
diff --git a/GPT_SoVITS/AR/text_processing/__init__.py b/GPT_SoVITS/AR/text_processing/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py
deleted file mode 100644
index 1003040e..00000000
--- a/GPT_SoVITS/AR/text_processing/phonemizer.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
-# reference: https://github.com/lifeiteng/vall-e
-import itertools
-import re
-from typing import Dict
-from typing import List
-
-import regex
-from gruut import sentences
-from gruut.const import Sentence
-from gruut.const import Word
-from AR.text_processing.symbols import SYMBOL_TO_ID
-
-
-class GruutPhonemizer:
-    def __init__(self, language: str):
-        self._phonemizer = sentences
-        self.lang = language
-        self.symbol_to_id = SYMBOL_TO_ID
-        self._special_cases_dict: Dict[str] = {
-            r"\.\.\.": "... ",
-            ";": "; ",
-            ":": ": ",
-            ",": ", ",
-            r"\.": ". ",
-            "!": "! ",
-            r"\?": "? ",
-            "—": "—",
-            "…": "… ",
-            "«": "«",
-            "»": "»",
-        }
-        self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
-
-    def _normalize_punctuation(self, text: str) -> str:
-        text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
-        text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
-        text = regex.sub(r"\pZ+", r" ", text)
-        return text.strip()
-
-    def _convert_punctuation(self, word: Word) -> str:
-        if not word.phonemes:
-            return ""
-        if word.phonemes[0] in ["‖", "|"]:
-            return word.text.strip()
-
-        phonemes = "".join(word.phonemes)
-        # remove modifier characters ˈˌː with regex
-        phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
-        return phonemes.strip()
-
-    def phonemize(self, text: str, espeak: bool = False) -> str:
-        text_to_phonemize: str = self._normalize_punctuation(text)
-        sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)]
-        words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)]
-        return " ".join(words)
-
-    def transform(self, phonemes):
-        # convert phonemes to ids
-        # dictionary is in symbols.py
-        return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
-
-
-if __name__ == "__main__":
-    phonemizer = GruutPhonemizer("en-us")
-    # text -> IPA
-    phonemes = phonemizer.phonemize("Hello, wor-ld ?")
-    print("phonemes:", phonemes)
-    print("len(phonemes):", len(phonemes))
-    phoneme_ids = phonemizer.transform(phonemes)
-    print("phoneme_ids:", phoneme_ids)
-    print("len(phoneme_ids):", len(phoneme_ids))
diff --git a/GPT_SoVITS/AR/text_processing/symbols.py b/GPT_SoVITS/AR/text_processing/symbols.py
deleted file mode 100644
index f7ef57fa..00000000
--- a/GPT_SoVITS/AR/text_processing/symbols.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
-# reference: https://github.com/lifeiteng/vall-e
-PAD = "_"
-PUNCTUATION = ';:,.!?¡¿—…"«»“” '
-LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-IPA_LETTERS = (
-    "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
-)
-SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
-SPACE_ID = SYMBOLS.index(" ")
-SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
-ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
diff --git a/GPT_SoVITS/Accelerate/MLX/__init__.py b/GPT_SoVITS/Accelerate/MLX/__init__.py
new file mode 100644
index 00000000..4042c0f1
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/__init__.py
@@ -0,0 +1,11 @@
+import importlib.util
+
+if importlib.util.find_spec("mlx") is not None:
+    from .sample_funcs_mlx import sample_naive as sample_naive_mlx
+    from .t2s_engine_mlx import T2SEngine as T2SEngineMLX
+
+    backends = ["mlx_static", "mlx_quantized", "mlx_varlen"]
+else:
+    backends = []
+
+__all__ = ["T2SEngineMLX", "sample_naive_mlx", "backends"]
diff --git a/GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py b/GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py
new file mode 100644
index 00000000..fbd10af4
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py
@@ -0,0 +1,172 @@
+from __future__ import annotations
+
+from typing import cast
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from ..structs_mlx import KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCache,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Array = mx.array
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+
+    @staticmethod
+    def quantized_scaled_dot_product_attention(
+        queries: Array,
+        q_keys: tuple[Array, Array, Array],
+        q_values: tuple[Array, Array, Array],
+        scale: float,
+        mask: Array,
+        group_size: int = 32,
+        bits: int = 8,
+    ) -> Array:
+        queries *= scale
+
+        scores = mx.quantized_matmul(queries, *q_keys, transpose=True, group_size=group_size, bits=bits)
+        scores = mx.where(mask, scores, -mx.inf)
+        scores = mx.softmax(scores, axis=-1, precise=True)  # type: ignore
+        out = mx.quantized_matmul(scores, *q_values, transpose=False, group_size=group_size, bits=bits)
+
+        return out
+
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+
+        max_idx = int(input_pos.max())
+
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, *kv_cache))
+
+        mask = attn_mask[..., :max_idx]
+
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+    # def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+    #     bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+
+    #     q, k, v = self.in_proj(x).split(3, axis=-1)
+
+    #     q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+    #     q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+
+    #     kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+
+    #     assert len(kv_cache) == 3
+    #     (k_q, k_s, k_b), (v_q, v_s, v_b), (group_size, bits) = kv_cache
+
+    #     k_q, k_s, k_b, v_q, v_s, v_b = map(lambda x: x[..., : int(input_pos.max()), :], (k_q, k_s, k_b, v_q, v_s, v_b))
+
+    #     mask = attn_mask[..., : int(input_pos.max())]
+
+    #     attn = Attention.quantized_scaled_dot_product_attention(
+    #         q,
+    #         (k_q, k_s, k_b),
+    #         (v_q, v_s, v_b),
+    #         self.scale,
+    #         mask,
+    #         group_size,
+    #         bits,
+    #     )
+
+    #     attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+
+    #     output = self.out_proj(attn)
+
+    #     return output
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int, *args, **kwds) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length, *args, **kwds)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length, *args, **kwds)
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+        *args,
+        **kwds,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+            *args,
+            **kwds,
+        )
+
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+                *args,
+                **kwds,
+            )
+            for _ in range(n_layer)
+        ]
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 1800,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
+        self.group_size = 32
+        self.bits = 8
+
+    # def init_cache(self, bsz: int = 0):
+    #     return super().init_cache(bsz, group_size=self.group_size, bits=self.bits)
+
+    def quantized(self):
+        nn.quantize(self.h, self.group_size, self.bits)
diff --git a/GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py b/GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py
new file mode 100644
index 00000000..6716fb32
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from typing import cast
+
+import mlx.core as mx
+
+from ..structs_mlx import KVCache, KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Array = mx.array
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+
+        k, v = kv_cache
+
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=attn_mask)
+
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+        )
+
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+            )
+            for _ in range(n_layer)
+        ]
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 1800,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
diff --git a/GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py b/GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py
new file mode 100644
index 00000000..3f07f6e2
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+from typing import cast
+
+import mlx.core as mx
+
+from ..structs_mlx import KVCache, KVCacheQ
+from ..t2s_model_abc import (
+    AttentionABC,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Array = mx.array
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+        self.kc_class = KVCacheHND
+
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+
+        q, k, v = self.in_proj(x).split(3, axis=-1)
+
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+
+        kv_cache = self.kc_class.update_cache(input_pos, k, v, kv_cache, cache_idx)
+        assert len(kv_cache) == 2
+
+        max_idx = int(input_pos.max())
+
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, *kv_cache))
+
+        mask = attn_mask[..., :max_idx]
+
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale, mask=mask)
+
+        attn = attn.swapaxes(1, 2).reshape(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__(
+            hidden_dim,
+            n_layer,
+            n_head,
+            ffn_dim,
+            vocab_size,
+            max_seq_length,
+            max_batch_size,
+        )
+
+        self.layers = [
+            TransformerBlock(
+                n_head,
+                ffn_dim,
+                hidden_dim,
+                max_seq_length,
+            )
+            for _ in range(n_layer)
+        ]
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 1800,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.h = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
diff --git a/GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py b/GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py
new file mode 100644
index 00000000..2ec6f8d9
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py
@@ -0,0 +1,64 @@
+from functools import partial
+from typing import Protocol, cast
+
+import mlx.core as mx
+
+Array = mx.array
+
+
+class SampleProtocolMLX(Protocol):
+    @staticmethod
+    def __call__(
+        logits: Array,
+        previous_tokens: Array,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ) -> Array: ...
+
+
+class sample_naive(SampleProtocolMLX):
+    @partial(mx.compile)
+    @staticmethod
+    def __call__(
+        logits,
+        previous_tokens,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+    ):
+        if temperature <= 1e-5:
+            probs = mx.softmax(logits, axis=-1)
+            return mx.argmax(probs, axis=-1, keepdims=True)
+
+        if repetition_penalty != 1.0:
+            batch_idx = mx.arange(cast(tuple[int, ...], previous_tokens.shape)[0])
+            previous_tokens = previous_tokens.astype(mx.int64)
+            selected_logists = logits[batch_idx, previous_tokens]
+            selected_logists = mx.where(
+                selected_logists < 0, selected_logists * repetition_penalty, selected_logists / repetition_penalty
+            )
+            logits[batch_idx, previous_tokens] = selected_logists
+
+        sorted_indices = mx.argsort(-logits, axis=-1)
+        sorted_logits = mx.take_along_axis(logits, sorted_indices, axis=-1)
+        cum_probs = mx.cumsum(mx.softmax(sorted_logits, axis=-1), axis=-1)
+        sorted_indices_to_remove = cum_probs > top_p
+        sorted_indices_to_remove[:, 0] = False
+        indices_to_remove = mx.zeros_like(logits).astype(mx.bool_)
+        batch_indices = mx.arange(cast(tuple[int, ...], logits.shape)[0])[:, None]
+        indices_to_remove[batch_indices, sorted_indices] = sorted_indices_to_remove
+        logits = mx.where(indices_to_remove, -mx.inf, logits)
+
+        logits = logits / temperature
+
+        v = mx.topk(logits, top_k)
+        pivot = mx.expand_dims(v[:, -1], -1)
+        logits = mx.where(logits < pivot, -mx.inf, logits)
+
+        gumbel_noise = mx.random.gumbel(shape=cast(tuple[int, ...], logits.shape), dtype=logits.dtype)
+        idx_next = mx.argmax(logits + gumbel_noise, axis=-1, keepdims=True).astype(mx.int32)
+
+        return idx_next
diff --git a/GPT_SoVITS/Accelerate/MLX/structs_mlx.py b/GPT_SoVITS/Accelerate/MLX/structs_mlx.py
new file mode 100644
index 00000000..ab6643a0
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/structs_mlx.py
@@ -0,0 +1,164 @@
+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import List, MutableSequence, Protocol, TypeAlias, cast
+
+import mlx.core as mx
+import torch
+
+from ..PyTorch.structs import T2SRequest, T2SResult
+from .sample_funcs_mlx import SampleProtocolMLX, sample_naive
+
+Tensor = torch.Tensor
+Array = mx.array
+
+
+@dataclass(slots=True)
+class T2SRequestMLX:
+    x: List[Array]
+    x_lens: Array
+    prompts: Array
+    bert_feature: List[Array]
+    valid_length: int
+    top_k: int = 5
+    top_p: float = 1
+    early_stop_num: int = -1
+    temperature: float = 1.0
+    repetition_penalty: float = 1.35
+
+    @classmethod
+    def from_torch(cls, request: T2SRequest) -> T2SRequestMLX:
+        x = list(map(lambda tensor: mx.array(tensor.cpu()), request.x))
+        x_lens = mx.array(request.x_lens.cpu())
+        prompts = mx.array(request.prompts.cpu())
+        bert_feature = list(map(lambda tensor: mx.array(tensor.cpu()), request.bert_feature))
+
+        return cls(
+            x,
+            x_lens,
+            prompts,
+            bert_feature,
+            request.valid_length,
+            request.top_k,
+            request.top_p,
+            request.early_stop_num,
+            request.temperature,
+            request.repetition_penalty,
+        )
+
+
+KVCache: TypeAlias = tuple[Array, Array]
+KVCacheQ: TypeAlias = tuple[tuple[Array, Array, Array], tuple[Array, Array, Array], tuple[int, int]]
+
+
+class KVCacheProtocol(Protocol):
+    @staticmethod
+    def empty(kv_cache: KVCache | KVCacheQ) -> None: ...
+
+    @staticmethod
+    def update_cache(
+        input_pos: Array, k_val: Array, v_val: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array
+    ) -> KVCache | KVCacheQ: ...
+
+    @staticmethod
+    def prefill_kv(k_val: Array, v_val: Array, kv_cache: KVCache | KVCacheQ) -> None: ...
+
+    @staticmethod
+    def init_cache(
+        batch_size: int, max_seq_length: int, n_heads: int, head_dim: int, dtype: mx.Dtype, *args, **kwds
+    ) -> KVCache | KVCacheQ: ...
+
+
+class T2SDecoderProtocol(Protocol):
+    max_seq_length: int
+    EOS: int
+    n_head: int
+
+    def embed(self, x: list[Array], y: Array, bert_features: list[Array]) -> Array: ...
+
+
+class T2SEngineProtocol(Protocol):
+    def _handle_request(self, request: T2SRequest) -> tuple[list[Array], float]: ...
+
+    def generate(self, request: T2SRequest) -> T2SResult: ...
+
+    @staticmethod
+    def load_decoder(
+        weights_path: os.PathLike, max_batch_size: int = 1, implement: str = "MLX"
+    ) -> T2SDecoderProtocol: ...
+
+
+class T2SSessionMLX:
+    def __init__(
+        self,
+        decoder: T2SDecoderProtocol,
+        request_torch: T2SRequest,
+        sample_func: type[SampleProtocolMLX] = sample_naive,
+        device: mx.Device = mx.Device(mx.cpu),
+        dtype: mx.Dtype = mx.float32,
+    ):
+        with mx.stream(device):
+            request = T2SRequestMLX.from_torch(request_torch)
+
+            self.decoder = decoder
+            self.request = request
+            self.device = device
+            self.dtype = dtype
+
+            bsz = len(request.x)
+            y_len: int = cast(tuple[int, ...], request.prompts.shape)[-1]
+            self.bsz = bsz
+            self.y_len = y_len
+
+            # Cache
+            self.kv_cache: MutableSequence[KVCache | KVCacheQ]
+            self.sample = sample_func()
+
+            # Forward args
+            self.x = [i.astype(mx.int32) for i in request.x]
+            self.x_lens = request.x_lens.astype(mx.int32)
+            self.y = mx.zeros((bsz, decoder.max_seq_length)).astype(mx.int32)
+            self.y[:, : cast(tuple[int, ...], request.prompts.shape)[-1]] = request.prompts.astype(mx.int32)
+            self.bert_feature = [i.astype(dtype) for i in request.bert_feature]
+
+            self.prefill_len = self.x_lens + cast(tuple[int, ...], request.prompts.shape)[1]
+
+            self.input_pos = mx.zeros_like(self.prefill_len)
+            self.input_pos += self.prefill_len
+
+            # EOS
+            self.completed = mx.array([False] * len(self.x)).astype(mx.bool_)
+            self.y_results: List[Array] = [None] * len(self.x)  # type: ignore
+
+            self.xy_pos = decoder.embed(self.x, request.prompts, self.bert_feature)
+
+            max_len = int(self.prefill_len.max(-1))
+            attn_mask = mx.zeros(shape=(bsz, max_len, max_len), dtype=mx.bool_)
+
+            for bs in range(bsz):
+                pos = int(self.x_lens[bs])
+                seq_len = pos + y_len
+
+                attn_mask[bs, :seq_len, :pos] = True
+
+                ar_mask = ~mx.triu(
+                    x=mx.ones(
+                        shape=(
+                            y_len,
+                            y_len,
+                        ),
+                        dtype=mx.bool_,
+                    ),
+                    k=1,
+                )
+                attn_mask[bs, pos:seq_len, pos:seq_len] = ar_mask
+
+            attn_mask = mx.repeat(mx.expand_dims(attn_mask, 1), decoder.n_head, 1)
+            self.attn_mask = attn_mask
+
+            mx.eval(self.attn_mask)
diff --git a/GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py b/GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py
new file mode 100644
index 00000000..7f2bdb3f
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py
@@ -0,0 +1,231 @@
+import gc
+import os
+import time
+import traceback
+from typing import cast
+
+import mlx.core as mx
+import torch
+from rich.progress import BarColumn, Progress, TextColumn, TimeRemainingColumn
+from tqdm import tqdm
+
+from ..PyTorch.structs import T2SEngineProtocol, T2SRequest
+from .backends import mlx_quantized, mlx_static, mlx_varlen
+from .structs_mlx import T2SResult, T2SSessionMLX
+from .t2s_model_abc import T2SDecoderABC
+
+Array = mx.array
+Tensor = torch.Tensor
+
+
+class T2SEngine(T2SEngineProtocol):
+    def __init__(
+        self,
+        decoder_model: T2SDecoderABC,
+        device: mx.Device | str = mx.Device(mx.cpu),
+        dtype: torch.dtype | mx.Dtype = torch.float32,
+    ) -> None:
+        if isinstance(device, str):
+            match device:
+                case "mx.cpu":
+                    device = mx.Device(mx.cpu)
+                case "mx.gpu":
+                    device = mx.Device(mx.gpu)
+
+        match dtype:
+            case torch.float32:
+                dtype = mx.float32
+            case torch.float16:
+                dtype = mx.float16
+            case torch.bfloat16:
+                dtype = mx.bfloat16
+
+        device = cast(mx.Device, device)
+        dtype = cast(mx.Dtype, dtype)
+
+        assert device.type.value in {0, 1}
+        assert dtype in {mx.float16, mx.bfloat16, mx.float32}
+
+        self.device = device
+        self.dtype = dtype
+
+        mx.set_default_device(device)
+        decoder_model.set_dtype(self.dtype)
+
+        self.decoder_model: T2SDecoderABC = decoder_model
+        self.decoder_model.compile()
+
+    def _handle_request(self, request: T2SRequest):
+        decoder = self.decoder_model
+        session = T2SSessionMLX(decoder, request, device=self.device, dtype=self.dtype)
+        batch_idx = mx.arange(session.bsz)
+
+        t1 = 0.0
+        infer_speed = 0.0
+        infer_time = 0.0
+
+        with (
+            mx.stream(session.device),
+            Progress(
+                TextColumn("[cyan]{task.description}"),
+                BarColumn(),
+                TextColumn("{task.completed}/{task.total}"),
+                TimeRemainingColumn(),
+                transient=True,
+            ) as progress,
+        ):
+            task = progress.add_task("T2S Decoding", total=1500)
+            for idx in range(1500):
+                progress.update(task, advance=1)
+                if idx == 0:
+                    session.kv_cache = decoder.init_cache(session.bsz)
+                    xy_dec = decoder.h.prefill(
+                        session.xy_pos,
+                        session.attn_mask,
+                        session.kv_cache,
+                    )  # bs, seq_len, embed_dim
+                    xy_dec = xy_dec[None, batch_idx, session.input_pos - 1]
+                else:
+                    args, kwds = decoder.pre_forward(session)
+                    xy_dec = decoder.h(
+                        session.input_pos,
+                        session.xy_pos,
+                        session.kv_cache,
+                        batch_idx,
+                        *args,
+                        **kwds,
+                    )
+
+                decoder.post_forward(idx, session)
+                logits = decoder.ar_predict_layer(xy_dec[:, -1])
+                session.input_pos += 1
+
+                if idx == 0:
+                    logits[:, -1] = -mx.inf
+
+                samples = session.sample(
+                    logits=logits,
+                    previous_tokens=session.y[:, : session.y_len + idx],
+                    top_k=request.top_k,
+                    top_p=request.top_p,
+                    repetition_penalty=request.repetition_penalty,
+                    temperature=request.temperature,
+                )
+
+                session.y[batch_idx, session.y_len + idx] = samples
+
+                argmax_token = mx.argmax(logits, axis=-1)
+                sample_token = samples.squeeze(1)
+                EOS_mask = (cast(Array, argmax_token == decoder.EOS)) | (sample_token == decoder.EOS)
+
+                newly_done_mask = EOS_mask & (~session.completed)
+                newly_done_indices = mx.where(newly_done_mask, batch_idx, -1)
+                pos = mx.where(newly_done_indices != -1, batch_idx, session.bsz)
+                pos_sorted = mx.sort(pos, axis=0)
+                valid_count = session.bsz - mx.sum(cast(Array, pos_sorted == session.bsz))
+                pos_final = pos_sorted[: int(valid_count)]
+                newly_done_indices = mx.expand_dims(newly_done_indices[pos_final], 0)
+
+                if newly_done_indices.size > 0:
+                    for i in newly_done_indices:
+                        session.y_results[int(i)] = session.y[i, session.y_len : session.y_len + idx]
+                        session.completed[newly_done_indices] = True
+
+                if mx.all(session.completed).item():
+                    if session.y.sum() == 0:
+                        session.y_results = [mx.array([0]) for _ in range(session.bsz)]
+                        tqdm.write("Bad Zero Prediction")
+                    else:
+                        tqdm.write(
+                            f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[cast(tuple[int, ...], i.shape)[-1] for i in session.y_results].__str__().strip('[]')}"
+                        )
+                        tqdm.write(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                        infer_time = time.perf_counter() - t1
+                        infer_speed = (idx - 1) / infer_time
+                    break
+
+                if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == 1499:
+                    for j in range(session.bsz):
+                        if not session.completed[j].item():
+                            session.y_results[j] = session.y[[j], session.y_len : session.y_len + 1499]
+                            session.completed[j] = True
+                    tqdm.write(
+                        f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> {[cast(tuple[int, ...], i.shape)[-1] for i in session.y_results].__str__().strip('[]')}"
+                    )
+                    tqdm.write(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                    infer_time = time.perf_counter() - t1
+                    infer_speed = (idx - 1) / infer_time
+                    break
+
+                y_emb = decoder.ar_audio_embedding(samples)
+                session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
+                mx.eval(session.xy_pos, session.y)
+
+                if idx == 1:
+                    t1 = time.perf_counter()
+
+                if idx % 100 == 0:
+                    mx.clear_cache()
+
+        match session.device:
+            case mx.gpu:
+                mx.clear_cache()
+            case mx.cpu:
+                gc.collect()
+
+        result_mlx = session.y_results[: request.valid_length]
+        mx.eval(result_mlx)
+        result = [torch.tensor(k) for k in result_mlx]
+        return result, infer_speed, infer_time
+
+    def generate(self, request: T2SRequest):
+        try:
+            result, infer_speed, infer_time = self._handle_request(request)
+            t2s_result = T2SResult(result=result, infer_speed=(infer_speed, infer_time), status="Success")
+        except Exception as e:
+            t2s_result = T2SResult(status="Error", exception=e, traceback=traceback.format_exc())
+        return t2s_result
+
+    @staticmethod
+    def replace_key(state_dict: dict[str, Tensor]):
+        state_dict_mlx: list[tuple[str, Array]] = []
+        for key, value in state_dict.items():
+            key = (
+                key.replace("model.", "")
+                .replace("in_proj_", "in_proj.")
+                .replace("self_attn", "attention")
+                .replace("linear", "feed_forward.linear")
+                .replace("norm1", "attention_norm")
+                .replace("norm2", "ffn_norm")
+            )
+            value_mlx = mx.array(value)
+            state_dict_mlx.append((key, value_mlx))
+        return state_dict_mlx
+
+    @staticmethod
+    def load_decoder(weights_path: os.PathLike, max_batch_size: int = 1, backend: str = "MLX-Varlen"):
+        print(f"Loading Text2Semantic Weights from {weights_path} with {backend} Backend")
+        dict_s1 = torch.load(weights_path, map_location="cpu", weights_only=False, mmap=True)
+        config = dict_s1["config"]
+        match backend:
+            case "MLX-Varlen":
+                decoder_cls: type[T2SDecoderABC] = mlx_varlen.T2SDecoder
+            case "MLX-Static":
+                decoder_cls = mlx_static.T2SDecoder
+            case "MLX-Quantized":
+                decoder_cls = mlx_quantized.T2SDecoder
+            case _:
+                raise RuntimeError(f"Backend {backend} Not Found")
+
+        decoder: T2SDecoderABC = decoder_cls(config, max_batch_size=max_batch_size)
+        state_dict = dict_s1["weight"]
+        state_dict_mlx = T2SEngine.replace_key(state_dict)
+        decoder.load_weights(state_dict_mlx)
+        decoder.eval()
+        mx.eval(decoder)
+
+        if "Quantized" in backend and isinstance(decoder, mlx_quantized.T2SDecoder):
+            decoder.quantized()
+            mx.eval(decoder)
+
+        return decoder
diff --git a/GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py b/GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py
new file mode 100644
index 00000000..fb295179
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py
@@ -0,0 +1,530 @@
+from __future__ import annotations
+
+import math
+from abc import ABC, abstractmethod
+from typing import MutableSequence, cast
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from .structs_mlx import KVCache, KVCacheProtocol, KVCacheQ, T2SDecoderProtocol, T2SSessionMLX
+
+Array = mx.array
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        vocab_size: int,
+    ):
+        super().__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
+
+    @property
+    def weight(self):
+        return self.word_embeddings.weight
+
+    def embedding(self, index: int):
+        return self.word_embeddings.weight[index : index + 1]
+
+    def __call__(self, x: Array):
+        x = self.word_embeddings(x)
+        return x
+
+
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        scale: bool = False,
+        max_batch_size: int = 10,
+        max_seq_len: int = 1800,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
+        self.alpha = mx.ones(1)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = max_seq_len
+
+        self.reverse = False
+        self._pe = mx.zeros((max_batch_size, max_seq_len, embedding_dim))
+        self.compute_pe()
+
+    def compute_pe(self):
+        """Reset the positional encodings."""
+
+        if self.reverse:
+            position = mx.expand_dims(mx.arange(self.max_seq_len - 1, -1, -1.0), axis=1)
+        else:
+            position = mx.expand_dims(mx.arange(self.max_seq_len), axis=1)
+        div_term = mx.exp(
+            mx.arange(
+                0,
+                self.embedding_dim,
+                2,
+            )
+            * -(math.log(10000.0) / self.embedding_dim)
+        )
+        pe = self._pe
+        pe[:, :, 0::2] = mx.sin(position * div_term)
+        pe[:, :, 1::2] = mx.cos(position * div_term)
+
+    def __call__(self, input_pos: Array, x: Array):
+        """
+        Args:
+            input_pos (Array): [batch_size, ]
+            x (Array): [batch_size, 1, embed_dim]
+
+        Returns:
+            embedded_x (Array): [batch_size, 1, embed_dim]
+        """
+
+        batch_size = cast(tuple[int, ...], x.shape)[0]
+        pe_values = self._pe[mx.arange(batch_size), input_pos - 1]  # (batch_size, embed_dim)
+
+        return x * self.x_scale + self.alpha * mx.expand_dims(pe_values, 1)  # (batch_size, 1, embed_dim)
+
+    def prefill(self, x: Array):
+        """
+        Args:
+            x (Array): [batch_size, seq_len, embed_dim]
+
+        Returns:
+            embedded_x (Array): [batch_size, seq_len, embed_dim]
+        """
+        pe_values = self._pe[:, : cast(tuple[int, ...], x.shape)[-2]]
+        return x * self.x_scale + self.alpha * pe_values
+
+
+class KVCacheHND(KVCacheProtocol):
+    @staticmethod
+    def empty(kv_cache):
+        assert len(kv_cache) == 2
+        k_cache, v_cache = kv_cache
+
+        k_cache[:] = 0
+        v_cache[:] = 0
+
+    @staticmethod
+    def update_cache(input_pos, k_val, v_val, kv_cache, cache_idx):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+        assert len(kv_cache) == 2
+        k_out, v_out = kv_cache
+        ip0 = input_pos - 1
+
+        k_out[cache_idx, :, ip0, None] = k_val
+        v_out[cache_idx, :, ip0, None] = v_val
+
+        return k_out, v_out
+
+    @staticmethod
+    def prefill_kv(k_val, v_val, kv_cache):
+        # k_val: [B, S, H, D]
+        assert len(kv_cache) == 2
+        k_cache, v_cache = kv_cache
+
+        k_cache[..., : cast(tuple[int, ...], k_val.shape)[1], :] = k_val.swapaxes(1, 2)
+        v_cache[..., : cast(tuple[int, ...], v_val.shape)[1], :] = v_val.swapaxes(1, 2)
+
+    @staticmethod
+    def init_cache(batch_size: int, max_seq_length: int, n_heads: int, head_dim: int, dtype: mx.Dtype) -> KVCache:
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+
+        return (mx.zeros(cache_shape, dtype=dtype), mx.zeros(cache_shape, dtype=dtype))
+
+
+class KVCacheHNDQuantized(KVCacheProtocol):
+    @staticmethod
+    def _el_per_int(bits: int) -> int:
+        return 32 // bits
+
+    @staticmethod
+    def _packed_dim(head_dim: int, bits: int = 8) -> int:
+        el_per_int = KVCacheHNDQuantized._el_per_int(bits)
+        if head_dim % el_per_int != 0:
+            raise ValueError(f"{head_dim=} is not divisible by {el_per_int=} ({bits=})")
+        return head_dim // el_per_int
+
+    @staticmethod
+    def _group_count(head_dim: int, group_size: int = 32) -> int:
+        assert group_size in {32, 64, 128}
+        if head_dim % group_size != 0:
+            raise ValueError(f"{head_dim} is not divisible by {group_size=}")
+        return head_dim // group_size
+
+    @staticmethod
+    def empty(kv_cache) -> None:
+        assert len(kv_cache) == 3
+        (k_q, k_s, k_b), (v_q, v_s, v_b), (_, __) = kv_cache
+
+        k_q[:] = 0
+        k_s[:] = 0
+        k_b[:] = 0
+        v_q[:] = 0
+        v_s[:] = 0
+        v_b[:] = 0
+
+    @staticmethod
+    def update_cache(
+        input_pos,
+        k_val,
+        v_val,
+        kv_cache,
+        cache_idx,
+    ):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+
+        assert len(kv_cache) == 3
+        (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits) = kv_cache
+
+        k_q, k_s, k_b = mx.quantize(k_val, group_size=group_size, bits=bits)
+        v_q, v_s, v_b = mx.quantize(v_val, group_size=group_size, bits=bits)
+
+        ip0 = input_pos - 1
+
+        k_q_out[cache_idx, :, ip0, None] = k_q
+        k_s_out[cache_idx, :, ip0, None] = k_s
+        k_b_out[cache_idx, :, ip0, None] = k_b
+
+        v_q_out[cache_idx, :, ip0, None] = v_q
+        v_s_out[cache_idx, :, ip0, None] = v_s
+        v_b_out[cache_idx, :, ip0, None] = v_b
+
+        return (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits)
+
+    @staticmethod
+    def prefill_kv(
+        k_val,
+        v_val,
+        kv_cache,
+    ) -> None:
+        assert len(kv_cache) == 3
+        (k_q_out, k_s_out, k_b_out), (v_q_out, v_s_out, v_b_out), (group_size, bits) = kv_cache
+
+        S = cast(tuple[int, ...], k_val.shape)[1]
+
+        k_sw = k_val.swapaxes(1, 2)
+        v_sw = v_val.swapaxes(1, 2)
+
+        k_q, k_s, k_b = mx.quantize(k_sw, group_size=group_size, bits=bits)
+        v_q, v_s, v_b = mx.quantize(v_sw, group_size=group_size, bits=bits)
+
+        k_q_out[..., :S, :] = k_q
+        k_s_out[..., :S, :] = k_s
+        k_b_out[..., :S, :] = k_b
+
+        v_q_out[..., :S, :] = v_q
+        v_s_out[..., :S, :] = v_s
+        v_b_out[..., :S, :] = v_b
+
+    @staticmethod
+    def init_cache(
+        batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype: mx.Dtype,
+        *,
+        group_size: int = 32,
+        bits: int = 8,
+    ) -> KVCacheQ:
+        packed_dim = KVCacheHNDQuantized._packed_dim(head_dim, bits=bits)
+        group_cnt = KVCacheHNDQuantized._group_count(head_dim, group_size=group_size)
+
+        packed_shape = (batch_size, n_heads, max_seq_length, packed_dim)
+        group_shape = (batch_size, n_heads, max_seq_length, group_cnt)
+
+        k_q = mx.zeros(packed_shape, dtype=mx.uint32)
+        k_s = mx.zeros(group_shape, dtype=dtype)
+        k_b = mx.zeros(group_shape, dtype=dtype)
+
+        v_q = mx.zeros(packed_shape, dtype=mx.uint32)
+        v_s = mx.zeros(group_shape, dtype=dtype)
+        v_b = mx.zeros(group_shape, dtype=dtype)
+
+        return (k_q, k_s, k_b), (v_q, v_s, v_b), (group_size, bits)
+
+
+class AttentionABC(ABC, nn.Module):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int, *args, **kwds):
+        super().__init__()
+
+        self.n_head = n_head
+        self.hidden_dim = hidden_dim
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+
+        self.max_seq_length = max_seq_length
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+        self.scale = 1 / math.sqrt(self.head_dim)
+
+        self.kc_class: KVCacheProtocol
+
+    @abstractmethod
+    def __call__(
+        self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array
+    ) -> Array: ...
+
+    def prefill(self, x: Array, kv_cache: KVCache | KVCacheQ, attn_mask: Array):
+        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
+
+        q, k, v = self.in_proj(mx.expand_dims(x, 0)).split(3, axis=-1)
+
+        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+        self.kc_class.prefill_kv(k, v, kv_cache)
+
+        q, k, v = map(lambda x: x.swapaxes(1, 2), (q, k, v))
+
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, mask=attn_mask, scale=self.scale)
+
+        attn = mx.nan_to_num(attn)
+
+        attn = attn.swapaxes(1, 2).reshape(1, -1, self.hidden_dim)
+
+        output = self.out_proj(attn)
+
+        return output
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+
+        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
+        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
+
+    def __call__(self, x: Array):
+        return self.linear2(nn.relu(self.linear1(x)))
+
+
+class TransformerBlockABC(nn.Module):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int, *args, **kwds) -> None:
+        super().__init__()
+
+        self.hidden_dim = hidden_dim
+        self.max_seq_length = max_seq_length
+
+        self.attention: AttentionABC
+
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm(self.hidden_dim)
+        self.ffn_norm = nn.LayerNorm(self.hidden_dim)
+
+    def __call__(self, x: Array, input_pos: Array, kv_cache: KVCache | KVCacheQ, cache_idx: Array, attn_mask: Array):
+        h = self.attention_norm(
+            x
+            + self.attention(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+
+    def prefill(self, x: Array, attn_mask: Array, kv_cache: KVCache | KVCacheQ):
+        h = self.attention_norm(
+            x
+            + self.attention.prefill(
+                x,
+                kv_cache,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+
+        return out
+
+
+class TransformerDecoderABC(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+        *args,
+        **kwds,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_dim = hidden_dim
+        self.n_head = n_head
+        assert hidden_dim % n_head == 0
+
+        self.head_dim = hidden_dim // n_head
+        self.vocab_size = vocab_size
+
+        self.n_layer = n_layer
+
+        self.layers: MutableSequence[TransformerBlockABC]
+
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+
+    def __call__(
+        self,
+        input_pos: Array,
+        x: Array,
+        kv_caches: MutableSequence[KVCache | KVCacheQ],
+        cache_idx: Array,
+        *args,
+        **kwds,
+    ):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer(
+                x,
+                input_pos,
+                kv_cache,
+                cache_idx,
+                *args,
+                **kwds,
+            )
+
+        return x
+
+    def prefill(self, x: Array, mask: Array, kv_caches: MutableSequence[KVCache | KVCacheQ]):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.prefill(
+                x,
+                mask,
+                kv_cache,
+            )
+        return x
+
+
+class T2SDecoderABC(nn.Module, T2SDecoderProtocol):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 1800,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__()
+
+        hidden_dim: int = config["model"]["hidden_dim"]
+        embedding_dim: int = config["model"]["embedding_dim"]
+        n_head: int = config["model"]["head"]
+        n_layer: int = config["model"]["n_layer"]
+        vocab_size: int = config["model"]["vocab_size"]
+        phoneme_vocab_size: int = config["model"]["phoneme_vocab_size"]
+        EOS: int = config["model"]["EOS"]
+        ffn_dim: int = hidden_dim * 4
+
+        self.n_layer = int(n_layer)
+        self.hidden_dim = int(hidden_dim)
+        self.n_head = int(n_head)
+        assert hidden_dim % n_head == 0
+
+        self.head_dim = int(hidden_dim // n_head)
+        self.embedding_dim = int(embedding_dim)
+        self.ffn_dim = int(ffn_dim)
+        self.vocab_size = int(vocab_size)
+        self.phoneme_vocab_size = int(phoneme_vocab_size)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        self.EOS = EOS
+        assert self.EOS == self.vocab_size - 1
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC
+
+        self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size)
+        self.ar_text_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size)
+        self.ar_audio_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+
+        self.kv_class: KVCacheProtocol
+
+    def init_cache(self, bsz: int = 0, *args, **kwds) -> MutableSequence[KVCache | KVCacheQ]:
+        bsz = bsz or self.h.max_batch_size
+        assert bsz <= self.h.max_batch_size
+        seq_lens = self.h.max_seq_length
+        dtype = self.bert_proj.bias.dtype
+        cache: MutableSequence[KVCache | KVCacheQ] = [
+            self.kv_class.init_cache(bsz, seq_lens, self.n_head, self.head_dim, dtype, *args, **kwds)
+            for _ in range(self.n_layer)
+        ]
+        mx.eval(cache)
+        return cache
+
+    def embed(
+        self,
+        x: list[Array],
+        y: Array,
+        bert_features: list[Array],
+    ):
+        x_len: list[int] = [cast(tuple[int, ...], i.shape)[0] for i in x]
+        x_len_max = max(x_len)
+        xy_pos = mx.zeros((len(x), x_len_max + cast(tuple[int, ...], y.shape)[1], self.embedding_dim)).astype(
+            bert_features[0].dtype
+        )
+
+        bert_features = list(map(lambda x: x.swapaxes(0, 1), bert_features))
+
+        y_len = cast(tuple[int, ...], y.shape)[1]
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position.prefill(y_emb)
+
+        for bs, (x_, len_, bert_feature) in enumerate(zip(x, x_len, bert_features)):
+            x_emb = self.ar_text_embedding(x_)
+            bert = self.bert_proj(bert_feature)
+            x_emb = x_emb + bert
+            x_pos = self.ar_text_position.prefill(mx.expand_dims(x_emb, 0))
+            xy_pos[[bs], :len_] = x_pos
+            xy_pos[[bs], len_ : len_ + y_len] = y_pos
+
+        mx.eval(xy_pos)
+        return xy_pos
+
+    def compile(self):
+        setattr(self.h, "__call__", mx.compile(self.h.__call__))
+        # setattr(self.h, "prefill", mx.compile(self.h.prefill, shapeless=True))
+
+    def pre_forward(self, session: T2SSessionMLX):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+
+    def post_forward(self, idx: int, session: T2SSessionMLX) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+
+            range_tensor = mx.arange(self.max_seq_length).reshape(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.reshape(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = mx.repeat(attn_mask, self.n_head, 1)
+
+            session.attn_mask = attn_mask
+
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[mx.arange(session.bsz), :, :, input_pos] = True
+        mx.eval(attn_mask)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/__init__.py b/GPT_SoVITS/Accelerate/PyTorch/__init__.py
new file mode 100644
index 00000000..2997b7ff
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/__init__.py
@@ -0,0 +1,28 @@
+import importlib.util
+
+import torch
+
+from .sample_funcs import sample_naive
+from .structs import T2SRequest, T2SResult
+from .t2s_engine import T2SEngine as T2SEngineTorch
+
+backends = ["torch_varlen"]
+if torch.cuda.is_available():
+    backends.append("torch_static_cuda_graph")
+    if importlib.util.find_spec("sageattention") is not None:
+        for i in range(torch.cuda.device_count()):
+            major, minor = torch.cuda.get_device_capability(i)
+            sm_version = major + minor / 10.0
+            if sm_version >= 7.0:
+                backends.append("sage_attn_varlen_cuda_graph")
+    if importlib.util.find_spec("flash_attn") is not None:
+        for i in range(torch.cuda.device_count()):
+            major, minor = torch.cuda.get_device_capability(i)
+            sm_version = major + minor / 10.0
+            if sm_version >= 7.5:
+                backends.append("flash_attn_varlen_cuda_graph")
+if torch.mps.is_available():
+    backends.append("mps_flash_attn_varlen")
+
+
+__all__ = ["T2SEngineTorch", "T2SRequest", "sample_naive", "T2SResult", "backends"]
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
new file mode 100644
index 00000000..666c1b63
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
@@ -0,0 +1,157 @@
+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+
+from typing import Dict, List, Tuple
+
+import kernels
+import torch
+
+from .. import nn
+from ..structs import T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheNHD,
+    KVCacheProtocol,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+flash_attn_kernel = None
+try:
+    import flash_attn_interface as flash_attn  # type: ignore
+
+    flash_attn_kernel = flash_attn.flash_attn_with_kvcache
+except ModuleNotFoundError:
+    try:
+        import flash_attn  # type: ignore
+
+        flash_attn_kernel = flash_attn.flash_attn_with_kvcache
+
+    except ModuleNotFoundError:
+        pass
+
+if flash_attn_kernel is None:
+    flash_attn_kernel = kernels.get_kernel("kernels-community/flash-attn").flash_attn_with_kvcache
+
+
+Tensor = torch.Tensor
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+
+        attn: Tensor = flash_attn.flash_attn_with_kvcache(  # type: ignore
+            q, kv_cache.k_cache, kv_cache.v_cache, k, v, cache_seqlens=input_pos - 1
+        )
+
+        attn = attn.view(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head, ffn_dim, hidden_dim, max_seq_length) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=1800,
+        max_batch_size=10,
+    ) -> None:
+        assert torch.cuda.is_available()
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheNHD
+
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        return super().post_forward(idx, session)
+
+    def pre_forward(self, session: T2SSession) -> Tuple[List, Dict]:
+        return super().pre_forward(session)
+
+
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder: T2SDecoder,
+    ) -> None:
+        super().__init__(decoder)
+
+    def release_graph(self, session: T2SSession):
+        if session.id != self.id:
+            self.assigned = False
+        else:
+            del session.graph, session.xy_pos_, session.xy_dec_, session.input_pos, session.kv_cache
+
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py b/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
new file mode 100644
index 00000000..6f15f51e
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
@@ -0,0 +1,165 @@
+import torch
+from torch.nn import functional as F
+
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Tensor = torch.Tensor
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        k, v = kv_cache.update(input_pos, k, v)
+
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=1800,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
+
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+
+            session.attn_mask = attn_mask
+
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+
+
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        super().__init__(decoder)
+        if torch.cuda.is_available():
+            self.attn_mask = (
+                torch.randint(0, 2, (decoder.max_batch_size, decoder.n_head, 1, decoder.max_seq_length))
+                .bool()
+                .to(self.device, self.dtype)
+            )
+
+    def release_graph(self, session: T2SSession):
+        if session.id != self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.attn_mask,
+            )
+
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+
+        session.attn_mask = self.attn_mask
+
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+
+        session.attn_mask = self.attn_mask.clone().copy_(session.attn_mask)
+
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
new file mode 100644
index 00000000..fd363147
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
@@ -0,0 +1,176 @@
+from typing import MutableSequence
+
+import sageattention  # type: ignore
+import torch
+
+from .. import nn
+from ..structs import T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    KVCacheProtocol,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Tensor = torch.Tensor
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def __call__(
+        self,
+        x: Tensor,
+        input_pos: Tensor,
+        kv_cache: KVCacheProtocol,
+        cu_seqlens_q: Tensor,
+        cu_seqlens_kv: Tensor,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        k, v = kv_cache.update(input_pos, k, v)
+
+        attn: Tensor = sageattention.sageattn_varlen(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=1,
+            max_seqlen_k=self.max_seq_length,
+        )
+
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head, ffn_dim, hidden_dim, max_seq_length) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=1800,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
+
+    def pre_forward(self, session: T2SSession) -> tuple[list[Tensor], dict[str, Tensor]]:
+        return list(), dict(cu_seqlens_q=session.cu_seqlens_q, cu_seqlens_kv=session.cu_seqlens_kv)
+
+    def post_forward(self, idx: int, session: T2SSession):
+        if idx == 0:
+            session.cu_seqlens_q = torch.arange(0, session.bsz + 1, dtype=torch.int32)
+            session.cu_seqlens_kv = torch.cat([torch.tensor(0, dtype=torch.int32), session.input_pos])
+        else:
+            cu_seqlens_q = session.cu_seqlens_q
+            cu_seqlens_kv = session.cu_seqlens_kv
+            cu_seqlens_kv.add_(cu_seqlens_q)
+
+
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder: T2SDecoder,
+    ) -> None:
+        super().__init__(decoder)
+
+        if torch.cuda.is_available():
+            self.cu_seqlens_q = torch.arange(0, decoder.max_batch_size + 1, dtype=torch.int32).to(self.device)
+            self.cu_seqlens_kv = torch.cat([torch.tensor(0, dtype=torch.int32), self.input_pos]).to(self.device)
+
+    def release_graph(self, session: T2SSession):
+        if session.id != self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.cu_seqlens_q,
+                session.cu_seqlens_kv,
+            )
+
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+
+        session.cu_seqlens_q = self.cu_seqlens_q
+        session.cu_seqlens_kv = self.cu_seqlens_kv
+
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+
+        session.cu_seqlens_q = self.cu_seqlens_q.clone().copy_(session.cu_seqlens_q)
+        session.cu_seqlens_kv = self.cu_seqlens_kv.clone().copy_(session.cu_seqlens_kv)
+
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
new file mode 100644
index 00000000..6f15f51e
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
@@ -0,0 +1,165 @@
+import torch
+from torch.nn import functional as F
+
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHND,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Tensor = torch.Tensor
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        k, v = kv_cache.update(input_pos, k, v)
+
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=1800,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHND
+
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+
+            session.attn_mask = attn_mask
+
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+
+
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        super().__init__(decoder)
+        if torch.cuda.is_available():
+            self.attn_mask = (
+                torch.randint(0, 2, (decoder.max_batch_size, decoder.n_head, 1, decoder.max_seq_length))
+                .bool()
+                .to(self.device, self.dtype)
+            )
+
+    def release_graph(self, session: T2SSession):
+        if session.id != self.id:
+            self.assigned = False
+        else:
+            del (
+                session.graph,
+                session.xy_pos_,
+                session.xy_dec_,
+                session.input_pos,
+                session.kv_cache,
+                session.attn_mask,
+            )
+
+    def get_cache_graph(self, session: T2SSession):
+        assert self.graph
+        session.graph = self.graph
+        session.stream = self.stream
+
+        session.xy_pos_ = self.xy_pos
+        session.xy_dec_ = self.xy_dec
+        session.input_pos = self.input_pos.copy_(session.input_pos)
+
+        session.attn_mask = self.attn_mask
+
+        for cache, cache_ in zip(self.kv_cache, session.kv_cache):
+            cache.sync_cache(cache_)
+
+    def capture_new_graph(self, session: T2SSession):
+        session.xy_pos_ = self.xy_pos.clone()
+        session.xy_dec_ = self.xy_dec.clone()
+        session.input_pos = self.input_pos.clone().copy_(session.input_pos)
+
+        session.attn_mask = self.attn_mask.clone().copy_(session.attn_mask)
+
+        args, kwds = self.decoder.pre_forward(session)
+        graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+        session.graph = graph
+        session.stream = torch.cuda.Stream()  # type: ignore
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
new file mode 100644
index 00000000..d079e9af
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
@@ -0,0 +1,144 @@
+from typing import NoReturn
+
+import torch
+from torch.nn import functional as F
+
+from .. import nn
+from ..structs import KVCacheProtocol, T2SSession
+from ..t2s_model_abc import (
+    AttentionABC,
+    CUDAGraphCacheABC,
+    FeedForward,
+    KVCacheHNDVarlen,
+    T2SDecoderABC,
+    TransformerBlockABC,
+    TransformerDecoderABC,
+)
+
+Tensor = torch.Tensor
+
+
+class Attention(AttentionABC):
+    def __init__(self, n_head, hidden_dim, max_seq_length):
+        super().__init__(n_head, hidden_dim, max_seq_length)
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
+        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor):
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        k, v = kv_cache.update(input_pos, k, v)
+
+        max_idx = input_pos.max()
+
+        q, k, v = map(lambda x: x[..., :max_idx, :], (q, k, v))
+
+        mask = attn_mask[..., :max_idx]
+
+        attn = F.scaled_dot_product_attention(q, k, v, mask)
+
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seqlen, self.hidden_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn
+
+
+class TransformerBlock(TransformerBlockABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__(n_head, ffn_dim, hidden_dim, max_seq_length)
+
+        self.attention = Attention(n_head, hidden_dim, max_seq_length)
+        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
+        self.attention_norm = nn.LayerNorm([self.hidden_dim])
+        self.ffn_norm = nn.LayerNorm([self.hidden_dim])
+
+
+class TransformerDecoder(TransformerDecoderABC):
+    def __init__(
+        self,
+        hidden_dim,
+        n_layer,
+        n_head,
+        ffn_dim,
+        vocab_size,
+        max_seq_length,
+        max_batch_size,
+    ) -> None:
+        super().__init__(hidden_dim, n_layer, n_head, ffn_dim, vocab_size, max_seq_length, max_batch_size)
+
+        self.layers = nn.ModuleList(  # type: ignore
+            TransformerBlock(n_head, ffn_dim, hidden_dim, max_seq_length) for _ in range(n_layer)
+        )
+
+
+class T2SDecoder(T2SDecoderABC):
+    def __init__(
+        self,
+        config,
+        max_seq_length=1800,
+        max_batch_size=10,
+    ) -> None:
+        super().__init__(config, max_seq_length, max_batch_size)
+
+        self.bert_proj = nn.Linear(1024, self.embedding_dim)
+        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
+        self.h: TransformerDecoderABC = TransformerDecoder(
+            self.hidden_dim, self.n_layer, self.n_head, self.ffn_dim, self.vocab_size, max_seq_length, max_batch_size
+        )
+
+        self.kv_class = KVCacheHNDVarlen
+
+    def capture(
+        self,
+        *args,
+        **kwds,
+    ) -> NoReturn:
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+
+    def pre_forward(self, session: T2SSession):
+        attn_mask = session.attn_mask
+        return list(), dict(attn_mask=attn_mask)
+
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        if idx == 0:
+            prefill_len = session.prefill_len
+            bsz = session.bsz
+
+            range_tensor = torch.arange(self.max_seq_length).view(1, 1, 1, self.max_seq_length)
+            prefill_len_expanded = prefill_len.view(bsz, 1, 1, 1)
+            attn_mask = range_tensor < prefill_len_expanded
+            attn_mask = attn_mask.expand(-1, self.n_head, -1, -1)
+
+            session.attn_mask = attn_mask
+
+        attn_mask = session.attn_mask
+        input_pos = session.input_pos
+        attn_mask[torch.arange(session.bsz), :, :, input_pos] = True
+
+
+class CUDAGraphCache(CUDAGraphCacheABC):
+    def __init__(
+        self,
+        decoder,
+    ) -> None:
+        super().__init__(decoder, False)
+
+    def release_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+
+    def get_cache_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
+
+    def capture_new_graph(self, session: T2SSession):
+        raise NotImplementedError("Cuda Graph Is Not Supported For Varlen Model")
diff --git a/GPT_SoVITS/Accelerate/PyTorch/nn.py b/GPT_SoVITS/Accelerate/PyTorch/nn.py
new file mode 100644
index 00000000..1e69e056
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/nn.py
@@ -0,0 +1,69 @@
+"""
+Enhanced Type Hint nn.Module
+Modified From https://github.com/labmlai/labml/blob/master/helpers/labml_helpers/module.py
+"""
+
+from typing import Any
+
+import torch.nn
+from torch.nn import (
+    functional as functional,
+)
+from torch.nn import (
+    utils as utils,
+)
+from torch.nn.modules import *  # type: ignore # noqa: F403
+from torch.nn.parameter import (
+    Parameter as Parameter,
+)
+
+Tensor = torch.Tensor
+
+
+class Module(torch.nn.Module):
+    r"""
+    Wraps ``torch.nn.Module`` to overload ``__call__`` instead of
+    ``forward`` for better type checking.
+
+    `PyTorch Github issue for clarification <https://github.com/pytorch/pytorch/issues/44605>`_
+    """
+
+    def _forward_unimplemented(self, *input: Any) -> None:
+        # To stop PyTorch from giving abstract methods warning
+        pass
+
+    def __init_subclass__(cls, **kwargs):
+        if cls.__dict__.get("__call__", None) is None:
+            return
+
+        setattr(cls, "forward", cls.__dict__["__call__"])
+        delattr(cls, "__call__")
+
+    @property
+    def device(self) -> torch.device:
+        params = self.parameters()
+        try:
+            sample_param = next(params)
+            return sample_param.device
+        except StopIteration:
+            raise RuntimeError(f"Unable to determine device of {self.__class__.__name__}") from None
+
+
+class Linear(torch.nn.Linear):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+
+
+class Dropout(torch.nn.Dropout):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+
+
+class Embedding(torch.nn.Embedding):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
+
+
+class LayerNorm(torch.nn.LayerNorm):
+    def __call__(self, input: Tensor) -> Tensor:
+        return super().__call__(input)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/sample_funcs.py b/GPT_SoVITS/Accelerate/PyTorch/sample_funcs.py
new file mode 100644
index 00000000..1cf25a9e
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/sample_funcs.py
@@ -0,0 +1,62 @@
+from typing import Protocol
+
+import torch
+import torch.nn.functional as F
+
+Tensor = torch.Tensor
+
+
+class SampleProtocol(Protocol):
+    @staticmethod
+    def __call__(
+        logits: Tensor,
+        previous_tokens: Tensor,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ) -> Tensor: ...
+
+
+class sample_naive(SampleProtocol):
+    @staticmethod
+    def __call__(
+        logits: Tensor,
+        previous_tokens: Tensor,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ):
+        if temperature <= 1e-5:
+            probs = F.softmax(logits, dim=-1)
+            return torch.argmax(probs, dim=-1, keepdim=True)
+
+        if repetition_penalty != 1.0:
+            previous_tokens = previous_tokens.long()
+            score = torch.gather(logits, dim=1, index=previous_tokens)
+            score = torch.where(
+                score < 0,
+                score * repetition_penalty,
+                score / repetition_penalty,
+            )
+            logits.scatter_(dim=1, index=previous_tokens, src=score)
+
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cum_probs > top_p
+        sorted_indices_to_remove[:, 0] = False  # keep at least one option
+        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+        logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+
+        logits /= temperature
+
+        v, _ = torch.topk(logits, top_k)
+        pivot = v[:, -1].unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+
+        probs = F.softmax(logits, dim=-1)
+        q = torch.empty_like(probs).exponential_(1.0)
+        idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
+
+        return idx_next
diff --git a/GPT_SoVITS/Accelerate/PyTorch/structs.py b/GPT_SoVITS/Accelerate/PyTorch/structs.py
new file mode 100644
index 00000000..8b48068b
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/structs.py
@@ -0,0 +1,151 @@
+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal, MutableSequence, Optional, Protocol
+
+import torch
+
+from .sample_funcs import SampleProtocol, sample_naive
+
+Tensor = torch.Tensor
+
+
+@dataclass
+class T2SResult:
+    result: list[Tensor] | None = None
+    infer_speed: tuple[float, float] = (0.0, 0.0)
+    status: Literal["Success", "Error"] = "Success"
+    exception: Optional[Exception] = None
+    traceback: Optional[str] = None
+
+
+@dataclass
+class T2SRequest:
+    x: list[torch.Tensor]
+    x_lens: Tensor
+    prompts: torch.Tensor
+    bert_feature: list[Tensor]
+    valid_length: int
+    top_k: int = 5
+    top_p: float = 1
+    early_stop_num: int = -1
+    temperature: float = 1.0
+    repetition_penalty: float = 1.35
+    use_cuda_graph: bool = False
+    debug: bool = False
+
+
+class KVCacheProtocol(Protocol):
+    k_cache: Tensor
+    v_cache: Tensor
+
+    def __init__(self, batch_size: int, max_seq_length: int, n_heads: int, head_dim: int) -> None: ...
+
+    def empty(self) -> None: ...
+
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor, *args, **kwds) -> tuple[Tensor, Tensor]: ...
+
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor) -> None: ...
+
+    def sync_cache(self, kv_cache: KVCacheProtocol) -> None: ...
+
+
+class T2SDecoderProtocol(Protocol):
+    max_seq_length: int
+    EOS: int
+    n_head: int
+
+    @property
+    def device(self) -> torch.device: ...
+
+    def embed(self, x: list[Tensor], y: Tensor, bert_features: list[Tensor]) -> Tensor: ...
+
+
+class T2SEngineProtocol(Protocol):
+    def _handle_request(self, request: T2SRequest) -> tuple[list[Tensor], float]: ...
+
+    def generate(self, request: T2SRequest) -> T2SResult: ...
+
+
+class T2SSession:
+    def __init__(
+        self,
+        decoder: T2SDecoderProtocol,
+        request: T2SRequest,
+        sapmle_func: type[SampleProtocol] = sample_naive,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ):
+        with device:
+            self.decoder = decoder
+            self.request = request
+            self.device = device
+            self.dtype = dtype
+
+            bsz = len(request.x)
+            y_len = request.prompts.size(-1)
+            self.bsz = bsz
+            self.y_len = y_len
+            request.prompts = request.prompts.to(device, torch.int32)
+
+            # Cache
+            self.kv_cache: MutableSequence[KVCacheProtocol]
+            self.sample = sapmle_func()
+
+            # Forward args
+            self.x = [i.to(device) for i in request.x]
+            self.x_lens = request.x_lens.to(torch.int32)
+            self.y = torch.zeros((bsz, decoder.max_seq_length)).to(torch.int32)
+            self.y[:, : request.prompts.shape[-1]] = request.prompts
+            self.bert_feature = [i.to(device, dtype) for i in request.bert_feature]
+
+            self.prefill_len = self.x_lens + request.prompts.size(1)
+
+            self.input_pos = torch.zeros_like(self.prefill_len)
+            self.input_pos.add_(self.prefill_len)
+
+            # CUDA Graph
+            self.stream: Optional[torch.cuda.Stream] = None
+            self.graph: Optional[torch.cuda.CUDAGraph] = None
+            self.xy_pos_: Tensor
+            self.xy_dec_: Tensor
+
+            # EOS
+            self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
+            self.y_results: list[Tensor] = [None] * len(self.x)  # type: ignore
+
+            self.xy_pos = decoder.embed(self.x, request.prompts, self.bert_feature)
+
+            max_len = int(self.prefill_len.max().item())
+            attn_mask = torch.zeros(size=(bsz, max_len, max_len), dtype=torch.bool)
+
+            for bs in range(bsz):
+                pos = int(self.x_lens[bs])
+                seq_len = pos + y_len
+
+                attn_mask[bs, :seq_len, :pos] = True
+
+                ar_mask = ~torch.triu(
+                    input=torch.ones(
+                        size=(
+                            y_len,
+                            y_len,
+                        ),
+                        dtype=torch.bool,
+                    ),
+                    diagonal=1,
+                )
+                attn_mask[bs, pos:seq_len, pos:seq_len] = ar_mask
+
+            self.attn_mask = attn_mask
+            self.attn_mask = attn_mask.unsqueeze(0).expand(-1, decoder.n_head, -1, -1)
+
+            self.id: int = -1
+
+            # Sage Attn & Transformer Engine Impl
+            self.cu_seqlens_q: Tensor
+            self.cu_seqlens_kv: Tensor
diff --git a/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py b/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
new file mode 100644
index 00000000..06445f77
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
@@ -0,0 +1,202 @@
+import contextlib
+import gc
+import os
+import sys
+import time
+import traceback
+from importlib import import_module
+
+import torch
+from tqdm import tqdm
+
+from .structs import T2SEngineProtocol, T2SRequest, T2SResult, T2SSession
+from .t2s_model_abc import (
+    CUDAGraphCacheABC,
+    T2SDecoderABC,
+    TorchProfiler,
+)
+
+torch.set_grad_enabled(False)
+
+
+class T2SEngine(T2SEngineProtocol):
+    def __init__(
+        self,
+        decoder_model: T2SDecoderABC,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        assert device.type in {"cpu", "cuda", "mps", "xpu", "mtia"}
+        assert dtype in {torch.float16, torch.bfloat16, torch.float32}
+
+        self.device = device
+        self.dtype = dtype
+
+        self.decoder_model: T2SDecoderABC = decoder_model.to(self.device, self.dtype)
+
+        self.graphcache: CUDAGraphCacheABC = self.init_cache()
+
+    def _handle_request(self, request: T2SRequest):
+        with self.device:
+            decoder = self.decoder_model
+            session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
+            batch_idx = torch.arange(session.bsz)
+
+            t1 = 0.0
+            infer_speed = 0.0
+
+            torch_profiler = TorchProfiler(request.debug)
+            with torch_profiler.profiler():
+                for idx in tqdm(range(1500)):
+                    if idx == 0:
+                        session.kv_cache = decoder.init_cache(session.bsz)
+                        xy_dec = decoder.h.prefill(session.xy_pos, session.kv_cache, session.attn_mask)
+                        xy_dec = xy_dec[None, batch_idx, session.input_pos - 1]
+                    else:
+                        if request.use_cuda_graph and session.graph is None and torch.cuda.is_available():
+                            self.graphcache.assign_graph(session)
+
+                        with torch_profiler.record("AR"):
+                            if session.graph:
+                                assert session.stream
+                                session.stream.wait_stream(torch.cuda.default_stream())
+                                with torch.cuda.stream(session.stream):
+                                    session.xy_pos_.copy_(session.xy_pos)
+                                    session.graph.replay()
+                                    xy_dec = session.xy_dec_.clone()
+                            else:
+                                args, kwds = decoder.pre_forward(session)
+                                xy_dec = decoder.h(
+                                    session.input_pos,
+                                    session.xy_pos,
+                                    session.kv_cache,
+                                    *args,
+                                    **kwds,
+                                )
+
+                    with torch.cuda.stream(session.stream) if session.stream is not None else contextlib.nullcontext():
+                        decoder.post_forward(idx, session)
+                        logits = decoder.ar_predict_layer(xy_dec[:, -1])
+
+                        if idx == 0:
+                            logits[:, -1] = float("-inf")
+
+                        with torch_profiler.record("Sampling"):
+                            samples = session.sample(
+                                logits=logits,
+                                previous_tokens=session.y[:, : session.y_len + idx],
+                                top_k=request.top_k,
+                                top_p=request.top_p,
+                                repetition_penalty=request.repetition_penalty,
+                                temperature=request.temperature,
+                            )
+                            session.y[batch_idx, session.y_len + idx] = samples
+                            session.input_pos.add_(1)
+
+                        with torch_profiler.record("EOS"):
+                            argmax_token = torch.argmax(logits, dim=-1)
+                            sample_token = samples.squeeze(1)
+                            EOS_mask = (argmax_token == decoder.EOS) | (sample_token == decoder.EOS)
+
+                            newly_done_mask = EOS_mask & (~session.completed)
+                            newly_done_indices = newly_done_mask.nonzero()
+
+                            if newly_done_indices.numel() > 0:
+                                for i in newly_done_indices:
+                                    print(i, i.shape, newly_done_indices, newly_done_indices.shape)
+                                    session.y_results[i] = session.y[i, session.y_len : session.y_len + idx]
+                                    session.completed[newly_done_indices] = True
+
+                            if torch.all(session.completed).item():
+                                if session.y.sum() == 0:
+                                    session.y_results = [torch.tensor(0) for _ in range(session.bsz)]
+                                    tqdm.write("Bad Zero Prediction")
+                                else:
+                                    tqdm.write(
+                                        f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n{[i.size(-1) for i in session.y_results].__str__().strip('[]')}"
+                                    )
+                                    tqdm.write(f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s")
+                                    infer_speed = (idx - 1) / (time.perf_counter() - t1)
+                                break
+
+                            if (request.early_stop_num != -1 and idx >= request.early_stop_num) or idx == 1499:
+                                for i in range(session.bsz):
+                                    if not session.completed[i].item():
+                                        session.y_results[i] = session.y[i, session.y_len : session.y_len + 1499]
+                                        session.completed[i] = True
+                                break
+
+                        with torch_profiler.record("NextPos"):
+                            y_emb = decoder.ar_audio_embedding(samples)
+                            session.xy_pos = decoder.ar_audio_position(session.input_pos - session.x_lens, y_emb)
+
+                        if idx == 1:
+                            torch_profiler.start()
+                            t1 = time.perf_counter()
+
+                        if idx == 51:
+                            torch_profiler.end()
+
+                        if idx % 100 == 0:
+                            match session.device.type:
+                                case "cuda":
+                                    torch.cuda.empty_cache()
+                                case "mps":
+                                    torch.mps.empty_cache()
+                                case "xpu":
+                                    torch.xpu.empty_cache()
+                                case "mtia":
+                                    torch.mtia.empty_cache()
+
+            match session.device.type:
+                case "cuda":
+                    if session.stream is not None:
+                        torch.cuda.current_stream().wait_stream(session.stream)
+                    torch.cuda.empty_cache()
+                case "mps":
+                    torch.mps.empty_cache()
+                case "xpu":
+                    torch.xpu.empty_cache()
+                case "mtia":
+                    torch.mtia.empty_cache()
+                case "cpu":
+                    gc.collect()
+
+            torch_profiler.end()
+            if request.use_cuda_graph and torch.cuda.is_available():
+                self.graphcache.release_graph(session)
+            return session.y_results[: request.valid_length], infer_speed
+
+    def generate(self, request: T2SRequest):
+        try:
+            result, infer_speed = self._handle_request(request)
+            t2s_result = T2SResult(result=result, infer_speed=infer_speed, status="Success")
+        except Exception as e:
+            t2s_result = T2SResult(status="Error", exception=e, traceback=traceback.format_exc())
+        return t2s_result
+
+    @staticmethod
+    def load_decoder(weights_path: os.PathLike, max_batch_size: int = 1, backend: str = "Flash Attn CUDAGraph"):
+        print(f"Loading Text2Semantic Weights from {weights_path} with {backend} Backend")
+        module_path = f".backends.{backend.lower().replace('-', '_')}"
+        decoder_cls_name = "T2SDecoder"
+        decoder_mod = import_module(module_path, package=__package__)
+        decoder_cls: type[T2SDecoderABC] = getattr(decoder_mod, decoder_cls_name)
+        dict_s1 = torch.load(weights_path, map_location="cpu", weights_only=False, mmap=True)
+        config = dict_s1["config"]
+        decoder: T2SDecoderABC = decoder_cls(config, max_batch_size=max_batch_size)
+        state_dict = dict_s1["weight"]
+        decoder.load_state_dict(state_dict)
+
+        return decoder.eval()
+
+    def init_cache(self):
+        assert self.decoder_model
+
+        module_name = self.decoder_model.__class__.__module__
+        module = sys.modules.get(module_name)
+        assert module
+
+        target_class: type[CUDAGraphCacheABC] = getattr(module, "CUDAGraphCache")
+
+        return target_class(self.decoder_model)
diff --git a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
new file mode 100644
index 00000000..ccf7aee8
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
@@ -0,0 +1,668 @@
+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+
+from __future__ import annotations
+
+import math
+import os
+import random
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+from typing import MutableSequence
+
+import torch
+import torch._inductor.config
+import torch.nn.functional as F
+from torch.cuda.graphs import CUDAGraph
+from torch.profiler import ProfilerAction, tensorboard_trace_handler
+
+from . import nn
+from .structs import KVCacheProtocol, T2SDecoderProtocol, T2SSession
+
+Tensor = torch.Tensor
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        vocab_size: int,
+    ):
+        super().__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
+
+    @property
+    def weight(self) -> Tensor:
+        return self.word_embeddings.weight
+
+    def embedding(self, index: int) -> Tensor:
+        return self.word_embeddings.weight[index : index + 1]
+
+    def __call__(self, x: Tensor):
+        x = self.word_embeddings(x)
+        return x
+
+
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        scale: bool = False,
+        alpha: bool = False,
+        max_batch_size: int = 10,
+        max_seq_len: int = 1800,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
+        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = max_seq_len
+
+        self.reverse = False
+        self.register_buffer("pe", torch.zeros(max_batch_size, max_seq_len, embedding_dim), persistent=False)
+        self.pe: torch.Tensor
+        self.compute_pe()
+
+    def compute_pe(self):
+        """Reset the positional encodings."""
+        if self.reverse:
+            position = torch.arange(self.max_seq_len - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
+        else:
+            position = torch.arange(self.max_seq_len, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
+        )
+        pe = self.pe
+        pe[:, :, 0::2] = torch.sin(position * div_term)
+        pe[:, :, 1::2] = torch.cos(position * div_term)
+
+    def __call__(self, input_pos: Tensor, x: Tensor) -> Tensor:
+        """
+        Args:
+            input_pos (Tensor): [batch_size, ]
+            x (Tensor): [batch_size, 1, embed_dim]
+
+        Returns:
+            embedded_x (Tensor): [batch_size, 1, embed_dim]
+        """
+
+        batch_size = x.shape[0]
+        pe_values = self.pe[torch.arange(batch_size), input_pos - 1]  # (batch_size, embed_dim)
+
+        return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)  # (batch_size, 1, embed_dim)
+
+    def prefill(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): [batch_size, seq_len, embed_dim]
+
+        Returns:
+            embedded_x (Tensor): [batch_size, seq_len, embed_dim]
+        """
+
+        pe_values = self.pe[:, : x.shape[-2]]
+        return x * self.x_scale + self.alpha.item() * pe_values
+
+
+class KVCacheABC(nn.Module, ABC, KVCacheProtocol):
+    def __init__(self, batch_size: int, max_seq_length: int, n_heads: int, head_dim: int) -> None:
+        super().__init__()
+
+        self.n_head = n_heads
+        self.head_dim = head_dim
+        self.batch_size = batch_size
+        self.max_seq_length = max_seq_length
+
+        self.k_cache: Tensor
+        self.v_cache: Tensor
+
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+
+    @abstractmethod
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor, *args, **kwds) -> tuple[Tensor, Tensor]: ...
+
+    @abstractmethod
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor) -> None: ...
+
+    def sync_cache(self, kv_cache: KVCacheProtocol):
+        self.k_cache.copy_(kv_cache.k_cache)
+        self.v_cache.copy_(kv_cache.v_cache)
+
+
+class KVCacheNHD(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+
+        assert batch_size > 0
+        cache_shape = (batch_size, max_seq_length, n_heads, head_dim)
+
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, 1, H, D]
+
+        index = (
+            (input_pos - 1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .expand(
+                -1,
+                -1,
+                self.n_head,
+                self.head_dim,
+            )
+            .to(torch.int64)
+        )  # (bs, 1, num_head, head_dim)
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out.scatter_(1, index, k_val)
+        v_out.scatter_(1, index, v_val)
+
+        return k_out, v_out
+
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+
+        self.k_cache[:, : k_val.shape[1]] = k_val
+        self.v_cache[:, : v_val.shape[1]] = v_val
+
+
+class KVCacheHND(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+
+        index = (
+            (input_pos - 1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .expand(
+                -1,
+                self.n_head,
+                -1,
+                self.head_dim,
+            )
+            .to(torch.int64)
+        )  # (bs, num_head, 1, head_dim)
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out.scatter_(2, index, k_val)
+        v_out.scatter_(2, index, v_val)
+
+        return k_out, v_out
+
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+
+        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
+        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)
+
+
+class KVCacheHNDVarlen(KVCacheABC):
+    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
+        super().__init__(batch_size, max_seq_length, n_heads, head_dim)
+
+        cache_shape = (batch_size, n_heads, max_seq_length, head_dim)
+        self.cache_idx: Tensor
+
+        self.register_buffer("cache_idx", torch.arange(batch_size), persistent=False)
+        self.register_buffer("k_cache", torch.zeros(size=cache_shape), persistent=False)
+        self.register_buffer("v_cache", torch.zeros(size=cache_shape), persistent=False)
+
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: [B, ], k_val: [B, H, 1, D]
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+
+        k_out[self.cache_idx, :, input_pos - 1, :] = k_val
+        v_out[self.cache_idx, :, input_pos - 1, :] = v_val
+
+        return k_out, v_out
+
+    def empty(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+
+    def prefill_kv(self, k_val: Tensor, v_val: Tensor):
+        # input_pos: int, k_val: [B, S, H, D]
+
+        self.k_cache[..., : k_val.shape[1], :] = k_val.transpose(1, 2)
+        self.v_cache[..., : v_val.shape[1], :] = v_val.transpose(1, 2)
+
+
+class AttentionABC(nn.Module, ABC):
+    def __init__(self, n_head: int, hidden_dim: int, max_seq_length: int):
+        super().__init__()
+
+        self.n_head = n_head
+        self.hidden_dim = hidden_dim
+        assert hidden_dim % n_head == 0
+        self.head_dim = hidden_dim // n_head
+
+        self.max_seq_length = max_seq_length
+
+        # key, query, value projections for all heads, but in a batch
+        self.in_proj: nn.Linear
+        self.out_proj: nn.Linear
+
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        keys_to_modify = [key for key in state_dict if "in_proj_" in key]
+        for key in keys_to_modify:
+            new_key = key.replace("in_proj_", "in_proj.")  # in_proj_ -> in_proj.
+            state_dict[new_key] = state_dict.pop(key)
+
+    @abstractmethod
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds) -> Tensor: ...
+
+    def prefill(self, x: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        q, k, v = self.in_proj(x.unsqueeze(0)).chunk(3, dim=-1)
+
+        q, k, v = map(lambda x: x.contiguous().view(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
+
+        kv_cache.prefill_kv(k, v)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask)
+
+        attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
+
+        output = self.out_proj(attn)
+
+        return output
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+
+        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
+        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
+
+    def __call__(self, x: Tensor):
+        return self.linear2(F.relu(self.linear1(x), inplace=True))
+
+
+class TransformerBlockABC(nn.Module, ABC):
+    def __init__(self, n_head: int, ffn_dim: int, hidden_dim: int, max_seq_length: int) -> None:
+        super().__init__()
+
+        self.hidden_dim = hidden_dim
+        self.max_seq_length = max_seq_length
+
+        self.attention: AttentionABC
+        self.feed_forward: FeedForward
+        self.attention_norm: nn.LayerNorm
+        self.ffn_norm: nn.LayerNorm
+
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        for key in list(state_dict.keys()):
+            new_key = (
+                key.replace("self_attn", "attention")
+                .replace("linear", "feed_forward.linear")
+                .replace("norm1", "attention_norm")
+                .replace("norm2", "ffn_norm")
+            )
+            state_dict[new_key] = state_dict.pop(key)
+
+    def __call__(self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheProtocol, *args, **kwds):
+        h = self.attention_norm(
+            x
+            + self.attention(
+                x,
+                input_pos,
+                kv_cache,
+                *args,
+                **kwds,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+
+    def prefill(
+        self,
+        x: Tensor,
+        kv_cache: KVCacheProtocol,
+        attn_mask: Tensor,
+    ) -> Tensor:
+        h = self.attention_norm(
+            x
+            + self.attention.prefill(
+                x,
+                kv_cache,
+                attn_mask,
+            )
+        )
+        out = self.ffn_norm(h + self.feed_forward(h))
+        return out
+
+
+class TransformerDecoderABC(nn.Module, ABC):
+    def __init__(
+        self,
+        hidden_dim: int,
+        n_layer: int,
+        n_head: int,
+        ffn_dim: int,
+        vocab_size: int,
+        max_seq_length: int,
+        max_batch_size: int,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_dim = hidden_dim
+        self.n_head = n_head
+        assert hidden_dim % n_head == 0
+
+        self.head_dim = hidden_dim // n_head
+        self.vocab_size = vocab_size
+
+        self.n_layer = n_layer
+
+        self.layers: MutableSequence[TransformerBlockABC]
+
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+
+    def __call__(self, input_pos: Tensor, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer(x, input_pos, kv_cache, *args, **kwds)
+        return x
+
+    def prefill(self, x: Tensor, kv_caches: MutableSequence[KVCacheProtocol], attn_mask: Tensor):
+        for layer, kv_cache in zip(self.layers, kv_caches):
+            x = layer.prefill(x, kv_cache, attn_mask)
+        return x
+
+
+class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
+    def __init__(
+        self,
+        config: dict,
+        max_seq_length: int = 1800,
+        max_batch_size: int = 10,
+    ) -> None:
+        super().__init__()
+
+        hidden_dim: int = config["model"]["hidden_dim"]
+        embedding_dim: int = config["model"]["embedding_dim"]
+        n_head: int = config["model"]["head"]
+        n_layer: int = config["model"]["n_layer"]
+        vocab_size: int = config["model"]["vocab_size"]
+        phoneme_vocab_size: int = config["model"]["phoneme_vocab_size"]
+        EOS: int = config["model"]["EOS"]
+        ffn_dim: int = hidden_dim * 4
+
+        self.n_layer = int(n_layer)
+        self.hidden_dim = int(hidden_dim)
+        self.n_head = int(n_head)
+        assert hidden_dim % n_head == 0
+
+        self.head_dim = int(hidden_dim // n_head)
+        self.embedding_dim = int(embedding_dim)
+        self.ffn_dim = int(ffn_dim)
+        self.vocab_size = int(vocab_size)
+        self.phoneme_vocab_size = int(phoneme_vocab_size)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        self.EOS = EOS
+        assert self.EOS == self.vocab_size - 1
+
+        self.bert_proj: nn.Linear
+        self.ar_predict_layer: nn.Linear
+        self.h: TransformerDecoderABC
+
+        self.kv_class: type[KVCacheABC]
+
+        self.GraphCache: CUDAGraphCacheABC | None
+
+        self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size)
+        self.ar_text_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            alpha=True,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+        self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size)
+        self.ar_audio_position = SinePositionalEmbedding(
+            self.embedding_dim,
+            scale=False,
+            alpha=True,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_length,
+        )
+
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
+        model_keys = [key for key in state_dict if key.startswith("model.")]
+        for key in model_keys:
+            new_key = key[len("model.") :]
+            state_dict[new_key] = state_dict.pop(key)
+
+    def init_cache(self, bsz: int = 0) -> MutableSequence[KVCacheProtocol]:
+        bsz = bsz or self.h.max_batch_size
+        assert bsz <= self.h.max_batch_size
+        seq_lens = self.h.max_seq_length
+        dtype = self.bert_proj.bias.dtype
+        kvclass = self.kv_class
+
+        return nn.ModuleList(
+            [kvclass(bsz, seq_lens, self.n_head, self.head_dim) for _ in range(self.n_layer)],
+        ).to(self.device, dtype)  # type: ignore
+
+    def embed(
+        self,
+        x: list[torch.Tensor],
+        y: torch.Tensor,
+        bert_features: list[torch.Tensor],
+    ):
+        x_len: list[int] = [i.shape[0] for i in x]
+        x_len_max = max(x_len)
+        xy_pos = torch.zeros((len(x), x_len_max + y.shape[1], self.embedding_dim)).to(bert_features[0].dtype)
+
+        bert_features = list(map(lambda x: x.transpose(0, 1), bert_features))
+
+        y_len = y.shape[1]
+        y_emb = self.ar_audio_embedding(y)
+        y_pos = self.ar_audio_position.prefill(y_emb)
+
+        for bs, (x_, len_, bert_feature) in enumerate(zip(x, x_len, bert_features)):
+            x_emb = self.ar_text_embedding(x_)
+            bert = self.bert_proj(bert_feature)
+            x_emb = x_emb + bert
+            x_pos = self.ar_text_position.prefill(x_emb.unsqueeze(0))
+            xy_pos[[bs], :len_] = x_pos
+            xy_pos[[bs], len_ : len_ + y_len] = y_pos
+
+        return xy_pos
+
+    def compile(self, *args, **kwds):
+        # Experimental features to reduce compilation times, will be on by default in future
+        torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.fx_graph_cache = True
+        torch._inductor.config.triton.cudagraph_trees = True
+        torch._inductor.config.triton.cudagraph_support_input_mutation = True
+        self.h.compile(fullgraph=True, mode="reduce-overhead")
+
+    def capture(
+        self, input_pos: Tensor, x: Tensor, x_dec: Tensor, kv_caches: MutableSequence[KVCacheProtocol], *args, **kwds
+    ) -> CUDAGraph:
+        assert torch.cuda.is_available()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+
+        graph = torch.cuda.CUDAGraph()
+
+        with torch.cuda.stream(s):  # type: ignore
+            for _ in range(5):
+                self.h(input_pos, x, kv_caches, *args, **kwds)
+        torch.cuda.current_stream().wait_stream(s)
+
+        with torch.cuda.graph(graph):
+            x_dec.copy_(self.h(input_pos, x, kv_caches, *args, **kwds))
+        torch.cuda.synchronize()
+
+        return graph
+
+    @abstractmethod
+    def pre_forward(self, session: T2SSession) -> tuple[list[Tensor], dict[str, Tensor]]:
+        return list(), dict()
+
+    @abstractmethod
+    def post_forward(self, idx: int, session: T2SSession) -> None:
+        return
+
+
+class CUDAGraphCacheABC(ABC):
+    def __init__(
+        self,
+        decoder: T2SDecoderABC,
+        enabled: bool = False,
+    ) -> None:
+        if torch.cuda.is_available() and enabled:
+            self.device: torch.device = decoder.device
+            self.dtype = decoder.bert_proj.bias.dtype
+
+            self.assigned: bool = False
+
+            self.decoder: T2SDecoderABC = decoder
+            self.kv_cache: MutableSequence[KVCacheProtocol] = decoder.init_cache(decoder.max_batch_size)
+            self.xy_pos = torch.rand(size=(decoder.max_batch_size, 1, decoder.embedding_dim), device=self.device).to(
+                self.dtype
+            )
+            self.xy_dec = self.xy_pos.clone()
+
+            self.input_pos = torch.tensor([10] * decoder.max_batch_size, device=self.device).int()
+            self.graph: torch.cuda.CUDAGraph | None = None
+            self.stream: torch.cuda.Stream | None
+
+            self.id: int = random.randint(1, 2**32 - 1)
+
+    def assign_graph(self, session: T2SSession):
+        if self.graph is None:
+            args, kwds = self.decoder.pre_forward(session)
+            graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
+            self.graph = graph
+            self.stream = torch.cuda.Stream()  # type: ignore
+
+        if self.assigned is False:
+            self.get_cache_graph(session)
+            session.id = self.id
+            self.assigned = True
+        else:
+            self.capture_new_graph(session)
+
+    @abstractmethod
+    def release_graph(self, session: T2SSession): ...
+
+    @abstractmethod
+    def get_cache_graph(self, session: T2SSession):
+        pass
+
+    @abstractmethod
+    def capture_new_graph(self, session: T2SSession):
+        pass
+
+
+class TorchProfiler:
+    def __init__(self, debug: bool, log_dir: str = "./profiler") -> None:
+        self.debug = debug
+        self.log_dir = log_dir
+        self.__profiler: torch.profiler.profile
+
+        if self.debug and not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+
+        self.tensorboard_handler = tensorboard_trace_handler(self.log_dir)
+
+    def profiler_callback(self, prof: torch.profiler.profile):
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
+        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=30))
+        self.tensorboard_handler(prof)
+
+    @staticmethod
+    def three_step_schedule(step: int) -> ProfilerAction:
+        if step == 0:
+            return ProfilerAction.NONE
+        elif step == 1:
+            return ProfilerAction.RECORD
+        elif step == 2:
+            return ProfilerAction.RECORD_AND_SAVE
+        else:
+            return ProfilerAction.NONE
+
+    def start(self):
+        if not self.debug:
+            return
+        assert self.__profiler is not None
+        self.__profiler.step()
+
+    def end(self):
+        if not self.debug:
+            return
+        assert self.__profiler is not None
+        self.__profiler.step()
+
+    def profiler(self):
+        if self.debug:
+            activities_list = [torch.profiler.ProfilerActivity.CPU]
+            if torch.cuda.is_available():
+                activities_list.append(torch.profiler.ProfilerActivity.CUDA)
+
+            self.__profiler = torch.profiler.profile(
+                activities=activities_list,
+                record_shapes=True,
+                with_stack=True,
+                with_modules=True,
+                profile_memory=True,
+                schedule=self.three_step_schedule,
+                on_trace_ready=self.profiler_callback,
+            )
+            return self.__profiler
+        else:
+            return nullcontext()
+
+    def record(self, func_name: str):
+        if self.debug:
+            return torch.profiler.record_function(func_name)
+        else:
+            return nullcontext()
diff --git a/GPT_SoVITS/Accelerate/__init__.py b/GPT_SoVITS/Accelerate/__init__.py
new file mode 100644
index 00000000..c7d68a17
--- /dev/null
+++ b/GPT_SoVITS/Accelerate/__init__.py
@@ -0,0 +1,11 @@
+from . import MLX, PyTorch
+from .PyTorch import T2SEngineTorch, T2SRequest, T2SResult
+
+backends = PyTorch.backends + MLX.backends
+
+backends = [
+    b.replace("_", "-").title().replace("Mlx", "MLX").replace("Mps", "MPS").replace("Cuda", "CUDA") for b in backends
+]
+
+
+__all__ = ["T2SEngineTorch", "T2SRequest", "T2SResult", "backends", "MLX", "PyTorch"]
diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py
deleted file mode 100644
index 459a3d36..00000000
--- a/GPT_SoVITS/inference_cli.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import argparse
-import os
-import soundfile as sf
-
-from tools.i18n.i18n import I18nAuto
-from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
-
-i18n = I18nAuto()
-
-
-def synthesize(
-    GPT_model_path,
-    SoVITS_model_path,
-    ref_audio_path,
-    ref_text_path,
-    ref_language,
-    target_text_path,
-    target_language,
-    output_path,
-):
-    # Read reference text
-    with open(ref_text_path, "r", encoding="utf-8") as file:
-        ref_text = file.read()
-
-    # Read target text
-    with open(target_text_path, "r", encoding="utf-8") as file:
-        target_text = file.read()
-
-    # Change model weights
-    change_gpt_weights(gpt_path=GPT_model_path)
-    change_sovits_weights(sovits_path=SoVITS_model_path)
-
-    # Synthesize audio
-    synthesis_result = get_tts_wav(
-        ref_wav_path=ref_audio_path,
-        prompt_text=ref_text,
-        prompt_language=i18n(ref_language),
-        text=target_text,
-        text_language=i18n(target_language),
-        top_p=1,
-        temperature=1,
-    )
-
-    result_list = list(synthesis_result)
-
-    if result_list:
-        last_sampling_rate, last_audio_data = result_list[-1]
-        output_wav_path = os.path.join(output_path, "output.wav")
-        sf.write(output_wav_path, last_audio_data, last_sampling_rate)
-        print(f"Audio saved to {output_wav_path}")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
-    parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
-    parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file")
-    parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file")
-    parser.add_argument("--ref_text", required=True, help="Path to the reference text file")
-    parser.add_argument(
-        "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio"
-    )
-    parser.add_argument("--target_text", required=True, help="Path to the target text file")
-    parser.add_argument(
-        "--target_language",
-        required=True,
-        choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"],
-        help="Language of the target text",
-    )
-    parser.add_argument("--output_path", required=True, help="Path to the output directory")
-
-    args = parser.parse_args()
-
-    synthesize(
-        args.gpt_model,
-        args.sovits_model,
-        args.ref_audio,
-        args.ref_text,
-        args.ref_language,
-        args.target_text,
-        args.target_language,
-        args.output_path,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py
deleted file mode 100644
index 379f7fa8..00000000
--- a/GPT_SoVITS/inference_gui.py
+++ /dev/null
@@ -1,316 +0,0 @@
-import os
-import sys
-from PyQt5.QtCore import QEvent
-from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
-from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
-import soundfile as sf
-
-from tools.i18n.i18n import I18nAuto
-
-i18n = I18nAuto()
-
-from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
-
-
-class GPTSoVITSGUI(QMainWindow):
-    GPT_Path = gpt_path
-    SoVITS_Path = sovits_path
-
-    def __init__(self):
-        super().__init__()
-
-        self.setWindowTitle("GPT-SoVITS GUI")
-        self.setGeometry(800, 450, 950, 850)
-
-        self.setStyleSheet("""
-            QWidget {
-                background-color: #a3d3b1; 
-            }
-
-            QTabWidget::pane {
-                background-color: #a3d3b1;  
-            }
-
-            QTabWidget::tab-bar {
-                alignment: left;
-            }
-
-            QTabBar::tab {
-                background: #8da4bf; 
-                color: #ffffff;  
-                padding: 8px;
-            }
-
-            QTabBar::tab:selected {
-                background: #2a3f54; 
-            }
-
-            QLabel {
-                color: #000000;  
-            }
-
-            QPushButton {
-                background-color: #4CAF50; 
-                color: white;  
-                padding: 8px;
-                border: 1px solid #4CAF50;
-                border-radius: 4px;
-            }
-
-            QPushButton:hover {
-                background-color: #45a049;  
-                border: 1px solid #45a049;
-                box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1);
-            }
-        """)
-
-        license_text = (
-            "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. "
-            "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
-        )
-        license_label = QLabel(license_text)
-        license_label.setWordWrap(True)
-
-        self.GPT_model_label = QLabel("选择GPT模型:")
-        self.GPT_model_input = QLineEdit()
-        self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
-        self.GPT_model_input.setText(self.GPT_Path)
-        self.GPT_model_input.setReadOnly(True)
-        self.GPT_model_button = QPushButton("选择GPT模型文件")
-        self.GPT_model_button.clicked.connect(self.select_GPT_model)
-
-        self.SoVITS_model_label = QLabel("选择SoVITS模型:")
-        self.SoVITS_model_input = QLineEdit()
-        self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
-        self.SoVITS_model_input.setText(self.SoVITS_Path)
-        self.SoVITS_model_input.setReadOnly(True)
-        self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
-        self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
-
-        self.ref_audio_label = QLabel("上传参考音频:")
-        self.ref_audio_input = QLineEdit()
-        self.ref_audio_input.setPlaceholderText("拖拽或选择文件")
-        self.ref_audio_input.setReadOnly(True)
-        self.ref_audio_button = QPushButton("选择音频文件")
-        self.ref_audio_button.clicked.connect(self.select_ref_audio)
-
-        self.ref_text_label = QLabel("参考音频文本:")
-        self.ref_text_input = QLineEdit()
-        self.ref_text_input.setPlaceholderText("直接输入文字或上传文本")
-        self.ref_text_button = QPushButton("上传文本")
-        self.ref_text_button.clicked.connect(self.upload_ref_text)
-
-        self.ref_language_label = QLabel("参考音频语言:")
-        self.ref_language_combobox = QComboBox()
-        self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
-        self.ref_language_combobox.setCurrentText("多语种混合")
-
-        self.target_text_label = QLabel("合成目标文本:")
-        self.target_text_input = QLineEdit()
-        self.target_text_input.setPlaceholderText("直接输入文字或上传文本")
-        self.target_text_button = QPushButton("上传文本")
-        self.target_text_button.clicked.connect(self.upload_target_text)
-
-        self.target_language_label = QLabel("合成音频语言:")
-        self.target_language_combobox = QComboBox()
-        self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
-        self.target_language_combobox.setCurrentText("多语种混合")
-
-        self.output_label = QLabel("输出音频路径:")
-        self.output_input = QLineEdit()
-        self.output_input.setPlaceholderText("拖拽或选择文件")
-        self.output_input.setReadOnly(True)
-        self.output_button = QPushButton("选择文件夹")
-        self.output_button.clicked.connect(self.select_output_path)
-
-        self.output_text = QTextEdit()
-        self.output_text.setReadOnly(True)
-
-        self.add_drag_drop_events(
-            [
-                self.GPT_model_input,
-                self.SoVITS_model_input,
-                self.ref_audio_input,
-                self.ref_text_input,
-                self.target_text_input,
-                self.output_input,
-            ]
-        )
-
-        self.synthesize_button = QPushButton("合成")
-        self.synthesize_button.clicked.connect(self.synthesize)
-
-        self.clear_output_button = QPushButton("清空输出")
-        self.clear_output_button.clicked.connect(self.clear_output)
-
-        self.status_bar = QStatusBar()
-
-        main_layout = QVBoxLayout()
-
-        input_layout = QGridLayout(self)
-        input_layout.setSpacing(10)
-
-        input_layout.addWidget(license_label, 0, 0, 1, 3)
-
-        input_layout.addWidget(self.GPT_model_label, 1, 0)
-        input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2)
-        input_layout.addWidget(self.GPT_model_button, 2, 2)
-
-        input_layout.addWidget(self.SoVITS_model_label, 3, 0)
-        input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2)
-        input_layout.addWidget(self.SoVITS_model_button, 4, 2)
-
-        input_layout.addWidget(self.ref_audio_label, 5, 0)
-        input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
-        input_layout.addWidget(self.ref_audio_button, 6, 2)
-
-        input_layout.addWidget(self.ref_language_label, 7, 0)
-        input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1)
-        input_layout.addWidget(self.ref_text_label, 9, 0)
-        input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
-        input_layout.addWidget(self.ref_text_button, 10, 2)
-
-        input_layout.addWidget(self.target_language_label, 11, 0)
-        input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1)
-        input_layout.addWidget(self.target_text_label, 13, 0)
-        input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
-        input_layout.addWidget(self.target_text_button, 14, 2)
-
-        input_layout.addWidget(self.output_label, 15, 0)
-        input_layout.addWidget(self.output_input, 16, 0, 1, 2)
-        input_layout.addWidget(self.output_button, 16, 2)
-
-        main_layout.addLayout(input_layout)
-
-        output_layout = QVBoxLayout()
-        output_layout.addWidget(self.output_text)
-        main_layout.addLayout(output_layout)
-
-        main_layout.addWidget(self.synthesize_button)
-
-        main_layout.addWidget(self.clear_output_button)
-
-        main_layout.addWidget(self.status_bar)
-
-        self.central_widget = QWidget()
-        self.central_widget.setLayout(main_layout)
-        self.setCentralWidget(self.central_widget)
-
-    def dragEnterEvent(self, event):
-        if event.mimeData().hasUrls():
-            event.acceptProposedAction()
-
-    def dropEvent(self, event):
-        if event.mimeData().hasUrls():
-            file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
-            if len(file_paths) == 1:
-                self.update_ref_audio(file_paths[0])
-            else:
-                self.update_ref_audio(", ".join(file_paths))
-
-    def add_drag_drop_events(self, widgets):
-        for widget in widgets:
-            widget.setAcceptDrops(True)
-            widget.installEventFilter(self)
-
-    def eventFilter(self, obj, event):
-        if event.type() in (QEvent.DragEnter, QEvent.Drop):
-            mime_data = event.mimeData()
-            if mime_data.hasUrls():
-                event.acceptProposedAction()
-
-        return super().eventFilter(obj, event)
-
-    def select_GPT_model(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
-        if file_path:
-            self.GPT_model_input.setText(file_path)
-
-    def select_SoVITS_model(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)")
-        if file_path:
-            self.SoVITS_model_input.setText(file_path)
-
-    def select_ref_audio(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)")
-        if file_path:
-            self.update_ref_audio(file_path)
-
-    def upload_ref_text(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
-        if file_path:
-            with open(file_path, "r", encoding="utf-8") as file:
-                content = file.read()
-                self.ref_text_input.setText(content)
-
-    def upload_target_text(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
-        if file_path:
-            with open(file_path, "r", encoding="utf-8") as file:
-                content = file.read()
-                self.target_text_input.setText(content)
-
-    def select_output_path(self):
-        options = QFileDialog.Options()
-        options |= QFileDialog.DontUseNativeDialog
-        options |= QFileDialog.ShowDirsOnly
-
-        folder_dialog = QFileDialog()
-        folder_dialog.setOptions(options)
-        folder_dialog.setFileMode(QFileDialog.Directory)
-
-        if folder_dialog.exec_():
-            folder_path = folder_dialog.selectedFiles()[0]
-            self.output_input.setText(folder_path)
-
-    def update_ref_audio(self, file_path):
-        self.ref_audio_input.setText(file_path)
-
-    def clear_output(self):
-        self.output_text.clear()
-
-    def synthesize(self):
-        GPT_model_path = self.GPT_model_input.text()
-        SoVITS_model_path = self.SoVITS_model_input.text()
-        ref_audio_path = self.ref_audio_input.text()
-        language_combobox = self.ref_language_combobox.currentText()
-        language_combobox = i18n(language_combobox)
-        ref_text = self.ref_text_input.text()
-        target_language_combobox = self.target_language_combobox.currentText()
-        target_language_combobox = i18n(target_language_combobox)
-        target_text = self.target_text_input.text()
-        output_path = self.output_input.text()
-
-        if GPT_model_path != self.GPT_Path:
-            change_gpt_weights(gpt_path=GPT_model_path)
-            self.GPT_Path = GPT_model_path
-        if SoVITS_model_path != self.SoVITS_Path:
-            change_sovits_weights(sovits_path=SoVITS_model_path)
-            self.SoVITS_Path = SoVITS_model_path
-
-        synthesis_result = get_tts_wav(
-            ref_wav_path=ref_audio_path,
-            prompt_text=ref_text,
-            prompt_language=language_combobox,
-            text=target_text,
-            text_language=target_language_combobox,
-        )
-
-        result_list = list(synthesis_result)
-
-        if result_list:
-            last_sampling_rate, last_audio_data = result_list[-1]
-            output_wav_path = os.path.join(output_path, "output.wav")
-            sf.write(output_wav_path, last_audio_data, last_sampling_rate)
-
-            result = "Audio saved to " + output_wav_path
-
-        self.status_bar.showMessage("合成完成！输出路径：" + output_wav_path, 5000)
-        self.output_text.append("处理结果：\n" + result)
-
-
-if __name__ == "__main__":
-    app = QApplication(sys.argv)
-    mainWin = GPTSoVITSGUI()
-    mainWin.show()
-    sys.exit(app.exec_())
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index a361ed58..c9feeb52 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1,36 +1,48 @@
-"""
-按中英混合识别
-按日英混合识别
-多语种启动切分识别语种
-全部按中文识别
-全部按英文识别
-全部按日文识别
-"""
-import psutil
-import os
-
-def set_high_priority():
-    """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
-    if os.name != "nt":
-        return # 仅 Windows 有效
-    p = psutil.Process(os.getpid())
-    try:
-        p.nice(psutil.HIGH_PRIORITY_CLASS)
-        print("已将进程优先级设为 High")
-    except psutil.AccessDenied:
-        print("权限不足，无法修改优先级（请用管理员运行）")
-set_high_priority()
+import argparse
+import contextlib
 import json
 import logging
 import os
+import random
 import re
-import sys
 import traceback
 import warnings
+from pathlib import Path
+from time import time as ttime
 
+import gradio as gr
+import librosa
+import numpy as np
+import psutil
 import torch
 import torchaudio
-from text.LangSegmenter import LangSegmenter
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from config import (
+    change_choices,
+    get_weights_names,
+    infer_device,
+    is_half,
+    name2gpt_path,
+    name2sovits_path,
+    pretrained_sovits_name,
+)
+from GPT_SoVITS.Accelerate import MLX, PyTorch, T2SRequest, backends
+from GPT_SoVITS.feature_extractor import cnhubert
+from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spectrogram_torch
+from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
+from GPT_SoVITS.process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
+from GPT_SoVITS.sv import SV
+from GPT_SoVITS.text import cleaned_text_to_sequence
+from GPT_SoVITS.text.cleaner import clean_text
+from GPT_SoVITS.text.LangSegmenter import LangSegmenter
+from tools.assets import css, js, top_html
+from tools.i18n.i18n import I18nAuto, scan_language_list
+
+with contextlib.suppress(ImportError):
+    import mlx.core as mx
+    import mlx.utils as mxutils
 
 logging.getLogger("markdown_it").setLevel(logging.ERROR)
 logging.getLogger("urllib3").setLevel(logging.ERROR)
@@ -42,100 +54,74 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
 warnings.simplefilter(action="ignore", category=FutureWarning)
 
-version = model_version = os.environ.get("version", "v2")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
+
+def set_high_priority():
+    if os.name != "nt":
+        return
+    p = psutil.Process(os.getpid())
+    with contextlib.suppress(psutil.AccessDenied):
+        p.nice(psutil.HIGH_PRIORITY_CLASS)
+        print("已将进程优先级设为 High")
+
+
+set_high_priority()
+
+_LANG_RE = re.compile(r"^[a-z]{2}[_-][A-Z]{2}$")
+
+
+def lang_type(text: str) -> str:
+    if text == "Auto":
+        return text
+    if not _LANG_RE.match(text):
+        raise argparse.ArgumentTypeError(f"Unspported Format: {text}, Expected ll_CC/ll-CC")
+    ll, cc = re.split(r"[_-]", text)
+    language = f"{ll}_{cc}"
+    if language in scan_language_list():
+        return language
+    else:
+        return "Auto"
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="inference_webui",
+        description="python -s inference_webui.py zh_CN -i naive",
+    )
+    p.add_argument(
+        "language",
+        nargs="?",
+        default="Auto",
+        type=lang_type,
+        help="Language Code, Such as zh_CN, en-US",
+    )
+    p.add_argument(
+        "--backends",
+        "-b",
+        choices=backends,
+        default="naive",
+        help="AR Inference Backend",
+        required=False,
+    )
+    return p
+
+
+args = build_parser().parse_args()
 
 SoVITS_names, GPT_names = get_weights_names()
-from config import pretrained_sovits_name
 
+
+version = model_version = os.environ.get("version", "v2")
 path_sovits_v3 = pretrained_sovits_name["v3"]
 path_sovits_v4 = pretrained_sovits_name["v4"]
 is_exist_s2gv3 = os.path.exists(path_sovits_v3)
 is_exist_s2gv4 = os.path.exists(path_sovits_v4)
 
-if os.path.exists("./weight.json"):
-    pass
-else:
-    with open("./weight.json", "w", encoding="utf-8") as file:
-        json.dump({"GPT": {}, "SoVITS": {}}, file)
 
-with open("./weight.json", "r", encoding="utf-8") as file:
-    weight_data = file.read()
-    weight_data = json.loads(weight_data)
-    gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
-    sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0]))
-    if isinstance(gpt_path, list):
-        gpt_path = gpt_path[0]
-    if isinstance(sovits_path, list):
-        sovits_path = sovits_path[0]
+i18n = I18nAuto(language=args.language)
+ar_backend: str = args.backends
 
-# print(2333333)
-# print(os.environ["gpt_path"])
-# print(gpt_path)
-# print(GPT_names)
-# print(weight_data)
-# print(weight_data.get("GPT", {}))
-# print(version)###GPT version里没有s2的v2pro
-# print(weight_data.get("GPT", {}).get(version, GPT_names[-1]))
-
-cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base")
-bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large")
-infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
-infer_ttswebui = int(infer_ttswebui)
-is_share = os.environ.get("is_share", "False")
-is_share = eval(is_share)
-if "_CUDA_VISIBLE_DEVICES" in os.environ:
-    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
-is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
-# is_half=False
-punctuation = set(["!", "?", "…", ",", ".", "-", " "])
-import gradio as gr
-import librosa
-import numpy as np
-from feature_extractor import cnhubert
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-
-cnhubert.cnhubert_base_path = cnhubert_base_path
-
-import random
-
-from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
-
-
-def set_seed(seed):
-    if seed == -1:
-        seed = random.randint(0, 1000000)
-    seed = int(seed)
-    random.seed(seed)
-    os.environ["PYTHONHASHSEED"] = str(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-
-# set_seed(42)
-
-from time import time as ttime
-
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from peft import LoraConfig, get_peft_model
-from text import cleaned_text_to_sequence
-from text.cleaner import clean_text
-
-from tools.assets import css, js, top_html
-from tools.i18n.i18n import I18nAuto, scan_language_list
-
-language = os.environ.get("language", "Auto")
-language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
-i18n = I18nAuto(language=language)
-
-# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 确保直接启动推理UI时也能够设置。
-
-if torch.cuda.is_available():
-    device = "cuda"
-else:
-    device = "cpu"
 
 dict_language_v1 = {
     i18n("中文"): "all_zh",  # 全部按中文识别
@@ -160,12 +146,98 @@ dict_language_v2 = {
 }
 dict_language = dict_language_v1 if version == "v1" else dict_language_v2
 
+
+cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base")
+bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large")
+infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
+infer_ttswebui = int(infer_ttswebui)
+is_share = os.environ.get("is_share", "False")
+is_share = eval(is_share)
+punctuation = set(["!", "?", "…", ",", ".", "-", " "])
+splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…"}
+v3v4set = {"v3", "v4"}
+
+
+device = infer_device
+if "_CUDA_VISIBLE_DEVICES" in os.environ:
+    device = torch.device(f"cuda:{os.environ['_CUDA_VISIBLE_DEVICES'][0]}") if torch.cuda.is_available() else device
+
+dtype = torch.float32
+if is_half is True:
+    dtype = torch.float16
+
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
-bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
-if is_half == True:
-    bert_model = bert_model.half().to(device)
+bert_model = AutoModelForMaskedLM.from_pretrained(bert_path).to(device, dtype)
+
+cnhubert.cnhubert_base_path = cnhubert_base_path
+ssl_model = cnhubert.get_model().to(device, dtype)
+
+spec_min = -12
+spec_max = 2
+
+
+def norm_spec(x):
+    return (x - spec_min) / (spec_max - spec_min) * 2 - 1
+
+
+def denorm_spec(x):
+    return (x + 1) / 2 * (spec_max - spec_min) + spec_min
+
+
+def mel_fn(x):
+    return mel_spectrogram_torch(
+        y=x,
+        n_fft=1024,
+        num_mels=100,
+        sampling_rate=24000,
+        hop_size=256,
+        win_size=1024,
+        fmin=0,
+        fmax=None,
+        center=False,
+    )
+
+
+def mel_fn_v4(x):
+    return mel_spectrogram_torch(
+        y=x,
+        n_fft=1280,
+        num_mels=100,
+        sampling_rate=32000,
+        hop_size=320,
+        win_size=1280,
+        fmin=0,
+        fmax=None,
+        center=False,
+    )
+
+
+if os.path.exists("./weight.json"):
+    pass
 else:
-    bert_model = bert_model.to(device)
+    with open("./weight.json", "w", encoding="utf-8") as file:
+        json.dump({"GPT": {}, "SoVITS": {}}, file)
+
+with open("./weight.json", "r", encoding="utf-8") as file:
+    weight_data = file.read()
+    weight_data = json.loads(weight_data)
+    gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
+    sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0]))
+    if isinstance(gpt_path, list):
+        gpt_path = gpt_path[0]
+    if isinstance(sovits_path, list):
+        sovits_path = sovits_path[0]
+
+
+def set_seed(seed):
+    if seed == -1:
+        seed = random.randint(0, 1000000)
+    seed = int(seed)
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
 
 
 def get_bert_feature(text, word2ph):
@@ -212,20 +284,6 @@ class DictToAttrRecursive(dict):
             raise AttributeError(f"Attribute {item} not found")
 
 
-ssl_model = cnhubert.get_model()
-if is_half == True:
-    ssl_model = ssl_model.half().to(device)
-else:
-    ssl_model = ssl_model.to(device)
-
-
-###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt
-# symbol_version-model_version-if_lora_v3
-from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
-
-v3v4set = {"v3", "v4"}
-
-
 def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
     if "！" in sovits_path or "!" in sovits_path:
         sovits_path = name2sovits_path[sovits_path]
@@ -234,10 +292,10 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
     print(sovits_path, version, model_version, if_lora_v3)
     is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
     path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
-    if if_lora_v3 == True and is_exist == False:
-        info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失，无法加载相应 LoRA 权重")
+    if if_lora_v3 is True and is_exist is False:
+        info = f"{path_sovits} SoVITS {model_version} {i18n('底模缺失，无法加载相应 LoRA 权重')}"
         gr.Warning(info)
-        raise FileExistsError(info)
+        raise FileNotFoundError(info)
     dict_language = dict_language_v1 if version == "v1" else dict_language_v2
     if prompt_language is not None and text_language is not None:
         if prompt_language in list(dict_language.keys()):
@@ -291,7 +349,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
     version = hps.model.version
     # print("sovits版本:",hps.model.version)
     if model_version not in v3v4set:
-        if "Pro" not in model_version:
+        if "Pro" not in model_version:  # type: ignore
             model_version = version
         else:
             hps.model.version = model_version
@@ -308,23 +366,19 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
             hps.train.segment_size // hps.data.hop_length,
             n_speakers=hps.data.n_speakers,
             **hps.model,
-        )
+        ).eval()
     if "pretrained" not in sovits_path:
         try:
             del vq_model.enc_q
-        except:
+        finally:
             pass
-    if is_half == True:
-        vq_model = vq_model.half().to(device)
-    else:
-        vq_model = vq_model.to(device)
-    vq_model.eval()
-    if if_lora_v3 == False:
-        print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False))
+
+    if if_lora_v3 is False:
+        print(f">> loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
     else:
         path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
         print(
-            "loading sovits_%spretrained_G" % model_version,
+            ">> loading sovits_%spretrained_G" % model_version,
             vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
         )
         lora_rank = dict_s2["lora_rank"]
@@ -334,27 +388,29 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
             lora_alpha=lora_rank,
             init_lora_weights=True,
         )
-        vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
-        print("loading sovits_%s_lora%s" % (model_version, lora_rank))
+        vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)  # type: ignore
+        print(f">> loading sovits_{model_version}_lora{model_version}")
         vq_model.load_state_dict(dict_s2["weight"], strict=False)
-        vq_model.cfm = vq_model.cfm.merge_and_unload()
+        vq_model.cfm = vq_model.cfm.merge_and_unload()  # pyright: ignore[reportAttributeAccessIssue, reportCallIssue]
         # torch.save(vq_model.state_dict(),"merge_win.pth")
         vq_model.eval()
 
+    vq_model = vq_model.to(device, dtype)
+
     yield (
         {"__type__": "update", "choices": list(dict_language.keys())},
         {"__type__": "update", "choices": list(dict_language.keys())},
-        prompt_text_update,
-        prompt_language_update,
-        text_update,
-        text_language_update,
+        prompt_text_update,  # type: ignore
+        prompt_language_update,  # type: ignore
+        text_update,  # type: ignore
+        text_language_update,  # type: ignore
         {
             "__type__": "update",
-            "visible": visible_sample_steps,
+            "visible": visible_sample_steps,  # type: ignore
             "value": 32 if model_version == "v3" else 8,
             "choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
         },
-        {"__type__": "update", "visible": visible_inp_refs},
+        {"__type__": "update", "visible": visible_inp_refs},  # type: ignore
         {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False},
         {"__type__": "update", "visible": True if model_version == "v3" else False},
         {"__type__": "update", "value": i18n("合成语音"), "interactive": True},
@@ -367,28 +423,33 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
         f.write(json.dumps(data))
 
 
-try:
+with contextlib.suppress(UnboundLocalError):
     next(change_sovits_weights(sovits_path))
-except:
-    pass
 
 
 def change_gpt_weights(gpt_path):
     if "！" in gpt_path or "!" in gpt_path:
         gpt_path = name2gpt_path[gpt_path]
-    global hz, max_sec, t2s_model, config
-    hz = 50
-    dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
-    config = dict_s1["config"]
-    max_sec = config["data"]["max_sec"]
-    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
-    t2s_model.load_state_dict(dict_s1["weight"])
-    if is_half == True:
-        t2s_model = t2s_model.half()
-    t2s_model = t2s_model.to(device)
-    t2s_model.eval()
-    # total = sum([param.nelement() for param in t2s_model.parameters()])
-    # print("Number of parameter: %.2fM" % (total / 1e6))
+    global t2s_engine, config
+
+    if "mlx" in ar_backend.lower():
+        t2s_engine = MLX.T2SEngineMLX(
+            MLX.T2SEngineMLX.load_decoder(Path(gpt_path), backend=ar_backend),
+            "mx.gpu",
+            dtype=dtype,
+        )
+        # t2s_engine.decoder_model.compile()
+        total = sum((p[-1].size for p in mxutils.tree_flatten(t2s_engine.decoder_model.parameters())))  # type: ignore
+    else:
+        t2s_engine = PyTorch.T2SEngineTorch(
+            PyTorch.T2SEngineTorch.load_decoder(Path(gpt_path), backend=ar_backend),
+            device,
+            dtype=dtype,
+        )
+        # t2s_engine.decoder_model.compile()
+        total = sum(p.numel() for p in t2s_engine.decoder_model.parameters())
+    print(">> Number of parameter: %.2fM" % (total / 1e6))
+
     with open("./weight.json") as f:
         data = f.read()
         data = json.loads(data)
@@ -398,43 +459,36 @@ def change_gpt_weights(gpt_path):
 
 
 change_gpt_weights(gpt_path)
-os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
-import torch
-
-now_dir = os.getcwd()
 
 
 def clean_hifigan_model():
     global hifigan_model
     if hifigan_model:
         hifigan_model = hifigan_model.cpu()
-        hifigan_model = None
-        try:
+        del hifigan_model
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        except:
-            pass
+        hifigan_model = None
 
 
 def clean_bigvgan_model():
     global bigvgan_model
     if bigvgan_model:
         bigvgan_model = bigvgan_model.cpu()
-        bigvgan_model = None
-        try:
+        del bigvgan_model
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        except:
-            pass
+        bigvgan_model = None
 
 
 def clean_sv_cn_model():
     global sv_cn_model
     if sv_cn_model:
         sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu()
-        sv_cn_model = None
-        try:
+        del sv_cn_model
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        except:
-            pass
+        sv_cn_model = None
 
 
 def init_bigvgan():
@@ -442,18 +496,14 @@ def init_bigvgan():
     from BigVGAN import bigvgan
 
     bigvgan_model = bigvgan.BigVGAN.from_pretrained(
-        "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
+        "./GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
         use_cuda_kernel=False,
     )  # if True, RuntimeError: Ninja is required to load C++ extensions
     # remove weight norm in the model and set to eval mode
     bigvgan_model.remove_weight_norm()
-    bigvgan_model = bigvgan_model.eval()
+    bigvgan_model = bigvgan_model.to(device, dtype).eval()
     clean_hifigan_model()
     clean_sv_cn_model()
-    if is_half == True:
-        bigvgan_model = bigvgan_model.half().to(device)
-    else:
-        bigvgan_model = bigvgan_model.to(device)
 
 
 def init_hifigan():
@@ -472,20 +522,15 @@ def init_hifigan():
     hifigan_model.eval()
     hifigan_model.remove_weight_norm()
     state_dict_g = torch.load(
-        "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+        "./GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
         map_location="cpu",
         weights_only=False,
     )
-    print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
+    print(">> loading vocoder", hifigan_model.load_state_dict(state_dict_g))
     clean_bigvgan_model()
     clean_sv_cn_model()
-    if is_half == True:
-        hifigan_model = hifigan_model.half().to(device)
-    else:
-        hifigan_model = hifigan_model.to(device)
 
-
-from sv import SV
+    hifigan_model = hifigan_model.to(device, dtype)
 
 
 def init_sv_cn():
@@ -508,18 +553,13 @@ resample_transform_dict = {}
 
 def resample(audio_tensor, sr0, sr1, device):
     global resample_transform_dict
-    key = "%s-%s-%s" % (sr0, sr1, str(device))
+    key = f"{sr0}-{sr1}-{str(device)}"
     if key not in resample_transform_dict:
         resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
     return resample_transform_dict[key](audio_tensor)
 
 
 def get_spepc(hps, filename, dtype, device, is_v2pro=False):
-    # audio = load_audio(filename, int(hps.data.sampling_rate))
-
-    # audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate))
-    # audio = torch.FloatTensor(audio)
-
     sr1 = int(hps.data.sampling_rate)
     audio, sr0 = torchaudio.load(filename)
     if sr0 != sr1:
@@ -532,7 +572,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
         if audio.shape[0] == 2:
             audio = audio.mean(0).unsqueeze(0)
 
-    maxx = audio.abs().max()
+    maxx = float(audio.abs().max())
     if maxx > 1:
         audio /= min(2, maxx)
     spec = spectrogram_torch(
@@ -544,7 +584,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
         center=False,
     )
     spec = spec.to(dtype)
-    if is_v2pro == True:
+    if is_v2pro is True:
         audio = resample(audio, sr1, 16000, device).to(dtype)
     return spec, audio
 
@@ -556,9 +596,6 @@ def clean_text_inf(text, language, version):
     return phones, word2ph, norm_text
 
 
-dtype = torch.float16 if is_half == True else torch.float32
-
-
 def get_bert_inf(phones, word2ph, norm_text, language):
     language = language.replace("all_", "")
     if language == "zh":
@@ -566,58 +603,38 @@ def get_bert_inf(phones, word2ph, norm_text, language):
     else:
         bert = torch.zeros(
             (1024, len(phones)),
-            dtype=torch.float16 if is_half == True else torch.float32,
+            dtype=torch.float16 if is_half is True else torch.float32,
         ).to(device)
 
     return bert
 
 
-splits = {
-    "，",
-    "。",
-    "？",
-    "！",
-    ",",
-    ".",
-    "?",
-    "!",
-    "~",
-    ":",
-    "：",
-    "—",
-    "…",
-}
-
-
 def get_first(text):
     pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
     text = re.split(pattern, text)[0].strip()
     return text
 
 
-from text import chinese
-
-
 def get_phones_and_bert(text, language, version, final=False):
-    text = re.sub(r' {2,}', ' ', text)
+    text = re.sub(r" {2,}", " ", text)
     textlist = []
     langlist = []
     if language == "all_zh":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
             langlist.append(tmp["lang"])
             textlist.append(tmp["text"])
     elif language == "all_yue":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
             if tmp["lang"] == "zh":
                 tmp["lang"] = "yue"
             langlist.append(tmp["lang"])
             textlist.append(tmp["text"])
     elif language == "all_ja":
-        for tmp in LangSegmenter.getTexts(text,"ja"):
+        for tmp in LangSegmenter.getTexts(text, "ja"):
             langlist.append(tmp["lang"])
             textlist.append(tmp["text"])
     elif language == "all_ko":
-        for tmp in LangSegmenter.getTexts(text,"ko"):
+        for tmp in LangSegmenter.getTexts(text, "ko"):
             langlist.append(tmp["lang"])
             textlist.append(tmp["text"])
     elif language == "en":
@@ -667,48 +684,6 @@ def get_phones_and_bert(text, language, version, final=False):
     return phones, bert.to(dtype), norm_text
 
 
-from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
-
-spec_min = -12
-spec_max = 2
-
-
-def norm_spec(x):
-    return (x - spec_min) / (spec_max - spec_min) * 2 - 1
-
-
-def denorm_spec(x):
-    return (x + 1) / 2 * (spec_max - spec_min) + spec_min
-
-
-mel_fn = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1024,
-        "win_size": 1024,
-        "hop_size": 256,
-        "num_mels": 100,
-        "sampling_rate": 24000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
-mel_fn_v4 = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1280,
-        "win_size": 1280,
-        "hop_size": 320,
-        "num_mels": 100,
-        "sampling_rate": 32000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
-
-
 def merge_short_text_in_array(texts, threshold):
     if (len(texts)) < 2:
         return texts
@@ -732,7 +707,7 @@ sr_model = None
 
 def audio_sr(audio, sr):
     global sr_model
-    if sr_model == None:
+    if sr_model is None:
         from tools.audio_sr import AP_BWE
 
         try:
@@ -743,8 +718,6 @@ def audio_sr(audio, sr):
     return sr_model(audio, sr)
 
 
-##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature
-# cache_tokens={}#暂未实现清理机制
 cache = {}
 
 
@@ -794,17 +767,17 @@ def get_tts_wav(
         prompt_text = prompt_text.strip("\n")
         if prompt_text[-1] not in splits:
             prompt_text += "。" if prompt_language != "en" else "."
-        print(i18n("实际输入的参考文本:"), prompt_text)
+        print(">>", i18n("实际输入的参考文本:"), prompt_text)
     text = text.strip("\n")
     # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
 
-    print(i18n("实际输入的目标文本:"), text)
+    print(">>", i18n("实际输入的目标文本:"), text)
     zero_wav = np.zeros(
         int(hps.data.sampling_rate * pause_second),
-        dtype=np.float16 if is_half == True else np.float32,
+        dtype=np.float16 if is_half is True else np.float32,
     )
     zero_wav_torch = torch.from_numpy(zero_wav)
-    if is_half == True:
+    if is_half is True:
         zero_wav_torch = zero_wav_torch.half().to(device)
     else:
         zero_wav_torch = zero_wav_torch.to(device)
@@ -815,7 +788,7 @@ def get_tts_wav(
                 gr.Warning(i18n("参考音频在3~10秒范围外，请更换！"))
                 raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
             wav16k = torch.from_numpy(wav16k)
-            if is_half == True:
+            if is_half is True:
                 wav16k = wav16k.half().to(device)
             else:
                 wav16k = wav16k.to(device)
@@ -824,6 +797,8 @@ def get_tts_wav(
             codes = vq_model.extract_latent(ssl_content)
             prompt_semantic = codes[0, 0]
             prompt = prompt_semantic.unsqueeze(0).to(device)
+    else:
+        prompt = torch.zeros((1, 0)).to(device, torch.int32)
 
     t1 = ttime()
     t.append(t1 - t0)
@@ -840,14 +815,19 @@ def get_tts_wav(
         text = cut5(text)
     while "\n\n" in text:
         text = text.replace("\n\n", "\n")
-    print(i18n("实际输入的目标文本(切句后):"), text)
+    print(">>", i18n("实际输入的目标文本(切句后):"), text)
     texts = text.split("\n")
     texts = process_text(texts)
     texts = merge_short_text_in_array(texts, 5)
     audio_opt = []
     ###s2v3暂不支持ref_free
     if not ref_free:
-        phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version)
+        phones1, bert1, _ = get_phones_and_bert(prompt_text, prompt_language, version)
+    else:
+        phones1, bert1 = [], torch.zeros(1024, 0)
+
+    infer_len: list[int] = []
+    infer_time: list[float] = []
 
     for i_text, text in enumerate(texts):
         # 解决输入目标文本的空行导致报错的问题
@@ -855,76 +835,89 @@ def get_tts_wav(
             continue
         if text[-1] not in splits:
             text += "。" if text_language != "en" else "."
-        print(i18n("实际输入的目标文本(每句):"), text)
+        print(">>", i18n("实际输入的目标文本(每句):"), text)
         phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
-        print(i18n("前端处理后的文本(每句):"), norm_text2)
-        if not ref_free:
-            bert = torch.cat([bert1, bert2], 1)
-            all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
-        else:
-            bert = bert2
-            all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0)
+        print(">>", i18n("前端处理后的文本(每句):"), norm_text2)
+
+        bert = torch.cat([bert1, bert2], 1)
+        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
 
         bert = bert.to(device).unsqueeze(0)
         all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
 
         t2 = ttime()
-        # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
-        # print(cache.keys(),if_freeze)
-        if i_text in cache and if_freeze == True:
+        if i_text in cache and if_freeze is True:
             pred_semantic = cache[i_text]
         else:
-            with torch.no_grad():
-                pred_semantic, idx = t2s_model.model.infer_panel(
-                    all_phoneme_ids,
-                    all_phoneme_len,
-                    None if ref_free else prompt,
-                    bert,
-                    # prompt_phone_len=ph_offset,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    early_stop_num=hz * max_sec,
-                )
-                pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
-                cache[i_text] = pred_semantic
+            t2s_request = T2SRequest(
+                [all_phoneme_ids.squeeze(0)],
+                all_phoneme_len,
+                prompt,
+                [bert.squeeze(0)],
+                valid_length=1,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                early_stop_num=1500,
+                use_cuda_graph=torch.cuda.is_available(),
+                # debug=True,
+            )
+            t2s_result = t2s_engine.generate(t2s_request)
+            if t2s_result.exception is not None:
+                print(t2s_result.traceback)
+                raise RuntimeError()
+            pred_semantic_list = t2s_result.result
+            assert pred_semantic_list, t2s_result.traceback
+            pred_semantic = pred_semantic_list[0].unsqueeze(0)
+            infer_len.append(pred_semantic.shape[-1])
+            infer_time.append(t2s_result.infer_speed[-1])
+
+            cache[i_text] = pred_semantic
         t3 = ttime()
         is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
-        # print(23333,is_v2pro,model_version)
         ###v3不存在以下逻辑和inp_refs
+        sv_emb = []
         if model_version not in v3v4set:
             refers = []
-            if is_v2pro:
-                sv_emb = []
-                if sv_cn_model == None:
-                    init_sv_cn()
+            if is_v2pro and sv_cn_model is None:
+                init_sv_cn()
             if inp_refs:
                 for path in inp_refs:
                     try:  #####这里加上提取sv的逻辑，要么一堆sv一堆refer，要么单个sv单个refer
                         refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
                         refers.append(refer)
                         if is_v2pro:
+                            assert sv_cn_model
                             sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
-                    except:
+                    except Exception as e:
+                        print(e)
                         traceback.print_exc()
             if len(refers) == 0:
                 refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
                 refers = [refers]
                 if is_v2pro:
+                    assert sv_cn_model
                     sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
             if is_v2pro:
                 audio = vq_model.decode(
-                    pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb
-                )[0][0]
+                    pred_semantic,
+                    torch.LongTensor(phones2).to(device).unsqueeze(0),
+                    refers,
+                    speed=speed,
+                    sv_emb=sv_emb,
+                )[0][0]  # type: ignore
             else:
                 audio = vq_model.decode(
-                    pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
-                )[0][0]
+                    pred_semantic,
+                    torch.LongTensor(phones2).to(device).unsqueeze(0),
+                    refers,
+                    speed=speed,
+                )[0][0]  # type: ignore
         else:
             refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
             phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
             phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0)
-            fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
+            fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)  # type: ignore
             ref_audio, sr = torchaudio.load(ref_wav_path)
             ref_audio = ref_audio.to(device).float()
             if ref_audio.shape[0] == 2:
@@ -932,7 +925,6 @@ def get_tts_wav(
             tgt_sr = 24000 if model_version == "v3" else 32000
             if sr != tgt_sr:
                 ref_audio = resample(ref_audio, sr, tgt_sr, device)
-            # print("ref_audio",ref_audio.abs().mean())
             mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio)
             mel2 = norm_spec(mel2)
             T_min = min(mel2.shape[2], fea_ref.shape[2])
@@ -946,7 +938,7 @@ def get_tts_wav(
                 T_min = Tref
             chunk_len = Tchunk - T_min
             mel2 = mel2.to(dtype)
-            fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
+            fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)  # type: ignore
             cfm_resss = []
             idx = 0
             while 1:
@@ -955,7 +947,7 @@ def get_tts_wav(
                     break
                 idx += chunk_len
                 fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
-                cfm_res = vq_model.cfm.inference(
+                cfm_res = vq_model.cfm.inference(  # type: ignore
                     fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
                 )
                 cfm_res = cfm_res[:, :, mel2.shape[2] :]
@@ -972,7 +964,7 @@ def get_tts_wav(
                     init_hifigan()
             vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
             with torch.inference_mode():
-                wav_gen = vocoder_model(cfm_res)
+                wav_gen = vocoder_model(cfm_res)  # type: ignore
                 audio = wav_gen[0][0]  # .cpu().detach().numpy()
         max_audio = torch.abs(audio).max()  # 简单防止16bit爆音
         if max_audio > 1:
@@ -982,7 +974,7 @@ def get_tts_wav(
         t4 = ttime()
         t.extend([t2 - t1, t3 - t2, t4 - t3])
         t1 = ttime()
-    print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
+
     audio_opt = torch.cat(audio_opt, 0)  # np.concatenate
     if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
         opt_sr = 32000
@@ -990,14 +982,30 @@ def get_tts_wav(
         opt_sr = 24000
     else:
         opt_sr = 48000  # v4
-    if if_sr == True and opt_sr == 24000:
-        print(i18n("音频超分中"))
+    if if_sr is True and opt_sr == 24000:
+        print(">>", i18n("音频超分中"))
         audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
         max_audio = np.abs(audio_opt).max()
         if max_audio > 1:
             audio_opt /= max_audio
     else:
         audio_opt = audio_opt.cpu().detach().numpy()
+
+    t0 = t[0]
+    t1 = sum(t[1::3])
+    t2 = sum(t[2::3])
+    t3 = sum(t[3::3])
+
+    infer_speed_avg = sum(infer_len) / sum(infer_time)
+    rtf_value = sum(t) / (audio_opt.__len__() / opt_sr)
+
+    print(f">> Time Stamps: {t0:.3f}\t{t1:.3f}\t{t2:.3f}\t{t3:.3f}")
+    print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
+    print(f">> RTF: {rtf_value:.2f}")
+
+    gr.Info(f"{infer_speed_avg:.2f} Token/s", title="Infer Speed")
+    gr.Info(f"{rtf_value:.2f}", title="RTF")
+
     yield opt_sr, (audio_opt * 32767).astype(np.int16)
 
 
@@ -1052,7 +1060,6 @@ def cut2(inp):
             tmp_str = ""
     if tmp_str != "":
         opts.append(tmp_str)
-    # print(opts)
     if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了，和前一个合一起
         opts[-2] = opts[-2] + opts[-1]
         opts = opts[:-1]
@@ -1062,7 +1069,7 @@ def cut2(inp):
 
 def cut3(inp):
     inp = inp.strip("\n")
-    opts = ["%s" % item for item in inp.strip("。").split("。")]
+    opts = inp.strip("。").split("。")
     opts = [item for item in opts if not set(item).issubset(punctuation)]
     return "\n".join(opts)
 
@@ -1141,7 +1148,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
     )
     with gr.Group():
         gr.Markdown(html_center(i18n("模型切换"), "h3"))
-        with gr.Row():
+        with gr.Row(equal_height=True):
             GPT_dropdown = gr.Dropdown(
                 label=i18n("GPT模型列表"),
                 choices=sorted(GPT_names, key=custom_sort_key),
@@ -1159,8 +1166,15 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
             refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14)
             refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
         gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3"))
-        with gr.Row():
-            inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频，超过会报错！"), type="filepath", scale=13)
+        with gr.Row(equal_height=True):
+            inp_ref = gr.Audio(
+                label=i18n("请上传3~10秒内参考音频，超过会报错！"),
+                type="filepath",
+                sources="upload",
+                scale=13,
+                editable=False,
+                waveform_options={"show_recording_waveform": False},
+            )
             with gr.Column(scale=13):
                 ref_text_free = gr.Checkbox(
                     label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")
@@ -1223,7 +1237,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     visible=False if model_version != "v3" else True,
                 )
         gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
-        with gr.Row():
+        with gr.Row(equal_height=True):
             with gr.Column(scale=13):
                 text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26)
             with gr.Column(scale=7):
@@ -1255,7 +1269,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     show_label=True,
                     scale=1,
                 )
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     speed = gr.Slider(
                         minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1
                     )
@@ -1278,13 +1292,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 temperature = gr.Slider(
                     minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
                 )
-            # with gr.Column():
-            #     gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理，无视目标文本框。"))
-            #     phoneme=gr.Textbox(label=i18n("音素框"), value="")
-            #     get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
-        with gr.Row():
+        with gr.Row(equal_height=True):
             inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
-            output = gr.Audio(label=i18n("输出的语音"), scale=14)
+            output = gr.Audio(
+                label=i18n("输出的语音"),
+                scale=14,
+                waveform_options={"show_recording_waveform": False},
+                editable=False,
+            )
 
         inference_button.click(
             get_tts_wav,
@@ -1327,24 +1342,9 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
         )
         GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
 
-        # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
-        # with gr.Row():
-        #     text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
-        #     button1 = gr.Button(i18n("凑四句一切"), variant="primary")
-        #     button2 = gr.Button(i18n("凑50字一切"), variant="primary")
-        #     button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
-        #     button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
-        #     button5 = gr.Button(i18n("按标点符号切"), variant="primary")
-        #     text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
-        #     button1.click(cut1, [text_inp], [text_opt])
-        #     button2.click(cut2, [text_inp], [text_opt])
-        #     button3.click(cut3, [text_inp], [text_opt])
-        #     button4.click(cut4, [text_inp], [text_opt])
-        #     button5.click(cut5, [text_inp], [text_opt])
-        # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")))
 
 if __name__ == "__main__":
-    app.queue().launch(  # concurrency_count=511, max_size=1022
+    app.queue(api_open=False, default_concurrency_limit=511).launch(  # concurrency_count=511, max_size=1022
         server_name="0.0.0.0",
         inbrowser=True,
         share=is_share,
diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py
index 51a120f1..92315981 100644
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@@ -315,7 +315,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
     with gr.Column():
         # with gr.Group():
         gr.Markdown(value=i18n("模型切换"))
-        with gr.Row():
+        with gr.Row(equal_height=True):
             GPT_dropdown = gr.Dropdown(
                 label=i18n("GPT模型列表"),
                 choices=sorted(GPT_names, key=custom_sort_key),
@@ -331,18 +331,22 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
             refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
             refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
 
-    with gr.Row():
+    with gr.Row(equal_height=True):
         with gr.Column():
             gr.Markdown(value=i18n("*请上传并填写参考信息"))
-            with gr.Row():
-                inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频，超过会报错！)"), type="filepath")
+            with gr.Row(equal_height=True):
+                inp_ref = gr.Audio(
+                    label=i18n("主参考音频(请上传3~10秒内参考音频，超过会报错！)"),
+                    type="filepath",
+                    waveform_options={"show_recording_waveform": False},
+                )
                 inp_refs = gr.File(
                     label=i18n("辅参考音频(可选多个，或不选)"),
                     file_count="multiple",
                     visible=True if model_version != "v3" else False,
                 )
             prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
-            with gr.Row():
+            with gr.Row(equal_height=True):
                 prompt_language = gr.Dropdown(
                     label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
                 )
@@ -368,26 +372,26 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
 
     with gr.Group():
         gr.Markdown(value=i18n("推理设置"))
-        with gr.Row():
+        with gr.Row(equal_height=True):
             with gr.Column():
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     batch_size = gr.Slider(
                         minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True
                     )
                     sample_steps = gr.Radio(
                         label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32, 64, 128], visible=True
                     )
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     fragment_interval = gr.Slider(
                         minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True
                     )
                     speed_factor = gr.Slider(
                         minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
                     )
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
                     top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     temperature = gr.Slider(
                         minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True
                     )
@@ -396,7 +400,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     )
 
             with gr.Column():
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     how_to_cut = gr.Dropdown(
                         label=i18n("怎么切"),
                         choices=[
@@ -415,7 +419,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                         label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True
                     )
 
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True)
                     split_bucket = gr.Checkbox(
                         label=i18n("数据分桶(并行推理时会降低一点计算量)"),
@@ -424,12 +428,15 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                         show_label=True,
                     )
 
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     seed = gr.Number(label=i18n("随机种子"), value=-1)
                     keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True)
 
-                output = gr.Audio(label=i18n("输出的语音"))
-                with gr.Row():
+                output = gr.Audio(
+                    label=i18n("输出的语音"),
+                    waveform_options={"show_recording_waveform": False},
+                )
+                with gr.Row(equal_height=True):
                     inference_button = gr.Button(i18n("合成语音"), variant="primary")
                     stop_infer = gr.Button(i18n("终止合成"), variant="primary")
 
@@ -485,7 +492,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 "文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"
             )
         )
-        with gr.Row():
+        with gr.Row(equal_height=True):
             text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4)
             with gr.Column():
                 _how_to_cut = gr.Radio(
diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py
index 1176f0bc..e05dff93 100644
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@@ -1,29 +1,32 @@
 # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py
-import os
-
-if "_CUDA_VISIBLE_DEVICES" in os.environ:
-    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 import argparse
 import logging
+import os
 import platform
+from collections import OrderedDict
 from pathlib import Path
 
 import torch
-from AR.data.data_module import Text2SemanticDataModule
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from AR.utils.io import load_yaml_config
 from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger  # WandbLogger
-from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.strategies import DDPStrategy, SingleDeviceStrategy
+
+from GPT_SoVITS.AR.data.data_module import Text2SemanticDataModule
+from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from GPT_SoVITS.AR.utils import get_newest_ckpt
+from GPT_SoVITS.AR.utils.io import load_yaml_config
+from GPT_SoVITS.process_ckpt import my_save
 
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("matplotlib").setLevel(logging.WARNING)
 torch.set_float32_matmul_precision("high")
-from collections import OrderedDict
 
-from AR.utils import get_newest_ckpt
-from process_ckpt import my_save
+
+if "_CUDA_VISIBLE_DEVICES" in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["USE_LIBUV"] = "0"
 
 
 class my_model_ckpt(ModelCheckpoint):
@@ -49,35 +52,30 @@ class my_model_ckpt(ModelCheckpoint):
             monitor_candidates = self._monitor_candidates(trainer)
             if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0:
                 if (
-                    self.if_save_latest == True
+                    self.if_save_latest is True
                 ):  ####如果设置只保存最后一个ckpt，在保存下一个ckpt后要清理掉之前的所有ckpt
                     to_clean = list(os.listdir(self.dirpath))
                 self._save_topk_checkpoint(trainer, monitor_candidates)
-                if self.if_save_latest == True:
+                if self.if_save_latest is True:
                     for name in to_clean:
                         try:
-                            os.remove("%s/%s" % (self.dirpath, name))
-                        except:
+                            os.remove(f"{self.dirpath}/{name}")
+                        except Exception as _:
                             pass
-                if self.if_save_every_weights == True:
+                if self.if_save_every_weights is True:
                     to_save_od = OrderedDict()
                     to_save_od["weight"] = OrderedDict()
                     dictt = trainer.strategy._lightning_module.state_dict()
                     for key in dictt:
                         to_save_od["weight"][key] = dictt[key].half()
                     to_save_od["config"] = self.config
-                    to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1)
+                    to_save_od["info"] = f"GPT-e{trainer.current_epoch + 1}"
                     # torch.save(
                     # print(os.environ)
                     if os.environ.get("LOCAL_RANK", "0") == "0":
                         my_save(
                             to_save_od,
-                            "%s/%s-e%s.ckpt"
-                            % (
-                                self.half_weights_save_dir,
-                                self.exp_name,
-                                trainer.current_epoch + 1,
-                            ),
+                            f"{self.half_weights_save_dir}/{self.exp_name}-e{trainer.current_epoch + 1}.ckpt",
                         )
             self._save_last_checkpoint(trainer, monitor_candidates)
 
@@ -91,6 +89,14 @@ def main(args):
     ckpt_dir = output_dir / "ckpt"
     ckpt_dir.mkdir(parents=True, exist_ok=True)
 
+    if torch.cuda.is_available():
+        if torch.cuda.device_count() > 1:
+            strategy = DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
+        else:
+            strategy = SingleDeviceStrategy("cuda")
+    else:
+        strategy = SingleDeviceStrategy("cpu")
+
     seed_everything(config["train"]["seed"], workers=True)
     ckpt_callback: ModelCheckpoint = my_model_ckpt(
         config=config,
@@ -106,8 +112,7 @@ def main(args):
         dirpath=ckpt_dir,
     )
     logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir)
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["USE_LIBUV"] = "0"
+
     trainer: Trainer = Trainer(
         max_epochs=config["train"]["epochs"],
         accelerator="gpu" if torch.cuda.is_available() else "cpu",
@@ -117,9 +122,7 @@ def main(args):
         devices=-1 if torch.cuda.is_available() else 1,
         benchmark=False,
         fast_dev_run=False,
-        strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
-        if torch.cuda.is_available()
-        else "auto",
+        strategy=strategy,
         precision=config["train"]["precision"],
         logger=logger,
         num_sanity_val_steps=0,
diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
index 99b3a422..17e70689 100644
--- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
+++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
@@ -1,40 +1,41 @@
 import logging
 import re
+from pathlib import Path
 
-# jieba静音
+import fast_langdetect
 import jieba
+from split_lang import LangSplitter
+
 jieba.setLogLevel(logging.CRITICAL)
 
-# 更改fast_langdetect大模型位置
-from pathlib import Path
-import fast_langdetect
-fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
-
-
-from split_lang import LangSplitter
+fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
+    fast_langdetect.infer.LangDetectConfig(
+        cache_dir=str(Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect")
+    )
+)
 
 
 def full_en(text):
-    pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
+    pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
     return bool(re.match(pattern, text))
 
 
 def full_cjk(text):
     # 来自wiki
     cjk_ranges = [
-        (0x4E00, 0x9FFF),        # CJK Unified Ideographs
-        (0x3400, 0x4DB5),        # CJK Extension A
-        (0x20000, 0x2A6DD),      # CJK Extension B
-        (0x2A700, 0x2B73F),      # CJK Extension C
-        (0x2B740, 0x2B81F),      # CJK Extension D
-        (0x2B820, 0x2CEAF),      # CJK Extension E
-        (0x2CEB0, 0x2EBEF),      # CJK Extension F
-        (0x30000, 0x3134A),      # CJK Extension G
-        (0x31350, 0x323AF),      # CJK Extension H
-        (0x2EBF0, 0x2EE5D),      # CJK Extension H
+        (0x4E00, 0x9FFF),  # CJK Unified Ideographs
+        (0x3400, 0x4DB5),  # CJK Extension A
+        (0x20000, 0x2A6DD),  # CJK Extension B
+        (0x2A700, 0x2B73F),  # CJK Extension C
+        (0x2B740, 0x2B81F),  # CJK Extension D
+        (0x2B820, 0x2CEAF),  # CJK Extension E
+        (0x2CEB0, 0x2EBEF),  # CJK Extension F
+        (0x30000, 0x3134A),  # CJK Extension G
+        (0x31350, 0x323AF),  # CJK Extension H
+        (0x2EBF0, 0x2EE5D),  # CJK Extension H
     ]
 
-    pattern = r'[0-9、-〜。！？.!?… /]+$'
+    pattern = r"[0-9、-〜。！？.!?… /]+$"
 
     cjk_text = ""
     for char in text:
@@ -45,7 +46,7 @@ def full_cjk(text):
     return cjk_text
 
 
-def split_jako(tag_lang,item):
+def split_jako(tag_lang, item):
     if tag_lang == "ja":
         pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。！？.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
     else:
@@ -53,41 +54,42 @@ def split_jako(tag_lang,item):
 
     lang_list: list[dict] = []
     tag = 0
-    for match in re.finditer(pattern, item['text']):
+    for match in re.finditer(pattern, item["text"]):
         if match.start() > tag:
-            lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
+            lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
 
         tag = match.end()
-        lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
+        lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
 
-    if tag < len(item['text']):
-        lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
+    if tag < len(item["text"]):
+        lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
 
     return lang_list
 
 
 def merge_lang(lang_list, item):
-    if lang_list and item['lang'] == lang_list[-1]['lang']:
-        lang_list[-1]['text'] += item['text']
+    if lang_list and item["lang"] == lang_list[-1]["lang"]:
+        lang_list[-1]["text"] += item["text"]
     else:
         lang_list.append(item)
     return lang_list
 
 
-class LangSegmenter():
+class LangSegmenter:
     # 默认过滤器, 基于gsv目前四种语言
     DEFAULT_LANG_MAP = {
         "zh": "zh",
         "yue": "zh",  # 粤语
         "wuu": "zh",  # 吴语
         "zh-cn": "zh",
-        "zh-tw": "x", # 繁体设置为x
+        "zh-tw": "x",  # 繁体设置为x
         "ko": "ko",
         "ja": "ja",
         "en": "en",
     }
 
-    def getTexts(text,default_lang = ""):
+    @staticmethod
+    def getTexts(text, default_lang=""):
         lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
         lang_splitter.merge_across_digit = False
         substr = lang_splitter.split_by_lang(text=text)
@@ -97,31 +99,31 @@ class LangSegmenter():
         have_num = False
 
         for _, item in enumerate(substr):
-            dict_item = {'lang':item.lang,'text':item.text}
+            dict_item = {"lang": item.lang, "text": item.text}
 
-            if dict_item['lang'] == 'digit':
+            if dict_item["lang"] == "digit":
                 if default_lang != "":
-                    dict_item['lang'] = default_lang
+                    dict_item["lang"] = default_lang
                 else:
                     have_num = True
-                lang_list = merge_lang(lang_list,dict_item)
+                lang_list = merge_lang(lang_list, dict_item)
                 continue
 
             # 处理短英文被识别为其他语言的问题
-            if full_en(dict_item['text']):  
-                dict_item['lang'] = 'en'
-                lang_list = merge_lang(lang_list,dict_item)
+            if full_en(dict_item["text"]):
+                dict_item["lang"] = "en"
+                lang_list = merge_lang(lang_list, dict_item)
                 continue
 
             if default_lang != "":
-                dict_item['lang'] = default_lang
-                lang_list = merge_lang(lang_list,dict_item)
+                dict_item["lang"] = default_lang
+                lang_list = merge_lang(lang_list, dict_item)
                 continue
             else:
                 # 处理非日语夹日文的问题(不包含CJK)
                 ja_list: list[dict] = []
-                if dict_item['lang'] != 'ja':
-                    ja_list = split_jako('ja',dict_item)
+                if dict_item["lang"] != "ja":
+                    ja_list = split_jako("ja", dict_item)
 
                 if not ja_list:
                     ja_list.append(dict_item)
@@ -130,8 +132,8 @@ class LangSegmenter():
                 ko_list: list[dict] = []
                 temp_list: list[dict] = []
                 for _, ko_item in enumerate(ja_list):
-                    if ko_item["lang"] != 'ko':
-                        ko_list = split_jako('ko',ko_item)
+                    if ko_item["lang"] != "ko":
+                        ko_list = split_jako("ko", ko_item)
 
                     if ko_list:
                         temp_list.extend(ko_list)
@@ -141,77 +143,76 @@ class LangSegmenter():
                 # 未存在非日韩文夹日韩文
                 if len(temp_list) == 1:
                     # 未知语言检查是否为CJK
-                    if dict_item['lang'] == 'x':
-                        cjk_text = full_cjk(dict_item['text'])
+                    if dict_item["lang"] == "x":
+                        cjk_text = full_cjk(dict_item["text"])
                         if cjk_text:
-                            dict_item = {'lang':'zh','text':cjk_text}
-                            lang_list = merge_lang(lang_list,dict_item)
+                            dict_item = {"lang": "zh", "text": cjk_text}
+                            lang_list = merge_lang(lang_list, dict_item)
                         else:
-                            lang_list = merge_lang(lang_list,dict_item)
+                            lang_list = merge_lang(lang_list, dict_item)
                         continue
                     else:
-                        lang_list = merge_lang(lang_list,dict_item)
+                        lang_list = merge_lang(lang_list, dict_item)
                         continue
 
                 # 存在非日韩文夹日韩文
                 for _, temp_item in enumerate(temp_list):
                     # 未知语言检查是否为CJK
-                    if temp_item['lang'] == 'x':
-                        cjk_text = full_cjk(temp_item['text'])
+                    if temp_item["lang"] == "x":
+                        cjk_text = full_cjk(temp_item["text"])
                         if cjk_text:
-                            lang_list = merge_lang(lang_list,{'lang':'zh','text':cjk_text})
+                            lang_list = merge_lang(lang_list, {"lang": "zh", "text": cjk_text})
                         else:
-                            lang_list = merge_lang(lang_list,temp_item)
+                            lang_list = merge_lang(lang_list, temp_item)
                     else:
-                        lang_list = merge_lang(lang_list,temp_item)
+                        lang_list = merge_lang(lang_list, temp_item)
 
         # 有数字
         if have_num:
             temp_list = lang_list
             lang_list = []
             for i, temp_item in enumerate(temp_list):
-                if temp_item['lang'] == 'digit':
+                if temp_item["lang"] == "digit":
                     if default_lang:
-                        temp_item['lang'] = default_lang
+                        temp_item["lang"] = default_lang
                     elif lang_list and i == len(temp_list) - 1:
-                        temp_item['lang'] = lang_list[-1]['lang']
+                        temp_item["lang"] = lang_list[-1]["lang"]
                     elif not lang_list and i < len(temp_list) - 1:
-                        temp_item['lang'] = temp_list[1]['lang']
+                        temp_item["lang"] = temp_list[1]["lang"]
                     elif lang_list and i < len(temp_list) - 1:
-                        if lang_list[-1]['lang'] == temp_list[i + 1]['lang']:
-                            temp_item['lang'] = lang_list[-1]['lang']
-                        elif lang_list[-1]['text'][-1] in [",",".","!","?","，","。","！","？"]:
-                            temp_item['lang'] = temp_list[i + 1]['lang']
-                        elif temp_list[i + 1]['text'][0] in [",",".","!","?","，","。","！","？"]:
-                            temp_item['lang'] = lang_list[-1]['lang']
-                        elif temp_item['text'][-1] in ["。","."]:
-                            temp_item['lang'] = lang_list[-1]['lang']
-                        elif len(lang_list[-1]['text']) >= len(temp_list[i + 1]['text']):
-                            temp_item['lang'] = lang_list[-1]['lang']
+                        if lang_list[-1]["lang"] == temp_list[i + 1]["lang"]:
+                            temp_item["lang"] = lang_list[-1]["lang"]
+                        elif lang_list[-1]["text"][-1] in [",", ".", "!", "?", "，", "。", "！", "？"]:
+                            temp_item["lang"] = temp_list[i + 1]["lang"]
+                        elif temp_list[i + 1]["text"][0] in [",", ".", "!", "?", "，", "。", "！", "？"]:
+                            temp_item["lang"] = lang_list[-1]["lang"]
+                        elif temp_item["text"][-1] in ["。", "."]:
+                            temp_item["lang"] = lang_list[-1]["lang"]
+                        elif len(lang_list[-1]["text"]) >= len(temp_list[i + 1]["text"]):
+                            temp_item["lang"] = lang_list[-1]["lang"]
                         else:
-                            temp_item['lang'] = temp_list[i + 1]['lang']
+                            temp_item["lang"] = temp_list[i + 1]["lang"]
                     else:
-                        temp_item['lang'] = 'zh'
-
-                lang_list = merge_lang(lang_list,temp_item)
+                        temp_item["lang"] = "zh"
 
+                lang_list = merge_lang(lang_list, temp_item)
 
         # 筛X
         temp_list = lang_list
         lang_list = []
         for _, temp_item in enumerate(temp_list):
-            if temp_item['lang'] == 'x':
+            if temp_item["lang"] == "x":
                 if lang_list:
-                    temp_item['lang'] = lang_list[-1]['lang']
+                    temp_item["lang"] = lang_list[-1]["lang"]
                 elif len(temp_list) > 1:
-                    temp_item['lang'] = temp_list[1]['lang']
+                    temp_item["lang"] = temp_list[1]["lang"]
                 else:
-                    temp_item['lang'] = 'zh'
+                    temp_item["lang"] = "zh"
 
-            lang_list = merge_lang(lang_list,temp_item)
+            lang_list = merge_lang(lang_list, temp_item)
 
         return lang_list
-    
+
 
 if __name__ == "__main__":
     text = "MyGO?,你也喜欢まいご吗？"
@@ -221,5 +222,5 @@ if __name__ == "__main__":
     print(LangSegmenter.getTexts(text))
 
     text = "当时ThinkPad T60刚刚发布，一同推出的还有一款名为Advanced Dock的扩展坞配件。这款扩展坞通过连接T60底部的插槽，扩展出包括PCIe在内的一大堆接口，并且自带电源，让T60可以安装桌面显卡来提升性能。"
-    print(LangSegmenter.getTexts(text,"zh"))
-    print(LangSegmenter.getTexts(text))
\ No newline at end of file
+    print(LangSegmenter.getTexts(text, "zh"))
+    print(LangSegmenter.getTexts(text))
diff --git a/README.md b/README.md
index 86d50ea2..8dd4ea7f 100644
--- a/README.md
+++ b/README.md
@@ -248,13 +248,13 @@ if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-
 #### Others
 
 ```bash
-python webui.py <language(optional)>
+PYTHONPATH=. python webui.py <language(optional)>
 ```
 
 if you want to switch to V1,then
 
 ```bash
-python webui.py v1 <language(optional)>
+PYTHONPATH=. python webui.py v1 <language(optional)>
 ```
 
 Or maunally switch version in WebUI
@@ -285,7 +285,7 @@ python GPT_SoVITS/inference_webui.py <language(optional)>
 OR
 
 ```bash
-python webui.py
+PYTHONPATH=. python webui.py
 ```
 
 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
diff --git a/config.py b/config.py
index fdc11c0a..879c90d6 100644
--- a/config.py
+++ b/config.py
@@ -161,7 +161,7 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
     is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
     if mem_gb < 4 or sm_version < 5.3:
         return cpu, torch.float32, 0.0, 0.0
-    if sm_version == 6.1 or is_16_series == True:
+    if sm_version == 6.1 or is_16_series is True:
         return cuda, torch.float32, sm_version, mem_gb
     if sm_version > 6.1:
         return cuda, torch.float16, sm_version, mem_gb
@@ -216,3 +216,22 @@ class Config:
         self.webui_port_subfix = webui_port_subfix
 
         self.api_port = api_port
+
+
+def get_implement(device: torch.device):
+    if torch.cuda.is_available():
+        idx = device.index
+        capability = torch.cuda.get_device_capability(idx)
+        major, minor = capability
+        sm_version = major + minor / 10.0
+        if sm_version >= 7.5:
+            return "flash_attn"
+        else:
+            if sys.platform == "linux":
+                return "sage_attn"
+            else:
+                return "naive"
+    elif torch.mps.is_available():
+        return "mlx"
+    else:
+        return "naive"
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 793734d8..a297cdf8 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -236,13 +236,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
 #### 其他
 
 ```bash
-python webui.py <language(optional)>
+PYTHONPATH=. python webui.py <language(optional)>
 ```
 
 若想使用 V1,则
 
 ```bash
-python webui.py v1 <language(optional)>
+PYTHONPATH=. python webui.py v1 <language(optional)>
 ```
 
 或者在 webUI 内动态切换
@@ -273,7 +273,7 @@ python GPT_SoVITS/inference_webui.py <language(optional)>
 或者
 
 ```bash
-python webui.py
+PYTHONPATH=. python webui.py
 ```
 
 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
diff --git a/docs/ja/README.md b/docs/ja/README.md
index 1ee9abb4..b4761263 100644
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@@ -222,13 +222,13 @@ V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
 #### その他
 
 ```bash
-python webui.py <言語(オプション)>
+PYTHONPATH=. python webui.py <言語(オプション)>
 ```
 
 V1 に切り替えたい場合は
 
 ```bash
-python webui.py v1 <言語(オプション)>
+PYTHONPATH=. python webui.py v1 <言語(オプション)>
 ```
 
 または WebUI で手動でバージョンを切り替えてください.
@@ -259,7 +259,7 @@ python GPT_SoVITS/inference_webui.py <言語(オプション)>
 または
 
 ```bash
-python webui.py
+PYTHONPATH=. python webui.py
 ```
 
 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
diff --git a/docs/ko/README.md b/docs/ko/README.md
index 9ff32f83..e69927ed 100644
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@@ -228,13 +228,13 @@ V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-
 #### 기타
 
 ```bash
-python webui.py <언어(옵션)>
+PYTHONPATH=. python webui.py <언어(옵션)>
 ```
 
 V1으로 전환하려면,
 
 ```bash
-python webui.py v1 <언어(옵션)>
+PYTHONPATH=. python webui.py v1 <언어(옵션)>
 ```
 
 또는 WebUI에서 수동으로 버전을 전환하십시오.
@@ -265,7 +265,7 @@ python GPT_SoVITS/inference_webui.py <언어(옵션)>
 또는
 
 ```bash
-python webui.py
+PYTHONPATH=. python webui.py
 ```
 
 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
diff --git a/docs/tr/README.md b/docs/tr/README.md
index b80c764b..e93b3746 100644
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@@ -229,13 +229,13 @@ V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `
 #### Diğerleri
 
 ```bash
-python webui.py <dil(isteğe bağlı)>
+PYTHONPATH=. python webui.py <dil(isteğe bağlı)>
 ```
 
 V1'e geçmek istiyorsanız,
 
 ```bash
-python webui.py v1 <dil(isteğe bağlı)>
+PYTHONPATH=. python webui.py v1 <dil(isteğe bağlı)>
 ```
 
 veya WebUI'de manuel olarak sürüm değiştirin.
@@ -266,7 +266,7 @@ python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
 VEYA
 
 ```bash
-python webui.py
+PYTHONPATH=. python webui.py
 ```
 
 ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
diff --git a/go-webui.bat b/go-webui.bat
index c1c81089..6918cc5f 100644
--- a/go-webui.bat
+++ b/go-webui.bat
@@ -1,6 +1,7 @@
 set "SCRIPT_DIR=%~dp0"
 set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
 cd /d "%SCRIPT_DIR%"
-set "PATH=%SCRIPT_DIR%\runtime;%PATH%"
+set "PATH=%SCRIPT_DIR%\runtime"
+set "PYTHONPATH=%SCRIPT_DIR%"
 runtime\python.exe -I webui.py zh_CN
 pause
diff --git a/go-webui.ps1 b/go-webui.ps1
index 09103425..42d336e3 100644
--- a/go-webui.ps1
+++ b/go-webui.ps1
@@ -2,6 +2,7 @@ $ErrorActionPreference = "SilentlyContinue"
 chcp 65001
 Set-Location $PSScriptRoot
 $runtimePath = Join-Path $PSScriptRoot "runtime"
-$env:PATH = "$runtimePath;$env:PATH"
+$env:PATH = "$runtimePath"
+$env:PYTHONPATH = "$runtimePath"
 & "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
 pause
diff --git a/gpt-sovits_kaggle.ipynb b/gpt-sovits_kaggle.ipynb
deleted file mode 100644
index 764c23c4..00000000
--- a/gpt-sovits_kaggle.ipynb
+++ /dev/null
@@ -1,243 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "9fd922fb",
-   "metadata": {},
-   "source": [
-    "# Deprecated"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45857cb2",
-   "metadata": {
-    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
-    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
-    "execution": {
-     "iopub.execute_input": "2024-02-18T14:43:46.735480Z",
-     "iopub.status.busy": "2024-02-18T14:43:46.735183Z",
-     "iopub.status.idle": "2024-02-18T14:48:10.724175Z",
-     "shell.execute_reply": "2024-02-18T14:48:10.723059Z"
-    },
-    "papermill": {
-     "duration": 263.994935,
-     "end_time": "2024-02-18T14:48:10.726613",
-     "exception": false,
-     "start_time": "2024-02-18T14:43:46.731678",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
-    "%cd GPT-SoVITS\n",
-    "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
-    "!pip install -r requirements.txt\n",
-    "!pip install -r extra-req.txt --no-deps"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9d346b4",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-02-18T14:48:10.815802Z",
-     "iopub.status.busy": "2024-02-18T14:48:10.814899Z",
-     "iopub.status.idle": "2024-02-18T14:50:31.253276Z",
-     "shell.execute_reply": "2024-02-18T14:50:31.252024Z"
-    },
-    "papermill": {
-     "duration": 140.484893,
-     "end_time": "2024-02-18T14:50:31.255720",
-     "exception": false,
-     "start_time": "2024-02-18T14:48:10.770827",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# @title Download pretrained models 下载预训练模型\n",
-    "!mkdir -p /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
-    "!mkdir -p /kaggle/working/GPT-SoVITS/tools/asr/models\n",
-    "!mkdir -p /kaggle/working/GPT-SoVITS/tools/uvr5\n",
-    "%cd /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
-    "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
-    "%cd /kaggle/working/GPT-SoVITS/tools/asr/models\n",
-    "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
-    "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
-    "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
-    "# # @title UVR5 pretrains 安装uvr5模型\n",
-    "%cd /kaggle/working/GPT-SoVITS/tools/uvr5\n",
-    "!git clone https://huggingface.co/Delik/uvr5_weights\n",
-    "!git config core.sparseCheckout true\n",
-    "!mv /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea94d245",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-02-18T14:29:01.071549Z",
-     "iopub.status.busy": "2024-02-18T14:29:01.070592Z",
-     "iopub.status.idle": "2024-02-18T14:40:45.318368Z",
-     "shell.execute_reply": "2024-02-18T14:40:45.317130Z",
-     "shell.execute_reply.started": "2024-02-18T14:29:01.071512Z"
-    },
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": false,
-     "start_time": "2024-02-18T14:50:31.309013",
-     "status": "running"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# @title launch WebUI 启动WebUI\n",
-    "%cd /kaggle/working/GPT-SoVITS/\n",
-    "!npm install -g localtunnel\n",
-    "import subprocess\n",
-    "import threading\n",
-    "import time\n",
-    "import socket\n",
-    "import urllib.request\n",
-    "\n",
-    "\n",
-    "def iframe_thread(port):\n",
-    "    while True:\n",
-    "        time.sleep(0.5)\n",
-    "        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "        result = sock.connect_ex((\"127.0.0.1\", port))\n",
-    "        if result == 0:\n",
-    "            break\n",
-    "        sock.close()\n",
-    "\n",
-    "        from colorama import Fore, Style\n",
-    "    print(\n",
-    "        Fore.GREEN + \"\\nIP: \",\n",
-    "        Fore.RED,\n",
-    "        urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n",
-    "        \"\\n\",\n",
-    "        Style.RESET_ALL,\n",
-    "    )\n",
-    "    p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n",
-    "    for line in p.stdout:\n",
-    "        print(line.decode(), end=\"\")\n",
-    "\n",
-    "\n",
-    "threading.Thread(target=iframe_thread, daemon=True, args=(9874,)).start()\n",
-    "\n",
-    "!python  webui.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dda88a6d",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-02-18T14:40:56.880608Z",
-     "iopub.status.busy": "2024-02-18T14:40:56.879879Z"
-    },
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# 开启推理页面\n",
-    "%cd /kaggle/working/GPT-SoVITS/\n",
-    "!npm install -g localtunnel\n",
-    "import threading\n",
-    "\n",
-    "\n",
-    "def iframe_thread(port):\n",
-    "    while True:\n",
-    "        time.sleep(0.5)\n",
-    "        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "        result = sock.connect_ex((\"127.0.0.1\", port))\n",
-    "        if result == 0:\n",
-    "            break\n",
-    "        sock.close()\n",
-    "\n",
-    "        from colorama import Fore, Style\n",
-    "    print(\n",
-    "        Fore.GREEN + \"\\nIP: \",\n",
-    "        Fore.RED,\n",
-    "        urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n",
-    "        \"\\n\",\n",
-    "        Style.RESET_ALL,\n",
-    "    )\n",
-    "    p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n",
-    "    for line in p.stdout:\n",
-    "        print(line.decode(), end=\"\")\n",
-    "\n",
-    "\n",
-    "threading.Thread(target=iframe_thread, daemon=True, args=(9872,)).start()\n",
-    "\n",
-    "!python  ./GPT_SoVITS/inference_webui.py"
-   ]
-  }
- ],
- "metadata": {
-  "kaggle": {
-   "accelerator": "nvidiaTeslaT4",
-   "dataSources": [
-    {
-     "datasetId": 4459328,
-     "sourceId": 7649639,
-     "sourceType": "datasetVersion"
-    }
-   ],
-   "dockerImageVersionId": 30646,
-   "isGpuEnabled": true,
-   "isInternetEnabled": true,
-   "language": "python",
-   "sourceType": "notebook"
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  },
-  "papermill": {
-   "default_parameters": {},
-   "duration": null,
-   "end_time": null,
-   "environment_variables": {},
-   "exception": null,
-   "input_path": "__notebook__.ipynb",
-   "output_path": "__notebook__.ipynb",
-   "parameters": {},
-   "start_time": "2024-02-18T14:43:44.011910",
-   "version": "2.5.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/install.ps1 b/install.ps1
index 9c33ace8..ab761ba8 100644
--- a/install.ps1
+++ b/install.ps1
@@ -40,6 +40,10 @@ function Write-Info($msg) {
     Write-Host "[INFO]:" -ForegroundColor Green -NoNewline
     Write-Host " $msg"
 }
+function Write-Warning($msg) {
+    Write-Host "[Warning]:" -ForegroundColor Yellow -NoNewline
+    Write-Host " $msg"
+}
 function Write-Success($msg) {
     Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline
     Write-Host " $msg"
@@ -137,7 +141,7 @@ chcp 65001
 Set-Location $PSScriptRoot
 
 Write-Info "Installing FFmpeg & CMake..."
-Invoke-Conda  ffmpeg cmake
+Invoke-Conda  ffmpeg cmake vc14_runtime
 Write-Success "FFmpeg & CMake Installed"
 
 $PretrainedURL  = ""
@@ -208,12 +212,30 @@ if ($DownloadUVR5) {
 
 switch ($Device) {
     "CU128" {
+        $cudaLine = nvidia-smi | Select-String "CUDA Version"
+        $version = ($cudaLine -split "CUDA Version:")[1].Trim()
+        Write-Info "Maximum CUDA Version Supported By Current Driver: $version"
+        if ([version](nvidia-smi | Select-String "CUDA Version" | ForEach-Object { ($_ -split "CUDA Version:")[1].Trim() }) -ge [version]"12.8") {
+            Write-Warning "CUDA 12.8 Is Not Supported By Current Driver"
+        }
         Write-Info "Installing PyTorch For CUDA 12.8..."
         Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu128"
+        Invoke-Conda cuda-nvcc=12.8
+        Invoke-Pip psutil ninja packaging wheel "setuptools>=42"
+        Invoke-Pip flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     "CU126" {
+        $cudaLine = nvidia-smi | Select-String "CUDA Version"
+        $version = ($cudaLine -split "CUDA Version:")[1].Trim()
+        Write-Info "Maximum CUDA Version Supported By Current Driver: $version"
+        if ([version](nvidia-smi | Select-String "CUDA Version" | ForEach-Object { ($_ -split "CUDA Version:")[1].Trim() }) -ge [version]"12.8") {
+            Write-Warning "CUDA 12.6 Is Not Supported By Current Driver"
+        }
         Write-Info "Installing PyTorch For CUDA 12.6..."
         Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu126"
+        Invoke-Conda cuda-nvcc=12.6
+        Invoke-Pip psutil ninja packaging wheel "setuptools>=42"
+        Invoke-Pip flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     "CPU" {
         Write-Info "Installing PyTorch For CPU..."
diff --git a/install.sh b/install.sh
index 7d80ec28..c0cf3543 100644
--- a/install.sh
+++ b/install.sh
@@ -127,7 +127,7 @@ while [[ $# -gt 0 ]]; do
             USE_ROCM=true
             ;;
         MPS)
-            USE_CPU=true
+            USE_MPS=true
             ;;
         CPU)
             USE_CPU=true
@@ -157,7 +157,7 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then
+if ! $USE_CUDA && ! $USE_ROCM && ! $USE_MPS && ! $USE_CPU; then
     echo -e "${ERROR}Error: Device is REQUIRED"
     echo ""
     print_help
@@ -322,13 +322,29 @@ if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
 fi
 
 if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
+    CUDAVERSION=$(nvidia-smi | grep "CUDA Version" | sed -E 's/.*CUDA Version: ([0-9]+\.[0-9]+).*/\1/')
+    echo -e "${INFO}Maximum CUDA Version Supported By Current Driver: $CUDAVERSION"
     if [ "$CUDA" = 128 ]; then
+        if awk "BEGIN {exit !($CUDAVERSION < 12.8)}"; then
+            echo -r "${WARNING}CUDA 12.8 Is Not Supported By Current Driver"
+        fi
         echo -e "${INFO}Installing PyTorch For CUDA 12.8..."
         run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu128"
+        run_conda_quiet cuda-nvcc=12.8
     elif [ "$CUDA" = 126 ]; then
+        if awk "BEGIN {exit !($CUDAVERSION < 12.6)}"; then
+            echo -r "${WARNING}CUDA 12.6 Is Not Supported By Current Driver"
+        fi
         echo -e "${INFO}Installing PyTorch For CUDA 12.6..."
         run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu126"
+        run_conda_quiet cuda-nvcc=12.6
     fi
+    run_pip_quiet psutil ninja packaging wheel "setuptools>=42"
+    run_pip_quiet flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
+elif [ "$USE_MPS" = true ] && [ "$WORKFLOW" = false ]; then
+    echo -e "${INFO}Installing PyTorch For MPS..."
+    run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cpu"
+    run_pip_quiet mlx
 elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
     echo -e "${INFO}Installing PyTorch For ROCm 6.2..."
     run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/rocm6.2"
diff --git a/requirements.txt b/requirements.txt
index 90e4957d..b49126dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ tensorboard
 librosa==0.10.2
 numba
 pytorch-lightning>=2.4
-gradio<5
+gradio==5.25.0
 ffmpeg-python
 onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
 onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
@@ -16,9 +16,11 @@ pypinyin
 pyopenjtalk>=0.4.1
 g2p_en
 torchaudio
-modelscope==1.10.0
+modelscope
 sentencepiece
-transformers>=4.43,<=4.50
+transformers
+huggingface_hub
+kernels
 peft
 chardet
 PyYAML
@@ -39,7 +41,6 @@ x_transformers
 torchmetrics<=1.5
 pydantic<=2.10.6
 ctranslate2>=4.0,<5
-huggingface_hub>=0.13
 tokenizers>=0.13,<1
 av>=11
 tqdm
diff --git a/tools/i18n/locale/en_US.json b/tools/i18n/locale/en_US.json
index 24d24de4..c1d7b5cb 100644
--- a/tools/i18n/locale/en_US.json
+++ b/tools/i18n/locale/en_US.json
@@ -222,5 +222,6 @@
     "预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path",
     "预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path",
     "预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path",
-    "预训练模型路径": "Pretrained Model Path"
+    "预训练模型路径": "Pretrained Model Path",
+    "推理后端": "Inference Backend"
 }
diff --git a/tools/i18n/locale/zh_CN.json b/tools/i18n/locale/zh_CN.json
index 8ede7647..8b530d33 100644
--- a/tools/i18n/locale/zh_CN.json
+++ b/tools/i18n/locale/zh_CN.json
@@ -222,5 +222,6 @@
     "预训练SoVITS-D模型路径": "预训练SoVITS-D模型路径",
     "预训练SoVITS-G模型路径": "预训练SoVITS-G模型路径",
     "预训练中文BERT模型路径": "预训练中文BERT模型路径",
-    "预训练模型路径": "预训练模型路径"
+    "预训练模型路径": "预训练模型路径",
+    "推理后端": "推理后端"
 }
diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py
index 51a7dfad..6a815019 100644
--- a/tools/subfix_webui.py
+++ b/tools/subfix_webui.py
@@ -1,4 +1,5 @@
 import sys
+
 from tools.i18n.i18n import I18nAuto, scan_language_list
 
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
@@ -314,7 +315,7 @@ if __name__ == "__main__":
                 "Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮，你再翻回来就回滚了，白忙活。)"
             )
         )
-        with gr.Row():
+        with gr.Row(equal_height=True):
             btn_change_index = gr.Button("Change Index")
             btn_submit_change = gr.Button("Submit Text")
             btn_merge_audio = gr.Button("Merge Audio")
@@ -322,7 +323,7 @@ if __name__ == "__main__":
             btn_previous_index = gr.Button("Previous Index")
             btn_next_index = gr.Button("Next Index")
 
-        with gr.Row():
+        with gr.Row(equal_height=True):
             index_slider = gr.Slider(minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3)
             splitpoint_slider = gr.Slider(
                 minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3
@@ -331,18 +332,23 @@ if __name__ == "__main__":
             btn_save_json = gr.Button("Save File", visible=True, scale=1)
             btn_invert_selection = gr.Button("Invert Selection", scale=1)
 
-        with gr.Row():
+        with gr.Row(equal_height=True):
             with gr.Column():
                 for _ in range(0, g_batch):
-                    with gr.Row():
+                    with gr.Row(equal_height=True):
                         text = gr.Textbox(label="Text", visible=True, scale=5)
-                        audio_output = gr.Audio(label="Output Audio", visible=True, scale=5)
+                        audio_output = gr.Audio(
+                            label="Output Audio",
+                            visible=True,
+                            scale=5,
+                            waveform_options={"show_recording_waveform": False},
+                        )
                         audio_check = gr.Checkbox(label="Yes", show_label=True, info="Choose Audio", scale=1)
                         g_text_list.append(text)
                         g_audio_list.append(audio_output)
                         g_checkbox_list.append(audio_check)
 
-        with gr.Row():
+        with gr.Row(equal_height=True):
             batchsize_slider = gr.Slider(
                 minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False
             )
diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py
index f5f8d3f6..98a720a9 100644
--- a/tools/uvr5/webui.py
+++ b/tools/uvr5/webui.py
@@ -168,7 +168,7 @@ with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app:
                     "h4",
                 )
             )
-            with gr.Row():
+            with gr.Row(equal_height=True):
                 with gr.Column():
                     model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names)
                     dir_wav_input = gr.Textbox(
@@ -197,9 +197,9 @@ with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app:
                         interactive=True,
                     )
                     with gr.Column():
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             but2 = gr.Button(i18n("转换"), variant="primary")
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             vc_output4 = gr.Textbox(label=i18n("输出信息"), lines=3)
                 but2.click(
                     uvr,
diff --git a/webui.py b/webui.py
index 9a6aae5f..ed0e2930 100644
--- a/webui.py
+++ b/webui.py
@@ -1,22 +1,96 @@
-import os
-import sys
-
-os.environ["version"] = version = "v2Pro"
-now_dir = os.getcwd()
-sys.path.insert(0, now_dir)
-import warnings
-
-warnings.filterwarnings("ignore")
+import argparse
+import contextlib
 import json
+import os
 import platform
+import re
 import shutil
 import signal
+import site
+import subprocess
+import traceback
+import warnings
+from multiprocessing import cpu_count
+from subprocess import Popen
 
+import gradio as gr
 import psutil
 import torch
 import yaml
 
+from config import (
+    GPU_INDEX,
+    GPU_INFOS,
+    IS_GPU,
+    GPT_weight_root,
+    GPT_weight_version2root,
+    SoVITS_weight_root,
+    SoVITS_weight_version2root,
+    change_choices,
+    exp_root,
+    get_weights_names,
+    infer_device,
+    is_half,
+    is_share,
+    memset,
+    pretrained_gpt_name,
+    pretrained_sovits_name,
+    python_exec,
+    webui_port_infer_tts,
+    webui_port_main,
+    webui_port_subfix,
+    webui_port_uvr5,
+)
+from GPT_SoVITS.Accelerate import backends
+from tools import my_utils
+from tools.asr.config import asr_dict
+from tools.assets import css, js, top_html
+from tools.i18n.i18n import I18nAuto, scan_language_list
+from tools.my_utils import check_details, check_for_existance
+
+os.environ["PYTHONPATH"] = now_dir = os.getcwd()
+
+backends_gradio = [(b.replace("-", " "), b) for b in backends]
+
+_LANG_RE = re.compile(r"^[a-z]{2}[_-][A-Z]{2}$")
+
+
+def lang_type(text: str) -> str:
+    if text == "Auto":
+        return text
+    if not _LANG_RE.match(text):
+        raise argparse.ArgumentTypeError(f"Unspported Format: {text}, Expected ll_CC/ll-CC")
+    ll, cc = re.split(r"[_-]", text)
+    language = f"{ll}_{cc}"
+    if language in scan_language_list():
+        return language
+    else:
+        return "en_US"
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="train_webui",
+        description="python -s webui.py zh_CN",
+    )
+    p.add_argument(
+        "language",
+        nargs="?",
+        default="Auto",
+        type=lang_type,
+        help="Language Code, Such as zh_CN, en-US",
+    )
+    return p
+
+
+args = build_parser().parse_args()
+
+os.environ["version"] = version = "v2Pro"
 os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+
+warnings.filterwarnings("ignore")
+
+
 torch.manual_seed(233333)
 tmp = os.path.join(now_dir, "TEMP")
 os.makedirs(tmp, exist_ok=True)
@@ -32,8 +106,6 @@ if os.path.exists(tmp):
         except Exception as e:
             print(str(e))
             pass
-import site
-import traceback
 
 site_packages_roots = []
 for path in site.getsitepackages():
@@ -41,7 +113,6 @@ for path in site.getsitepackages():
         site_packages_roots.append(path)
 if site_packages_roots == []:
     site_packages_roots = ["%s/runtime/Lib/site-packages" % now_dir]
-# os.environ["OPENBLAS_NUM_THREADS"] = "4"
 os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
 os.environ["all_proxy"] = ""
 for site_packages_root in site_packages_roots:
@@ -56,41 +127,10 @@ for site_packages_root in site_packages_roots:
             break
         except PermissionError:
             traceback.print_exc()
-import shutil
-import subprocess
-from subprocess import Popen
 
-from tools.assets import css, js, top_html
-from tools.i18n.i18n import I18nAuto, scan_language_list
 
-language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
-os.environ["language"] = language
+language = args.language
 i18n = I18nAuto(language=language)
-from multiprocessing import cpu_count
-
-from config import (
-    GPU_INDEX,
-    GPU_INFOS,
-    IS_GPU,
-    exp_root,
-    infer_device,
-    is_half,
-    is_share,
-    memset,
-    python_exec,
-    webui_port_infer_tts,
-    webui_port_main,
-    webui_port_subfix,
-    webui_port_uvr5,
-)
-from tools import my_utils
-from tools.my_utils import check_details, check_for_existance
-
-os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
-os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
-
-# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
-import gradio as gr
 
 n_cpu = cpu_count()
 
@@ -147,7 +187,7 @@ def fix_gpu_number(input):  # 将越界的number强制改到界内
     try:
         if int(input) not in set_gpu_numbers:
             return default_gpu_numbers
-    except:
+    except Exception as _:
         return input
     return input
 
@@ -158,13 +198,10 @@ def fix_gpu_numbers(inputs):
         for input in inputs.split(","):
             output.append(str(fix_gpu_number(input)))
         return ",".join(output)
-    except:
+    except Exception as _:
         return inputs
 
 
-from config import pretrained_gpt_name, pretrained_sovits_name
-
-
 def check_pretrained_is_exist(version):
     pretrained_model_list = (
         pretrained_sovits_name[version],
@@ -189,14 +226,6 @@ for key in pretrained_gpt_name.keys():
     if os.path.exists(pretrained_gpt_name[key]) == False:
         pretrained_gpt_name[key] = ""
 
-from config import (
-    GPT_weight_root,
-    GPT_weight_version2root,
-    SoVITS_weight_root,
-    SoVITS_weight_version2root,
-    change_choices,
-    get_weights_names,
-)
 
 for root in SoVITS_weight_root + GPT_weight_root:
     os.makedirs(root, exist_ok=True)
@@ -218,15 +247,11 @@ def kill_proc_tree(pid, including_parent=True):
 
     children = parent.children(recursive=True)
     for child in children:
-        try:
+        with contextlib.suppress(OSError):
             os.kill(child.pid, signal.SIGTERM)  # or signal.SIGKILL
-        except OSError:
-            pass
     if including_parent:
-        try:
+        with contextlib.suppress(OSError):
             os.kill(parent.pid, signal.SIGTERM)  # or signal.SIGKILL
-        except OSError:
-            pass
 
 
 system = platform.system()
@@ -329,21 +354,20 @@ def change_uvr5():
 process_name_tts = i18n("TTS推理WebUI")
 
 
-def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled):
+def change_tts_inference(
+    bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled, backends_dropdown
+):
     global p_tts_inference
     if batched_infer_enabled:
-        cmd = '"%s" -s GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language)
+        cmd = f"'{python_exec}' -s GPT_SoVITS/inference_webui_fast.py {language}"
     else:
-        cmd = '"%s" -s GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
-    # #####v3暂不支持加速推理
-    # if version=="v3":
-    #     cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
+        cmd = f"'{python_exec}' -s GPT_SoVITS/inference_webui.py {language} -b {backends_dropdown}"
     if p_tts_inference is None:
         os.environ["gpt_path"] = gpt_path
         os.environ["sovits_path"] = sovits_path
         os.environ["cnhubert_base_path"] = cnhubert_base_path
         os.environ["bert_path"] = bert_path
-        os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number)
+        os.environ["_CUDA_VISIBLE_DEVICES"] = str(fix_gpu_number(gpu_number))
         os.environ["is_half"] = str(is_half)
         os.environ["infer_ttswebui"] = str(webui_port_infer_tts)
         os.environ["is_share"] = str(is_share)
@@ -364,8 +388,6 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so
         )
 
 
-from tools.asr.config import asr_dict
-
 process_name_asr = i18n("语音识别")
 
 
@@ -764,7 +786,7 @@ def close_slice():
         for p_slice in ps_slice:
             try:
                 kill_process(p_slice.pid, process_name_slice)
-            except:
+            except Exception as _:
                 traceback.print_exc()
         ps_slice = []
     return (
@@ -853,7 +875,7 @@ def close1a():
         for p1a in ps1a:
             try:
                 kill_process(p1a.pid, process_name_1a)
-            except:
+            except Exception as _:
                 traceback.print_exc()
         ps1a = []
     return (
@@ -944,7 +966,7 @@ def close1b():
         for p1b in ps1b:
             try:
                 kill_process(p1b.pid, process_name_1b)
-            except:
+            except Exception as _:
                 traceback.print_exc()
         ps1b = []
     return (
@@ -1030,7 +1052,7 @@ def close1c():
         for p1c in ps1c:
             try:
                 kill_process(p1c.pid, process_name_1c)
-            except:
+            except Exception as _:
                 traceback.print_exc()
         ps1c = []
     return (
@@ -1230,7 +1252,7 @@ def open1abc(
                 {"__type__": "update", "visible": True},
                 {"__type__": "update", "visible": False},
             )
-        except:
+        except Exception as _:
             traceback.print_exc()
             close1abc()
             yield (
@@ -1252,7 +1274,7 @@ def close1abc():
         for p1abc in ps1abc:
             try:
                 kill_process(p1abc.pid, process_name_1abc)
-            except:
+            except Exception as _:
                 traceback.print_exc()
         ps1abc = []
     return (
@@ -1303,6 +1325,14 @@ def sync(text):
     return {"__type__": "update", "value": text}
 
 
+def changeBackend(flag: bool):
+    if flag:
+        return gr.update(choices=["Torch Varlen"], value="Torch Varlen")
+    else:
+        return gr.update(choices=backends_gradio, value=backends_gradio[-1][-1])
+
+
+GPU_INDEX.add(0)
 with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app:
     gr.HTML(
         top_html.format(
@@ -1315,9 +1345,9 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
     with gr.Tabs():
         with gr.TabItem("0-" + i18n("前置数据集获取工具")):  # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
             with gr.Accordion(label="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Column(scale=3):
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info"))
                     open_uvr5 = gr.Button(
                         value=process_info(process_name_uvr5, "open"), variant="primary", visible=True
@@ -1327,14 +1357,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     )
 
             with gr.Accordion(label="0b-" + i18n("语音切分工具")):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Column(scale=3):
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径，可文件可文件夹"), value="")
                             slice_opt_root = gr.Textbox(
                                 label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt"
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             threshold = gr.Textbox(
                                 label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34"
                             )
@@ -1348,7 +1378,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                                 value="10",
                             )
                             max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500")
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             _max = gr.Slider(
                                 minimum=0,
                                 maximum=1,
@@ -1365,7 +1395,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                                 value=0.25,
                                 interactive=True,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             n_process = gr.Slider(
                                 minimum=1,
                                 maximum=n_cpu,
@@ -1385,10 +1415,10 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
             # gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(不稳定，先别用，可能劣化模型效果！)"))
             with gr.Row(visible=False):
                 with gr.Column(scale=3):
-                    with gr.Row():
+                    with gr.Row(equal_height=True):
                         denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="")
                         denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt")
-                    with gr.Row():
+                    with gr.Row(equal_height=True):
                         denoise_info = gr.Textbox(label=process_info(process_name_denoise, "info"))
                 open_denoise_button = gr.Button(
                     value=process_info(process_name_denoise, "open"), variant="primary", visible=True
@@ -1398,16 +1428,16 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 )
 
             with gr.Accordion(label="0c-" + i18n("语音识别工具")):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Column(scale=3):
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             asr_inp_dir = gr.Textbox(
                                 label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True
                             )
                             asr_opt_dir = gr.Textbox(
                                 label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             asr_model = gr.Dropdown(
                                 label=i18n("ASR 模型"),
                                 choices=list(asr_dict.keys()),
@@ -1423,7 +1453,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                             asr_precision = gr.Dropdown(
                                 label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32"
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             asr_info = gr.Textbox(label=process_info(process_name_asr, "info"))
                     open_asr_button = gr.Button(
                         value=process_info(process_name_asr, "open"), variant="primary", visible=True
@@ -1455,9 +1485,9 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 asr_model.change(change_precision_choices, [asr_model], [asr_precision])
 
             with gr.Accordion(label="0d-" + i18n("语音文本校对标注工具")):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Column(scale=3):
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             path_list = gr.Textbox(
                                 label=i18n("标注文件路径 (含文件后缀 *.list)"),
                                 value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list",
@@ -1478,7 +1508,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
 
         with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
             with gr.Accordion(i18n("微调模型信息")):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Row(equal_height=True):
                         exp_name = gr.Textbox(
                             label=i18n("*实验/模型名"),
@@ -1500,7 +1530,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                             scale=5,
                         )
             with gr.Accordion(label=i18n("预训练模型路径"), open=False):
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Row(equal_height=True):
                         pretrained_s1 = gr.Textbox(
                             label=i18n("预训练GPT模型路径"),
@@ -1529,15 +1559,15 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
 
             with gr.TabItem("1A-" + i18n("训练集格式化工具")):
                 with gr.Accordion(label=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")):
-                    with gr.Row():
-                        with gr.Row():
+                    with gr.Row(equal_height=True):
+                        with gr.Row(equal_height=True):
                             inp_text = gr.Textbox(
                                 label=i18n("*文本标注文件"),
                                 value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",
                                 interactive=True,
                                 scale=10,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             inp_wav_dir = gr.Textbox(
                                 label=i18n("*训练集音频文件目录"),
                                 # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",
@@ -1549,90 +1579,90 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                             )
 
                 with gr.Accordion(label="1Aa-" + process_name_1a):
-                    with gr.Row():
-                        with gr.Row():
+                    with gr.Row(equal_height=True):
+                        with gr.Row(equal_height=True):
                             gpu_numbers1a = gr.Textbox(
                                 label=i18n("GPU卡号以-分割，每个卡号一个进程"),
                                 value="%s-%s" % (gpus, gpus),
                                 interactive=True,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             bert_pretrained_dir = gr.Textbox(
                                 label=i18n("预训练中文BERT模型路径"),
                                 value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
                                 interactive=False,
                                 lines=2,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             button1a_open = gr.Button(
                                 value=process_info(process_name_1a, "open"), variant="primary", visible=True
                             )
                             button1a_close = gr.Button(
                                 value=process_info(process_name_1a, "close"), variant="primary", visible=False
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             info1a = gr.Textbox(label=process_info(process_name_1a, "info"))
 
                 with gr.Accordion(label="1Ab-" + process_name_1b):
-                    with gr.Row():
-                        with gr.Row():
+                    with gr.Row(equal_height=True):
+                        with gr.Row(equal_height=True):
                             gpu_numbers1Ba = gr.Textbox(
                                 label=i18n("GPU卡号以-分割，每个卡号一个进程"),
                                 value="%s-%s" % (gpus, gpus),
                                 interactive=True,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             cnhubert_base_dir = gr.Textbox(
                                 label=i18n("预训练SSL模型路径"),
                                 value="GPT_SoVITS/pretrained_models/chinese-hubert-base",
                                 interactive=False,
                                 lines=2,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             button1b_open = gr.Button(
                                 value=process_info(process_name_1b, "open"), variant="primary", visible=True
                             )
                             button1b_close = gr.Button(
                                 value=process_info(process_name_1b, "close"), variant="primary", visible=False
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             info1b = gr.Textbox(label=process_info(process_name_1b, "info"))
 
                 with gr.Accordion(label="1Ac-" + process_name_1c):
-                    with gr.Row():
-                        with gr.Row():
+                    with gr.Row(equal_height=True):
+                        with gr.Row(equal_height=True):
                             gpu_numbers1c = gr.Textbox(
                                 label=i18n("GPU卡号以-分割，每个卡号一个进程"),
                                 value="%s-%s" % (gpus, gpus),
                                 interactive=True,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             pretrained_s2G_ = gr.Textbox(
                                 label=i18n("预训练SoVITS-G模型路径"),
                                 value=pretrained_sovits_name[version],
                                 interactive=False,
                                 lines=2,
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             button1c_open = gr.Button(
                                 value=process_info(process_name_1c, "open"), variant="primary", visible=True
                             )
                             button1c_close = gr.Button(
                                 value=process_info(process_name_1c, "close"), variant="primary", visible=False
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             info1c = gr.Textbox(label=process_info(process_name_1c, "info"))
 
                 with gr.Accordion(label="1Aabc-" + process_name_1abc):
-                    with gr.Row():
-                        with gr.Row():
+                    with gr.Row(equal_height=True):
+                        with gr.Row(equal_height=True):
                             button1abc_open = gr.Button(
                                 value=process_info(process_name_1abc, "open"), variant="primary", visible=True
                             )
                             button1abc_close = gr.Button(
                                 value=process_info(process_name_1abc, "close"), variant="primary", visible=False
                             )
-                        with gr.Row():
+                        with gr.Row(equal_height=True):
                             info1abc = gr.Textbox(label=process_info(process_name_1abc, "info"))
 
             pretrained_s2G.change(sync, [pretrained_s2G], [pretrained_s2G_])
@@ -1704,149 +1734,146 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
 
             with gr.TabItem("1B-" + i18n("微调训练")):
                 with gr.Accordion(label="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")):
-                    with gr.Row():
+                    with gr.Row(equal_height=True):
+                        batch_size = gr.Slider(
+                            minimum=1,
+                            maximum=default_max_batch_size,
+                            step=1,
+                            label=i18n("每张显卡的batch_size"),
+                            value=default_batch_size,
+                            interactive=True,
+                        )
+                        total_epoch = gr.Slider(
+                            minimum=1,
+                            maximum=max_sovits_epoch,
+                            step=1,
+                            label=i18n("总训练轮数total_epoch，不建议太高"),
+                            value=default_sovits_epoch,
+                            interactive=True,
+                        )
+                        with gr.Column(scale=2):
+                            if_save_latest = gr.Checkbox(
+                                label=i18n("是否仅保存最新的权重文件以节省硬盘空间"),
+                                value=True,
+                                interactive=True,
+                                show_label=True,
+                            )
+                            if_save_every_weights = gr.Checkbox(
+                                label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
+                                value=True,
+                                interactive=True,
+                                show_label=True,
+                            )
+                            if_grad_ckpt = gr.Checkbox(
+                                label="v3是否开启梯度检查点节省显存占用",
+                                value=False,
+                                interactive=True if version in v3v4set else False,
+                                show_label=True,
+                                visible=False,
+                            )  # 只有V3s2可以用
+                    with gr.Row(equal_height=True):
+                        text_low_lr_rate = gr.Slider(
+                            minimum=0.2,
+                            maximum=0.6,
+                            step=0.05,
+                            label=i18n("文本模块学习率权重"),
+                            value=0.4,
+                            visible=True if version not in v3v4set else False,
+                        )  # v3v4 not need
+                        lora_rank = gr.Radio(
+                            label=i18n("LoRA秩"),
+                            value="32",
+                            choices=["16", "32", "64", "128"],
+                            visible=True if version in v3v4set else False,
+                        )  # v1v2 not need
+                        save_every_epoch = gr.Slider(
+                            minimum=1,
+                            maximum=max_sovits_save_every_epoch,
+                            step=1,
+                            label=i18n("保存频率save_every_epoch"),
+                            value=default_sovits_save_every_epoch,
+                            interactive=True,
+                        )
+                        with gr.Column(scale=3):
+                            gpu_numbers1Ba = gr.Textbox(
+                                label=i18n("GPU卡号以-分割，每个卡号一个进程"),
+                                value="%s" % (gpus),
+                                interactive=True,
+                            )
+                    with gr.Row(equal_height=True):
                         with gr.Column():
-                            with gr.Row():
-                                batch_size = gr.Slider(
-                                    minimum=1,
-                                    maximum=default_max_batch_size,
-                                    step=1,
-                                    label=i18n("每张显卡的batch_size"),
-                                    value=default_batch_size,
-                                    interactive=True,
-                                )
-                                total_epoch = gr.Slider(
-                                    minimum=1,
-                                    maximum=max_sovits_epoch,
-                                    step=1,
-                                    label=i18n("总训练轮数total_epoch，不建议太高"),
-                                    value=default_sovits_epoch,
-                                    interactive=True,
-                                )
-                            with gr.Row():
-                                text_low_lr_rate = gr.Slider(
-                                    minimum=0.2,
-                                    maximum=0.6,
-                                    step=0.05,
-                                    label=i18n("文本模块学习率权重"),
-                                    value=0.4,
-                                    visible=True if version not in v3v4set else False,
-                                )  # v3v4 not need
-                                lora_rank = gr.Radio(
-                                    label=i18n("LoRA秩"),
-                                    value="32",
-                                    choices=["16", "32", "64", "128"],
-                                    visible=True if version in v3v4set else False,
-                                )  # v1v2 not need
-                                save_every_epoch = gr.Slider(
-                                    minimum=1,
-                                    maximum=max_sovits_save_every_epoch,
-                                    step=1,
-                                    label=i18n("保存频率save_every_epoch"),
-                                    value=default_sovits_save_every_epoch,
-                                    interactive=True,
-                                )
-                        with gr.Column():
-                            with gr.Column():
-                                if_save_latest = gr.Checkbox(
-                                    label=i18n("是否仅保存最新的权重文件以节省硬盘空间"),
-                                    value=True,
-                                    interactive=True,
-                                    show_label=True,
-                                )
-                                if_save_every_weights = gr.Checkbox(
-                                    label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
-                                    value=True,
-                                    interactive=True,
-                                    show_label=True,
-                                )
-                                if_grad_ckpt = gr.Checkbox(
-                                    label="v3是否开启梯度检查点节省显存占用",
-                                    value=False,
-                                    interactive=True if version in v3v4set else False,
-                                    show_label=True,
-                                    visible=False,
-                                )  # 只有V3s2可以用
-                            with gr.Row():
-                                gpu_numbers1Ba = gr.Textbox(
-                                    label=i18n("GPU卡号以-分割，每个卡号一个进程"),
-                                    value="%s" % (gpus),
-                                    interactive=True,
-                                )
-                    with gr.Row():
-                        with gr.Row():
                             button1Ba_open = gr.Button(
                                 value=process_info(process_name_sovits, "open"), variant="primary", visible=True
                             )
                             button1Ba_close = gr.Button(
                                 value=process_info(process_name_sovits, "close"), variant="primary", visible=False
                             )
-                        with gr.Row():
+                        with gr.Column():
                             info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info"))
                 with gr.Accordion(label="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")):
-                    with gr.Row():
+                    with gr.Row(equal_height=True):
+                        batch_size1Bb = gr.Slider(
+                            minimum=1,
+                            maximum=40,
+                            step=1,
+                            label=i18n("每张显卡的batch_size"),
+                            value=default_batch_size_s1,
+                            interactive=True,
+                        )
+                        total_epoch1Bb = gr.Slider(
+                            minimum=2,
+                            maximum=50,
+                            step=1,
+                            label=i18n("总训练轮数total_epoch"),
+                            value=15,
+                            interactive=True,
+                        )
+                        with gr.Column(scale=2):
+                            if_save_latest1Bb = gr.Checkbox(
+                                label=i18n("是否仅保存最新的权重文件以节省硬盘空间"),
+                                value=True,
+                                interactive=True,
+                                show_label=True,
+                            )
+                            if_save_every_weights1Bb = gr.Checkbox(
+                                label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
+                                value=True,
+                                interactive=True,
+                                show_label=True,
+                            )
+                    with gr.Row(equal_height=True):
+                        # with gr.Column():
+                        save_every_epoch1Bb = gr.Slider(
+                            minimum=1,
+                            maximum=50,
+                            step=1,
+                            label=i18n("保存频率save_every_epoch"),
+                            value=5,
+                            interactive=True,
+                        )
+                        # with gr.Column():
+                        if_dpo = gr.Checkbox(
+                            label=i18n("是否开启DPO训练选项(实验性)"),
+                            value=False,
+                            interactive=True,
+                            show_label=True,
+                        )
+                        with gr.Column(scale=2):
+                            gpu_numbers1Bb = gr.Textbox(
+                                label=i18n("GPU卡号以-分割，每个卡号一个进程"),
+                                value="%s" % (gpus),
+                                interactive=True,
+                            )
+                    with gr.Row(equal_height=True):
                         with gr.Column():
-                            with gr.Row():
-                                batch_size1Bb = gr.Slider(
-                                    minimum=1,
-                                    maximum=40,
-                                    step=1,
-                                    label=i18n("每张显卡的batch_size"),
-                                    value=default_batch_size_s1,
-                                    interactive=True,
+                            with gr.Row(equal_height=True):
+                                button1Bb_open = gr.Button(
+                                    value=process_info(process_name_gpt, "open"), variant="primary", visible=True
                                 )
-                                total_epoch1Bb = gr.Slider(
-                                    minimum=2,
-                                    maximum=50,
-                                    step=1,
-                                    label=i18n("总训练轮数total_epoch"),
-                                    value=15,
-                                    interactive=True,
-                                )
-                            with gr.Row():
-                                save_every_epoch1Bb = gr.Slider(
-                                    minimum=1,
-                                    maximum=50,
-                                    step=1,
-                                    label=i18n("保存频率save_every_epoch"),
-                                    value=5,
-                                    interactive=True,
-                                )
-                                if_dpo = gr.Checkbox(
-                                    label=i18n("是否开启DPO训练选项(实验性)"),
-                                    value=False,
-                                    interactive=True,
-                                    show_label=True,
+                                button1Bb_close = gr.Button(
+                                    value=process_info(process_name_gpt, "close"), variant="primary", visible=False
                                 )
                         with gr.Column():
-                            with gr.Column():
-                                if_save_latest1Bb = gr.Checkbox(
-                                    label=i18n("是否仅保存最新的权重文件以节省硬盘空间"),
-                                    value=True,
-                                    interactive=True,
-                                    show_label=True,
-                                )
-                                if_save_every_weights1Bb = gr.Checkbox(
-                                    label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
-                                    value=True,
-                                    interactive=True,
-                                    show_label=True,
-                                )
-                            with gr.Row():
-                                gpu_numbers1Bb = gr.Textbox(
-                                    label=i18n("GPU卡号以-分割，每个卡号一个进程"),
-                                    value="%s" % (gpus),
-                                    interactive=True,
-                                )
-                    with gr.Row():
-                        with gr.Row():
-                            button1Bb_open = gr.Button(
-                                value=process_info(process_name_gpt, "open"), variant="primary", visible=True
-                            )
-                            button1Bb_close = gr.Button(
-                                value=process_info(process_name_gpt, "close"), variant="primary", visible=False
-                            )
-                        with gr.Row():
                             info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info"))
 
             button1Ba_close.click(close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close])
@@ -1858,41 +1885,60 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                         "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模，体验5秒Zero Shot TTS不训练推理用。"
                     )
                 )
-                with gr.Row():
+                with gr.Row(equal_height=True):
                     with gr.Column(scale=2):
-                        with gr.Row():
-                            GPT_dropdown = gr.Dropdown(
-                                label=i18n("GPT模型列表"),
-                                choices=GPT_names,
-                                value=GPT_names[-1],
-                                interactive=True,
-                            )
-                            SoVITS_dropdown = gr.Dropdown(
-                                label=i18n("SoVITS模型列表"),
-                                choices=SoVITS_names,
-                                value=SoVITS_names[0],
-                                interactive=True,
-                            )
+                        with gr.Row(equal_height=True):
+                            with gr.Column():
+                                GPT_dropdown = gr.Dropdown(
+                                    label=i18n("GPT模型列表"),
+                                    choices=GPT_names,
+                                    value=GPT_names[-1],
+                                    interactive=True,
+                                )
+                            with gr.Column():
+                                SoVITS_dropdown = gr.Dropdown(
+                                    label=i18n("SoVITS模型列表"),
+                                    choices=SoVITS_names,
+                                    value=SoVITS_names[0],
+                                    interactive=True,
+                                )
                     with gr.Column(scale=2):
-                        with gr.Row():
-                            gpu_number_1C = gr.Textbox(
-                                label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True
+                        with gr.Row(equal_height=True):
+                            gpu_number_1C = gr.Dropdown(
+                                choices=sorted(list(GPU_INDEX)),
+                                value=sorted(list(GPU_INDEX))[0],
+                                label=i18n("GPU卡号,只能填1个整数"),
+                                interactive=True,
                             )
                             refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
                     refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
                 with gr.Row(equal_height=True):
-                    with gr.Row():
-                        batched_infer_enabled = gr.Checkbox(
-                            label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True
-                        )
+                    with gr.Row(equal_height=True):
+                        with gr.Column():
+                            batched_infer_enabled = gr.Checkbox(
+                                label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True
+                            )
+                        with gr.Column():
+                            backends_dropdown = gr.Dropdown(
+                                choices=backends_gradio,
+                                label=i18n("推理后端"),
+                                value=backends_gradio[-1][-1],
+                                interactive=True,
+                            )
+                    with gr.Row(equal_height=True):
+                        tts_info = gr.Textbox(label=process_info(process_name_tts, "info"))
                         open_tts = gr.Button(
                             value=process_info(process_name_tts, "open"), variant="primary", visible=True
                         )
                         close_tts = gr.Button(
                             value=process_info(process_name_tts, "close"), variant="primary", visible=False
                         )
-                    with gr.Column():
-                        tts_info = gr.Textbox(label=process_info(process_name_tts, "info"), scale=2)
+
+                    batched_infer_enabled.change(
+                        changeBackend,
+                        [batched_infer_enabled],
+                        [backends_dropdown],
+                    )
                     open_tts.click(
                         change_tts_inference,
                         [
@@ -1902,6 +1948,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                             GPT_dropdown,
                             SoVITS_dropdown,
                             batched_infer_enabled,
+                            backends_dropdown,
                         ],
                         [tts_info, open_tts, close_tts],
                     )
@@ -1914,6 +1961,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                             GPT_dropdown,
                             SoVITS_dropdown,
                             batched_infer_enabled,
+                            backends_dropdown,
                         ],
                         [tts_info, open_tts, close_tts],
                     )