.

2025-09-29 00:30:15 +08:00 · 2025-09-08 19:30:35 +08:00 · 2025-09-08 19:30:35 +08:00 · 26d5eaf1b4
commit 26d5eaf1b4
parent 9f89f679c1
26 changed files with 56 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,7 +18,7 @@ speakers.json
 ref_audios
 tools/AP_BWE/24kto48k/*
 !tools/AP_BWE/24kto48k/readme.txt
-onnx
+onnx_export
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@ -11,10 +11,10 @@ import pandas as pd
 import torch
 from torch.utils.data import DataLoader, Dataset
 version = os.environ.get("version", None)
 from GPT_SoVITS.text import cleaned_text_to_sequence
 version = os.environ.get("version", None)
 # from config import exp_dir
--- a/GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_quantized.py
@ -152,7 +152,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config: dict,
-        max_seq_length: int = 1800,
+        max_seq_length: int = 2000,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_static.py
@ -87,7 +87,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config: dict,
-        max_seq_length: int = 1800,
+        max_seq_length: int = 2000,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py
+++ b/GPT_SoVITS/Accelerate/MLX/backends/mlx_varlen.py
@ -91,7 +91,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config: dict,
-        max_seq_length: int = 1800,
+        max_seq_length: int = 2000,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py
+++ b/GPT_SoVITS/Accelerate/MLX/t2s_engine_mlx.py
@ -75,7 +75,7 @@ class T2SEngine(T2SEngineProtocol):
                transient=True,
            ) as progress,
        ):
-            max_token = min(1800 - int(session.input_pos.max()), 1500)
+            max_token = min(2000 - int(session.input_pos.max()), 1500)
            task = progress.add_task("T2S Decoding", total=max_token)
            for idx in range(1500):
--- a/GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py
+++ b/GPT_SoVITS/Accelerate/MLX/t2s_model_abc.py
@ -43,7 +43,7 @@ class SinePositionalEmbedding(nn.Module):
        embedding_dim: int,
        scale: bool = False,
        max_batch_size: int = 10,
-        max_seq_len: int = 1800,
+        max_seq_len: int = 2000,
    ):
        super().__init__()
        self.embedding_dim = embedding_dim
@ -278,7 +278,7 @@ class AttentionABC(ABC, nn.Module):
    def prefill(self, x: Array, kv_cache: KVCache | KVCacheQ, attn_mask: Array):
        bsz, seqlen, _ = cast(tuple[int, ...], x.shape)
-        q, k, v = self.in_proj(mx.expand_dims(x, 0)).split(3, axis=-1)
+        q, k, v = self.in_proj(x).split(3, axis=-1)
        q, k, v = map(lambda x: x.reshape(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
@ -413,7 +413,7 @@ class T2SDecoderABC(nn.Module, T2SDecoderProtocol):
    def __init__(
        self,
        config: dict,
-        max_seq_length: int = 1800,
+        max_seq_length: int = 2000,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__()
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
@ -100,7 +100,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config,
-        max_seq_length=1800,
+        max_seq_length=2000,
        max_batch_size=10,
    ) -> None:
        assert torch.cuda.is_available()
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
@ -78,7 +78,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config,
-        max_seq_length=1800,
+        max_seq_length=2000,
        max_batch_size=10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
@ -94,7 +94,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config,
-        max_seq_length=1800,
+        max_seq_length=2000,
        max_batch_size=10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
@ -78,7 +78,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config,
-        max_seq_length=1800,
+        max_seq_length=2000,
        max_batch_size=10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_varlen.py
@ -86,7 +86,7 @@ class T2SDecoder(T2SDecoderABC):
    def __init__(
        self,
        config,
-        max_seq_length=1800,
+        max_seq_length=2000,
        max_batch_size=10,
    ) -> None:
        super().__init__(config, max_seq_length, max_batch_size)
--- a/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_engine.py
@ -57,7 +57,7 @@ class T2SEngine(T2SEngineProtocol):
                    transient=True,
                ) as progress,
            ):
-                max_token = int(min(1800 - session.input_pos.max(), 1500))
+                max_token = int(min(2000 - session.input_pos.max(), 1500))
                task = progress.add_task("T2S Decoding", total=max_token)
                for idx in range(max_token):
--- a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
@ -55,7 +55,7 @@ class SinePositionalEmbedding(nn.Module):
        scale: bool = False,
        alpha: bool = False,
        max_batch_size: int = 10,
-        max_seq_len: int = 1800,
+        max_seq_len: int = 2000,
    ):
        super().__init__()
        self.embedding_dim = embedding_dim
@ -106,8 +106,9 @@ class SinePositionalEmbedding(nn.Module):
            embedded_x (Tensor): [batch_size, seq_len, embed_dim]
        """
-        pe_values = self.pe[:, : x.shape[-2]]
+        batch_size = x.shape[0]
-        return x * self.x_scale + self.alpha.item() * pe_values
+        pe_values = self.pe[:batch_size, : x.shape[-2]]
        return x * self.x_scale + self.alpha * pe_values
 class KVCacheABC(nn.Module, ABC, KVCacheProtocol):
@ -290,7 +291,7 @@ class AttentionABC(nn.Module, ABC):
    def prefill(self, x: Tensor, kv_cache: KVCacheProtocol, attn_mask: Tensor) -> Tensor:
        bsz, seqlen, _ = x.shape
-        q, k, v = self.in_proj(x.unsqueeze(0)).chunk(3, dim=-1)
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
        q, k, v = map(lambda x: x.contiguous().view(bsz, seqlen, self.n_head, self.head_dim), (q, k, v))
@ -416,7 +417,7 @@ class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
    def __init__(
        self,
        config: dict,
-        max_seq_length: int = 1800,
+        max_seq_length: int = 2000,
        max_batch_size: int = 10,
    ) -> None:
        super().__init__()
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -60,6 +60,7 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 def set_high_priority():
@ -90,7 +91,7 @@ def lang_type(text: str) -> str:
 def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="inference_webui",
-        description=f"PYTHONPATH=. python -s GPT_SoVITS/inference_webui.py zh_CN -b {backends[-1]}",
+        description=f"python -s -m GPT_SoVITS.inference_webui zh_CN -b {backends[-1]}",
    )
    p.add_argument(
        "language",
@ -691,6 +692,8 @@ def get_tts_wav(
    pause_second=0.3,
 ):
    torch.set_grad_enabled(False)
    ttfb_time = ttime()
    if ref_wav_path:
        pass
    else:
@ -918,6 +921,8 @@ def get_tts_wav(
            with torch.inference_mode():
                wav_gen = vocoder_model(cfm_res)  # type: ignore
                audio = wav_gen[0][0]
        if i_text == 0:
            ttfb_time = ttime() - ttfb_time
        max_audio = torch.abs(audio).max()  # 简单防止16bit爆音
        if max_audio > 1:
            audio = audio / max_audio
@ -954,6 +959,10 @@ def get_tts_wav(
    console.print(f">> Time Stamps: {t0:.3f}\t{t1:.3f}\t{t2:.3f}\t{t3:.3f}")
    console.print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
    console.print(f">> RTF: {rtf_value:.2f}")
    if ttfb_time > 2:
        console.print(f">> TTFB: {ttfb_time:.3f} s")
    else:
        console.print(f">> TTFB: {ttfb_time * 1000:.3f} ms")
    gr.Info(f"{infer_speed_avg:.2f} Token/s", title="Infer Speed")
    gr.Info(f"{rtf_value:.2f}", title="RTF")
--- a/GPT_SoVITS/prepare_datasets/1_get_text.py
+++ b/GPT_SoVITS/prepare_datasets/1_get_text.py
@ -1,7 +1,6 @@
 import enum
 import os
 import os.path as osp
 import platform
 import queue
 import sys
 import time
@ -16,7 +15,7 @@ from rich.progress import BarColumn, Progress, TextColumn, TimeRemainingColumn
 from torch.multiprocessing.spawn import spawn
 from transformers import BertForMaskedLM, BertTokenizerFast
-from GPT_SoVITS.Accelerate.logger import console, logger, SpeedColumnIteration
+from GPT_SoVITS.Accelerate.logger import SpeedColumnIteration, console, logger
 from GPT_SoVITS.text.cleaner import clean_text
 from tools.my_utils import clean_path
@ -302,16 +301,8 @@ def is_powershell_env(env: dict) -> bool:
 def get_prog_name() -> str:
-    system = platform.system()
+    script_rel = ".".join(["GPT_SoVITS", "prepare_datasets", osp.basename(__file__)]).strip(".py")
-    env = os.environ.copy()
+    return f"python -s -m {script_rel}"
    script_rel = osp.join("GPT_SoVITS", "prepare_datasets", osp.basename(__file__))
    if system == "Windows":
        if is_powershell_env(env):
            return rf"$env:PYTHONPATH='.'; python -s {script_rel}"
        else:
            return rf"set PYTHONPATH=. && python -s {script_rel}"
    else:
        return f"PYTHONPATH=. python -s {script_rel}"
 if __name__ == "__main__":
--- a/GPT_SoVITS/prepare_datasets/2_get_hubert_sv_wav32k.py
+++ b/GPT_SoVITS/prepare_datasets/2_get_hubert_sv_wav32k.py
@ -1,7 +1,6 @@
 import enum
 import os
 import os.path as osp
 import platform
 import queue
 import sys
 import time
@ -405,16 +404,8 @@ def is_powershell_env(env: dict) -> bool:
 def get_prog_name() -> str:
-    system = platform.system()
+    script_rel = ".".join(["GPT_SoVITS", "prepare_datasets", osp.basename(__file__)]).strip(".py")
-    env = os.environ.copy()
+    return f"python -s -m {script_rel}"
    script_rel = os.path.join("GPT_SoVITS", "prepare_datasets", os.path.basename(__file__))
    if system == "Windows":
        if is_powershell_env(env):
            return rf"$env:PYTHONPATH='.'; python -s {script_rel}"
        else:
            return rf"set PYTHONPATH=. && python -s {script_rel}"
    else:
        return f"PYTHONPATH=. python -s {script_rel}"
 if __name__ == "__main__":
--- a/GPT_SoVITS/prepare_datasets/3_get_semantic.py
+++ b/GPT_SoVITS/prepare_datasets/3_get_semantic.py
@ -2,7 +2,6 @@ import enum
 import gc
 import os
 import os.path as osp
 import platform
 import queue
 import sys
 import time
@ -12,10 +11,10 @@ from typing import List, Tuple
 import torch
 import torch.multiprocessing as tmp
 import typer
-from rich.progress import BarColumn, Progress, TimeRemainingColumn, TextColumn
+from rich.progress import BarColumn, Progress, TextColumn, TimeRemainingColumn
 from torch.multiprocessing.spawn import spawn
-from GPT_SoVITS.Accelerate.logger import console, logger, SpeedColumnIteration
+from GPT_SoVITS.Accelerate.logger import SpeedColumnIteration, console, logger
 from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3
 from GPT_SoVITS.process_ckpt import inspect_version
 from tools.my_utils import DictToAttrRecursive, clean_path
@ -295,16 +294,8 @@ def is_powershell_env(env: dict) -> bool:
 def get_prog_name() -> str:
-    system = platform.system()
+    script_rel = ".".join(["GPT_SoVITS", "prepare_datasets", osp.basename(__file__)]).strip(".py")
-    env = os.environ.copy()
+    return f"python -s -m {script_rel}"
    script_rel = osp.join("GPT_SoVITS", "prepare_datasets", osp.basename(__file__))
    if system == "Windows":
        if is_powershell_env(env):
            return rf"$env:PYTHONPATH='.'; python -s {script_rel}"
        else:
            return rf"set PYTHONPATH=. && python -s {script_rel}"
    else:
        return f"PYTHONPATH=. python -s {script_rel}"
 if __name__ == "__main__":
--- a/README.md
+++ b/README.md
@ -273,13 +273,13 @@ Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference
 #### Others
 ```bash
-PYTHONPATH=. python GPT_SoVITS/inference_webui.py <language(optional)> -b <backend> -p <port>
+python -m GPT_SoVITS.inference_webui <language(optional)> -b <backend> -p <port>
 ```
 OR
 ```bash
-PYTHONPATH=. python webui.py
+python webui.py
 ```
 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -265,7 +265,7 @@ python webui.py <language(optional)>
 #### 其他
 ```bash
-PYTHONPATH=. python GPT_SoVITS/inference_webui.py <language(optional)> -b <backend> -p <port>
+python -m GPT_SoVITS.inference_webui <language(optional)> -b <backend> -p <port>
 ```
 或者
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -251,13 +251,13 @@ python webui.py <言語(オプション)>
 #### その他
 ```bash
-PYTHONPATH=. python GPT_SoVITS/inference_webui.py <language(optional)> -b <backend> -p <port>
+python -m GPT_SoVITS.inference_webui <language(optional)> -b <backend> -p <port>
 ```
 または
 ```bash
-PYTHONPATH=. python webui.py
+python webui.py
 ```
 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -259,7 +259,7 @@ python webui.py <언어(옵션)>
 #### 기타
 ```bash
-PYTHONPATH=. python GPT_SoVITS/inference_webui.py <language(optional)> -b <backend> -p <port>
+python -m GPT_SoVITS.inference_webui <language(optional)> -b <backend> -p <port>
 ```
 또는
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -259,7 +259,7 @@ python webui.py <dil(isteğe bağlı)>
 #### Diğerleri
 ```text
-PYTHONPATH=. python GPT_SoVITS/inference_webui.py <language(optional)> -b <backend> -p <port>
+python -m GPT_SoVITS.inference_webui <language(optional)> -b <backend> -p <port>
 ```
 VEYA
--- a/go-webui.bat
+++ b/go-webui.bat
@ -2,6 +2,5 @@ set "SCRIPT_DIR=%~dp0"
 set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
 cd /d "%SCRIPT_DIR%"
 set "PATH=%SCRIPT_DIR%\runtime"
 set "PYTHONPATH=%SCRIPT_DIR%"
 runtime\python.exe -I webui.py zh_CN
 pause
--- a/go-webui.ps1
+++ b/go-webui.ps1
@ -3,6 +3,5 @@ chcp 65001
 Set-Location $PSScriptRoot
 $runtimePath = Join-Path $PSScriptRoot "runtime"
 $env:PATH = "$runtimePath"
 $env:PYTHONPATH = "$runtimePath"
 & "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
 pause
--- a/webui.py
+++ b/webui.py
@ -44,11 +44,11 @@ from tools.assets import css, js, top_html
 from tools.i18n.i18n import I18nAuto, scan_language_list
 from tools.my_utils import check_details, check_for_existance
 os.environ["PYTHONPATH"] = now_dir = os.getcwd()
 os.environ["version"] = version = "v2Pro"
 os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
 os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
 os.environ["all_proxy"] = ""
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 backends_gradio = [(b.replace("-", " "), b) for b in backends]
@ -86,7 +86,7 @@ def build_parser() -> argparse.ArgumentParser:
 args = build_parser().parse_args()
-tmp = os.path.join(now_dir, "TEMP")
+tmp = "TEMP"
 os.makedirs(tmp, exist_ok=True)
 os.environ["TEMP"] = tmp
 if os.path.exists(tmp):
@ -794,11 +794,10 @@ def open1a(
        opt_dir = f"{exp_root}/{exp_name}"
        env = os.environ.copy()
        env["PYTHONPATH"] = os.getcwd()
        # fmt: off
        cmd = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/1-get-text.py",
+            python_exec, "-s", "-m", "GPT_SoVITS.prepare_datasets.1_get_text",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--bert", bert_pretrained_dir,
@ -884,11 +883,10 @@ def open1b(
        opt_dir = f"{exp_root}/{exp_name}"
        env = os.environ.copy()
        env["PYTHONPATH"] = os.getcwd()
        # fmt: off
        cmd = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/2-get-hubert-sv-wav32k.py",
+            python_exec, "-s", "GPT_SoVITS/prepare_datasets/2_get_hubert_sv_wav32k.py",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--cnhubert", ssl_pretrained_dir,
@ -977,11 +975,10 @@ def open1c(
        opt_dir = f"{exp_root}/{exp_name}"
        env = os.environ.copy()
        env["PYTHONPATH"] = os.getcwd()
        # fmt: off
        cmd = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/3-get-semantic.py",
+            python_exec, "-s", "GPT_SoVITS/prepare_datasets/3_get_semantic.py",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--pretrained-s2g", pretrained_s2G_path,
@ -1073,12 +1070,11 @@ def open1abc(
        opt_dir = f"{exp_root}/{exp_name}"
        env = os.environ.copy()
        env["PYTHONPATH"] = os.getcwd()
        # Step 1
        # fmt: off
        cmd_1 = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/1-get-text.py",
+            python_exec, "-s", "GPT_SoVITS/prepare_datasets/1_get_text.py",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--bert", bert_pretrained_dir,
@ -1124,7 +1120,7 @@ def open1abc(
        # Step 2
        # fmt: off
        cmd_2 = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/2-get-hubert-sv-wav32k.py",
+            python_exec, "-s", "GPT_SoVITS/prepare_datasets/2_get_hubert_sv_wav32k.py",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--cnhubert", ssl_pretrained_dir,
@ -1175,7 +1171,7 @@ def open1abc(
        # Step 3
        # fmt: off
        cmd_3 = [
-            python_exec, "-s", "GPT_SoVITS/prepare_datasets/3-get-semantic.py",
+            python_exec, "-s", "GPT_SoVITS/prepare_datasets/3_get_semantic.py",
            "--inp-list", inp_text,
            "--opt", opt_dir,
            "--pretrained-s2g", pretrained_s2G_path,