From 99a2e356f2bfdb5cd30fda35a03a00f96384b9af Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Fri, 13 Mar 2026 21:35:24 +0800
Subject: [PATCH 01/15] =?UTF-8?q?feat:remove=20=E2=80=9C-q=E2=80=9C=20opti?=
 =?UTF-8?q?on=20of=20conda=20installation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 install.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index 7017524f..847ad900 100644
--- a/install.ps1
+++ b/install.ps1
@@ -52,7 +52,7 @@ function Invoke-Conda {
         [string[]]$Args
     )
 
-    $output = & conda install -y -q -c conda-forge @Args 2>&1
+    $output = & conda install -y -c conda-forge @Args 2>&1
     $exitCode = $LASTEXITCODE
 
     if ($exitCode -ne 0) {

From 0e83383544e65ab7950b81651379c37e06bb911c Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 14 Mar 2026 09:32:11 +0800
Subject: [PATCH 02/15] feat:added bat file for launching webui with conda

---
 conda-go-webui.bat | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 conda-go-webui.bat

diff --git a/conda-go-webui.bat b/conda-go-webui.bat
new file mode 100644
index 00000000..271ccf30
--- /dev/null
+++ b/conda-go-webui.bat
@@ -0,0 +1,4 @@
+chcp 65001
+cd /d %~dp0
+conda activate %1
+python -I webui.py zh_CN
\ No newline at end of file

From 6e3db0126c55e0c3833585641d386221ac3cc8f5 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 14 Mar 2026 12:59:09 +0800
Subject: [PATCH 03/15] fix: Fixed conda-go-webui.bat

---
 conda-go-webui.bat | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda-go-webui.bat b/conda-go-webui.bat
index 271ccf30..30b811dc 100644
--- a/conda-go-webui.bat
+++ b/conda-go-webui.bat
@@ -1,4 +1,3 @@
 chcp 65001
 cd /d %~dp0
-conda activate %1
-python -I webui.py zh_CN
\ No newline at end of file
+conda activate %1 | python -I webui.py zh_CN
\ No newline at end of file

From eedb06b303a559c17796049909db25a2dde65561 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 14 Mar 2026 13:01:11 +0800
Subject: [PATCH 04/15] fix:Fixed config.json loader in config.py

---
 config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.py b/config.py
index 76965027..cc706b90 100644
--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ def merge_dir_txt2(*TXT):
 config_json_location = merge_dir_txt2(current_dir,"config.json")
 with open(str(config_json_location),"r") as f:
     __info__ = f.read()
-
+__info__ = json.loads(__info__)
 i18n = I18nAuto(language=os.environ.get("language", "Auto"))
 
 

From e49d396b18664edfca5808838898e1db502afa39 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 14 Mar 2026 13:28:46 +0800
Subject: [PATCH 05/15] =?UTF-8?q?fix:=20=E6=B7=BB=E5=8A=A0=E4=BA=86inst.ba?=
 =?UTF-8?q?t=20=E4=B8=8E=20inst2.ps1=20=E4=BB=A5=E5=BA=94=E5=AF=B9=20insta?=
 =?UTF-8?q?ll.ps1=20=E8=BF=90=E8=A1=8C=E6=97=B6=E5=8F=AF=E8=83=BD=E5=87=BA?=
 =?UTF-8?q?=E7=8E=B0=E7=9A=84=20=E2=80=9C=E7=94=B1=E4=BA=8E=E8=B0=83?=
 =?UTF-8?q?=E7=94=A8=E6=B7=B1=E5=BA=A6=E6=BA=A2=E5=87=BA=EF=BC=8C=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC=E5=A4=B1=E8=B4=A5=E3=80=82=E2=80=9D=20=E9=94=99?=
 =?UTF-8?q?=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md |   9 +++
 inst.bat  |   3 +
 inst2.ps1 | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+)
 create mode 100644 inst.bat
 create mode 100644 inst2.ps1

diff --git a/README.md b/README.md
index 923f9a0a..76bc868b 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,15 @@ conda activate GPTSoVits
 pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
 ```
 
+If install.ps1 fails, you can try again or run the following commands:
+
+```pwsh
+conda create -n GPTSoVits python=3.10
+conda activate GPTSoVits
+inst.bat
+pwsh -F inst2.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
+```
+
 ### Linux
 
 ```bash
diff --git a/inst.bat b/inst.bat
new file mode 100644
index 00000000..050faa83
--- /dev/null
+++ b/inst.bat
@@ -0,0 +1,3 @@
+chcp 65001
+conda install -y -c conda-forge ffmpeg
+conda install -y -c conda-forge cmake
\ No newline at end of file
diff --git a/inst2.ps1 b/inst2.ps1
new file mode 100644
index 00000000..4a877e82
--- /dev/null
+++ b/inst2.ps1
@@ -0,0 +1,209 @@
+Param (
+    [Parameter(Mandatory=$true)][ValidateSet("CU126", "CU128", "CPU")][string]$Device,
+    [Parameter(Mandatory=$true)][ValidateSet("HF", "HF-Mirror", "ModelScope")][string]$Source,
+    [switch]$DownloadUVR5
+)
+
+$global:ErrorActionPreference = 'Stop'
+
+trap {
+    Write-ErrorLog $_
+}
+
+function Write-ErrorLog {
+    param (
+        [System.Management.Automation.ErrorRecord]$ErrorRecord
+    )
+
+    Write-Host "`n[ERROR] Command failed:" -ForegroundColor Red
+    if (-not $ErrorRecord.Exception.Message){
+    } else {
+        Write-Host "Message:" -ForegroundColor Red 
+        $ErrorRecord.Exception.Message -split "`n" | ForEach-Object {
+            Write-Host "    $_"
+        }
+    }
+
+    Write-Host "Command:" -ForegroundColor Red  -NoNewline
+    Write-Host " $($ErrorRecord.InvocationInfo.Line)".Replace("`r", "").Replace("`n", "")
+    Write-Host "Location:" -ForegroundColor Red -NoNewline
+    Write-Host " $($ErrorRecord.InvocationInfo.ScriptName):$($ErrorRecord.InvocationInfo.ScriptLineNumber)"
+    Write-Host "Call Stack:" -ForegroundColor DarkRed
+    $ErrorRecord.ScriptStackTrace -split "`n" | ForEach-Object {
+        Write-Host "    $_" -ForegroundColor DarkRed
+    }
+
+    exit 1
+}
+
+function Write-Info($msg) {
+    Write-Host "[INFO]:" -ForegroundColor Green -NoNewline
+    Write-Host " $msg"
+}
+function Write-Success($msg) {
+    Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline
+    Write-Host " $msg"
+}
+
+function Invoke-Pip {
+    param (
+        [Parameter(ValueFromRemainingArguments = $true)]
+        [string[]]$Args
+    )
+    
+    $output = & pip install @Args 2>&1
+    $exitCode = $LASTEXITCODE
+    
+    if ($exitCode -ne 0) {
+        $errorMessages = @()
+        Write-Host "Pip Install $Args Failed" -ForegroundColor Red
+        foreach ($item in $output) {
+            if ($item -is [System.Management.Automation.ErrorRecord]) {
+                $msg = $item.Exception.Message
+                Write-Host "$msg" -ForegroundColor Red
+                $errorMessages += $msg
+            }
+            else {
+                Write-Host $item
+                $errorMessages += $item
+            }
+        }
+        throw [System.Exception]::new(($errorMessages -join "`n"))
+    }
+}
+
+function Invoke-Download {
+    param (
+        [Parameter(Mandatory = $true)]
+        [string]$Uri,
+
+        [Parameter()]
+        [string]$OutFile
+    )
+
+    try {
+        $params = @{
+            Uri = $Uri
+        }
+
+        if ($OutFile) {
+            $params["OutFile"] = $OutFile
+        }
+
+        $null = Invoke-WebRequest @params -ErrorAction Stop
+
+    } catch {
+        Write-Host "Failed to download:" -ForegroundColor Red
+        Write-Host "  $Uri"
+        throw
+    }
+}
+
+function Invoke-Unzip {
+    param($ZipPath, $DestPath)
+    Expand-Archive -Path $ZipPath -DestinationPath $DestPath -Force
+    Remove-Item $ZipPath -Force
+}
+
+chcp 65001
+Set-Location $PSScriptRoot
+
+$PretrainedURL  = ""
+$G2PWURL        = ""
+$UVR5URL        = ""
+$NLTKURL        = ""
+$OpenJTalkURL   = ""
+
+switch ($Source) {
+    "HF" {
+        Write-Info "Download Model From HuggingFace"
+        $PretrainedURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
+        $G2PWURL       = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
+        $UVR5URL       = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
+        $NLTKURL       = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
+        $OpenJTalkURL  = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
+    }
+    "HF-Mirror" {
+        Write-Info "Download Model From HuggingFace-Mirror"
+        $PretrainedURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
+        $G2PWURL       = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
+        $UVR5URL       = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
+        $NLTKURL       = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
+        $OpenJTalkURL  = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
+    }
+    "ModelScope" {
+        Write-Info "Download Model From ModelScope"
+        $PretrainedURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
+        $G2PWURL       = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
+        $UVR5URL       = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
+        $NLTKURL       = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
+        $OpenJTalkURL  = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
+    }
+}
+
+if (-not (Test-Path "GPT_SoVITS/pretrained_models/sv")) {
+    Write-Info "Downloading Pretrained Models..."
+    Invoke-Download -Uri $PretrainedURL -OutFile "pretrained_models.zip"
+    Invoke-Unzip "pretrained_models.zip" "GPT_SoVITS"
+    Write-Success "Pretrained Models Downloaded"
+} else {
+    Write-Info "Pretrained Model Exists"
+    Write-Info "Skip Downloading Pretrained Models"
+}
+
+
+if (-not (Test-Path "GPT_SoVITS/text/G2PWModel")) {
+    Write-Info "Downloading G2PWModel..."
+    Invoke-Download -Uri $G2PWURL -OutFile "G2PWModel.zip"
+    Invoke-Unzip "G2PWModel.zip" "GPT_SoVITS/text"
+    Write-Success "G2PWModel Downloaded"
+} else {
+    Write-Info "G2PWModel Exists"
+    Write-Info "Skip Downloading G2PWModel"
+}
+
+if ($DownloadUVR5) {
+    if (-not (Test-Path "tools/uvr5/uvr5_weights")) {
+        Write-Info "Downloading UVR5 Models..."
+        Invoke-Download -Uri $UVR5URL -OutFile "uvr5_weights.zip"
+        Invoke-Unzip "uvr5_weights.zip" "tools/uvr5"
+        Write-Success "UVR5 Models Downloaded"
+    } else {
+        Write-Info "UVR5 Models Exists"
+        Write-Info "Skip Downloading UVR5 Models"
+    }
+}
+
+switch ($Device) {
+    "CU128" {
+        Write-Info "Installing PyTorch For CUDA 12.8..."
+        Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu128"
+    }
+    "CU126" {
+        Write-Info "Installing PyTorch For CUDA 12.6..."
+        Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu126"
+    }
+    "CPU" {
+        Write-Info "Installing PyTorch For CPU..."
+        Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cpu"
+    }
+}
+Write-Success "PyTorch Installed"
+
+Write-Info "Installing Python Dependencies From requirements.txt..."
+Invoke-Pip -r extra-req.txt --no-deps
+Invoke-Pip -r requirements.txt
+Write-Success "Python Dependencies Installed"
+
+Write-Info "Downloading NLTK Data..."
+Invoke-Download -Uri $NLTKURL -OutFile "nltk_data.zip"
+Invoke-Unzip "nltk_data.zip" (python -c "import sys; print(sys.prefix)").Trim()
+
+Write-Info "Downloading Open JTalk Dict..."
+Invoke-Download -Uri $OpenJTalkURL -OutFile "open_jtalk_dic_utf_8-1.11.tar.gz"
+$target = (python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))").Trim()
+tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C $target
+Remove-Item "open_jtalk_dic_utf_8-1.11.tar.gz" -Force
+Write-Success "Open JTalk Dic Downloaded"
+
+Write-Success "Installation Completed"

From 86ac5555e1b8fc745b2494f6df25eff0eb1fbb15 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 14 Mar 2026 15:28:50 +0800
Subject: [PATCH 06/15] feat: Added webUI entries

---
 GPT_SoVITS/inference_webui.py | 76 +++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 3031b9ba..81a18cfd 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1329,6 +1329,70 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                         visible=False,
                     )
                 )
+
+                SaveSvEmb = gr.Checkbox(
+                    label=i18n("保存参考音频的语义向量"),
+                    interactive=True,
+                    show_label=True,
+                    value = False,
+                    visible=False if model_version not in {"v2Pro","v2ProPlus"} else True
+                )
+                SaveRefers = gr.Checkbox(
+                    label=i18n("保存参考音频的声纹特征"),
+                    interactive=True,
+                    show_label=True,
+                    value = False,
+                    visible=True
+
+                )
+                SaveSvEmbName = gr.Textbox(
+                    label=i18n("保存的语义向量文件名，默认保存在output/sv_emb_opt目录下"),
+                    value="sv_emb.voice",
+                    interactive=True,
+                    visible=True,
+                )
+                SaveRefersName = gr.Textbox(
+                    label=i18n("保存的声纹特征文件名，默认保存在output/refers_opt目录下"),
+                    value="refers.voice",
+                    interactive=True,
+                    visible=True,
+                )
+
+                InjectSvEmb = gr.Checkbox(
+                    label=i18n("注入参考音频的语义向量"),
+                    interactive=True,
+                    show_label=True,
+                    value = False,
+                    visible=False if model_version not in {"v2Pro","v2ProPlus"} else True
+                )
+                InjectRefers = gr.Checkbox(
+                    label=i18n("注入参考音频的声纹特征"),
+                    interactive=True,
+                    show_label=True,
+                    value = False,
+                    visible=True
+                )
+
+                InjectSvEmbName = gr.Textbox(
+                    label=i18n("注入的语义向量文件名，默认保存在output/sv_emb_opt目录下"),
+                    value="sv_emb.voice",
+                    interactive=True,
+                    visible=True,
+                )
+                InjectRefersName = gr.Textbox(
+                    label=i18n("注入的声纹特征文件名，默认保存在output/refers_opt目录下"),
+                    value="refers.voice",
+                    interactive=True,
+                    visible=True,
+                )
+
+                EnableAudioLoad = gr.Checkbox(
+                    label=i18n("启用音频加载。开启后会加载参考音频"),
+                    value=True,
+                    interactive=True,
+                    show_label=True,
+                    visible=True,
+                )
                 sample_steps = (
                     gr.Radio(
                         label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
@@ -1434,8 +1498,20 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 sample_steps,
                 if_sr_Checkbox,
                 pause_second_slider,
+                
+                SaveSvEmb,
+                SaveRefers,
+                SaveSvEmbName,
+                SaveRefersName,
+                InjectSvEmb,
+                InjectRefers,
+                InjectSvEmbName,
+                InjectRefersName,
+                EnableAudioLoad,
+
             ],
             [output],
+
         )
         SoVITS_dropdown.change(
             change_sovits_weights,

From 5450922d8d03063ce4267b6342897cfbab82cfd5 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Thu, 19 Mar 2026 17:39:55 +0800
Subject: [PATCH 07/15] feat:Added entry to get value "ge" of class
 SynthesizerTrn

---
 GPT_SoVITS/module/models.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index 348ddb3f..9b47ef90 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -989,10 +989,8 @@ class SynthesizerTrn(nn.Module):
 
         o = self.dec((z * y_mask)[:, :, :], g=ge)
         return o, y_mask, (z, z_p, m_p, logs_p)
-
-
     @torch.no_grad()
-    def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None):
+    def ge_(self, refer, sv_emb, InjectGE=False, GE=None, LoadGE=True):
         def get_ge(refer, sv_emb):
             ge = None
             if refer is not None:
@@ -1007,15 +1005,28 @@ class SynthesizerTrn(nn.Module):
                     ge += sv_emb.unsqueeze(-1)
                     ge = self.prelu(ge)
             return ge
-
-        if type(refer) == list:
-            ges = []
-            for idx, _refer in enumerate(refer):
-                ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
-                ges.append(ge)
-            ge = torch.stack(ges, 0).mean(0)
+        
+        if LoadGE:
+            if type(refer) == list:
+                ges = []
+                for idx, _refer in enumerate(refer):
+                    ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
+                    ges.append(ge)
+                ge = torch.stack(ges, 0).mean(0)
+            else:
+                ge = get_ge(refer, sv_emb)
         else:
-            ge = get_ge(refer, sv_emb)
+            if InjectGE:
+                if type(GE) == list:
+                    GE = torch.stack(GE, 0).mean(0)
+                ge = GE
+            else:
+                raise ValueError
+        return ge
+    @torch.no_grad()
+    def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None,
+                InjectGE=False,GE=None,LoadGE=True):
+        ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE)
 
         y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
         text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)

From f3a9603eb06af95e77b6e4928ca70d606e7a8eee Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 21 Mar 2026 13:19:48 +0800
Subject: [PATCH 08/15] style: move new entries to the middle of the page

---
 GPT_SoVITS/inference_webui.py | 49 +++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 81a18cfd..e7dc34d2 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1307,28 +1307,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     )
                 )
                 prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1)
-            with gr.Column(scale=14):
-                prompt_language = gr.Dropdown(
-                    label=i18n("参考音频的语种"),
-                    choices=list(dict_language.keys()),
-                    value=i18n("中文"),
-                )
-                inp_refs = (
-                    gr.File(
-                        label=i18n(
-                            "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
-                        ),
-                        file_count="multiple",
-                    )
-                    if model_version not in v3v4set
-                    else gr.File(
-                        label=i18n(
-                            "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
-                        ),
-                        file_count="multiple",
-                        visible=False,
-                    )
-                )
+            
 
                 SaveSvEmb = gr.Checkbox(
                     label=i18n("保存参考音频的语义向量"),
@@ -1393,6 +1372,30 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     show_label=True,
                     visible=True,
                 )
+                     
+            with gr.Column(scale=14):
+                prompt_language = gr.Dropdown(
+                    label=i18n("参考音频的语种"),
+                    choices=list(dict_language.keys()),
+                    value=i18n("中文"),
+                )
+                inp_refs = (
+                    gr.File(
+                        label=i18n(
+                            "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
+                        ),
+                        file_count="multiple",
+                    )
+                    if model_version not in v3v4set
+                    else gr.File(
+                        label=i18n(
+                            "可选项：通过拖拽多个文件上传多个参考音频（建议同性），平均融合他们的音色。如不填写此项，音色由左侧单个参考音频控制。如是微调模型，建议参考音频全部在微调训练集音色内，底模不用管。"
+                        ),
+                        file_count="multiple",
+                        visible=False,
+                    )
+                )
+
                 sample_steps = (
                     gr.Radio(
                         label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
@@ -1498,7 +1501,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 sample_steps,
                 if_sr_Checkbox,
                 pause_second_slider,
-                
+
                 SaveSvEmb,
                 SaveRefers,
                 SaveSvEmbName,

From 47170fd555316564322df41ad5bf9d4c2c69d678 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sun, 29 Mar 2026 11:10:28 +0800
Subject: [PATCH 09/15] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=BA=86?=
 =?UTF-8?q?=E5=90=91=E5=BC=A0=E9=87=8F=E7=BB=84=E6=96=87=E4=BB=B6=E4=B8=AD?=
 =?UTF-8?q?=E8=BF=BD=E5=8A=A0=E5=BC=A0=E9=87=8F=E7=9A=84=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/VoiceSave/__init__.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/GPT_SoVITS/VoiceSave/__init__.py b/GPT_SoVITS/VoiceSave/__init__.py
index dadc19e7..4e175d51 100644
--- a/GPT_SoVITS/VoiceSave/__init__.py
+++ b/GPT_SoVITS/VoiceSave/__init__.py
@@ -99,7 +99,12 @@ class ZIP_File:
         fl.delete_dir(self.temp_write)
         POOL.remove(self.name)
 
-def save_tensor(path: str, tensors: Union[torch.Tensor, list],name:str,MySet:set=set(),file_names:Union[str,list,None]=None,**info_save) -> None:
+def save_tensor(path: str,
+                 tensors: Union[torch.Tensor, list],
+                 name:str,
+                 MySet:set=set(),
+                 file_names:Union[str,list,None]=None,
+                 **info_save,) -> None:
     if isinstance(tensors, torch.Tensor):
         tensors = [tensors]
     if not file_names:
@@ -128,7 +133,10 @@ def save_tensor(path: str, tensors: Union[torch.Tensor, list],name:str,MySet:set
     zf.close()
     del zf
 
-def load_tensor(path: str,name:str,find_func,MySet:set=set()) -> list[torch.Tensor]:
+def load_tensor(path: str,
+               name:str,
+               find_func,
+               MySet:set=set(),) -> list[torch.Tensor]:
     zf = ZIP_File(path, name, MySet=MySet)
     zf.release()
     voice_path = find_func(zf,il)
@@ -140,4 +148,16 @@ def load_tensor(path: str,name:str,find_func,MySet:set=set()) -> list[torch.Tens
         tensors.append(tensor)
     zf.close()
     del zf
-    return tensors
\ No newline at end of file
+    return tensors
+
+def add_tensor(add:list[torch.Tensor],
+               path: str,
+               name:str,
+               find_func,
+               MySet:set=set(),
+               file_names:Union[str,list,None]=None,
+               **info_save,):
+    tensors = load_tensor(path,name,find_func,MySet=MySet)
+    tensors.extend(add)
+    save_tensor(path,tensors,name,MySet=MySet,file_names=file_names,**info_save)
+    
\ No newline at end of file

From 46ae12bf17cb1a9f84624f9b17e9c694aa023008 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Thu, 2 Apr 2026 17:24:19 +0800
Subject: [PATCH 10/15] =?UTF-8?q?feat:=E6=B7=BB=E5=8A=A0=E5=85=B3=E9=97=AD?=
 =?UTF-8?q?tts=20webui=20=E7=9A=84=E5=85=A5=E5=8F=A3=20=E4=B8=8E=20ge=20?=
 =?UTF-8?q?=E7=AD=89=E4=B8=AD=E9=97=B4=E9=87=8F=E7=9A=84=E4=BF=9D=E5=AD=98?=
 =?UTF-8?q?=E5=85=A5=E5=8F=A3=E7=94=A8=E4=BA=8E=E5=88=86=E5=8F=91=E5=8F=8A?=
 =?UTF-8?q?=E4=BD=BF=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/config.json        |   3 +
 GPT_SoVITS/inference_webui.py | 126 ++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 GPT_SoVITS/config.json

diff --git a/GPT_SoVITS/config.json b/GPT_SoVITS/config.json
new file mode 100644
index 00000000..0965b480
--- /dev/null
+++ b/GPT_SoVITS/config.json
@@ -0,0 +1,3 @@
+{
+    "running_on" : "local"
+}
\ No newline at end of file
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index e7dc34d2..3eed1b35 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -9,7 +9,11 @@
 import psutil
 import os
 import sys
+import json
 from pathlib import Path
+import uuid
+
+
 
 def get_my_dir():
     return os.path.dirname(os.path.abspath(__file__))
@@ -23,6 +27,11 @@ def get_parent_dir(dir_path,depth=1):
 def merge_dir_txt2(*TXT):
     return Path(os.path.join(*TXT))
 
+with open(merge_dir_txt2(get_my_dir(), "config.json"), "r", encoding="utf-8") as f:
+    config_json = f.read()
+    config_json = json.loads(config_json)
+    running_on = config_json["running_on"]
+
 ROOT_DIR = str(get_parent_dir(get_my_dir()))
 sys.path.append(get_my_dir())
 import VoiceSave
@@ -816,12 +825,19 @@ def get_tts_wav(
     SaveSvEmbName="sv_emb.voice",
     SaveRefersName="refers.voice",
 
+    SaveGE=False,
+    SaveGEName="ge.voice",
+
     InjectSvEmb=False,
     InjectRefers=False,
     InjectSvEmbName="sv_emb.voice",
     InjectRefersName="refers.voice",
 
     EnableAudioLoad=True,
+
+    SaveOutputAsUndecoded=False,
+    SaveOutputAsUndecodedName="output.voice",
+    AddRandomSaltToSaveOutputAsUndecodedName=False,
 ):
     global cache
     if ref_wav_path:
@@ -1041,6 +1057,60 @@ def get_tts_wav(
             #print("注入后refers数量:", len(refers))
             #print("注入后sv_emb数量:", len(sv_emb) if is_v2pro else "无sv_emb")
 
+            try:
+                ges = []
+                for i in range(len(refers)):
+                    if is_v2pro:
+                        ge_ = vq_model.ge_(refers[i],sv_emb[i])
+                    else:
+                        ge_ = vq_model.ge_(refers[i])
+                    ges.append(ge_)
+                if SaveGE:
+                    names = []
+                    for i in ges:
+                        names.append(_get_unique_name(str(i.shape))+".npy")
+                    ge_path = merge_dir_txt2(ROOT_DIR,"output","ge_opt")
+                    if not os.path.exists(ge_path):
+                        os.makedirs(ge_path,exist_ok=True)
+                    if not os.path.exists(SaveGEName):
+                        _pth_ = str(merge_dir_txt2(ROOT_DIR,"output","ge_opt",SaveGEName))
+                    else:
+                        _pth_ = SaveGEName
+                    VoiceSave.save_tensor(_pth_,ges,SaveGEName,file_names=names,access_list=names)
+            except:
+                traceback.print_exc()
+            
+            if AddRandomSaltToSaveOutputAsUndecodedName:
+                ranA = uuid.uuid4()
+                ranB = uuid.uuid4()
+                SaveOutputAsUndecodedName = f"{SaveOutputAsUndecodedName}_{ranA}_{ranB}.voice"
+            try:
+                if SaveOutputAsUndecoded:
+                    if is_v2pro:
+                        z_p,mask,ge = vq_model.decode2(
+                            pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
+                            refers, speed=speed, sv_emb=sv_emb)
+                    else:
+                        z_p,mask,ge = vq_model.decode2(
+                            pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
+                            refers, speed=speed)
+                    ret = [z_p.cpu().detach(), 
+                           mask.cpu().detach(),
+                           ge.cpu().detach()]
+                    names = [f"z_p_{str(ret[0].shape)}",
+                             f"mask_{str(ret[1].shape)}",
+                             f"ge_{str(ret[2].shape)}"]
+                    undecoded_path = merge_dir_txt2(ROOT_DIR,"output","undecoded_opt")
+                    if not os.path.exists(undecoded_path):
+                        os.makedirs(undecoded_path,exist_ok=True)
+                    if not os.path.exists(SaveOutputAsUndecodedName):
+                        _pth_ = str(merge_dir_txt2(ROOT_DIR,"output","undecoded_opt",SaveOutputAsUndecodedName))
+                    else:
+                        _pth_ = SaveOutputAsUndecodedName
+                    VoiceSave.save_tensor(_pth_,ret,SaveOutputAsUndecodedName,file_names=names,access_list=names)
+            except:
+                traceback.print_exc()
+
             if is_v2pro:
                 audio = vq_model.decode(
                     pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb
@@ -1129,6 +1199,11 @@ def get_tts_wav(
         audio_opt = audio_opt.cpu().detach().numpy()
     yield opt_sr, (audio_opt * 32767).astype(np.int16)
 
+def close_serv():
+    if running_on == "local"
+        sys.exit(0)
+    else:
+        gr.Warning(i18n("服务器环境下该功能不可用"))
 
 def split(todo_text):
     todo_text = todo_text.replace("……", "。").replace("——", "，")
@@ -1372,7 +1447,47 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     show_label=True,
                     visible=True,
                 )
+
+                SaveGE = gr.Checkbox(
+                    label = i18n("保存GE"),
+                    value = True,
+                    interactive = True,
+                    show_label = True,
+                    visible = True,
+                )
                      
+                SaveGEName = gr.Textbox(
+                    label = i18n("保存的GE文件名，默认保存在output/ge_opt目录下"),
+                    value = "ge.voice",
+                    interactive = True,
+                    show_label = True,
+                    visible = True,
+                )
+
+                SaveOutputAsUndecoded = gr.Checkbox(
+                    label = i18n("保存未解码的输出"),
+                    value = False,
+                    interactive = True,
+                    show_label = True,
+                    visible = True,
+                )
+
+                SaveOutputAsUndecodedName = gr.Textbox(
+                    label = i18n("保存的未解码输出文件名，默认保存在output/undecoded_opt目录下"),
+                    value = "output.voice",
+                    interactive = True,
+                    show_label = True,
+                    visible = True,
+                )
+
+                AddRandomSaltToSaveOutputAsUndecodedName = gr.Checkbox(
+                    label = i18n("给未解码输出文件名添加随机盐，防止覆盖"),
+                    value = False,
+                    interactive = True,
+                    show_label = True,
+                    visible = True,
+                )
+
             with gr.Column(scale=14):
                 prompt_language = gr.Dropdown(
                     label=i18n("参考音频的语种"),
@@ -1482,6 +1597,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
             inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
             output = gr.Audio(label=i18n("输出的语音"), scale=14)
 
+        with gr.Row():
+            close_button = gr.Button(value=i18n("关闭服务器"), variant="danger", size="lg", scale=25)
+        
+        close_button.click(close_serv)
+
         inference_button.click(
             get_tts_wav,
             [
@@ -1506,12 +1626,18 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                 SaveRefers,
                 SaveSvEmbName,
                 SaveRefersName,
+                SaveGE,
+                SaveGEName,
                 InjectSvEmb,
                 InjectRefers,
                 InjectSvEmbName,
                 InjectRefersName,
                 EnableAudioLoad,
 
+                SaveOutputAsUndecoded,
+                SaveOutputAsUndecodedName,
+                AddRandomSaltToSaveOutputAsUndecodedName,
+
             ],
             [output],
 

From 5c03499fcf95e7de3306a46417adc98625023d15 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Thu, 2 Apr 2026 17:26:08 +0800
Subject: [PATCH 11/15] =?UTF-8?q?feat:=E5=90=91=20VoiceSave=20=E6=A8=A1?=
 =?UTF-8?q?=E5=9D=97=E4=B8=AD=E6=B7=BB=E5=8A=A0=20find=5Ffunc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/VoiceSave/__init__.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/GPT_SoVITS/VoiceSave/__init__.py b/GPT_SoVITS/VoiceSave/__init__.py
index 4e175d51..621a7293 100644
--- a/GPT_SoVITS/VoiceSave/__init__.py
+++ b/GPT_SoVITS/VoiceSave/__init__.py
@@ -114,6 +114,7 @@ def save_tensor(path: str,
     else:
         files = file_names
 
+    print(f"length of tensors: {len(tensors)}, length of files: {len(files)}")
     if len(tensors) != len(files):
         raise ValueError("The number of tensors and files must be the same.")
     np_arrays = []
@@ -160,4 +161,18 @@ def add_tensor(add:list[torch.Tensor],
     tensors = load_tensor(path,name,find_func,MySet=MySet)
     tensors.extend(add)
     save_tensor(path,tensors,name,MySet=MySet,file_names=file_names,**info_save)
-    
\ No newline at end of file
+    
+def __find_func__(zf,il):
+    f = zf.get_file_path("voice.json")
+    info = il.load_info(f)
+    if info is None:
+        return None
+    list_names = info["access_list"]
+    ret = []
+    for name in list_names:
+        try:
+            a = zf.get_file_path(name)
+            ret.append(a)
+        except FileNotFoundError:
+            continue
+    return ret
\ No newline at end of file

From cb2b844f45e9186734a5ae50d856dd81e367c1ed Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Sat, 4 Apr 2026 14:17:07 +0800
Subject: [PATCH 12/15] feat: Added ReturnWay option to get_tts_wav

---
 GPT_SoVITS/inference_webui.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 3eed1b35..709b5d56 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -838,6 +838,8 @@ def get_tts_wav(
     SaveOutputAsUndecoded=False,
     SaveOutputAsUndecodedName="output.voice",
     AddRandomSaltToSaveOutputAsUndecodedName=False,
+
+    ReturnWay = "yield", # "yield" or "return"
 ):
     global cache
     if ref_wav_path:
@@ -1197,10 +1199,15 @@ def get_tts_wav(
             audio_opt /= max_audio
     else:
         audio_opt = audio_opt.cpu().detach().numpy()
-    yield opt_sr, (audio_opt * 32767).astype(np.int16)
+
+    if ReturnWay == "yield":
+        yield opt_sr, (audio_opt * 32767).astype(np.int16)
+    else:
+        return opt_sr, (audio_opt * 32767).astype(np.int16)
+
 
 def close_serv():
-    if running_on == "local"
+    if running_on == "local":
         sys.exit(0)
     else:
         gr.Warning(i18n("服务器环境下该功能不可用"))

From fb50fc090f84d4336e5920ea6f2e88a781a9814d Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Mon, 6 Apr 2026 12:58:00 +0800
Subject: [PATCH 13/15] feat:Added batch tts option

---
 GPT_SoVITS/config.json                   |   6 +-
 GPT_SoVITS/feature_extractor/cnhubert.py |   2 +
 GPT_SoVITS/inference_webui.py            | 276 ++++++++++++++++++++++-
 GPT_SoVITS/sv.py                         |   5 +-
 4 files changed, 283 insertions(+), 6 deletions(-)

diff --git a/GPT_SoVITS/config.json b/GPT_SoVITS/config.json
index 0965b480..73825583 100644
--- a/GPT_SoVITS/config.json
+++ b/GPT_SoVITS/config.json
@@ -1,3 +1,7 @@
 {
-    "running_on" : "local"
+    "running_on" : "local",
+    "Default":{
+        "GPT_Path": "不训练直接推v3底模！",
+        "SoVITS_Path": "不训练直接推v2ProPlus底模！"
+    }
 }
\ No newline at end of file
diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py
index f22b8d09..a81de48e 100644
--- a/GPT_SoVITS/feature_extractor/cnhubert.py
+++ b/GPT_SoVITS/feature_extractor/cnhubert.py
@@ -24,6 +24,7 @@ class CNHubert(nn.Module):
         super().__init__()
         if base_path is None:
             base_path = cnhubert_base_path
+        print(f"Loading CN-Hubert from \"{base_path}\"")
         if os.path.exists(base_path):
             ...
         else:
@@ -69,6 +70,7 @@ class CNHubert(nn.Module):
 
 
 def get_model():
+    print("cnhubert_base_path:", cnhubert_base_path)
     model = CNHubert()
     model.eval()
     return model
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 709b5d56..ebc18c9a 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -12,6 +12,7 @@ import sys
 import json
 from pathlib import Path
 import uuid
+from scipy.io.wavfile import write
 
 
 
@@ -31,6 +32,7 @@ with open(merge_dir_txt2(get_my_dir(), "config.json"), "r", encoding="utf-8") as
     config_json = f.read()
     config_json = json.loads(config_json)
     running_on = config_json["running_on"]
+    Default = config_json["Default"]
 
 ROOT_DIR = str(get_parent_dir(get_my_dir()))
 sys.path.append(get_my_dir())
@@ -124,6 +126,7 @@ with open("./weight.json", "r", encoding="utf-8") as file:
     if isinstance(sovits_path, list):
         sovits_path = sovits_path[0]
 
+
 # print(2333333)
 # print(os.environ["gpt_path"])
 # print(gpt_path)
@@ -150,7 +153,7 @@ import numpy as np
 from feature_extractor import cnhubert
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 
-cnhubert.cnhubert_base_path = cnhubert_base_path
+cnhubert.cnhubert_base_path = merge_dir_txt2(ROOT_DIR, cnhubert_base_path)
 
 import random
 
@@ -184,6 +187,12 @@ language = os.environ.get("language", "Auto")
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
 i18n = I18nAuto(language=language)
 
+
+if gpt_path in [None, "",]:
+    gpt_path = str(merge_dir_txt2(ROOT_DIR, name2gpt_path[i18n(Default["GPT_Path"])]))
+if sovits_path in [None, "",]:
+    sovits_path = str(merge_dir_txt2(ROOT_DIR, name2sovits_path[i18n(Default["SoVITS_Path"])]))
+
 # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 确保直接启动推理UI时也能够设置。
 
 if torch.cuda.is_available():
@@ -214,8 +223,8 @@ dict_language_v2 = {
 }
 dict_language = dict_language_v1 if version == "v1" else dict_language_v2
 
-tokenizer = AutoTokenizer.from_pretrained(bert_path)
-bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
+tokenizer = AutoTokenizer.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path)))
+bert_model = AutoModelForMaskedLM.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path)))
 if is_half == True:
     bert_model = bert_model.half().to(device)
 else:
@@ -428,6 +437,7 @@ except:
 
 
 def change_gpt_weights(gpt_path):
+    print("gpt_path:", gpt_path)
     if "！" in gpt_path or "!" in gpt_path:
         gpt_path = name2gpt_path[gpt_path]
     global hz, max_sec, t2s_model, config
@@ -1205,7 +1215,204 @@ def get_tts_wav(
     else:
         return opt_sr, (audio_opt * 32767).astype(np.int16)
 
+def batched_tts_wav(
+    ref_wav_path,
+    prompt_text,
+    prompt_language,
+    texts,
+    text_language,
+    how_to_cut=i18n("不切"),
+    top_k=20,
+    top_p=0.6,
+    temperature=0.6,
+    ref_free=False,
+    speed=1,
+    if_freeze=False,
+    inp_refs=None,
+    sample_steps=8,
+    if_sr=False,
+    pause_second=0.3,
 
+    SaveSvEmb=False,
+    SaveRefers=False,
+    SaveSvEmbName="sv_emb.voice",
+    SaveRefersName="refers.voice",
+
+    SaveGE=False,
+    SaveGEName="ge.voice",
+
+    InjectSvEmb=False,
+    InjectRefers=False,
+    InjectSvEmbName="sv_emb.voice",
+    InjectRefersName="refers.voice",
+
+    EnableAudioLoad=True,
+
+    SaveOutputAsUndecoded=False,
+    SaveOutputAsUndecodedName="output.voice",
+    AddRandomSaltToSaveOutputAsUndecodedName=False,
+
+    ReturnWay = "yield", # "yield" or "return"
+):
+    count = 0
+    out = []
+    SaveDir = merge_dir_txt2(ROOT_DIR,"output","tts_output",f"batch_{uuid.uuid4()}")
+    if not os.path.exists(SaveDir):
+        os.makedirs(SaveDir,exist_ok=True)
+    for text in texts:
+        if text in [None, " ", ""]:
+            gr.Warning(i18n(f"输入文本第{count}行中有空行，已跳过"))
+            continue
+        else:
+            unparsed = get_tts_wav(
+                ref_wav_path,
+                prompt_text,
+                prompt_language,
+                text,
+                text_language,
+                how_to_cut,
+                top_k,
+                top_p,
+                temperature,
+                ref_free,
+                speed,
+                if_freeze,
+                inp_refs,
+                sample_steps,
+                if_sr,
+                pause_second,
+
+                SaveSvEmb,
+                SaveRefers,
+                SaveSvEmbName,
+                SaveRefersName,
+
+                SaveGE,
+                SaveGEName,
+
+                InjectSvEmb,
+                InjectRefers,
+                InjectSvEmbName,
+                InjectRefersName,
+
+                EnableAudioLoad,
+
+                SaveOutputAsUndecoded,
+                SaveOutputAsUndecodedName,
+                AddRandomSaltToSaveOutputAsUndecodedName,
+                "yield",
+            )
+            unparsed = list(unparsed)
+            print(unparsed)
+            a = text.strip().replace(' ','_').replace('\n','_')
+            wav_path = os.path.join(SaveDir,f"tts_output_{a}_{str(uuid.uuid4())}.wav")
+            write(wav_path, unparsed[0][0], unparsed[0][1])
+            out.append(wav_path)
+        count += 1
+    if ReturnWay == "yield":
+        yield SaveDir
+    else:
+        return SaveDir
+        
+def read_tts_batch_file(file_path):
+    ret = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    for l in lines:
+        if l.strip() in [None, " ", ""]:
+            continue
+        else:
+            ret.append(l)
+    return ret
+
+def batch_tts(
+    ref_wav_path,
+    prompt_text,
+    prompt_language,
+    text_paths,
+    text_language,
+    how_to_cut=i18n("不切"),
+    top_k=20,
+    top_p=0.6,
+    temperature=0.6,
+    ref_free=False,
+    speed=1,
+    if_freeze=False,
+    inp_refs=None,
+    sample_steps=8,
+    if_sr=False,
+    pause_second=0.3,
+
+    SaveSvEmb=False,
+    SaveRefers=False,
+    SaveSvEmbName="sv_emb.voice",
+    SaveRefersName="refers.voice",
+
+    SaveGE=False,
+    SaveGEName="ge.voice",
+
+    InjectSvEmb=False,
+    InjectRefers=False,
+    InjectSvEmbName="sv_emb.voice",
+    InjectRefersName="refers.voice",
+
+    EnableAudioLoad=True,
+
+    SaveOutputAsUndecoded=False,
+    SaveOutputAsUndecodedName="output.voice",
+    AddRandomSaltToSaveOutputAsUndecodedName=False,
+
+    ReturnWay = "yield", # "yield" or "return"
+):
+    print(text_paths)
+    text_list = []
+    for i in text_paths:
+        text_list.extend(read_tts_batch_file(i))
+    out = batched_tts_wav(
+        ref_wav_path,
+        prompt_text,
+        prompt_language,
+        text_list,
+        text_language,
+        how_to_cut,
+        top_k,
+        top_p,
+        temperature,
+        ref_free,
+        speed,
+        if_freeze,
+        inp_refs,
+        sample_steps,
+        if_sr,
+        pause_second,
+
+        SaveSvEmb,
+        SaveRefers,
+        SaveSvEmbName,
+        SaveRefersName,
+
+        SaveGE,
+        SaveGEName,
+
+        InjectSvEmb,
+        InjectRefers,
+        InjectSvEmbName,
+        InjectRefersName,
+
+        EnableAudioLoad,
+
+        SaveOutputAsUndecoded,
+        SaveOutputAsUndecodedName,
+        AddRandomSaltToSaveOutputAsUndecodedName,
+
+        "yield"
+    )
+    out = list(out)
+
+    if ReturnWay == "yield":
+        yield out
+    else:
+        return out
 def close_serv():
     if running_on == "local":
         sys.exit(0)
@@ -1540,6 +1747,25 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
                     show_label=True,
                     visible=False if model_version != "v3" else True,
                 )
+        with gr.Row():
+            gr.Markdown(html_center(i18n("批量语音合成参数"), "h3"))
+            with gr.Column(scale=13):
+                txt_paths = gr.File(label=i18n("批量语音合成文本文件，每行一个文本"),
+                                    file_types=[".txt"],
+                                    interactive=True, 
+                                    file_count="multiple",
+                                    scale=13)
+            with gr.Column(scale=7):
+                out = gr.File(label=i18n("批量合成输出的语音文件"),
+                              file_types=[".wav"],
+                              file_count="directory",)
+                start_batch_btn = gr.Button(i18n("开始批量合成"), 
+                                        variant="primary",
+                                        size="lg",
+                                        interactive=True,
+                                        scale=25)
+                
+
         gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
         with gr.Row():
             with gr.Column(scale=13):
@@ -1648,7 +1874,51 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
             ],
             [output],
 
+            api_name="get_tts_wav",
         )
+
+        start_batch_btn.click(
+            batch_tts,
+            [
+                inp_ref,
+                prompt_text,
+                prompt_language,
+                txt_paths,
+                text_language,
+                how_to_cut,
+                top_k,
+                top_p,
+                temperature,
+                ref_text_free,
+                speed,
+                if_freeze,
+                inp_refs,
+                sample_steps,
+                if_sr_Checkbox,
+                pause_second_slider,
+
+                SaveSvEmb,
+                SaveRefers,
+                SaveSvEmbName,
+                SaveRefersName,
+                SaveGE,
+                SaveGEName,
+                InjectSvEmb,
+                InjectRefers,
+                InjectSvEmbName,
+                InjectRefersName,
+                EnableAudioLoad,
+
+                SaveOutputAsUndecoded,
+                SaveOutputAsUndecodedName,
+                AddRandomSaltToSaveOutputAsUndecodedName,
+
+            ],
+            [out],
+
+            api_name="batch_tts",
+        )
+        
         SoVITS_dropdown.change(
             change_sovits_weights,
             [SoVITS_dropdown, prompt_language, text_language],
diff --git a/GPT_SoVITS/sv.py b/GPT_SoVITS/sv.py
index 22e70369..7fab06aa 100644
--- a/GPT_SoVITS/sv.py
+++ b/GPT_SoVITS/sv.py
@@ -1,9 +1,10 @@
 import sys
 import os
 import torch
+from pathlib import Path
 
-sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
-sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
+sys.path.append(f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/eres2net")
+sv_path = f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
 from ERes2NetV2 import ERes2NetV2
 import kaldi as Kaldi
 

From 24d7290c116032e0e615cdf52c877d5c0a13261d Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Mon, 6 Apr 2026 12:59:31 +0800
Subject: [PATCH 14/15] feat: Added VoiceChange.py

---
 GPT_SoVITS/module/VoiceChange.py | 175 +++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 GPT_SoVITS/module/VoiceChange.py

diff --git a/GPT_SoVITS/module/VoiceChange.py b/GPT_SoVITS/module/VoiceChange.py
new file mode 100644
index 00000000..ca0a1dda
--- /dev/null
+++ b/GPT_SoVITS/module/VoiceChange.py
@@ -0,0 +1,175 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import torchaudio
+import math
+from torchaudio.transforms import Resample
+import VoiceSave
+import uuid
+
+def get_train_set(voice_file_path):
+    if type(voice_file_path) == str:
+        voice_file_path = [voice_file_path]
+    ret = []
+    for i in voice_file_path:
+        tensors_ = VoiceSave.load_tensor(i,
+                              f"get_{uuid.uuid4()}",
+                              find_func=VoiceSave.__find_func__,
+                              MySet=set())
+        ret.append(tensors_)
+    return ret
+
+class MelSpectrogram(nn.Module):
+    def __init__(self, hps):
+        super().__init__()
+        self.filter_length = hps.data.filter_length
+        self.hop_length = hps.data.hop_length
+        self.win_length = hps.data.win_length
+        self.sampling_rate = hps.data.sampling_rate
+        self.n_mel_channels = hps.data.n_mel_channels  
+        self.mel_fmin = hps.data.mel_fmin if hasattr(hps.data, 'mel_fmin') else 0
+        self.mel_fmax = hps.data.mel_fmax if hasattr(hps.data, 'mel_fmax') else None
+
+        # 构建梅尔频谱变换
+        self.mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.sampling_rate,
+            n_fft=self.filter_length,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            f_min=self.mel_fmin,
+            f_max=self.mel_fmax,
+            n_mels=192,  # self.n_mel_channels,
+            window_fn=torch.hann_window,
+            center=False,
+            power=1.0,
+        )
+
+    def forward(self, audio):
+        """
+        输入：audio [B, 1, T] 或 [1, T]（单声道音频）
+        输出：mel_spec [B, n_mel_channels, T']
+        """
+        if len(audio.shape) == 2:
+            audio = audio.unsqueeze(0)  # [1, T] → [1, 1, T]
+        
+        # 提取梅尔频谱
+        mel_spec = self.mel_transform(audio.squeeze(1))  # [B, n_mel, T']
+        
+        # 对数缩放（TTS标准操作）
+        mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+        
+        return mel_spec
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_seq_length=5000):
+        super(PositionalEncoding, self).__init__()
+        self.pe = torch.zeros(max_seq_length, d_model)  # 初始化位置编码矩阵
+        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)  # 偶数位置使用正弦函数
+        self.pe[:, 1::2] = torch.cos(position * div_term)  # 奇数位置使用余弦函数
+        self.register_buffer('pe', self.pe.unsqueeze(0))  # 注册为缓冲区
+        
+    def forward(self, x):
+        # 将位置编码添加到输入中
+        return x + self.pe[:, :x.size(1)]
+    
+class Spliter(nn.Module):
+    '''output: z_p shape: torch.Size([1, 192, x]), y_mask shape: torch.Size([1, 1, x]), ge shape: torch.Size([1, 1024, 1])'''
+    def __init__(self,
+                 hps,
+                 ge,
+                 device):
+        super().__init__()
+        self.hps = hps
+
+        self.ge = ge
+        self.device = device
+        #TODO: 将mel_spec与ge输入Transformer模型
+        self.mel_dim = 192
+        self.ge_dim = 1024                
+        self.transformer_dim = 512
+        self.ge_proj = nn.Linear(self.ge_dim, self.transformer_dim).to(self.device)
+        self.mel_proj = nn.Linear(self.mel_dim, self.transformer_dim).to(self.device)
+        self.pos_encoder = PositionalEncoding(self.transformer_dim).to(self.device)
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=self.transformer_dim,
+                nhead=hps.model.nhead,
+                dim_feedforward=hps.model.ffn_dim,
+                batch_first=False,
+                dropout=0.1
+            ),
+            num_layers=hps.model.num_layers
+        ).to(self.device)
+    
+        self.out_proj = nn.Linear(self.transformer_dim, self.mel_dim).to(self.device)
+
+    @torch.no_grad()
+    def mel_(self,audio_path, hps, device, dtype):
+        sr_target = int(hps.data.sampling_rate)
+        audio, sr_origin = torchaudio.load(audio_path)
+        if audio.shape[0] > 1:
+            audio = audio.mean(0, keepdim=True)
+        if sr_origin != sr_target:
+            resampler = Resample(sr_origin, sr_target).to(device)
+            audio = resampler(audio.to(device))
+        else:
+            audio = audio.to(device)
+        max_audio = audio.abs().max()
+        if max_audio > 1.0:
+            audio = audio / max_audio
+        mel_extractor = MelSpectrogram(hps).to(device)
+        mel_spec = mel_extractor(audio).to(dtype)
+        return mel_spec
+    
+    def forward(self, audio_path, ge,device,dtype):
+        # 输入：audio_path, ge
+        # 输出：z_p, y_mask, ge
+        ge_ = ge
+        mel = self.mel_(audio_path, self.hps, device, dtype)
+
+        mel = mel.permute(2, 0, 1)  
+        # 梅尔谱投影到Transformer维度：[T, 1, 512]
+        mel_feat = self.mel_proj(mel)  
+        
+        # 全局情感特征GE处理：[1,1024,1] → [1,1024] → [1,1,512]
+        ge = ge.to(device, dtype=dtype)
+        ge_squeeze = ge.squeeze(-1)  # [1, 1024]
+        ge_feat = self.ge_proj(ge_squeeze).unsqueeze(0)  # [1, 1, 512]
+        
+        # ===================== 3. 特征融合与Transformer输入 =====================
+        # 将GE特征拼接在梅尔谱序列开头：[T+1, 1, 512]
+        self.transformer_input = torch.cat([ge_feat, mel_feat], dim=0)
+        # 添加位置编码
+        self.transformer_input = self.pos_encoder(self.transformer_input)
+        
+        # ===================== 4. Transformer编码 =====================
+        transformer_out = self.transformer(self.transformer_input)  # [T+1, 1, 512]
+        
+        # ===================== 5. 输出特征重构 =====================
+        # 去除GE开头，提取梅尔谱对应的输出：[T, 1, 512]
+        mel_out = transformer_out[1:, :, :]
+        # 投影回原始梅尔维度：[T, 1, 192]
+        mel_out = self.out_proj(mel_out)
+        # 转换为目标格式：[1, 192, T] → z_p
+        z_p = mel_out.permute(1, 2, 0)
+        
+        # ===================== 6. 生成掩码 =====================
+        T = z_p.shape[-1]  # 梅尔谱时间步
+        y_mask = torch.ones(1, 1, T, device=device, dtype=dtype)  # [1,1,T] 全1掩码
+        
+        # ===================== 7. 输出（严格匹配注释格式） =====================
+        return z_p, y_mask, ge_
+
+class SpliterDataset(torch.utils.data.Dataset):
+    def __init__(self, voice_file_paths):
+        self.voice_file_paths = voice_file_paths
+        self.datas = get_train_set(voice_file_paths)
+
+    def __len__(self):
+        return len(self.datas)
+    
+    def __getitem__(self, idx):
+        return self.datas[idx]
\ No newline at end of file

From e6a67650fffbfa499e9b8bcd11814a5afa5ff040 Mon Sep 17 00:00:00 2001
From: Kaning123 <kaning123official@163.com>
Date: Mon, 6 Apr 2026 13:01:32 +0800
Subject: [PATCH 15/15] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=B8=AD?=
 =?UTF-8?q?=E9=97=B4=E9=87=8F=E5=AF=BC=E5=87=BA=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/module/models.py  | 125 +++++++++++++++++++++++++++++++++--
 GPT_SoVITS/module/modules.py |   2 +
 2 files changed, 122 insertions(+), 5 deletions(-)

diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index 9b47ef90..ad2a4e43 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -25,6 +25,53 @@ import contextlib
 import random
 
 
+import torchaudio
+from torchaudio.transforms import Resample
+import os
+from pathlib import Path
+def merge_dir_txt2(*TXT):
+    return Path(os.path.join(*TXT))
+
+def get_my_dir():
+    return os.path.dirname(os.path.abspath(__file__))
+
+def get_parent_dir(dir_path,depth=1):
+    parent_path = Path(dir_path)
+    for _ in range(depth):
+        parent_path = parent_path.parent
+    return parent_path
+
+POOL:set = set() 
+def _get_unique_name(name,MySet:set=set()):
+    _id = 1
+    if name not in POOL and name not in MySet:
+        POOL.add(name)
+        return name
+    while name in POOL or name in MySet:
+        _id += 1
+        name = f'{name}_{_id}'
+    POOL.add(name)
+    return name
+
+def find_func(zf,il):
+    f = zf.get_file_path("voice.json")
+    info = il.load_info(f)
+    if info is None:
+        return None
+    list_names = info["access_list"]
+    global POOL
+    POOL.update(list_names)
+    ret = []
+    for name in list_names:
+        try:
+            a = zf.get_file_path(name)
+            ret.append(a)
+        except FileNotFoundError:
+            continue
+    return ret
+
+ROOT_DIR = str(get_parent_dir(get_my_dir()))
+
 class StochasticDurationPredictor(nn.Module):
     def __init__(
         self,
@@ -153,7 +200,7 @@ class DurationPredictor(nn.Module):
 
 WINDOW = {}
 
-class TextEncoder(nn.Module):
+class  TextEncoder(nn.Module):
     def __init__(
         self,
         out_channels,
@@ -990,7 +1037,7 @@ class SynthesizerTrn(nn.Module):
         o = self.dec((z * y_mask)[:, :, :], g=ge)
         return o, y_mask, (z, z_p, m_p, logs_p)
     @torch.no_grad()
-    def ge_(self, refer, sv_emb, InjectGE=False, GE=None, LoadGE=True):
+    def ge_(self, refer, sv_emb=None, InjectGE=False, GE=None, LoadGE=True):
         def get_ge(refer, sv_emb):
             ge = None
             if refer is not None:
@@ -1004,6 +1051,7 @@ class SynthesizerTrn(nn.Module):
                     sv_emb = self.sv_emb(sv_emb)  # B*20480->B*512
                     ge += sv_emb.unsqueeze(-1)
                     ge = self.prelu(ge)
+            print(f"ge.shape : {ge.shape}")
             return ge
         
         if LoadGE:
@@ -1021,11 +1069,17 @@ class SynthesizerTrn(nn.Module):
                     GE = torch.stack(GE, 0).mean(0)
                 ge = GE
             else:
-                raise ValueError
+                raise ValueError("No GE stream provided!")
         return ge
+
     @torch.no_grad()
     def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None,
-                InjectGE=False,GE=None,LoadGE=True):
+                InjectGE=False,GE=None,LoadGE=True,
+                InjectZP=False,ZP=None,LoadZP=True,
+                OverWrite_Mask=False,Mask=None,
+                SaveGE=False,SaveZP=False,SaveMask=False,
+                GE_Name=None, ZP_Name=None, Mask_Name=None,
+                VoiceSave=None):
         ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE)
 
         y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
@@ -1042,14 +1096,75 @@ class SynthesizerTrn(nn.Module):
             self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
             speed,
         )
-        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
 
+        if InjectZP:
+            if type(ZP) == list:
+                ZP = torch.stack(ZP, 0).mean(0)
+            else:
+                ZP = ZP
+            z_p = ZP
+        else:
+            if LoadZP:
+                z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+            else:
+                raise ValueError("No z_p stream provided!")
+            
+        if OverWrite_Mask:
+            if type(Mask) == list:
+                Mask = torch.stack(Mask, 0).mean(0)
+            if Mask is None:
+                raise ValueError("No mask stream provided!")
+            y_mask = Mask
+        print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}")
         z = self.flow(z_p, y_mask, g=ge, reverse=True)
 
         o = self.dec((z * y_mask)[:, :, :], g=ge)
         return o
 
 
+    @torch.no_grad()
+    def decode2(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None,
+                InjectGE=False,GE=None,LoadGE=True,
+                InjectZP=False,ZP=None,LoadZP=True,
+                OverWrite_Mask=False,Mask=None,):
+        ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE)
+
+        y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
+        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
+
+        quantized = self.quantizer.decode(codes)
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
+        x, m_p, logs_p, y_mask, _, _ = self.enc_p(
+            quantized,
+            y_lengths,
+            text,
+            text_lengths,
+            self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
+            speed,
+        )
+
+        if InjectZP:
+            if type(ZP) == list:
+                ZP = torch.stack(ZP, 0).mean(0)
+            else:
+                ZP = ZP
+            z_p = ZP
+        else:
+            if LoadZP:
+                z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+            else:
+                raise ValueError("No z_p stream provided!")
+            
+        if OverWrite_Mask:
+            if type(Mask) == list:
+                Mask = torch.stack(Mask, 0).mean(0)
+            if Mask is None:
+                raise ValueError("No mask stream provided!")
+            y_mask = Mask
+        print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}")
+        return z_p, y_mask, ge
+
     @torch.no_grad()
     def decode_streaming(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, result_length:int=None, overlap_frames:torch.Tensor=None, padding_length:int=None):
         def get_ge(refer, sv_emb):
diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py
index 6fa84a43..2ff7e8db 100644
--- a/GPT_SoVITS/module/modules.py
+++ b/GPT_SoVITS/module/modules.py
@@ -432,6 +432,8 @@ class ResidualCouplingLayer(nn.Module):
         self.post.bias.data.zero_()
 
     def forward(self, x, x_mask, g=None, reverse=False):
+
+        print(f"x.shape: {x.shape}, x_mask.shape: {x_mask.shape}")
         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
         h = self.pre(x0) * x_mask
         h = self.enc(h, x_mask, g=g)