From 99a2e356f2bfdb5cd30fda35a03a00f96384b9af Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Fri, 13 Mar 2026 21:35:24 +0800 Subject: [PATCH 01/15] =?UTF-8?q?feat:remove=20=E2=80=9C-q=E2=80=9C=20opti?= =?UTF-8?q?on=20of=20conda=20installation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- install.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.ps1 b/install.ps1 index 7017524f..847ad900 100644 --- a/install.ps1 +++ b/install.ps1 @@ -52,7 +52,7 @@ function Invoke-Conda { [string[]]$Args ) - $output = & conda install -y -q -c conda-forge @Args 2>&1 + $output = & conda install -y -c conda-forge @Args 2>&1 $exitCode = $LASTEXITCODE if ($exitCode -ne 0) { From 0e83383544e65ab7950b81651379c37e06bb911c Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 14 Mar 2026 09:32:11 +0800 Subject: [PATCH 02/15] feat:added bat file for launching webui with conda --- conda-go-webui.bat | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 conda-go-webui.bat diff --git a/conda-go-webui.bat b/conda-go-webui.bat new file mode 100644 index 00000000..271ccf30 --- /dev/null +++ b/conda-go-webui.bat @@ -0,0 +1,4 @@ +chcp 65001 +cd /d %~dp0 +conda activate %1 +python -I webui.py zh_CN \ No newline at end of file From 6e3db0126c55e0c3833585641d386221ac3cc8f5 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 14 Mar 2026 12:59:09 +0800 Subject: [PATCH 03/15] fix: Fixed conda-go-webui.bat --- conda-go-webui.bat | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda-go-webui.bat b/conda-go-webui.bat index 271ccf30..30b811dc 100644 --- a/conda-go-webui.bat +++ b/conda-go-webui.bat @@ -1,4 +1,3 @@ chcp 65001 cd /d %~dp0 -conda activate %1 -python -I webui.py zh_CN \ No newline at end of file +conda activate %1 | python -I webui.py zh_CN \ No newline at end of file From eedb06b303a559c17796049909db25a2dde65561 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 14 Mar 2026 13:01:11 +0800 Subject: [PATCH 04/15] fix:Fixed config.json loader in config.py --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index 76965027..cc706b90 100644 --- a/config.py +++ b/config.py @@ -14,7 +14,7 @@ def merge_dir_txt2(*TXT): config_json_location = merge_dir_txt2(current_dir,"config.json") with open(str(config_json_location),"r") as f: __info__ = f.read() - +__info__ = json.loads(__info__) i18n = I18nAuto(language=os.environ.get("language", "Auto")) From e49d396b18664edfca5808838898e1db502afa39 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 14 Mar 2026 13:28:46 +0800 Subject: [PATCH 05/15] =?UTF-8?q?fix:=20=E6=B7=BB=E5=8A=A0=E4=BA=86inst.ba?= =?UTF-8?q?t=20=E4=B8=8E=20inst2.ps1=20=E4=BB=A5=E5=BA=94=E5=AF=B9=20insta?= =?UTF-8?q?ll.ps1=20=E8=BF=90=E8=A1=8C=E6=97=B6=E5=8F=AF=E8=83=BD=E5=87=BA?= =?UTF-8?q?=E7=8E=B0=E7=9A=84=20=E2=80=9C=E7=94=B1=E4=BA=8E=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E6=B7=B1=E5=BA=A6=E6=BA=A2=E5=87=BA=EF=BC=8C=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E5=A4=B1=E8=B4=A5=E3=80=82=E2=80=9D=20=E9=94=99?= =?UTF-8?q?=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 +++ inst.bat | 3 + inst2.ps1 | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 inst.bat create mode 100644 inst2.ps1 diff --git a/README.md b/README.md index 923f9a0a..76bc868b 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,15 @@ conda activate GPTSoVits pwsh -F install.ps1 --Device --Source [--DownloadUVR5] ``` +If install.ps1 fails, you can try again or run the following commands: + +```pwsh +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits +inst.bat +pwsh -F inst2.ps1 --Device --Source [--DownloadUVR5] +``` + ### Linux ```bash diff --git a/inst.bat b/inst.bat new file mode 100644 index 00000000..050faa83 --- /dev/null +++ b/inst.bat @@ -0,0 +1,3 @@ +chcp 65001 +conda install -y -c conda-forge ffmpeg +conda install -y -c conda-forge cmake \ No newline at end of file diff --git a/inst2.ps1 b/inst2.ps1 new file mode 100644 index 00000000..4a877e82 --- /dev/null +++ b/inst2.ps1 @@ -0,0 +1,209 @@ +Param ( + [Parameter(Mandatory=$true)][ValidateSet("CU126", "CU128", "CPU")][string]$Device, + [Parameter(Mandatory=$true)][ValidateSet("HF", "HF-Mirror", "ModelScope")][string]$Source, + [switch]$DownloadUVR5 +) + +$global:ErrorActionPreference = 'Stop' + +trap { + Write-ErrorLog $_ +} + +function Write-ErrorLog { + param ( + [System.Management.Automation.ErrorRecord]$ErrorRecord + ) + + Write-Host "`n[ERROR] Command failed:" -ForegroundColor Red + if (-not $ErrorRecord.Exception.Message){ + } else { + Write-Host "Message:" -ForegroundColor Red + $ErrorRecord.Exception.Message -split "`n" | ForEach-Object { + Write-Host " $_" + } + } + + Write-Host "Command:" -ForegroundColor Red -NoNewline + Write-Host " $($ErrorRecord.InvocationInfo.Line)".Replace("`r", "").Replace("`n", "") + Write-Host "Location:" -ForegroundColor Red -NoNewline + Write-Host " $($ErrorRecord.InvocationInfo.ScriptName):$($ErrorRecord.InvocationInfo.ScriptLineNumber)" + Write-Host "Call Stack:" -ForegroundColor DarkRed + $ErrorRecord.ScriptStackTrace -split "`n" | ForEach-Object { + Write-Host " $_" -ForegroundColor DarkRed + } + + exit 1 +} + +function Write-Info($msg) { + Write-Host "[INFO]:" -ForegroundColor Green -NoNewline + Write-Host " $msg" +} +function Write-Success($msg) { + Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline + Write-Host " $msg" +} + +function Invoke-Pip { + param ( + [Parameter(ValueFromRemainingArguments = $true)] + [string[]]$Args + ) + + $output = & pip install @Args 2>&1 + $exitCode = $LASTEXITCODE + + if ($exitCode -ne 0) { + $errorMessages = @() + Write-Host "Pip Install $Args Failed" -ForegroundColor Red + foreach ($item in $output) { + if ($item -is [System.Management.Automation.ErrorRecord]) { + $msg = $item.Exception.Message + Write-Host "$msg" -ForegroundColor Red + $errorMessages += $msg + } + else { + Write-Host $item + $errorMessages += $item + } + } + throw [System.Exception]::new(($errorMessages -join "`n")) + } +} + +function Invoke-Download { + param ( + [Parameter(Mandatory = $true)] + [string]$Uri, + + [Parameter()] + [string]$OutFile + ) + + try { + $params = @{ + Uri = $Uri + } + + if ($OutFile) { + $params["OutFile"] = $OutFile + } + + $null = Invoke-WebRequest @params -ErrorAction Stop + + } catch { + Write-Host "Failed to download:" -ForegroundColor Red + Write-Host " $Uri" + throw + } +} + +function Invoke-Unzip { + param($ZipPath, $DestPath) + Expand-Archive -Path $ZipPath -DestinationPath $DestPath -Force + Remove-Item $ZipPath -Force +} + +chcp 65001 +Set-Location $PSScriptRoot + +$PretrainedURL = "" +$G2PWURL = "" +$UVR5URL = "" +$NLTKURL = "" +$OpenJTalkURL = "" + +switch ($Source) { + "HF" { + Write-Info "Download Model From HuggingFace" + $PretrainedURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + $G2PWURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + $UVR5URL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" + $NLTKURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + $OpenJTalkURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" + } + "HF-Mirror" { + Write-Info "Download Model From HuggingFace-Mirror" + $PretrainedURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + $G2PWURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + $UVR5URL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" + $NLTKURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + $OpenJTalkURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" + } + "ModelScope" { + Write-Info "Download Model From ModelScope" + $PretrainedURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip" + $G2PWURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip" + $UVR5URL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip" + $NLTKURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip" + $OpenJTalkURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz" + } +} + +if (-not (Test-Path "GPT_SoVITS/pretrained_models/sv")) { + Write-Info "Downloading Pretrained Models..." + Invoke-Download -Uri $PretrainedURL -OutFile "pretrained_models.zip" + Invoke-Unzip "pretrained_models.zip" "GPT_SoVITS" + Write-Success "Pretrained Models Downloaded" +} else { + Write-Info "Pretrained Model Exists" + Write-Info "Skip Downloading Pretrained Models" +} + + +if (-not (Test-Path "GPT_SoVITS/text/G2PWModel")) { + Write-Info "Downloading G2PWModel..." + Invoke-Download -Uri $G2PWURL -OutFile "G2PWModel.zip" + Invoke-Unzip "G2PWModel.zip" "GPT_SoVITS/text" + Write-Success "G2PWModel Downloaded" +} else { + Write-Info "G2PWModel Exists" + Write-Info "Skip Downloading G2PWModel" +} + +if ($DownloadUVR5) { + if (-not (Test-Path "tools/uvr5/uvr5_weights")) { + Write-Info "Downloading UVR5 Models..." + Invoke-Download -Uri $UVR5URL -OutFile "uvr5_weights.zip" + Invoke-Unzip "uvr5_weights.zip" "tools/uvr5" + Write-Success "UVR5 Models Downloaded" + } else { + Write-Info "UVR5 Models Exists" + Write-Info "Skip Downloading UVR5 Models" + } +} + +switch ($Device) { + "CU128" { + Write-Info "Installing PyTorch For CUDA 12.8..." + Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu128" + } + "CU126" { + Write-Info "Installing PyTorch For CUDA 12.6..." + Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cu126" + } + "CPU" { + Write-Info "Installing PyTorch For CPU..." + Invoke-Pip torch --index-url "https://download.pytorch.org/whl/cpu" + } +} +Write-Success "PyTorch Installed" + +Write-Info "Installing Python Dependencies From requirements.txt..." +Invoke-Pip -r extra-req.txt --no-deps +Invoke-Pip -r requirements.txt +Write-Success "Python Dependencies Installed" + +Write-Info "Downloading NLTK Data..." +Invoke-Download -Uri $NLTKURL -OutFile "nltk_data.zip" +Invoke-Unzip "nltk_data.zip" (python -c "import sys; print(sys.prefix)").Trim() + +Write-Info "Downloading Open JTalk Dict..." +Invoke-Download -Uri $OpenJTalkURL -OutFile "open_jtalk_dic_utf_8-1.11.tar.gz" +$target = (python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))").Trim() +tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C $target +Remove-Item "open_jtalk_dic_utf_8-1.11.tar.gz" -Force +Write-Success "Open JTalk Dic Downloaded" + +Write-Success "Installation Completed" From 86ac5555e1b8fc745b2494f6df25eff0eb1fbb15 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 14 Mar 2026 15:28:50 +0800 Subject: [PATCH 06/15] feat: Added webUI entries --- GPT_SoVITS/inference_webui.py | 76 +++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 3031b9ba..81a18cfd 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1329,6 +1329,70 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css visible=False, ) ) + + SaveSvEmb = gr.Checkbox( + label=i18n("保存参考音频的语义向量"), + interactive=True, + show_label=True, + value = False, + visible=False if model_version not in {"v2Pro","v2ProPlus"} else True + ) + SaveRefers = gr.Checkbox( + label=i18n("保存参考音频的声纹特征"), + interactive=True, + show_label=True, + value = False, + visible=True + + ) + SaveSvEmbName = gr.Textbox( + label=i18n("保存的语义向量文件名,默认保存在output/sv_emb_opt目录下"), + value="sv_emb.voice", + interactive=True, + visible=True, + ) + SaveRefersName = gr.Textbox( + label=i18n("保存的声纹特征文件名,默认保存在output/refers_opt目录下"), + value="refers.voice", + interactive=True, + visible=True, + ) + + InjectSvEmb = gr.Checkbox( + label=i18n("注入参考音频的语义向量"), + interactive=True, + show_label=True, + value = False, + visible=False if model_version not in {"v2Pro","v2ProPlus"} else True + ) + InjectRefers = gr.Checkbox( + label=i18n("注入参考音频的声纹特征"), + interactive=True, + show_label=True, + value = False, + visible=True + ) + + InjectSvEmbName = gr.Textbox( + label=i18n("注入的语义向量文件名,默认保存在output/sv_emb_opt目录下"), + value="sv_emb.voice", + interactive=True, + visible=True, + ) + InjectRefersName = gr.Textbox( + label=i18n("注入的声纹特征文件名,默认保存在output/refers_opt目录下"), + value="refers.voice", + interactive=True, + visible=True, + ) + + EnableAudioLoad = gr.Checkbox( + label=i18n("启用音频加载。开启后会加载参考音频"), + value=True, + interactive=True, + show_label=True, + visible=True, + ) sample_steps = ( gr.Radio( label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), @@ -1434,8 +1498,20 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css sample_steps, if_sr_Checkbox, pause_second_slider, + + SaveSvEmb, + SaveRefers, + SaveSvEmbName, + SaveRefersName, + InjectSvEmb, + InjectRefers, + InjectSvEmbName, + InjectRefersName, + EnableAudioLoad, + ], [output], + ) SoVITS_dropdown.change( change_sovits_weights, From 5450922d8d03063ce4267b6342897cfbab82cfd5 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Thu, 19 Mar 2026 17:39:55 +0800 Subject: [PATCH 07/15] feat:Added entry to get value "ge" of class SynthesizerTrn --- GPT_SoVITS/module/models.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 348ddb3f..9b47ef90 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -989,10 +989,8 @@ class SynthesizerTrn(nn.Module): o = self.dec((z * y_mask)[:, :, :], g=ge) return o, y_mask, (z, z_p, m_p, logs_p) - - @torch.no_grad() - def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None): + def ge_(self, refer, sv_emb, InjectGE=False, GE=None, LoadGE=True): def get_ge(refer, sv_emb): ge = None if refer is not None: @@ -1007,15 +1005,28 @@ class SynthesizerTrn(nn.Module): ge += sv_emb.unsqueeze(-1) ge = self.prelu(ge) return ge - - if type(refer) == list: - ges = [] - for idx, _refer in enumerate(refer): - ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None) - ges.append(ge) - ge = torch.stack(ges, 0).mean(0) + + if LoadGE: + if type(refer) == list: + ges = [] + for idx, _refer in enumerate(refer): + ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None) + ges.append(ge) + ge = torch.stack(ges, 0).mean(0) + else: + ge = get_ge(refer, sv_emb) else: - ge = get_ge(refer, sv_emb) + if InjectGE: + if type(GE) == list: + GE = torch.stack(GE, 0).mean(0) + ge = GE + else: + raise ValueError + return ge + @torch.no_grad() + def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, + InjectGE=False,GE=None,LoadGE=True): + ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) From f3a9603eb06af95e77b6e4928ca70d606e7a8eee Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 21 Mar 2026 13:19:48 +0800 Subject: [PATCH 08/15] style: move new entries to the middle of the page --- GPT_SoVITS/inference_webui.py | 49 +++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 81a18cfd..e7dc34d2 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1307,28 +1307,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css ) ) prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1) - with gr.Column(scale=14): - prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"), - choices=list(dict_language.keys()), - value=i18n("中文"), - ) - inp_refs = ( - gr.File( - label=i18n( - "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" - ), - file_count="multiple", - ) - if model_version not in v3v4set - else gr.File( - label=i18n( - "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" - ), - file_count="multiple", - visible=False, - ) - ) + SaveSvEmb = gr.Checkbox( label=i18n("保存参考音频的语义向量"), @@ -1393,6 +1372,30 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css show_label=True, visible=True, ) + + with gr.Column(scale=14): + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), + choices=list(dict_language.keys()), + value=i18n("中文"), + ) + inp_refs = ( + gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + ) + if model_version not in v3v4set + else gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + visible=False, + ) + ) + sample_steps = ( gr.Radio( label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), @@ -1498,7 +1501,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css sample_steps, if_sr_Checkbox, pause_second_slider, - + SaveSvEmb, SaveRefers, SaveSvEmbName, From 47170fd555316564322df41ad5bf9d4c2c69d678 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sun, 29 Mar 2026 11:10:28 +0800 Subject: [PATCH 09/15] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=BA=86?= =?UTF-8?q?=E5=90=91=E5=BC=A0=E9=87=8F=E7=BB=84=E6=96=87=E4=BB=B6=E4=B8=AD?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=E5=BC=A0=E9=87=8F=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/VoiceSave/__init__.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/VoiceSave/__init__.py b/GPT_SoVITS/VoiceSave/__init__.py index dadc19e7..4e175d51 100644 --- a/GPT_SoVITS/VoiceSave/__init__.py +++ b/GPT_SoVITS/VoiceSave/__init__.py @@ -99,7 +99,12 @@ class ZIP_File: fl.delete_dir(self.temp_write) POOL.remove(self.name) -def save_tensor(path: str, tensors: Union[torch.Tensor, list],name:str,MySet:set=set(),file_names:Union[str,list,None]=None,**info_save) -> None: +def save_tensor(path: str, + tensors: Union[torch.Tensor, list], + name:str, + MySet:set=set(), + file_names:Union[str,list,None]=None, + **info_save,) -> None: if isinstance(tensors, torch.Tensor): tensors = [tensors] if not file_names: @@ -128,7 +133,10 @@ def save_tensor(path: str, tensors: Union[torch.Tensor, list],name:str,MySet:set zf.close() del zf -def load_tensor(path: str,name:str,find_func,MySet:set=set()) -> list[torch.Tensor]: +def load_tensor(path: str, + name:str, + find_func, + MySet:set=set(),) -> list[torch.Tensor]: zf = ZIP_File(path, name, MySet=MySet) zf.release() voice_path = find_func(zf,il) @@ -140,4 +148,16 @@ def load_tensor(path: str,name:str,find_func,MySet:set=set()) -> list[torch.Tens tensors.append(tensor) zf.close() del zf - return tensors \ No newline at end of file + return tensors + +def add_tensor(add:list[torch.Tensor], + path: str, + name:str, + find_func, + MySet:set=set(), + file_names:Union[str,list,None]=None, + **info_save,): + tensors = load_tensor(path,name,find_func,MySet=MySet) + tensors.extend(add) + save_tensor(path,tensors,name,MySet=MySet,file_names=file_names,**info_save) + \ No newline at end of file From 46ae12bf17cb1a9f84624f9b17e9c694aa023008 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Thu, 2 Apr 2026 17:24:19 +0800 Subject: [PATCH 10/15] =?UTF-8?q?feat:=E6=B7=BB=E5=8A=A0=E5=85=B3=E9=97=AD?= =?UTF-8?q?tts=20webui=20=E7=9A=84=E5=85=A5=E5=8F=A3=20=E4=B8=8E=20ge=20?= =?UTF-8?q?=E7=AD=89=E4=B8=AD=E9=97=B4=E9=87=8F=E7=9A=84=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E5=85=A5=E5=8F=A3=E7=94=A8=E4=BA=8E=E5=88=86=E5=8F=91=E5=8F=8A?= =?UTF-8?q?=E4=BD=BF=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/config.json | 3 + GPT_SoVITS/inference_webui.py | 126 ++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 GPT_SoVITS/config.json diff --git a/GPT_SoVITS/config.json b/GPT_SoVITS/config.json new file mode 100644 index 00000000..0965b480 --- /dev/null +++ b/GPT_SoVITS/config.json @@ -0,0 +1,3 @@ +{ + "running_on" : "local" +} \ No newline at end of file diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index e7dc34d2..3eed1b35 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -9,7 +9,11 @@ import psutil import os import sys +import json from pathlib import Path +import uuid + + def get_my_dir(): return os.path.dirname(os.path.abspath(__file__)) @@ -23,6 +27,11 @@ def get_parent_dir(dir_path,depth=1): def merge_dir_txt2(*TXT): return Path(os.path.join(*TXT)) +with open(merge_dir_txt2(get_my_dir(), "config.json"), "r", encoding="utf-8") as f: + config_json = f.read() + config_json = json.loads(config_json) + running_on = config_json["running_on"] + ROOT_DIR = str(get_parent_dir(get_my_dir())) sys.path.append(get_my_dir()) import VoiceSave @@ -816,12 +825,19 @@ def get_tts_wav( SaveSvEmbName="sv_emb.voice", SaveRefersName="refers.voice", + SaveGE=False, + SaveGEName="ge.voice", + InjectSvEmb=False, InjectRefers=False, InjectSvEmbName="sv_emb.voice", InjectRefersName="refers.voice", EnableAudioLoad=True, + + SaveOutputAsUndecoded=False, + SaveOutputAsUndecodedName="output.voice", + AddRandomSaltToSaveOutputAsUndecodedName=False, ): global cache if ref_wav_path: @@ -1041,6 +1057,60 @@ def get_tts_wav( #print("注入后refers数量:", len(refers)) #print("注入后sv_emb数量:", len(sv_emb) if is_v2pro else "无sv_emb") + try: + ges = [] + for i in range(len(refers)): + if is_v2pro: + ge_ = vq_model.ge_(refers[i],sv_emb[i]) + else: + ge_ = vq_model.ge_(refers[i]) + ges.append(ge_) + if SaveGE: + names = [] + for i in ges: + names.append(_get_unique_name(str(i.shape))+".npy") + ge_path = merge_dir_txt2(ROOT_DIR,"output","ge_opt") + if not os.path.exists(ge_path): + os.makedirs(ge_path,exist_ok=True) + if not os.path.exists(SaveGEName): + _pth_ = str(merge_dir_txt2(ROOT_DIR,"output","ge_opt",SaveGEName)) + else: + _pth_ = SaveGEName + VoiceSave.save_tensor(_pth_,ges,SaveGEName,file_names=names,access_list=names) + except: + traceback.print_exc() + + if AddRandomSaltToSaveOutputAsUndecodedName: + ranA = uuid.uuid4() + ranB = uuid.uuid4() + SaveOutputAsUndecodedName = f"{SaveOutputAsUndecodedName}_{ranA}_{ranB}.voice" + try: + if SaveOutputAsUndecoded: + if is_v2pro: + z_p,mask,ge = vq_model.decode2( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), + refers, speed=speed, sv_emb=sv_emb) + else: + z_p,mask,ge = vq_model.decode2( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), + refers, speed=speed) + ret = [z_p.cpu().detach(), + mask.cpu().detach(), + ge.cpu().detach()] + names = [f"z_p_{str(ret[0].shape)}", + f"mask_{str(ret[1].shape)}", + f"ge_{str(ret[2].shape)}"] + undecoded_path = merge_dir_txt2(ROOT_DIR,"output","undecoded_opt") + if not os.path.exists(undecoded_path): + os.makedirs(undecoded_path,exist_ok=True) + if not os.path.exists(SaveOutputAsUndecodedName): + _pth_ = str(merge_dir_txt2(ROOT_DIR,"output","undecoded_opt",SaveOutputAsUndecodedName)) + else: + _pth_ = SaveOutputAsUndecodedName + VoiceSave.save_tensor(_pth_,ret,SaveOutputAsUndecodedName,file_names=names,access_list=names) + except: + traceback.print_exc() + if is_v2pro: audio = vq_model.decode( pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb @@ -1129,6 +1199,11 @@ def get_tts_wav( audio_opt = audio_opt.cpu().detach().numpy() yield opt_sr, (audio_opt * 32767).astype(np.int16) +def close_serv(): + if running_on == "local" + sys.exit(0) + else: + gr.Warning(i18n("服务器环境下该功能不可用")) def split(todo_text): todo_text = todo_text.replace("……", "。").replace("——", ",") @@ -1372,7 +1447,47 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css show_label=True, visible=True, ) + + SaveGE = gr.Checkbox( + label = i18n("保存GE"), + value = True, + interactive = True, + show_label = True, + visible = True, + ) + SaveGEName = gr.Textbox( + label = i18n("保存的GE文件名,默认保存在output/ge_opt目录下"), + value = "ge.voice", + interactive = True, + show_label = True, + visible = True, + ) + + SaveOutputAsUndecoded = gr.Checkbox( + label = i18n("保存未解码的输出"), + value = False, + interactive = True, + show_label = True, + visible = True, + ) + + SaveOutputAsUndecodedName = gr.Textbox( + label = i18n("保存的未解码输出文件名,默认保存在output/undecoded_opt目录下"), + value = "output.voice", + interactive = True, + show_label = True, + visible = True, + ) + + AddRandomSaltToSaveOutputAsUndecodedName = gr.Checkbox( + label = i18n("给未解码输出文件名添加随机盐,防止覆盖"), + value = False, + interactive = True, + show_label = True, + visible = True, + ) + with gr.Column(scale=14): prompt_language = gr.Dropdown( label=i18n("参考音频的语种"), @@ -1482,6 +1597,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) output = gr.Audio(label=i18n("输出的语音"), scale=14) + with gr.Row(): + close_button = gr.Button(value=i18n("关闭服务器"), variant="danger", size="lg", scale=25) + + close_button.click(close_serv) + inference_button.click( get_tts_wav, [ @@ -1506,12 +1626,18 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css SaveRefers, SaveSvEmbName, SaveRefersName, + SaveGE, + SaveGEName, InjectSvEmb, InjectRefers, InjectSvEmbName, InjectRefersName, EnableAudioLoad, + SaveOutputAsUndecoded, + SaveOutputAsUndecodedName, + AddRandomSaltToSaveOutputAsUndecodedName, + ], [output], From 5c03499fcf95e7de3306a46417adc98625023d15 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Thu, 2 Apr 2026 17:26:08 +0800 Subject: [PATCH 11/15] =?UTF-8?q?feat:=E5=90=91=20VoiceSave=20=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E4=B8=AD=E6=B7=BB=E5=8A=A0=20find=5Ffunc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/VoiceSave/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/GPT_SoVITS/VoiceSave/__init__.py b/GPT_SoVITS/VoiceSave/__init__.py index 4e175d51..621a7293 100644 --- a/GPT_SoVITS/VoiceSave/__init__.py +++ b/GPT_SoVITS/VoiceSave/__init__.py @@ -114,6 +114,7 @@ def save_tensor(path: str, else: files = file_names + print(f"length of tensors: {len(tensors)}, length of files: {len(files)}") if len(tensors) != len(files): raise ValueError("The number of tensors and files must be the same.") np_arrays = [] @@ -160,4 +161,18 @@ def add_tensor(add:list[torch.Tensor], tensors = load_tensor(path,name,find_func,MySet=MySet) tensors.extend(add) save_tensor(path,tensors,name,MySet=MySet,file_names=file_names,**info_save) - \ No newline at end of file + +def __find_func__(zf,il): + f = zf.get_file_path("voice.json") + info = il.load_info(f) + if info is None: + return None + list_names = info["access_list"] + ret = [] + for name in list_names: + try: + a = zf.get_file_path(name) + ret.append(a) + except FileNotFoundError: + continue + return ret \ No newline at end of file From cb2b844f45e9186734a5ae50d856dd81e367c1ed Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Sat, 4 Apr 2026 14:17:07 +0800 Subject: [PATCH 12/15] feat: Added ReturnWay option to get_tts_wav --- GPT_SoVITS/inference_webui.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 3eed1b35..709b5d56 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -838,6 +838,8 @@ def get_tts_wav( SaveOutputAsUndecoded=False, SaveOutputAsUndecodedName="output.voice", AddRandomSaltToSaveOutputAsUndecodedName=False, + + ReturnWay = "yield", # "yield" or "return" ): global cache if ref_wav_path: @@ -1197,10 +1199,15 @@ def get_tts_wav( audio_opt /= max_audio else: audio_opt = audio_opt.cpu().detach().numpy() - yield opt_sr, (audio_opt * 32767).astype(np.int16) + + if ReturnWay == "yield": + yield opt_sr, (audio_opt * 32767).astype(np.int16) + else: + return opt_sr, (audio_opt * 32767).astype(np.int16) + def close_serv(): - if running_on == "local" + if running_on == "local": sys.exit(0) else: gr.Warning(i18n("服务器环境下该功能不可用")) From fb50fc090f84d4336e5920ea6f2e88a781a9814d Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Mon, 6 Apr 2026 12:58:00 +0800 Subject: [PATCH 13/15] feat:Added batch tts option --- GPT_SoVITS/config.json | 6 +- GPT_SoVITS/feature_extractor/cnhubert.py | 2 + GPT_SoVITS/inference_webui.py | 276 ++++++++++++++++++++++- GPT_SoVITS/sv.py | 5 +- 4 files changed, 283 insertions(+), 6 deletions(-) diff --git a/GPT_SoVITS/config.json b/GPT_SoVITS/config.json index 0965b480..73825583 100644 --- a/GPT_SoVITS/config.json +++ b/GPT_SoVITS/config.json @@ -1,3 +1,7 @@ { - "running_on" : "local" + "running_on" : "local", + "Default":{ + "GPT_Path": "不训练直接推v3底模!", + "SoVITS_Path": "不训练直接推v2ProPlus底模!" + } } \ No newline at end of file diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index f22b8d09..a81de48e 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -24,6 +24,7 @@ class CNHubert(nn.Module): super().__init__() if base_path is None: base_path = cnhubert_base_path + print(f"Loading CN-Hubert from \"{base_path}\"") if os.path.exists(base_path): ... else: @@ -69,6 +70,7 @@ class CNHubert(nn.Module): def get_model(): + print("cnhubert_base_path:", cnhubert_base_path) model = CNHubert() model.eval() return model diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 709b5d56..ebc18c9a 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -12,6 +12,7 @@ import sys import json from pathlib import Path import uuid +from scipy.io.wavfile import write @@ -31,6 +32,7 @@ with open(merge_dir_txt2(get_my_dir(), "config.json"), "r", encoding="utf-8") as config_json = f.read() config_json = json.loads(config_json) running_on = config_json["running_on"] + Default = config_json["Default"] ROOT_DIR = str(get_parent_dir(get_my_dir())) sys.path.append(get_my_dir()) @@ -124,6 +126,7 @@ with open("./weight.json", "r", encoding="utf-8") as file: if isinstance(sovits_path, list): sovits_path = sovits_path[0] + # print(2333333) # print(os.environ["gpt_path"]) # print(gpt_path) @@ -150,7 +153,7 @@ import numpy as np from feature_extractor import cnhubert from transformers import AutoModelForMaskedLM, AutoTokenizer -cnhubert.cnhubert_base_path = cnhubert_base_path +cnhubert.cnhubert_base_path = merge_dir_txt2(ROOT_DIR, cnhubert_base_path) import random @@ -184,6 +187,12 @@ language = os.environ.get("language", "Auto") language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) + +if gpt_path in [None, "",]: + gpt_path = str(merge_dir_txt2(ROOT_DIR, name2gpt_path[i18n(Default["GPT_Path"])])) +if sovits_path in [None, "",]: + sovits_path = str(merge_dir_txt2(ROOT_DIR, name2sovits_path[i18n(Default["SoVITS_Path"])])) + # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 if torch.cuda.is_available(): @@ -214,8 +223,8 @@ dict_language_v2 = { } dict_language = dict_language_v1 if version == "v1" else dict_language_v2 -tokenizer = AutoTokenizer.from_pretrained(bert_path) -bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +tokenizer = AutoTokenizer.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path))) +bert_model = AutoModelForMaskedLM.from_pretrained(str(merge_dir_txt2(ROOT_DIR,bert_path))) if is_half == True: bert_model = bert_model.half().to(device) else: @@ -428,6 +437,7 @@ except: def change_gpt_weights(gpt_path): + print("gpt_path:", gpt_path) if "!" in gpt_path or "!" in gpt_path: gpt_path = name2gpt_path[gpt_path] global hz, max_sec, t2s_model, config @@ -1205,7 +1215,204 @@ def get_tts_wav( else: return opt_sr, (audio_opt * 32767).astype(np.int16) +def batched_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + texts, + text_language, + how_to_cut=i18n("不切"), + top_k=20, + top_p=0.6, + temperature=0.6, + ref_free=False, + speed=1, + if_freeze=False, + inp_refs=None, + sample_steps=8, + if_sr=False, + pause_second=0.3, + SaveSvEmb=False, + SaveRefers=False, + SaveSvEmbName="sv_emb.voice", + SaveRefersName="refers.voice", + + SaveGE=False, + SaveGEName="ge.voice", + + InjectSvEmb=False, + InjectRefers=False, + InjectSvEmbName="sv_emb.voice", + InjectRefersName="refers.voice", + + EnableAudioLoad=True, + + SaveOutputAsUndecoded=False, + SaveOutputAsUndecodedName="output.voice", + AddRandomSaltToSaveOutputAsUndecodedName=False, + + ReturnWay = "yield", # "yield" or "return" +): + count = 0 + out = [] + SaveDir = merge_dir_txt2(ROOT_DIR,"output","tts_output",f"batch_{uuid.uuid4()}") + if not os.path.exists(SaveDir): + os.makedirs(SaveDir,exist_ok=True) + for text in texts: + if text in [None, " ", ""]: + gr.Warning(i18n(f"输入文本第{count}行中有空行,已跳过")) + continue + else: + unparsed = get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr, + pause_second, + + SaveSvEmb, + SaveRefers, + SaveSvEmbName, + SaveRefersName, + + SaveGE, + SaveGEName, + + InjectSvEmb, + InjectRefers, + InjectSvEmbName, + InjectRefersName, + + EnableAudioLoad, + + SaveOutputAsUndecoded, + SaveOutputAsUndecodedName, + AddRandomSaltToSaveOutputAsUndecodedName, + "yield", + ) + unparsed = list(unparsed) + print(unparsed) + a = text.strip().replace(' ','_').replace('\n','_') + wav_path = os.path.join(SaveDir,f"tts_output_{a}_{str(uuid.uuid4())}.wav") + write(wav_path, unparsed[0][0], unparsed[0][1]) + out.append(wav_path) + count += 1 + if ReturnWay == "yield": + yield SaveDir + else: + return SaveDir + +def read_tts_batch_file(file_path): + ret = [] + with open(file_path, 'r', encoding='utf-8') as f: + lines = f.readlines() + for l in lines: + if l.strip() in [None, " ", ""]: + continue + else: + ret.append(l) + return ret + +def batch_tts( + ref_wav_path, + prompt_text, + prompt_language, + text_paths, + text_language, + how_to_cut=i18n("不切"), + top_k=20, + top_p=0.6, + temperature=0.6, + ref_free=False, + speed=1, + if_freeze=False, + inp_refs=None, + sample_steps=8, + if_sr=False, + pause_second=0.3, + + SaveSvEmb=False, + SaveRefers=False, + SaveSvEmbName="sv_emb.voice", + SaveRefersName="refers.voice", + + SaveGE=False, + SaveGEName="ge.voice", + + InjectSvEmb=False, + InjectRefers=False, + InjectSvEmbName="sv_emb.voice", + InjectRefersName="refers.voice", + + EnableAudioLoad=True, + + SaveOutputAsUndecoded=False, + SaveOutputAsUndecodedName="output.voice", + AddRandomSaltToSaveOutputAsUndecodedName=False, + + ReturnWay = "yield", # "yield" or "return" +): + print(text_paths) + text_list = [] + for i in text_paths: + text_list.extend(read_tts_batch_file(i)) + out = batched_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text_list, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr, + pause_second, + + SaveSvEmb, + SaveRefers, + SaveSvEmbName, + SaveRefersName, + + SaveGE, + SaveGEName, + + InjectSvEmb, + InjectRefers, + InjectSvEmbName, + InjectRefersName, + + EnableAudioLoad, + + SaveOutputAsUndecoded, + SaveOutputAsUndecodedName, + AddRandomSaltToSaveOutputAsUndecodedName, + + "yield" + ) + out = list(out) + + if ReturnWay == "yield": + yield out + else: + return out def close_serv(): if running_on == "local": sys.exit(0) @@ -1540,6 +1747,25 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css show_label=True, visible=False if model_version != "v3" else True, ) + with gr.Row(): + gr.Markdown(html_center(i18n("批量语音合成参数"), "h3")) + with gr.Column(scale=13): + txt_paths = gr.File(label=i18n("批量语音合成文本文件,每行一个文本"), + file_types=[".txt"], + interactive=True, + file_count="multiple", + scale=13) + with gr.Column(scale=7): + out = gr.File(label=i18n("批量合成输出的语音文件"), + file_types=[".wav"], + file_count="directory",) + start_batch_btn = gr.Button(i18n("开始批量合成"), + variant="primary", + size="lg", + interactive=True, + scale=25) + + gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) with gr.Row(): with gr.Column(scale=13): @@ -1648,7 +1874,51 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css ], [output], + api_name="get_tts_wav", ) + + start_batch_btn.click( + batch_tts, + [ + inp_ref, + prompt_text, + prompt_language, + txt_paths, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_text_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr_Checkbox, + pause_second_slider, + + SaveSvEmb, + SaveRefers, + SaveSvEmbName, + SaveRefersName, + SaveGE, + SaveGEName, + InjectSvEmb, + InjectRefers, + InjectSvEmbName, + InjectRefersName, + EnableAudioLoad, + + SaveOutputAsUndecoded, + SaveOutputAsUndecodedName, + AddRandomSaltToSaveOutputAsUndecodedName, + + ], + [out], + + api_name="batch_tts", + ) + SoVITS_dropdown.change( change_sovits_weights, [SoVITS_dropdown, prompt_language, text_language], diff --git a/GPT_SoVITS/sv.py b/GPT_SoVITS/sv.py index 22e70369..7fab06aa 100644 --- a/GPT_SoVITS/sv.py +++ b/GPT_SoVITS/sv.py @@ -1,9 +1,10 @@ import sys import os import torch +from pathlib import Path -sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net") -sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt" +sys.path.append(f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/eres2net") +sv_path = f"{str(Path(os.path.dirname(os.path.abspath(__file__))).parent)}/GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt" from ERes2NetV2 import ERes2NetV2 import kaldi as Kaldi From 24d7290c116032e0e615cdf52c877d5c0a13261d Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Mon, 6 Apr 2026 12:59:31 +0800 Subject: [PATCH 14/15] feat: Added VoiceChange.py --- GPT_SoVITS/module/VoiceChange.py | 175 +++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 GPT_SoVITS/module/VoiceChange.py diff --git a/GPT_SoVITS/module/VoiceChange.py b/GPT_SoVITS/module/VoiceChange.py new file mode 100644 index 00000000..ca0a1dda --- /dev/null +++ b/GPT_SoVITS/module/VoiceChange.py @@ -0,0 +1,175 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import torchaudio +import math +from torchaudio.transforms import Resample +import VoiceSave +import uuid + +def get_train_set(voice_file_path): + if type(voice_file_path) == str: + voice_file_path = [voice_file_path] + ret = [] + for i in voice_file_path: + tensors_ = VoiceSave.load_tensor(i, + f"get_{uuid.uuid4()}", + find_func=VoiceSave.__find_func__, + MySet=set()) + ret.append(tensors_) + return ret + +class MelSpectrogram(nn.Module): + def __init__(self, hps): + super().__init__() + self.filter_length = hps.data.filter_length + self.hop_length = hps.data.hop_length + self.win_length = hps.data.win_length + self.sampling_rate = hps.data.sampling_rate + self.n_mel_channels = hps.data.n_mel_channels + self.mel_fmin = hps.data.mel_fmin if hasattr(hps.data, 'mel_fmin') else 0 + self.mel_fmax = hps.data.mel_fmax if hasattr(hps.data, 'mel_fmax') else None + + # 构建梅尔频谱变换 + self.mel_transform = torchaudio.transforms.MelSpectrogram( + sample_rate=self.sampling_rate, + n_fft=self.filter_length, + hop_length=self.hop_length, + win_length=self.win_length, + f_min=self.mel_fmin, + f_max=self.mel_fmax, + n_mels=192, # self.n_mel_channels, + window_fn=torch.hann_window, + center=False, + power=1.0, + ) + + def forward(self, audio): + """ + 输入:audio [B, 1, T] 或 [1, T](单声道音频) + 输出:mel_spec [B, n_mel_channels, T'] + """ + if len(audio.shape) == 2: + audio = audio.unsqueeze(0) # [1, T] → [1, 1, T] + + # 提取梅尔频谱 + mel_spec = self.mel_transform(audio.squeeze(1)) # [B, n_mel, T'] + + # 对数缩放(TTS标准操作) + mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5)) + + return mel_spec + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_seq_length=5000): + super(PositionalEncoding, self).__init__() + self.pe = torch.zeros(max_seq_length, d_model) # 初始化位置编码矩阵 + position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) + self.pe[:, 0::2] = torch.sin(position * div_term) # 偶数位置使用正弦函数 + self.pe[:, 1::2] = torch.cos(position * div_term) # 奇数位置使用余弦函数 + self.register_buffer('pe', self.pe.unsqueeze(0)) # 注册为缓冲区 + + def forward(self, x): + # 将位置编码添加到输入中 + return x + self.pe[:, :x.size(1)] + +class Spliter(nn.Module): + '''output: z_p shape: torch.Size([1, 192, x]), y_mask shape: torch.Size([1, 1, x]), ge shape: torch.Size([1, 1024, 1])''' + def __init__(self, + hps, + ge, + device): + super().__init__() + self.hps = hps + + self.ge = ge + self.device = device + #TODO: 将mel_spec与ge输入Transformer模型 + self.mel_dim = 192 + self.ge_dim = 1024 + self.transformer_dim = 512 + self.ge_proj = nn.Linear(self.ge_dim, self.transformer_dim).to(self.device) + self.mel_proj = nn.Linear(self.mel_dim, self.transformer_dim).to(self.device) + self.pos_encoder = PositionalEncoding(self.transformer_dim).to(self.device) + self.transformer = nn.TransformerEncoder( + nn.TransformerEncoderLayer( + d_model=self.transformer_dim, + nhead=hps.model.nhead, + dim_feedforward=hps.model.ffn_dim, + batch_first=False, + dropout=0.1 + ), + num_layers=hps.model.num_layers + ).to(self.device) + + self.out_proj = nn.Linear(self.transformer_dim, self.mel_dim).to(self.device) + + @torch.no_grad() + def mel_(self,audio_path, hps, device, dtype): + sr_target = int(hps.data.sampling_rate) + audio, sr_origin = torchaudio.load(audio_path) + if audio.shape[0] > 1: + audio = audio.mean(0, keepdim=True) + if sr_origin != sr_target: + resampler = Resample(sr_origin, sr_target).to(device) + audio = resampler(audio.to(device)) + else: + audio = audio.to(device) + max_audio = audio.abs().max() + if max_audio > 1.0: + audio = audio / max_audio + mel_extractor = MelSpectrogram(hps).to(device) + mel_spec = mel_extractor(audio).to(dtype) + return mel_spec + + def forward(self, audio_path, ge,device,dtype): + # 输入:audio_path, ge + # 输出:z_p, y_mask, ge + ge_ = ge + mel = self.mel_(audio_path, self.hps, device, dtype) + + mel = mel.permute(2, 0, 1) + # 梅尔谱投影到Transformer维度:[T, 1, 512] + mel_feat = self.mel_proj(mel) + + # 全局情感特征GE处理:[1,1024,1] → [1,1024] → [1,1,512] + ge = ge.to(device, dtype=dtype) + ge_squeeze = ge.squeeze(-1) # [1, 1024] + ge_feat = self.ge_proj(ge_squeeze).unsqueeze(0) # [1, 1, 512] + + # ===================== 3. 特征融合与Transformer输入 ===================== + # 将GE特征拼接在梅尔谱序列开头:[T+1, 1, 512] + self.transformer_input = torch.cat([ge_feat, mel_feat], dim=0) + # 添加位置编码 + self.transformer_input = self.pos_encoder(self.transformer_input) + + # ===================== 4. Transformer编码 ===================== + transformer_out = self.transformer(self.transformer_input) # [T+1, 1, 512] + + # ===================== 5. 输出特征重构 ===================== + # 去除GE开头,提取梅尔谱对应的输出:[T, 1, 512] + mel_out = transformer_out[1:, :, :] + # 投影回原始梅尔维度:[T, 1, 192] + mel_out = self.out_proj(mel_out) + # 转换为目标格式:[1, 192, T] → z_p + z_p = mel_out.permute(1, 2, 0) + + # ===================== 6. 生成掩码 ===================== + T = z_p.shape[-1] # 梅尔谱时间步 + y_mask = torch.ones(1, 1, T, device=device, dtype=dtype) # [1,1,T] 全1掩码 + + # ===================== 7. 输出(严格匹配注释格式) ===================== + return z_p, y_mask, ge_ + +class SpliterDataset(torch.utils.data.Dataset): + def __init__(self, voice_file_paths): + self.voice_file_paths = voice_file_paths + self.datas = get_train_set(voice_file_paths) + + def __len__(self): + return len(self.datas) + + def __getitem__(self, idx): + return self.datas[idx] \ No newline at end of file From e6a67650fffbfa499e9b8bcd11814a5afa5ff040 Mon Sep 17 00:00:00 2001 From: Kaning123 Date: Mon, 6 Apr 2026 13:01:32 +0800 Subject: [PATCH 15/15] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=B8=AD?= =?UTF-8?q?=E9=97=B4=E9=87=8F=E5=AF=BC=E5=87=BA=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/module/models.py | 125 +++++++++++++++++++++++++++++++++-- GPT_SoVITS/module/modules.py | 2 + 2 files changed, 122 insertions(+), 5 deletions(-) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 9b47ef90..ad2a4e43 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -25,6 +25,53 @@ import contextlib import random +import torchaudio +from torchaudio.transforms import Resample +import os +from pathlib import Path +def merge_dir_txt2(*TXT): + return Path(os.path.join(*TXT)) + +def get_my_dir(): + return os.path.dirname(os.path.abspath(__file__)) + +def get_parent_dir(dir_path,depth=1): + parent_path = Path(dir_path) + for _ in range(depth): + parent_path = parent_path.parent + return parent_path + +POOL:set = set() +def _get_unique_name(name,MySet:set=set()): + _id = 1 + if name not in POOL and name not in MySet: + POOL.add(name) + return name + while name in POOL or name in MySet: + _id += 1 + name = f'{name}_{_id}' + POOL.add(name) + return name + +def find_func(zf,il): + f = zf.get_file_path("voice.json") + info = il.load_info(f) + if info is None: + return None + list_names = info["access_list"] + global POOL + POOL.update(list_names) + ret = [] + for name in list_names: + try: + a = zf.get_file_path(name) + ret.append(a) + except FileNotFoundError: + continue + return ret + +ROOT_DIR = str(get_parent_dir(get_my_dir())) + class StochasticDurationPredictor(nn.Module): def __init__( self, @@ -153,7 +200,7 @@ class DurationPredictor(nn.Module): WINDOW = {} -class TextEncoder(nn.Module): +class TextEncoder(nn.Module): def __init__( self, out_channels, @@ -990,7 +1037,7 @@ class SynthesizerTrn(nn.Module): o = self.dec((z * y_mask)[:, :, :], g=ge) return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def ge_(self, refer, sv_emb, InjectGE=False, GE=None, LoadGE=True): + def ge_(self, refer, sv_emb=None, InjectGE=False, GE=None, LoadGE=True): def get_ge(refer, sv_emb): ge = None if refer is not None: @@ -1004,6 +1051,7 @@ class SynthesizerTrn(nn.Module): sv_emb = self.sv_emb(sv_emb) # B*20480->B*512 ge += sv_emb.unsqueeze(-1) ge = self.prelu(ge) + print(f"ge.shape : {ge.shape}") return ge if LoadGE: @@ -1021,11 +1069,17 @@ class SynthesizerTrn(nn.Module): GE = torch.stack(GE, 0).mean(0) ge = GE else: - raise ValueError + raise ValueError("No GE stream provided!") return ge + @torch.no_grad() def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, - InjectGE=False,GE=None,LoadGE=True): + InjectGE=False,GE=None,LoadGE=True, + InjectZP=False,ZP=None,LoadZP=True, + OverWrite_Mask=False,Mask=None, + SaveGE=False,SaveZP=False,SaveMask=False, + GE_Name=None, ZP_Name=None, Mask_Name=None, + VoiceSave=None): ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) @@ -1042,14 +1096,75 @@ class SynthesizerTrn(nn.Module): self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge, speed, ) - z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + if InjectZP: + if type(ZP) == list: + ZP = torch.stack(ZP, 0).mean(0) + else: + ZP = ZP + z_p = ZP + else: + if LoadZP: + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + else: + raise ValueError("No z_p stream provided!") + + if OverWrite_Mask: + if type(Mask) == list: + Mask = torch.stack(Mask, 0).mean(0) + if Mask is None: + raise ValueError("No mask stream provided!") + y_mask = Mask + print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}") z = self.flow(z_p, y_mask, g=ge, reverse=True) o = self.dec((z * y_mask)[:, :, :], g=ge) return o + @torch.no_grad() + def decode2(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, + InjectGE=False,GE=None,LoadGE=True, + InjectZP=False,ZP=None,LoadZP=True, + OverWrite_Mask=False,Mask=None,): + ge = self.ge_(refer, sv_emb, InjectGE, GE, LoadGE) + + y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) + text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) + + quantized = self.quantizer.decode(codes) + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + x, m_p, logs_p, y_mask, _, _ = self.enc_p( + quantized, + y_lengths, + text, + text_lengths, + self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge, + speed, + ) + + if InjectZP: + if type(ZP) == list: + ZP = torch.stack(ZP, 0).mean(0) + else: + ZP = ZP + z_p = ZP + else: + if LoadZP: + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + else: + raise ValueError("No z_p stream provided!") + + if OverWrite_Mask: + if type(Mask) == list: + Mask = torch.stack(Mask, 0).mean(0) + if Mask is None: + raise ValueError("No mask stream provided!") + y_mask = Mask + print(f"z_p shape: {z_p.shape}, y_mask shape: {y_mask.shape}, ge shape: {ge.shape}") + return z_p, y_mask, ge + @torch.no_grad() def decode_streaming(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None, result_length:int=None, overlap_frames:torch.Tensor=None, padding_length:int=None): def get_ge(refer, sv_emb): diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index 6fa84a43..2ff7e8db 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -432,6 +432,8 @@ class ResidualCouplingLayer(nn.Module): self.post.bias.data.zero_() def forward(self, x, x_mask, g=None, reverse=False): + + print(f"x.shape: {x.shape}, x_mask.shape: {x_mask.shape}") x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) * x_mask h = self.enc(h, x_mask, g=g)