{}
diff --git a/tools/my_utils.py b/tools/my_utils.py
index d26a372e..04f1a98a 100644
--- a/tools/my_utils.py
+++ b/tools/my_utils.py
@@ -109,7 +109,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
if os.path.exists(wav_path):
...
else:
- gr.Warning(wav_path+i18n("路径错误"))
+ gr.Warning(wav_path + i18n("路径错误"))
return
if is_train:
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py
index 3f2fd03e..51a7dfad 100644
--- a/tools/subfix_webui.py
+++ b/tools/subfix_webui.py
@@ -1,5 +1,6 @@
import sys
from tools.i18n.i18n import I18nAuto, scan_language_list
+
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
i18n = I18nAuto(language=language)
import argparse
@@ -309,7 +310,9 @@ if __name__ == "__main__":
with gr.Blocks(analytics_enabled=False) as demo:
gr.Markdown(
- value=i18n("Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)")
+ value=i18n(
+ "Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)"
+ )
)
with gr.Row():
btn_change_index = gr.Button("Change Index")
diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py
index 6548ce24..98c75c1f 100644
--- a/tools/uvr5/mdxnet.py
+++ b/tools/uvr5/mdxnet.py
@@ -190,14 +190,14 @@ class Predictor:
opt_path_vocal = path_vocal[:-4] + ".%s" % format
opt_path_other = path_other[:-4] + ".%s" % format
if os.path.exists(path_vocal):
- os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_vocal, opt_path_vocal))
+ os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_vocal, opt_path_vocal))
if os.path.exists(opt_path_vocal):
try:
os.remove(path_vocal)
except:
pass
if os.path.exists(path_other):
- os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_other, opt_path_other))
+ os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_other, opt_path_other))
if os.path.exists(opt_path_other):
try:
os.remove(path_other)
diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py
index 8f24ca6a..45429cca 100644
--- a/tools/uvr5/vr.py
+++ b/tools/uvr5/vr.py
@@ -140,7 +140,7 @@ class AudioPre:
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
- cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+ cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
print(cmd)
os.system(cmd)
if os.path.exists(opt_format_path):
@@ -177,7 +177,7 @@ class AudioPre:
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
- cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+ cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
print(cmd)
os.system(cmd)
if os.path.exists(opt_format_path):
@@ -307,7 +307,7 @@ class AudioPreDeEcho:
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
- cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+ cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
print(cmd)
os.system(cmd)
if os.path.exists(opt_format_path):
@@ -340,7 +340,7 @@ class AudioPreDeEcho:
)
if os.path.exists(path):
opt_format_path = path[:-4] + ".%s" % format
- cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+ cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
print(cmd)
os.system(cmd)
if os.path.exists(opt_format_path):
diff --git a/webui.py b/webui.py
index 0e34987a..9981cfcc 100644
--- a/webui.py
+++ b/webui.py
@@ -507,7 +507,7 @@ def open1Ba(
):
global p_train_SoVITS
if p_train_SoVITS == None:
- exp_name=exp_name.rstrip(" ")
+ exp_name = exp_name.rstrip(" ")
config_file = (
"GPT_SoVITS/configs/s2.json"
if version not in {"v2Pro", "v2ProPlus"}
@@ -604,7 +604,7 @@ def open1Bb(
):
global p_train_GPT
if p_train_GPT == None:
- exp_name=exp_name.rstrip(" ")
+ exp_name = exp_name.rstrip(" ")
with open(
"GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml"
) as f:
From d46c069e52caa0d75b5d1119b7dd918322e7b590 Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Wed, 18 Jun 2025 10:38:54 +0800
Subject: [PATCH 02/27] Remove Debug Code (#2471)
---
install.sh | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/install.sh b/install.sh
index e6112bd3..c3d20852 100644
--- a/install.sh
+++ b/install.sh
@@ -33,7 +33,7 @@ on_error() {
run_conda_quiet() {
local output
- output=$(conda install --yes --quiet "$@" 2>&1) || {
+ output=$(conda install --yes --quiet -c conda-forge "$@" 2>&1) || {
echo -e "${ERROR} Conda install failed:\n$output"
exit 1
}
@@ -60,8 +60,6 @@ if ! command -v conda &>/dev/null; then
exit 1
fi
-run_conda_quiet gcc
-
USE_CUDA=false
USE_ROCM=false
USE_CPU=false
From 4987df5a71696cb1333fc4c2108cc3491c4e64f5 Mon Sep 17 00:00:00 2001
From: Ella Zhang <144317607+EllaZhangCA@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:34:11 +0800
Subject: [PATCH 03/27] fixed syntax errors in api_v2.py (#2473)
---
api_v2.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/api_v2.py b/api_v2.py
index 87082074..5947df53 100644
--- a/api_v2.py
+++ b/api_v2.py
@@ -33,14 +33,14 @@ POST:
"text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details.
"batch_size": 1, # int. batch size for inference
"batch_threshold": 0.75, # float. threshold for batch splitting.
- "split_bucket: True, # bool. whether to split the batch into multiple buckets.
+ "split_bucket": True, # bool. whether to split the batch into multiple buckets.
"speed_factor":1.0, # float. control the speed of the synthesized audio.
"streaming_mode": False, # bool. whether to return a streaming response.
"seed": -1, # int. random seed for reproducibility.
"parallel_infer": True, # bool. whether to use parallel inference.
- "repetition_penalty": 1.35 # float. repetition penalty for T2S model.
+ "repetition_penalty": 1.35, # float. repetition penalty for T2S model.
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
- "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
+ "super_sampling": False # bool. whether to use super-sampling for audio when using VITS model V3.
}
```
From 37f5abfcb4a6553652235909db2e124b6f8ff3a5 Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Wed, 25 Jun 2025 14:52:27 +0800
Subject: [PATCH 04/27] Fix Issues with libstdcxx and conda sysroot (#2482)
---
install.sh | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/install.sh b/install.sh
index c3d20852..ea8d2e2d 100644
--- a/install.sh
+++ b/install.sh
@@ -170,7 +170,13 @@ if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then
exit 1
fi
-# 安装构建工具
+case "$(uname -m)" in
+ x86_64|amd64) SYSROOT_PKG="sysroot_linux-64>=2.28" ;;
+ aarch64|arm64) SYSROOT_PKG="sysroot_linux-aarch64>=2.28" ;;
+ ppc64le) SYSROOT_PKG="sysroot_linux-ppc64le>=2.28" ;;
+ *) echo "Unsupported architecture: $(uname -m)"; exit 1 ;;
+esac
+
# Install build tools
echo -e "${INFO}Detected system: $(uname -s) $(uname -r) $(uname -m)"
if [ "$(uname)" != "Darwin" ]; then
@@ -178,10 +184,14 @@ if [ "$(uname)" != "Darwin" ]; then
if [ "$gcc_major_version" -lt 11 ]; then
echo -e "${INFO}Installing GCC & G++..."
run_conda_quiet gcc=11 gxx=11
+ run_conda_quiet "$SYSROOT_PKG"
echo -e "${SUCCESS}GCC & G++ Installed..."
else
echo -e "${INFO}Detected GCC Version: $gcc_major_version"
echo -e "${INFO}Skip Installing GCC & G++ From Conda-Forge"
+ echo -e "${INFO}Installing libstdcxx-ng From Conda-Forge"
+ run_conda_quiet "libstdcxx-ng>=$gcc_major_version"
+ echo -e "${SUCCESS}libstdcxx-ng=$gcc_major_version Installed..."
fi
else
if ! xcode-select -p &>/dev/null; then
From 4839e8214862808bea45b86c7c26ff643b0175ee Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Fri, 27 Jun 2025 01:04:18 +0800
Subject: [PATCH 05/27] Add Windows Install Powershell Scripts (#2487)
---
README.md | 8 ++
docs/cn/README.md | 6 ++
docs/ko/README.md | 6 ++
docs/tr/README.md | 6 ++
install.ps1 | 241 ++++++++++++++++++++++++++++++++++++++++++++++
install.sh | 28 +++---
6 files changed, 283 insertions(+), 12 deletions(-)
create mode 100644 install.ps1
diff --git a/README.md b/README.md
index 978bb3ca..e67288ea 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,14 @@ If you are a Windows user (tested with win>=10), you can [download the integrate
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
+Install the program by running the following commands:
+
+```pwsh
+conda create -n GPTSoVits python=3.10
+conda activate GPTSoVits
+pwsh -F install.ps1 --Device
--Source [--DownloadUVR5]
+```
+
### Linux
```bash
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 70ce4285..77841b99 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -62,6 +62,12 @@
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
+```pwsh
+conda create -n GPTSoVits python=3.10
+conda activate GPTSoVits
+pwsh -F install.ps1 --Device --Source [--DownloadUVR5]
+```
+
### Linux
```bash
diff --git a/docs/ko/README.md b/docs/ko/README.md
index bfc70395..1028c00a 100644
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@@ -58,6 +58,12 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
+```pwsh
+conda create -n GPTSoVits python=3.10
+conda activate GPTSoVits
+pwsh -F install.ps1 --Device --Source [--DownloadUVR5]
+```
+
### Linux
```bash
diff --git a/docs/tr/README.md b/docs/tr/README.md
index 2ce02b60..dd5d79b3 100644
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@@ -58,6 +58,12 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
+```pwsh
+conda create -n GPTSoVits python=3.10
+conda activate GPTSoVits
+pwsh -F install.ps1 --Device --Source [--DownloadUVR5]
+```
+
### Linux
```bash
diff --git a/install.ps1 b/install.ps1
new file mode 100644
index 00000000..9c33ace8
--- /dev/null
+++ b/install.ps1
@@ -0,0 +1,241 @@
+Param (
+ [Parameter(Mandatory=$true)][ValidateSet("CU126", "CU128", "CPU")][string]$Device,
+ [Parameter(Mandatory=$true)][ValidateSet("HF", "HF-Mirror", "ModelScope")][string]$Source,
+ [switch]$DownloadUVR5
+)
+
+$global:ErrorActionPreference = 'Stop'
+
+trap {
+ Write-ErrorLog $_
+}
+
+function Write-ErrorLog {
+ param (
+ [System.Management.Automation.ErrorRecord]$ErrorRecord
+ )
+
+ Write-Host "`n[ERROR] Command failed:" -ForegroundColor Red
+ if (-not $ErrorRecord.Exception.Message){
+ } else {
+ Write-Host "Message:" -ForegroundColor Red
+ $ErrorRecord.Exception.Message -split "`n" | ForEach-Object {
+ Write-Host " $_"
+ }
+ }
+
+ Write-Host "Command:" -ForegroundColor Red -NoNewline
+ Write-Host " $($ErrorRecord.InvocationInfo.Line)".Replace("`r", "").Replace("`n", "")
+ Write-Host "Location:" -ForegroundColor Red -NoNewline
+ Write-Host " $($ErrorRecord.InvocationInfo.ScriptName):$($ErrorRecord.InvocationInfo.ScriptLineNumber)"
+ Write-Host "Call Stack:" -ForegroundColor DarkRed
+ $ErrorRecord.ScriptStackTrace -split "`n" | ForEach-Object {
+ Write-Host " $_" -ForegroundColor DarkRed
+ }
+
+ exit 1
+}
+
+function Write-Info($msg) {
+ Write-Host "[INFO]:" -ForegroundColor Green -NoNewline
+ Write-Host " $msg"
+}
+function Write-Success($msg) {
+ Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline
+ Write-Host " $msg"
+}
+
+
+function Invoke-Conda {
+ param (
+ [Parameter(ValueFromRemainingArguments = $true)]
+ [string[]]$Args
+ )
+
+ $output = & conda install -y -q -c conda-forge @Args 2>&1
+ $exitCode = $LASTEXITCODE
+
+ if ($exitCode -ne 0) {
+ Write-Host "Conda Install $Args Failed" -ForegroundColor Red
+ $errorMessages = @()
+ foreach ($item in $output) {
+ if ($item -is [System.Management.Automation.ErrorRecord]) {
+ $msg = $item.Exception.Message
+ Write-Host "$msg" -ForegroundColor Red
+ $errorMessages += $msg
+ }
+ else {
+ Write-Host $item
+ $errorMessages += $item
+ }
+ }
+ throw [System.Exception]::new(($errorMessages -join "`n"))
+ }
+}
+
+function Invoke-Pip {
+ param (
+ [Parameter(ValueFromRemainingArguments = $true)]
+ [string[]]$Args
+ )
+
+ $output = & pip install @Args 2>&1
+ $exitCode = $LASTEXITCODE
+
+ if ($exitCode -ne 0) {
+ $errorMessages = @()
+ Write-Host "Pip Install $Args Failed" -ForegroundColor Red
+ foreach ($item in $output) {
+ if ($item -is [System.Management.Automation.ErrorRecord]) {
+ $msg = $item.Exception.Message
+ Write-Host "$msg" -ForegroundColor Red
+ $errorMessages += $msg
+ }
+ else {
+ Write-Host $item
+ $errorMessages += $item
+ }
+ }
+ throw [System.Exception]::new(($errorMessages -join "`n"))
+ }
+}
+
+function Invoke-Download {
+ param (
+ [Parameter(Mandatory = $true)]
+ [string]$Uri,
+
+ [Parameter()]
+ [string]$OutFile
+ )
+
+ try {
+ $params = @{
+ Uri = $Uri
+ }
+
+ if ($OutFile) {
+ $params["OutFile"] = $OutFile
+ }
+
+ $null = Invoke-WebRequest @params -ErrorAction Stop
+
+ } catch {
+ Write-Host "Failed to download:" -ForegroundColor Red
+ Write-Host " $Uri"
+ throw
+ }
+}
+
+function Invoke-Unzip {
+ param($ZipPath, $DestPath)
+ Expand-Archive -Path $ZipPath -DestinationPath $DestPath -Force
+ Remove-Item $ZipPath -Force
+}
+
+chcp 65001
+Set-Location $PSScriptRoot
+
+Write-Info "Installing FFmpeg & CMake..."
+Invoke-Conda ffmpeg cmake
+Write-Success "FFmpeg & CMake Installed"
+
+$PretrainedURL = ""
+$G2PWURL = ""
+$UVR5URL = ""
+$NLTKURL = ""
+$OpenJTalkURL = ""
+
+switch ($Source) {
+ "HF" {
+ Write-Info "Download Model From HuggingFace"
+ $PretrainedURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
+ $G2PWURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
+ $UVR5URL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
+ $NLTKURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
+ $OpenJTalkURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
+ }
+ "HF-Mirror" {
+ Write-Info "Download Model From HuggingFace-Mirror"
+ $PretrainedURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
+ $G2PWURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
+ $UVR5URL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
+ $NLTKURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
+ $OpenJTalkURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
+ }
+ "ModelScope" {
+ Write-Info "Download Model From ModelScope"
+ $PretrainedURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
+ $G2PWURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
+ $UVR5URL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
+ $NLTKURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
+ $OpenJTalkURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
+ }
+}
+
+if (-not (Test-Path "GPT_SoVITS/pretrained_models/sv")) {
+ Write-Info "Downloading Pretrained Models..."
+ Invoke-Download -Uri $PretrainedURL -OutFile "pretrained_models.zip"
+ Invoke-Unzip "pretrained_models.zip" "GPT_SoVITS"
+ Write-Success "Pretrained Models Downloaded"
+} else {
+ Write-Info "Pretrained Model Exists"
+ Write-Info "Skip Downloading Pretrained Models"
+}
+
+
+if (-not (Test-Path "GPT_SoVITS/text/G2PWModel")) {
+ Write-Info "Downloading G2PWModel..."
+ Invoke-Download -Uri $G2PWURL -OutFile "G2PWModel.zip"
+ Invoke-Unzip "G2PWModel.zip" "GPT_SoVITS/text"
+ Write-Success "G2PWModel Downloaded"
+} else {
+ Write-Info "G2PWModel Exists"
+ Write-Info "Skip Downloading G2PWModel"
+}
+
+if ($DownloadUVR5) {
+ if (-not (Test-Path "tools/uvr5/uvr5_weights")) {
+ Write-Info "Downloading UVR5 Models..."
+ Invoke-Download -Uri $UVR5URL -OutFile "uvr5_weights.zip"
+ Invoke-Unzip "uvr5_weights.zip" "tools/uvr5"
+ Write-Success "UVR5 Models Downloaded"
+ } else {
+ Write-Info "UVR5 Models Exists"
+ Write-Info "Skip Downloading UVR5 Models"
+ }
+}
+
+switch ($Device) {
+ "CU128" {
+ Write-Info "Installing PyTorch For CUDA 12.8..."
+ Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu128"
+ }
+ "CU126" {
+ Write-Info "Installing PyTorch For CUDA 12.6..."
+ Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu126"
+ }
+ "CPU" {
+ Write-Info "Installing PyTorch For CPU..."
+ Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cpu"
+ }
+}
+Write-Success "PyTorch Installed"
+
+Write-Info "Installing Python Dependencies From requirements.txt..."
+Invoke-Pip -r extra-req.txt --no-deps
+Invoke-Pip -r requirements.txt
+Write-Success "Python Dependencies Installed"
+
+Write-Info "Downloading NLTK Data..."
+Invoke-Download -Uri $NLTKURL -OutFile "nltk_data.zip"
+Invoke-Unzip "nltk_data.zip" (python -c "import sys; print(sys.prefix)").Trim()
+
+Write-Info "Downloading Open JTalk Dict..."
+Invoke-Download -Uri $OpenJTalkURL -OutFile "open_jtalk_dic_utf_8-1.11.tar.gz"
+$target = (python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))").Trim()
+tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C $target
+Remove-Item "open_jtalk_dic_utf_8-1.11.tar.gz" -Force
+Write-Success "Open JTalk Dic Downloaded"
+
+Write-Success "Installation Completed"
diff --git a/install.sh b/install.sh
index ea8d2e2d..a2fa751e 100644
--- a/install.sh
+++ b/install.sh
@@ -48,11 +48,12 @@ run_pip_quiet() {
}
run_wget_quiet() {
- local output
- output=$(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$@" 2>&1) || {
- echo -e "${ERROR} Wget failed:\n$output"
+ if wget --tries=25 --wait=5 --read-timeout=40 -q --show-progress "$@" 2>&1; then
+ tput cuu1 && tput el
+ else
+ echo -e "${ERROR} Wget failed"
exit 1
- }
+ fi
}
if ! command -v conda &>/dev/null; then
@@ -171,10 +172,13 @@ if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then
fi
case "$(uname -m)" in
- x86_64|amd64) SYSROOT_PKG="sysroot_linux-64>=2.28" ;;
- aarch64|arm64) SYSROOT_PKG="sysroot_linux-aarch64>=2.28" ;;
- ppc64le) SYSROOT_PKG="sysroot_linux-ppc64le>=2.28" ;;
- *) echo "Unsupported architecture: $(uname -m)"; exit 1 ;;
+x86_64 | amd64) SYSROOT_PKG="sysroot_linux-64>=2.28" ;;
+aarch64 | arm64) SYSROOT_PKG="sysroot_linux-aarch64>=2.28" ;;
+ppc64le) SYSROOT_PKG="sysroot_linux-ppc64le>=2.28" ;;
+*)
+ echo "Unsupported architecture: $(uname -m)"
+ exit 1
+ ;;
esac
# Install build tools
@@ -248,10 +252,7 @@ elif [ "$USE_MODELSCOPE" = "true" ]; then
PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
fi
-if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
- echo -e "${INFO}Pretrained Model Exists"
- echo -e "${INFO}Skip Downloading Pretrained Models"
-else
+if [ ! -d "GPT_SoVITS/pretrained_models/sv" ]; then
echo -e "${INFO}Downloading Pretrained Models..."
rm -rf pretrained_models.zip
run_wget_quiet "$PRETRINED_URL"
@@ -259,6 +260,9 @@ else
unzip -q -o pretrained_models.zip -d GPT_SoVITS
rm -rf pretrained_models.zip
echo -e "${SUCCESS}Pretrained Models Downloaded"
+else
+ echo -e "${INFO}Pretrained Model Exists"
+ echo -e "${INFO}Skip Downloading Pretrained Models"
fi
if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then
From 90ebefa78fd544da36eebe0b2003620879c921b0 Mon Sep 17 00:00:00 2001
From: KamioRinn <63162909+KamioRinn@users.noreply.github.com>
Date: Fri, 27 Jun 2025 10:41:52 +0800
Subject: [PATCH 06/27] make sure ort providers available (#2489)
---
GPT_SoVITS/text/g2pw/onnx_api.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py
index 52eed443..1d5e4231 100644
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@@ -93,13 +93,13 @@ class G2PWOnnxConverter:
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
- try:
+ if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
- except:
+ else:
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
From 6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23 Mon Sep 17 00:00:00 2001
From: KamioRinn <63162909+KamioRinn@users.noreply.github.com>
Date: Fri, 27 Jun 2025 11:58:41 +0800
Subject: [PATCH 07/27] =?UTF-8?q?=E8=AF=AD=E8=A8=80=E5=88=86=E5=89=B2?=
=?UTF-8?q?=E5=8F=8A=E6=A0=BC=E5=BC=8F=E5=8C=96=E4=BC=98=E5=8C=96=20(#2488?=
=?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* better LangSegmenter
* add version num2str
* better version num2str
* sync fast infer
* sync api
* remove duplicate spaces
* remove unnecessary code
---------
Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
---
GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 126 +++++------
GPT_SoVITS/inference_webui.py | 121 +++++-----
.../text/LangSegmenter/langsegmenter.py | 209 +++++++++++-------
GPT_SoVITS/text/chinese.py | 14 --
GPT_SoVITS/text/chinese2.py | 14 --
GPT_SoVITS/text/zh_normalization/num.py | 24 +-
.../zh_normalization/text_normlization.py | 3 +
api.py | 117 +++++-----
8 files changed, 332 insertions(+), 296 deletions(-)
diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
index 9a478d43..283e91c3 100644
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -121,71 +121,67 @@ class TextPreprocessor:
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
with self.bert_lock:
- if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
- # language = language.replace("all_","")
- formattext = text
- while " " in formattext:
- formattext = formattext.replace(" ", " ")
- if language == "all_zh":
- if re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return self.get_phones_and_bert(formattext, "zh", version)
+ text = re.sub(r' {2,}', ' ', text)
+ textlist = []
+ langlist = []
+ if language == "all_zh":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_yue":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ja":
+ for tmp in LangSegmenter.getTexts(text,"ja"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ko":
+ for tmp in LangSegmenter.getTexts(text,"ko"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "en":
+ langlist.append("en")
+ textlist.append(text)
+ elif language == "auto":
+ for tmp in LangSegmenter.getTexts(text):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "auto_yue":
+ for tmp in LangSegmenter.getTexts(text):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ else:
+ for tmp in LangSegmenter.getTexts(text):
+ if langlist:
+ if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+ textlist[-1] += tmp["text"]
+ continue
+ if tmp["lang"] == "en":
+ langlist.append(tmp["lang"])
else:
- phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
- bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
- elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return self.get_phones_and_bert(formattext, "yue", version)
- else:
- phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
- bert = torch.zeros(
- (1024, len(phones)),
- dtype=torch.float32,
- ).to(self.device)
- elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
- textlist = []
- langlist = []
- if language == "auto":
- for tmp in LangSegmenter.getTexts(text):
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- elif language == "auto_yue":
- for tmp in LangSegmenter.getTexts(text):
- if tmp["lang"] == "zh":
- tmp["lang"] = "yue"
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- else:
- for tmp in LangSegmenter.getTexts(text):
- if langlist:
- if (tmp["lang"] == "en" and langlist[-1] == "en") or (
- tmp["lang"] != "en" and langlist[-1] != "en"
- ):
- textlist[-1] += tmp["text"]
- continue
- if tmp["lang"] == "en":
- langlist.append(tmp["lang"])
- else:
- # 因无法区别中日韩文汉字,以用户输入为准
- langlist.append(language)
- textlist.append(tmp["text"])
- # print(textlist)
- # print(langlist)
- phones_list = []
- bert_list = []
- norm_text_list = []
- for i in range(len(textlist)):
- lang = langlist[i]
- phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
- bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
- phones_list.append(phones)
- norm_text_list.append(norm_text)
- bert_list.append(bert)
- bert = torch.cat(bert_list, dim=1)
- phones = sum(phones_list, [])
- norm_text = "".join(norm_text_list)
+ # 因无法区别中日韩文汉字,以用户输入为准
+ langlist.append(language)
+ textlist.append(tmp["text"])
+ # print(textlist)
+ # print(langlist)
+ phones_list = []
+ bert_list = []
+ norm_text_list = []
+ for i in range(len(textlist)):
+ lang = langlist[i]
+ phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
+ bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
+ phones_list.append(phones)
+ norm_text_list.append(norm_text)
+ bert_list.append(bert)
+ bert = torch.cat(bert_list, dim=1)
+ phones = sum(phones_list, [])
+ norm_text = "".join(norm_text_list)
if not final and len(phones) < 6:
return self.get_phones_and_bert("." + text, language, version, final=True)
@@ -240,4 +236,4 @@ class TextPreprocessor:
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
- return result
+ return result
\ No newline at end of file
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 5c7d0103..643e4054 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -586,68 +586,67 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False):
- if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
- formattext = text
- while " " in formattext:
- formattext = formattext.replace(" ", " ")
- if language == "all_zh":
- if re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return get_phones_and_bert(formattext, "zh", version)
+ text = re.sub(r' {2,}', ' ', text)
+ textlist = []
+ langlist = []
+ if language == "all_zh":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_yue":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ja":
+ for tmp in LangSegmenter.getTexts(text,"ja"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ko":
+ for tmp in LangSegmenter.getTexts(text,"ko"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "en":
+ langlist.append("en")
+ textlist.append(text)
+ elif language == "auto":
+ for tmp in LangSegmenter.getTexts(text):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "auto_yue":
+ for tmp in LangSegmenter.getTexts(text):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ else:
+ for tmp in LangSegmenter.getTexts(text):
+ if langlist:
+ if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+ textlist[-1] += tmp["text"]
+ continue
+ if tmp["lang"] == "en":
+ langlist.append(tmp["lang"])
else:
- phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
- bert = get_bert_feature(norm_text, word2ph).to(device)
- elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return get_phones_and_bert(formattext, "yue", version)
- else:
- phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
- bert = torch.zeros(
- (1024, len(phones)),
- dtype=torch.float16 if is_half == True else torch.float32,
- ).to(device)
- elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
- textlist = []
- langlist = []
- if language == "auto":
- for tmp in LangSegmenter.getTexts(text):
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- elif language == "auto_yue":
- for tmp in LangSegmenter.getTexts(text):
- if tmp["lang"] == "zh":
- tmp["lang"] = "yue"
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- else:
- for tmp in LangSegmenter.getTexts(text):
- if langlist:
- if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
- textlist[-1] += tmp["text"]
- continue
- if tmp["lang"] == "en":
- langlist.append(tmp["lang"])
- else:
- # 因无法区别中日韩文汉字,以用户输入为准
- langlist.append(language)
- textlist.append(tmp["text"])
- print(textlist)
- print(langlist)
- phones_list = []
- bert_list = []
- norm_text_list = []
- for i in range(len(textlist)):
- lang = langlist[i]
- phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
- bert = get_bert_inf(phones, word2ph, norm_text, lang)
- phones_list.append(phones)
- norm_text_list.append(norm_text)
- bert_list.append(bert)
- bert = torch.cat(bert_list, dim=1)
- phones = sum(phones_list, [])
- norm_text = "".join(norm_text_list)
+ # 因无法区别中日韩文汉字,以用户输入为准
+ langlist.append(language)
+ textlist.append(tmp["text"])
+ print(textlist)
+ print(langlist)
+ phones_list = []
+ bert_list = []
+ norm_text_list = []
+ for i in range(len(textlist)):
+ lang = langlist[i]
+ phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
+ bert = get_bert_inf(phones, word2ph, norm_text, lang)
+ phones_list.append(phones)
+ norm_text_list.append(norm_text)
+ bert_list.append(bert)
+ bert = torch.cat(bert_list, dim=1)
+ phones = sum(phones_list, [])
+ norm_text = "".join(norm_text_list)
if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True)
diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
index 0187ea69..99b3a422 100644
--- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
+++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
@@ -3,44 +3,38 @@ import re
# jieba静音
import jieba
-
jieba.setLogLevel(logging.CRITICAL)
# 更改fast_langdetect大模型位置
from pathlib import Path
import fast_langdetect
-
-fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
- fast_langdetect.infer.LangDetectConfig(
- cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
- )
-)
+fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
from split_lang import LangSplitter
def full_en(text):
- pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
+ pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
return bool(re.match(pattern, text))
def full_cjk(text):
# 来自wiki
cjk_ranges = [
- (0x4E00, 0x9FFF), # CJK Unified Ideographs
- (0x3400, 0x4DB5), # CJK Extension A
- (0x20000, 0x2A6DD), # CJK Extension B
- (0x2A700, 0x2B73F), # CJK Extension C
- (0x2B740, 0x2B81F), # CJK Extension D
- (0x2B820, 0x2CEAF), # CJK Extension E
- (0x2CEB0, 0x2EBEF), # CJK Extension F
- (0x30000, 0x3134A), # CJK Extension G
- (0x31350, 0x323AF), # CJK Extension H
- (0x2EBF0, 0x2EE5D), # CJK Extension H
+ (0x4E00, 0x9FFF), # CJK Unified Ideographs
+ (0x3400, 0x4DB5), # CJK Extension A
+ (0x20000, 0x2A6DD), # CJK Extension B
+ (0x2A700, 0x2B73F), # CJK Extension C
+ (0x2B740, 0x2B81F), # CJK Extension D
+ (0x2B820, 0x2CEAF), # CJK Extension E
+ (0x2CEB0, 0x2EBEF), # CJK Extension F
+ (0x30000, 0x3134A), # CJK Extension G
+ (0x31350, 0x323AF), # CJK Extension H
+ (0x2EBF0, 0x2EE5D), # CJK Extension H
]
- pattern = r"[0-9、-〜。!?.!?… /]+$"
+ pattern = r'[0-9、-〜。!?.!?… /]+$'
cjk_text = ""
for char in text:
@@ -51,7 +45,7 @@ def full_cjk(text):
return cjk_text
-def split_jako(tag_lang, item):
+def split_jako(tag_lang,item):
if tag_lang == "ja":
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
else:
@@ -59,118 +53,165 @@ def split_jako(tag_lang, item):
lang_list: list[dict] = []
tag = 0
- for match in re.finditer(pattern, item["text"]):
+ for match in re.finditer(pattern, item['text']):
if match.start() > tag:
- lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
+ lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
tag = match.end()
- lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
+ lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
- if tag < len(item["text"]):
- lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
+ if tag < len(item['text']):
+ lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
return lang_list
def merge_lang(lang_list, item):
- if lang_list and item["lang"] == lang_list[-1]["lang"]:
- lang_list[-1]["text"] += item["text"]
+ if lang_list and item['lang'] == lang_list[-1]['lang']:
+ lang_list[-1]['text'] += item['text']
else:
lang_list.append(item)
return lang_list
-class LangSegmenter:
+class LangSegmenter():
# 默认过滤器, 基于gsv目前四种语言
DEFAULT_LANG_MAP = {
"zh": "zh",
"yue": "zh", # 粤语
"wuu": "zh", # 吴语
"zh-cn": "zh",
- "zh-tw": "x", # 繁体设置为x
+ "zh-tw": "x", # 繁体设置为x
"ko": "ko",
"ja": "ja",
"en": "en",
}
- def getTexts(text):
+ def getTexts(text,default_lang = ""):
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
+ lang_splitter.merge_across_digit = False
substr = lang_splitter.split_by_lang(text=text)
lang_list: list[dict] = []
- for _, item in enumerate(substr):
- dict_item = {"lang": item.lang, "text": item.text}
+ have_num = False
- # 处理短英文被识别为其他语言的问题
- if full_en(dict_item["text"]):
- dict_item["lang"] = "en"
- lang_list = merge_lang(lang_list, dict_item)
+ for _, item in enumerate(substr):
+ dict_item = {'lang':item.lang,'text':item.text}
+
+ if dict_item['lang'] == 'digit':
+ if default_lang != "":
+ dict_item['lang'] = default_lang
+ else:
+ have_num = True
+ lang_list = merge_lang(lang_list,dict_item)
continue
- # 处理非日语夹日文的问题(不包含CJK)
- ja_list: list[dict] = []
- if dict_item["lang"] != "ja":
- ja_list = split_jako("ja", dict_item)
+ # 处理短英文被识别为其他语言的问题
+ if full_en(dict_item['text']):
+ dict_item['lang'] = 'en'
+ lang_list = merge_lang(lang_list,dict_item)
+ continue
- if not ja_list:
- ja_list.append(dict_item)
+ if default_lang != "":
+ dict_item['lang'] = default_lang
+ lang_list = merge_lang(lang_list,dict_item)
+ continue
+ else:
+ # 处理非日语夹日文的问题(不包含CJK)
+ ja_list: list[dict] = []
+ if dict_item['lang'] != 'ja':
+ ja_list = split_jako('ja',dict_item)
- # 处理非韩语夹韩语的问题(不包含CJK)
- ko_list: list[dict] = []
- temp_list: list[dict] = []
- for _, ko_item in enumerate(ja_list):
- if ko_item["lang"] != "ko":
- ko_list = split_jako("ko", ko_item)
+ if not ja_list:
+ ja_list.append(dict_item)
- if ko_list:
- temp_list.extend(ko_list)
- else:
- temp_list.append(ko_item)
+ # 处理非韩语夹韩语的问题(不包含CJK)
+ ko_list: list[dict] = []
+ temp_list: list[dict] = []
+ for _, ko_item in enumerate(ja_list):
+ if ko_item["lang"] != 'ko':
+ ko_list = split_jako('ko',ko_item)
- # 未存在非日韩文夹日韩文
- if len(temp_list) == 1:
- # 未知语言检查是否为CJK
- if dict_item["lang"] == "x":
- cjk_text = full_cjk(dict_item["text"])
- if cjk_text:
- dict_item = {"lang": "zh", "text": cjk_text}
- lang_list = merge_lang(lang_list, dict_item)
+ if ko_list:
+ temp_list.extend(ko_list)
else:
- lang_list = merge_lang(lang_list, dict_item)
- continue
- else:
- lang_list = merge_lang(lang_list, dict_item)
- continue
+ temp_list.append(ko_item)
- # 存在非日韩文夹日韩文
- for _, temp_item in enumerate(temp_list):
- # 未知语言检查是否为CJK
- if temp_item["lang"] == "x":
- cjk_text = full_cjk(dict_item["text"])
- if cjk_text:
- dict_item = {"lang": "zh", "text": cjk_text}
- lang_list = merge_lang(lang_list, dict_item)
+ # 未存在非日韩文夹日韩文
+ if len(temp_list) == 1:
+ # 未知语言检查是否为CJK
+ if dict_item['lang'] == 'x':
+ cjk_text = full_cjk(dict_item['text'])
+ if cjk_text:
+ dict_item = {'lang':'zh','text':cjk_text}
+ lang_list = merge_lang(lang_list,dict_item)
+ else:
+ lang_list = merge_lang(lang_list,dict_item)
+ continue
else:
- lang_list = merge_lang(lang_list, dict_item)
- else:
- lang_list = merge_lang(lang_list, temp_item)
+ lang_list = merge_lang(lang_list,dict_item)
+ continue
+ # 存在非日韩文夹日韩文
+ for _, temp_item in enumerate(temp_list):
+ # 未知语言检查是否为CJK
+ if temp_item['lang'] == 'x':
+ cjk_text = full_cjk(temp_item['text'])
+ if cjk_text:
+ lang_list = merge_lang(lang_list,{'lang':'zh','text':cjk_text})
+ else:
+ lang_list = merge_lang(lang_list,temp_item)
+ else:
+ lang_list = merge_lang(lang_list,temp_item)
+
+ # 有数字
+ if have_num:
+ temp_list = lang_list
+ lang_list = []
+ for i, temp_item in enumerate(temp_list):
+ if temp_item['lang'] == 'digit':
+ if default_lang:
+ temp_item['lang'] = default_lang
+ elif lang_list and i == len(temp_list) - 1:
+ temp_item['lang'] = lang_list[-1]['lang']
+ elif not lang_list and i < len(temp_list) - 1:
+ temp_item['lang'] = temp_list[1]['lang']
+ elif lang_list and i < len(temp_list) - 1:
+ if lang_list[-1]['lang'] == temp_list[i + 1]['lang']:
+ temp_item['lang'] = lang_list[-1]['lang']
+ elif lang_list[-1]['text'][-1] in [",",".","!","?",",","。","!","?"]:
+ temp_item['lang'] = temp_list[i + 1]['lang']
+ elif temp_list[i + 1]['text'][0] in [",",".","!","?",",","。","!","?"]:
+ temp_item['lang'] = lang_list[-1]['lang']
+ elif temp_item['text'][-1] in ["。","."]:
+ temp_item['lang'] = lang_list[-1]['lang']
+ elif len(lang_list[-1]['text']) >= len(temp_list[i + 1]['text']):
+ temp_item['lang'] = lang_list[-1]['lang']
+ else:
+ temp_item['lang'] = temp_list[i + 1]['lang']
+ else:
+ temp_item['lang'] = 'zh'
+
+ lang_list = merge_lang(lang_list,temp_item)
+
+
+ # 筛X
temp_list = lang_list
lang_list = []
for _, temp_item in enumerate(temp_list):
- if temp_item["lang"] == "x":
+ if temp_item['lang'] == 'x':
if lang_list:
- temp_item["lang"] = lang_list[-1]["lang"]
+ temp_item['lang'] = lang_list[-1]['lang']
elif len(temp_list) > 1:
- temp_item["lang"] = temp_list[1]["lang"]
+ temp_item['lang'] = temp_list[1]['lang']
else:
- temp_item["lang"] = "zh"
+ temp_item['lang'] = 'zh'
- lang_list = merge_lang(lang_list, temp_item)
+ lang_list = merge_lang(lang_list,temp_item)
return lang_list
-
+
if __name__ == "__main__":
text = "MyGO?,你也喜欢まいご吗?"
@@ -178,3 +219,7 @@ if __name__ == "__main__":
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
print(LangSegmenter.getTexts(text))
+
+ text = "当时ThinkPad T60刚刚发布,一同推出的还有一款名为Advanced Dock的扩展坞配件。这款扩展坞通过连接T60底部的插槽,扩展出包括PCIe在内的一大堆接口,并且自带电源,让T60可以安装桌面显卡来提升性能。"
+ print(LangSegmenter.getTexts(text,"zh"))
+ print(LangSegmenter.getTexts(text))
\ No newline at end of file
diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py
index ce44215f..944c9cb7 100644
--- a/GPT_SoVITS/text/chinese.py
+++ b/GPT_SoVITS/text/chinese.py
@@ -181,20 +181,6 @@ def text_normalize(text):
return dest_text
-# 不排除英文的文本格式化
-def mix_text_normalize(text):
- # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
- tx = TextNormalizer()
- sentences = tx.normalize(text)
- dest_text = ""
- for sentence in sentences:
- dest_text += replace_punctuation_with_en(sentence)
-
- # 避免重复标点引起的参考泄露
- dest_text = replace_consecutive_punctuation(dest_text)
- return dest_text
-
-
if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?"
diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py
index 612aa3a5..dcce0d96 100644
--- a/GPT_SoVITS/text/chinese2.py
+++ b/GPT_SoVITS/text/chinese2.py
@@ -326,20 +326,6 @@ def text_normalize(text):
return dest_text
-# 不排除英文的文本格式化
-def mix_text_normalize(text):
- # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
- tx = TextNormalizer()
- sentences = tx.normalize(text)
- dest_text = ""
- for sentence in sentences:
- dest_text += replace_punctuation_with_en(sentence)
-
- # 避免重复标点引起的参考泄露
- dest_text = replace_consecutive_punctuation(dest_text)
- return dest_text
-
-
if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?"
diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py
index c3af4d6a..14d602b0 100644
--- a/GPT_SoVITS/text/zh_normalization/num.py
+++ b/GPT_SoVITS/text/zh_normalization/num.py
@@ -256,6 +256,24 @@ def replace_to_range(match) -> str:
return result
+RE_VERSION_NUM = re.compile(r"((\d+)(\.\d+)(\.\d+)?(\.\d+)+)")
+def replace_vrsion_num(match) -> str:
+ """
+ Args:
+ match (re.Match)
+ Returns:
+ str
+ """
+ result = ""
+ for c in match.group(1):
+ if c == ".":
+ result += "点"
+ else:
+ result += num2str(c)
+ return result
+
+
+
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
stripped = value_string.lstrip("0")
if len(stripped) == 0:
@@ -308,7 +326,11 @@ def num2str(value_string: str) -> str:
result = verbalize_cardinal(integer)
- decimal = decimal.rstrip("0")
+ if decimal.endswith("0"):
+ decimal = decimal.rstrip("0") + "0"
+ else:
+ decimal = decimal.rstrip("0")
+
if decimal:
# '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二
diff --git a/GPT_SoVITS/text/zh_normalization/text_normlization.py b/GPT_SoVITS/text/zh_normalization/text_normlization.py
index 099b01bd..1a6cdd0f 100644
--- a/GPT_SoVITS/text/zh_normalization/text_normlization.py
+++ b/GPT_SoVITS/text/zh_normalization/text_normlization.py
@@ -25,6 +25,7 @@ from .chronology import replace_time
from .constants import F2H_ASCII_LETTERS
from .constants import F2H_DIGITS
from .constants import F2H_SPACE
+from .num import RE_VERSION_NUM
from .num import RE_DECIMAL_NUM
from .num import RE_DEFAULT_NUM
from .num import RE_FRAC
@@ -36,6 +37,7 @@ from .num import RE_RANGE
from .num import RE_TO_RANGE
from .num import RE_ASMD
from .num import RE_POWER
+from .num import replace_vrsion_num
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
@@ -158,6 +160,7 @@ class TextNormalizer:
sentence = RE_RANGE.sub(replace_range, sentence)
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
+ sentence = RE_VERSION_NUM.sub(replace_vrsion_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
diff --git a/api.py b/api.py
index dc2e8826..cc0896a2 100644
--- a/api.py
+++ b/api.py
@@ -543,66 +543,65 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False):
- if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
- formattext = text
- while " " in formattext:
- formattext = formattext.replace(" ", " ")
- if language == "all_zh":
- if re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return get_phones_and_bert(formattext, "zh", version)
+ text = re.sub(r' {2,}', ' ', text)
+ textlist = []
+ langlist = []
+ if language == "all_zh":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_yue":
+ for tmp in LangSegmenter.getTexts(text,"zh"):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ja":
+ for tmp in LangSegmenter.getTexts(text,"ja"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "all_ko":
+ for tmp in LangSegmenter.getTexts(text,"ko"):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "en":
+ langlist.append("en")
+ textlist.append(text)
+ elif language == "auto":
+ for tmp in LangSegmenter.getTexts(text):
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ elif language == "auto_yue":
+ for tmp in LangSegmenter.getTexts(text):
+ if tmp["lang"] == "zh":
+ tmp["lang"] = "yue"
+ langlist.append(tmp["lang"])
+ textlist.append(tmp["text"])
+ else:
+ for tmp in LangSegmenter.getTexts(text):
+ if langlist:
+ if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+ textlist[-1] += tmp["text"]
+ continue
+ if tmp["lang"] == "en":
+ langlist.append(tmp["lang"])
else:
- phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
- bert = get_bert_feature(norm_text, word2ph).to(device)
- elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
- formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
- formattext = chinese.mix_text_normalize(formattext)
- return get_phones_and_bert(formattext, "yue", version)
- else:
- phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
- bert = torch.zeros(
- (1024, len(phones)),
- dtype=torch.float16 if is_half == True else torch.float32,
- ).to(device)
- elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
- textlist = []
- langlist = []
- if language == "auto":
- for tmp in LangSegmenter.getTexts(text):
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- elif language == "auto_yue":
- for tmp in LangSegmenter.getTexts(text):
- if tmp["lang"] == "zh":
- tmp["lang"] = "yue"
- langlist.append(tmp["lang"])
- textlist.append(tmp["text"])
- else:
- for tmp in LangSegmenter.getTexts(text):
- if langlist:
- if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
- textlist[-1] += tmp["text"]
- continue
- if tmp["lang"] == "en":
- langlist.append(tmp["lang"])
- else:
- # 因无法区别中日韩文汉字,以用户输入为准
- langlist.append(language)
- textlist.append(tmp["text"])
- phones_list = []
- bert_list = []
- norm_text_list = []
- for i in range(len(textlist)):
- lang = langlist[i]
- phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
- bert = get_bert_inf(phones, word2ph, norm_text, lang)
- phones_list.append(phones)
- norm_text_list.append(norm_text)
- bert_list.append(bert)
- bert = torch.cat(bert_list, dim=1)
- phones = sum(phones_list, [])
- norm_text = "".join(norm_text_list)
+ # 因无法区别中日韩文汉字,以用户输入为准
+ langlist.append(language)
+ textlist.append(tmp["text"])
+ phones_list = []
+ bert_list = []
+ norm_text_list = []
+ for i in range(len(textlist)):
+ lang = langlist[i]
+ phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
+ bert = get_bert_inf(phones, word2ph, norm_text, lang)
+ phones_list.append(phones)
+ norm_text_list.append(norm_text)
+ bert_list.append(bert)
+ bert = torch.cat(bert_list, dim=1)
+ phones = sum(phones_list, [])
+ norm_text = "".join(norm_text_list)
if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True)
From 8c579d46dda62ae5bdf04f96796c1acc79d44b59 Mon Sep 17 00:00:00 2001
From: Yixiao Chen <113320753+Bobchenyx@users.noreply.github.com>
Date: Wed, 2 Jul 2025 10:48:28 -0400
Subject: [PATCH 08/27] Update export_torch_script.py (#2494)
Avoid dtype inconsistency when exporting
---
GPT_SoVITS/export_torch_script.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py
index a3a48277..e4406f28 100644
--- a/GPT_SoVITS/export_torch_script.py
+++ b/GPT_SoVITS/export_torch_script.py
@@ -474,6 +474,10 @@ class T2SModel(nn.Module):
bert = bert.unsqueeze(0)
x = self.ar_text_embedding(all_phoneme_ids)
+
+ # avoid dtype inconsistency when exporting
+ bert = bert.to(dtype=self.bert_proj.weight.dtype)
+
x = x + self.bert_proj(bert.transpose(1, 2))
x: torch.Tensor = self.ar_text_position(x)
From 0899b7e432561c6db3b33b2556fbb0bde25f2725 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:59:49 +0800
Subject: [PATCH 09/27] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index e67288ea..6bcb2844 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
From 3a75f5023feea209b256c2943cfb42418313ba28 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:05:03 +0800
Subject: [PATCH 10/27] Update README.md
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 6bcb2844..6cb19092 100644
--- a/README.md
+++ b/README.md
@@ -10,13 +10,13 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://lj1995-gpt-sovits-proplus.hf.space/)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
From 97e37c74d8dc8704bc8192caf61cea67156af1dc Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:06:04 +0800
Subject: [PATCH 11/27] Update README.md
---
docs/cn/README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 77841b99..39e37a42 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -8,13 +8,13 @@
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
From e63e0901fd62a8ce471507828fa281e895f32b7a Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:12:24 +0800
Subject: [PATCH 12/27] Update assets.py
---
tools/assets.py | 89 +++++++++++++++++++++++++++++++++++++------------
1 file changed, 67 insertions(+), 22 deletions(-)
diff --git a/tools/assets.py b/tools/assets.py
index 28d0f6b4..533a020c 100644
--- a/tools/assets.py
+++ b/tools/assets.py
@@ -1,38 +1,81 @@
js = """
-function deleteTheme() {
+function createGradioAnimation() {
-const params = new URLSearchParams(window.location.search);
-if (params.has('__theme')) {
- params.delete('__theme');
- const newUrl = `${window.location.pathname}?${params.toString()}`;
- window.location.replace(newUrl);
-}
+ const params = new URLSearchParams(window.location.search);
+ if (params.get('__theme') !== 'light') {
+ params.set('__theme', 'light'); // 仅当 __theme 不是 'light' 时设置为 'light'
+ window.location.search = params.toString(); // 更新 URL,触发页面刷新
+ }
+
+ var container = document.createElement('div');
+ container.id = 'gradio-animation';
+ container.style.fontSize = '2em';
+ container.style.fontWeight = '500';
+ container.style.textAlign = 'center';
+ container.style.marginBottom = '20px';
+ container.style.fontFamily = '-apple-system, sans-serif, Arial, Calibri';
+ var text = 'Welcome to GPT-SoVITS !';
+ for (var i = 0; i < text.length; i++) {
+ (function(i){
+ setTimeout(function(){
+ var letter = document.createElement('span');
+ letter.style.opacity = '0';
+ letter.style.transition = 'opacity 0.5s';
+ letter.innerText = text[i];
+
+ container.appendChild(letter);
+
+ setTimeout(function() {
+ letter.style.opacity = '1';
+ }, 50);
+ }, i * 250);
+ })(i);
+ }
+ return 'Animation created';
}
"""
+
css = """
/* CSSStyleRule */
+
.markdown {
+ background-color: lightblue;
padding: 6px 10px;
}
-@media (prefers-color-scheme: light) {
- .markdown {
- background-color: lightblue;
- color: #000;
- }
-}
-
-@media (prefers-color-scheme: dark) {
- .markdown {
- background-color: #4b4b4b;
- color: rgb(244, 244, 245);
- }
+.checkbox_info {
+ color: var(--block-title-text-color) !important;
+ font-size: var(--block-title-text-size) !important;
+ font-weight: var(--block-title-text-weight) !important;
+ height: 22px;
+ margin-bottom: 8px !important;
}
::selection {
- background: #ffc078 !important;
+ background: #ffc078; !important;
+}
+
+#checkbox_train_dpo input[type="checkbox"]{
+ margin-top: 6px;
+}
+
+#checkbox_train_dpo span {
+ margin-top: 6px;
+}
+
+#checkbox_align_train {
+ padding-top: 18px;
+ padding-bottom: 18px;
+}
+
+#checkbox_align_infer input[type="checkbox"] {
+ margin-top: 10px;
+}
+
+#checkbox_align_infer span {
+ margin-top: 10px;
}
footer {
@@ -48,14 +91,16 @@ footer * {
}
"""
-
top_html = """
{}
-
+
+
+
+
From 4e3c69043cf0772f02125a63e70af835ca702904 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:16:24 +0800
Subject: [PATCH 13/27] Update inference_webui.py
---
GPT_SoVITS/inference_webui.py | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 643e4054..a361ed58 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -6,7 +6,20 @@
全部按英文识别
全部按日文识别
"""
+import psutil
+import os
+def set_high_priority():
+ """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
+ if os.name != "nt":
+ return # 仅 Windows 有效
+ p = psutil.Process(os.getpid())
+ try:
+ p.nice(psutil.HIGH_PRIORITY_CLASS)
+ print("已将进程优先级设为 High")
+ except psutil.AccessDenied:
+ print("权限不足,无法修改优先级(请用管理员运行)")
+set_high_priority()
import json
import logging
import os
From 426e1a2bb43614af2479b877c37acfb0591e952f Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:16:45 +0800
Subject: [PATCH 14/27] =?UTF-8?q?=E6=8F=90=E5=8D=87=E6=8E=A8=E7=90=86?=
=?UTF-8?q?=E8=BF=9B=E7=A8=8B=E4=BC=98=E5=85=88=E7=BA=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
GPT_SoVITS/inference_webui_fast.py | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py
index 470b7bbd..2c159d81 100644
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@@ -6,7 +6,20 @@
全部按英文识别
全部按日文识别
"""
+import psutil
+import os
+def set_high_priority():
+ """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
+ if os.name != "nt":
+ return # 仅 Windows 有效
+ p = psutil.Process(os.getpid())
+ try:
+ p.nice(psutil.HIGH_PRIORITY_CLASS)
+ print("已将进程优先级设为 High")
+ except psutil.AccessDenied:
+ print("权限不足,无法修改优先级(请用管理员运行)")
+set_high_priority()
import json
import logging
import os
From fec515dcce2053164e6793a1c2ffe8fecc32d6ad Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Thu, 10 Jul 2025 18:33:18 +0800
Subject: [PATCH 15/27] Update Changelog_CN.md
---
docs/cn/Changelog_CN.md | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md
index 6c6dcdbe..f68e23a9 100644
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@@ -594,3 +594,33 @@
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
- 类型: 修复
- 提交: RVC-Boss
+- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
+ - 内容: 语种分割优化
+ - 类型: 优化
+ - 提交: KamioRinn
+- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
+ - 内容: 修复并行推理对v2pro支持bug
+ - 类型: 修复
+ - 提交: YYuX-1145
+- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
+ - 内容: v2pro对ge提取时会出现数值溢出的问题修复
+ - 类型: 修复
+ - 提交: RVC-Boss
+- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
+ - 内容: install.sh逻辑优化
+ - 类型: 优化
+ - 提交: XXXXRT666
+- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
+ - 内容: onnxruntime加载逻辑优化(对gpu/cpu的判断)
+ - 类型: 优化
+ - 提交: KamioRinn
+- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
+ - 内容: 语言分割及格式化优化
+ - 类型: 优化
+ - 提交: KamioRinn
+- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
+ - 内容: 提升推理进程优先级(修复win11下可能GPU利用率受限的问题)
+ - 类型: 修复
+ - 提交: XianYue0125
+
+
From ec1218893e09389b23e477bf685ced7e5ec2fb08 Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:10:07 +0800
Subject: [PATCH 16/27] Update Badge (#2518)
* Update README.md
* Update README.md
* Update Badges
* specify ranges
---
README.md | 5 ++++-
docs/cn/README.md | 8 ++++++--
docs/ja/README.md | 10 +++++++---
docs/ko/README.md | 10 +++++++---
docs/tr/README.md | 9 ++++++---
5 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index 6cb19092..be807a08 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,11 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
+[](https://www.python.org)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://lj1995-gpt-sovits-proplus.hf.space/)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 39e37a42..793734d8 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -7,15 +7,19 @@

+[](https://www.python.org)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://lj1995-gpt-sovits-proplus.hf.space/)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
diff --git a/docs/ja/README.md b/docs/ja/README.md
index c3c42cd8..1ee9abb4 100644
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@@ -7,14 +7,18 @@

+[](https://www.python.org)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ja/Changelog_JA.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
diff --git a/docs/ko/README.md b/docs/ko/README.md
index 1028c00a..9ff32f83 100644
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@@ -7,14 +7,18 @@

+[](https://www.python.org)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ko/Changelog_KO.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
diff --git a/docs/tr/README.md b/docs/tr/README.md
index dd5d79b3..b80c764b 100644
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@@ -7,14 +7,17 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz

+[](https://www.python.org)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[](https://lj1995-gpt-sovits-proplus.hf.space/)
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[](https://rentry.co/GPT-SoVITS-guide#/)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/tr/Changelog_TR.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
From 706bec74f8b80f9ca915b746a960bd72c9a1c4f9 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:11:08 +0800
Subject: [PATCH 17/27] Update assets.py
---
tools/assets.py | 90 +++++++++++++------------------------------------
1 file changed, 24 insertions(+), 66 deletions(-)
diff --git a/tools/assets.py b/tools/assets.py
index 533a020c..6851c064 100644
--- a/tools/assets.py
+++ b/tools/assets.py
@@ -1,81 +1,38 @@
js = """
-function createGradioAnimation() {
-
- const params = new URLSearchParams(window.location.search);
- if (params.get('__theme') !== 'light') {
- params.set('__theme', 'light'); // 仅当 __theme 不是 'light' 时设置为 'light'
- window.location.search = params.toString(); // 更新 URL,触发页面刷新
- }
-
- var container = document.createElement('div');
- container.id = 'gradio-animation';
- container.style.fontSize = '2em';
- container.style.fontWeight = '500';
- container.style.textAlign = 'center';
- container.style.marginBottom = '20px';
- container.style.fontFamily = '-apple-system, sans-serif, Arial, Calibri';
+function deleteTheme() {
- var text = 'Welcome to GPT-SoVITS !';
- for (var i = 0; i < text.length; i++) {
- (function(i){
- setTimeout(function(){
- var letter = document.createElement('span');
- letter.style.opacity = '0';
- letter.style.transition = 'opacity 0.5s';
- letter.innerText = text[i];
+const params = new URLSearchParams(window.location.search);
+if (params.has('__theme')) {
+ params.delete('__theme');
+ const newUrl = `${window.location.pathname}?${params.toString()}`;
+ window.location.replace(newUrl);
+}
- container.appendChild(letter);
-
- setTimeout(function() {
- letter.style.opacity = '1';
- }, 50);
- }, i * 250);
- })(i);
- }
- return 'Animation created';
}
"""
-
css = """
/* CSSStyleRule */
-
.markdown {
- background-color: lightblue;
padding: 6px 10px;
}
-.checkbox_info {
- color: var(--block-title-text-color) !important;
- font-size: var(--block-title-text-size) !important;
- font-weight: var(--block-title-text-weight) !important;
- height: 22px;
- margin-bottom: 8px !important;
+@media (prefers-color-scheme: light) {
+ .markdown {
+ background-color: lightblue;
+ color: #000;
+ }
+}
+
+@media (prefers-color-scheme: dark) {
+ .markdown {
+ background-color: #4b4b4b;
+ color: rgb(244, 244, 245);
+ }
}
::selection {
- background: #ffc078; !important;
-}
-
-#checkbox_train_dpo input[type="checkbox"]{
- margin-top: 6px;
-}
-
-#checkbox_train_dpo span {
- margin-top: 6px;
-}
-
-#checkbox_align_train {
- padding-top: 18px;
- padding-bottom: 18px;
-}
-
-#checkbox_align_infer input[type="checkbox"] {
- margin-top: 10px;
-}
-
-#checkbox_align_infer span {
- margin-top: 10px;
+ background: #ffc078 !important;
}
footer {
@@ -91,6 +48,7 @@ footer * {
}
"""
+
top_html = """
{}
@@ -98,12 +56,12 @@ top_html = """
-
-
-
+
+
+
From 85035f7ac09b8b90146cc4cd6bf334673e9baac7 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Mon, 14 Jul 2025 18:56:22 +0800
Subject: [PATCH 18/27] add RTF performence
add RTF performence
---
README.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/README.md b/README.md
index be807a08..f9feb80d 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,11 @@ Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+**RTF(inference speed) of GPT-SoVITS**:
+0.028 tested in 4060Ti, 0.014 tested in 4090. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
+
+请不要尬黑GPT-SoVITS推理速度慢,谢谢!
+
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation
From 42586e20f7e2d58026ed2fa68ac1ddf41fc48346 Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Mon, 14 Jul 2025 19:01:26 +0800
Subject: [PATCH 19/27] add RTF performence
add RTF performence
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index f9feb80d..86d50ea2 100644
--- a/README.md
+++ b/README.md
@@ -43,8 +43,8 @@ Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
-**RTF(inference speed) of GPT-SoVITS**:
-0.028 tested in 4060Ti, 0.014 tested in 4090. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
+**RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
+0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
请不要尬黑GPT-SoVITS推理速度慢,谢谢!
From e476b01f30312139555d45a78cbd830f557d892c Mon Sep 17 00:00:00 2001
From: jiangsier-xyz <126842484+jiangsier-xyz@users.noreply.github.com>
Date: Wed, 16 Jul 2025 15:42:36 +0800
Subject: [PATCH 20/27] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20TTS.py=20=E6=97=A0?=
=?UTF-8?q?=E6=B3=95=E8=AF=86=E5=88=AB=E7=9C=9F=E6=AD=A3=E6=94=AF=E6=8C=81?=
=?UTF-8?q?=E7=89=88=E6=9C=AC=20v2Pro=E3=80=81v2ProPlus=20=E7=9A=84?=
=?UTF-8?q?=E9=97=AE=E9=A2=98=20(#2490)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
同时更新一版默认配置。
Co-authored-by: jiangsier-xyz
---
GPT_SoVITS/TTS_infer_pack/TTS.py | 2 +-
GPT_SoVITS/configs/tts_infer.yaml | 16 ++++++++++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index 795b55dd..98c10328 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -304,7 +304,7 @@ class TTS_Config:
configs: dict = self._load_configs(self.configs_path)
assert isinstance(configs, dict)
- version = configs.get("version", "v2").lower()
+ version = configs.get("version", "v2")
assert version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"]
self.default_configs[version] = configs.get(version, self.default_configs[version])
self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml
index 20c41a20..f31061cc 100644
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@@ -22,6 +22,22 @@ v2:
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
+v2Pro:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
+ version: v2Pro
+ vits_weights_path: GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth
+v2ProPlus:
+ bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
+ cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
+ device: cpu
+ is_half: false
+ t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
+ version: v2ProPlus
+ vits_weights_path: GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth
v3:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
From 4d8ebf85233d4f1166d7cc02fdc595602975ca8f Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Wed, 16 Jul 2025 15:43:26 +0800
Subject: [PATCH 21/27] Update TTS.py
---
GPT_SoVITS/TTS_infer_pack/TTS.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index 98c10328..da6a6dfe 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -304,8 +304,8 @@ class TTS_Config:
configs: dict = self._load_configs(self.configs_path)
assert isinstance(configs, dict)
- version = configs.get("version", "v2")
- assert version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"]
+ version = configs.get("version", "v2").lower()
+ assert version in ["v1", "v2", "v3", "v4", "v2pro", "v2proplus"]
self.default_configs[version] = configs.get(version, self.default_configs[version])
self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
From 2d09bbe63ab6e33656e7d44b76a96e0072ae994b Mon Sep 17 00:00:00 2001
From: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com>
Date: Wed, 16 Jul 2025 15:44:04 +0800
Subject: [PATCH 22/27] Update tts_infer.yaml
---
GPT_SoVITS/configs/tts_infer.yaml | 1 +
1 file changed, 1 insertion(+)
diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml
index f31061cc..531aeb51 100644
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@@ -1,3 +1,4 @@
+version: v2ProPlus
custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
From cefafee32cfc08f0f622ef460578b09485cc189e Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Thu, 17 Jul 2025 20:28:25 +0800
Subject: [PATCH 23/27] Add Distil (#2531)
---
tools/asr/config.py | 24 +++++++----
tools/asr/fasterwhisper_asr.py | 78 +++++++++++++++++++++++++---------
webui.py | 9 ++--
3 files changed, 77 insertions(+), 34 deletions(-)
diff --git a/tools/asr/config.py b/tools/asr/config.py
index c04069b2..9c26a4f6 100644
--- a/tools/asr/config.py
+++ b/tools/asr/config.py
@@ -6,15 +6,10 @@ def check_fw_local_models():
启动时检查本地是否有 Faster Whisper 模型.
"""
model_size_list = [
- "tiny",
- "tiny.en",
- "base",
- "base.en",
- "small",
- "small.en",
"medium",
"medium.en",
- "large",
+ "distil-large-v2",
+ "distil-large-v3",
"large-v1",
"large-v2",
"large-v3",
@@ -25,11 +20,24 @@ def check_fw_local_models():
return model_size_list
+def get_models():
+ model_size_list = [
+ "medium",
+ "medium.en",
+ "distil-large-v2",
+ "distil-large-v3",
+ "large-v1",
+ "large-v2",
+ "large-v3",
+ ]
+ return model_size_list
+
+
asr_dict = {
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
"Faster Whisper (多语种)": {
"lang": ["auto", "zh", "en", "ja", "ko", "yue"],
- "size": check_fw_local_models(),
+ "size": get_models(),
"path": "fasterwhisper_asr.py",
"precision": ["float32", "float16", "int8"],
},
diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py
index 27cabbc2..a2ebe975 100644
--- a/tools/asr/fasterwhisper_asr.py
+++ b/tools/asr/fasterwhisper_asr.py
@@ -1,15 +1,16 @@
import argparse
import os
+import time
import traceback
-os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
-os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
-
import torch
from faster_whisper import WhisperModel
+from huggingface_hub import snapshot_download
+from huggingface_hub.errors import LocalEntryNotFoundError
from tqdm import tqdm
-from tools.asr.config import check_fw_local_models
+from tools.asr.config import get_models
+from tools.asr.funasr_asr import only_asr
from tools.my_utils import load_cudnn
# fmt: off
@@ -38,20 +39,54 @@ language_code_list = [
# fmt: on
-def execute_asr(input_folder, output_folder, model_size, language, precision):
- if "-local" in model_size:
- model_size = model_size[:-6]
- model_path = f"tools/asr/models/faster-whisper-{model_size}"
+def download_model(model_size: str):
+ if "distil" in model_size:
+ repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
else:
- model_path = model_size
+ repo_id = f"Systran/faster-whisper-{model_size}"
+ model_path = f"tools/asr/models/{repo_id.strip('Systran/')}"
+
+ files: list[str] = [
+ "config.json",
+ "model.bin",
+ "tokenizer.json",
+ "vocabulary.txt",
+ ]
+ if model_size == "large-v3" or "distil" in model_size:
+ files.append("preprocessor_config.json")
+ files.append("vocabulary.json")
+
+ files.remove("vocabulary.txt")
+
+ for attempt in range(2):
+ try:
+ snapshot_download(
+ repo_id=repo_id,
+ allow_patterns=files,
+ local_dir=model_path,
+ )
+ break
+ except LocalEntryNotFoundError:
+ if attempt < 1:
+ time.sleep(2)
+ else:
+ print("[ERROR] LocalEntryNotFoundError and no fallback.")
+ traceback.print_exc()
+ exit(1)
+ except Exception as e:
+ print(f"[ERROR] Unexpected error on attempt {attempt + 1}: {e}")
+ traceback.print_exc()
+ exit(1)
+
+ return model_path
+
+
+def execute_asr(input_folder, output_folder, model_path, language, precision):
if language == "auto":
language = None # 不设置语种由模型自动输出概率最高的语种
- print("loading faster whisper model:", model_size, model_path)
+ print("loading faster whisper model:", model_path, model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
- try:
- model = WhisperModel(model_path, device=device, compute_type=precision)
- except:
- return print(traceback.format_exc())
+ model = WhisperModel(model_path, device=device, compute_type=precision)
input_file_names = os.listdir(input_folder)
input_file_names.sort()
@@ -73,16 +108,15 @@ def execute_asr(input_folder, output_folder, model_size, language, precision):
if info.language == "zh":
print("检测为中文文本, 转 FunASR 处理")
- if "only_asr" not in globals():
- from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型
text = only_asr(file_path, language=info.language.lower())
if text == "":
for segment in segments:
text += segment.text
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
- except:
- print(traceback.format_exc())
+ except Exception as e:
+ print(e)
+ traceback.print_exc()
output_folder = output_folder or "output/asr_opt"
os.makedirs(output_folder, exist_ok=True)
@@ -107,7 +141,7 @@ if __name__ == "__main__":
"--model_size",
type=str,
default="large-v3",
- choices=check_fw_local_models(),
+ choices=get_models(),
help="Model Size of Faster Whisper",
)
parser.add_argument(
@@ -123,10 +157,14 @@ if __name__ == "__main__":
)
cmd = parser.parse_args()
+ model_size = cmd.model_size
+ if model_size == "large":
+ model_size = "large-v3"
+ model_path = download_model(model_size)
output_file_path = execute_asr(
input_folder=cmd.input_folder,
output_folder=cmd.output_folder,
- model_size=cmd.model_size,
+ model_path=model_path,
language=cmd.language,
precision=cmd.precision,
)
diff --git a/webui.py b/webui.py
index 9981cfcc..9a6aae5f 100644
--- a/webui.py
+++ b/webui.py
@@ -86,13 +86,10 @@ from config import (
from tools import my_utils
from tools.my_utils import check_details, check_for_existance
-# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
-try:
- import gradio.analytics as analytics
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
- analytics.version_check = lambda: None
-except:
- ...
+# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
import gradio as gr
n_cpu = cpu_count()
From b9211657d8dfe8cd46f6b6eb9cfc55d5989e6548 Mon Sep 17 00:00:00 2001
From: ChasonJiang <46401978+ChasonJiang@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:54:40 +0800
Subject: [PATCH 24/27] =?UTF-8?q?=E4=BC=98=E5=8C=96TTS=5FConfig=E7=9A=84?=
=?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=80=BB=E8=BE=91=20(#2536)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* 优化TTS_Config的代码逻辑
* 在载入vits权重之后保存tts_config
---
GPT_SoVITS/TTS_infer_pack/TTS.py | 20 +++++++++++++-------
GPT_SoVITS/configs/tts_infer.yaml | 1 -
GPT_SoVITS/inference_webui_fast.py | 3 ++-
3 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index da6a6dfe..0c1d2484 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -304,10 +304,10 @@ class TTS_Config:
configs: dict = self._load_configs(self.configs_path)
assert isinstance(configs, dict)
- version = configs.get("version", "v2").lower()
- assert version in ["v1", "v2", "v3", "v4", "v2pro", "v2proplus"]
- self.default_configs[version] = configs.get(version, self.default_configs[version])
- self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
+ configs_ = deepcopy(self.default_configs)
+ configs_.update(configs)
+ self.configs: dict = configs_.get("custom", configs_["v2"])
+ self.default_configs = deepcopy(configs_)
self.device = self.configs.get("device", torch.device("cpu"))
if "cuda" in str(self.device) and not torch.cuda.is_available():
@@ -315,11 +315,13 @@ class TTS_Config:
self.device = torch.device("cpu")
self.is_half = self.configs.get("is_half", False)
- # if str(self.device) == "cpu" and self.is_half:
- # print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
- # self.is_half = False
+ if str(self.device) == "cpu" and self.is_half:
+ print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
+ self.is_half = False
+ version = self.configs.get("version", None)
self.version = version
+ assert self.version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"], "Invalid version!"
self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
self.vits_weights_path = self.configs.get("vits_weights_path", None)
self.bert_base_path = self.configs.get("bert_base_path", None)
@@ -576,6 +578,10 @@ class TTS:
if self.configs.is_half and str(self.configs.device) != "cpu":
self.vits_model = self.vits_model.half()
+ self.configs.save_configs()
+
+
+
def init_t2s_weights(self, weights_path: str):
print(f"Loading Text2Semantic weights from {weights_path}")
self.configs.t2s_weights_path = weights_path
diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml
index 531aeb51..f31061cc 100644
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@@ -1,4 +1,3 @@
-version: v2ProPlus
custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py
index 2c159d81..51a120f1 100644
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@@ -125,7 +125,8 @@ is_exist_s2gv4 = os.path.exists(path_sovits_v4)
tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")
tts_config.device = device
tts_config.is_half = is_half
-tts_config.version = version
+# tts_config.version = version
+tts_config.update_version(version)
if gpt_path is not None:
if "!" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path]
From b5a67e62470fb87e7cea28ecad7c7c3bf7a58afd Mon Sep 17 00:00:00 2001
From: ChasonJiang <46401978+ChasonJiang@users.noreply.github.com>
Date: Fri, 18 Jul 2025 14:59:59 +0800
Subject: [PATCH 25/27] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgpt=E7=9A=84loss?=
=?UTF-8?q?=E8=AE=A1=E7=AE=97=E9=97=AE=E9=A2=98=20(#2537)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* 修复gpt的loss计算问题
* fallback tts config
---
GPT_SoVITS/AR/models/t2s_model.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py
index 4725b7a3..7196d6ab 100644
--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@@ -356,7 +356,7 @@ class Text2SemanticDecoder(nn.Module):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
- x_mask = make_pad_mask(x_lens)
+ x_mask = make_pad_mask_left(x_lens)
y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64)
@@ -420,7 +420,7 @@ class Text2SemanticDecoder(nn.Module):
mask=xy_attn_mask,
)
x_len = x_lens.max()
- logits = self.ar_predict_layer(xy_dec[:, x_len:])
+ logits = self.ar_predict_layer(xy_dec[:, x_len-1:])
###### DPO #############
reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(
@@ -432,7 +432,7 @@ class Text2SemanticDecoder(nn.Module):
mask=reject_xy_attn_mask,
)
x_len = x_lens.max()
- reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:])
+ reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len-1:])
# loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
@@ -455,7 +455,7 @@ class Text2SemanticDecoder(nn.Module):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
- x_mask = make_pad_mask(x_lens)
+ x_mask = make_pad_mask_left(x_lens)
y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64)
@@ -502,7 +502,7 @@ class Text2SemanticDecoder(nn.Module):
(xy_pos, None),
mask=xy_attn_mask,
)
- logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1)
+ logits = self.ar_predict_layer(xy_dec[:, x_len-1:]).permute(0, 2, 1)
# loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
loss = F.cross_entropy(logits, targets, reduction="sum")
@@ -578,7 +578,7 @@ class Text2SemanticDecoder(nn.Module):
def pad_y_eos(self, y, y_mask_int, eos_id):
targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1)
# 错位
- return targets[:, :-1], targets[:, 1:]
+ return targets[:, :-1], targets
def infer_panel_batch_infer(
self,
From 0be59c8043a12112934d474ff4cc65658d848e8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A4=9A=E7=8E=A9=E5=B9=BB=E7=81=B5qwq?=
Date: Sat, 19 Jul 2025 00:29:48 +0800
Subject: [PATCH 26/27] =?UTF-8?q?fix:=20=E6=9B=B4=E6=AD=A3=E9=93=BE?=
=?UTF-8?q?=E6=8E=A5=20(#2539)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
tools/assets.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/assets.py b/tools/assets.py
index 6851c064..b2c302fe 100644
--- a/tools/assets.py
+++ b/tools/assets.py
@@ -59,7 +59,7 @@ top_html = """
-
+
From fdf794e31d1fd6f91c5cb4fbb0396094491a31ac Mon Sep 17 00:00:00 2001
From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com>
Date: Sat, 2 Aug 2025 17:47:15 +0800
Subject: [PATCH 27/27] Update WSL Rocm (#2561)
---
install.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/install.sh b/install.sh
index a2fa751e..7d80ec28 100644
--- a/install.sh
+++ b/install.sh
@@ -373,7 +373,7 @@ if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
cd "${location}"/torch/lib/ || exit
rm libhsa-runtime64.so*
- cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
+ cp "$(readlink -f /opt/rocm/lib/libhsa-runtime64.so)" libhsa-runtime64.so
echo -e "${SUCCESS}ROCm Runtime Lib Updated..."
fi