Compare commits

..

No commits in common. "main" and "20250422v4" have entirely different histories.

114 changed files with 3749 additions and 10757 deletions

View File

@ -1,198 +1,8 @@
GPT_SoVITS/pretrained_models/* docs
tools/asr/models/*
tools/uvr5/uvr5_weights/*
.git
.DS_Store
.vscode
*.pyc
env
runtime
.idea
output
logs logs
SoVITS_weights*/ output
GPT_weights*/ reference
SoVITS_weights
GPT_weights
TEMP TEMP
weight.json .git
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc

View File

@ -1,194 +0,0 @@
$ErrorActionPreference = "Stop"
Write-Host "Current location: $(Get-Location)"
$cuda = $env:TORCH_CUDA
if (-not $cuda) {
Write-Error "Missing TORCH_CUDA env (cu124 or cu128)"
exit 1
}
$date = $env:DATE_SUFFIX
if ([string]::IsNullOrWhiteSpace($date)) {
$date = Get-Date -Format "MMdd"
}
$pkgName = "GPT-SoVITS-$date"
$tmpDir = "tmp"
$srcDir = $PWD
$suffix = $env:PKG_SUFFIX
if (-not [string]::IsNullOrWhiteSpace($suffix)) {
$pkgName = "$pkgName$suffix"
}
$pkgName = "$pkgName-$cuda"
$baseHF = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main"
$PRETRAINED_URL = "$baseHF/pretrained_models.zip"
$G2PW_URL = "$baseHF/G2PWModel.zip"
$UVR5_URL = "$baseHF/uvr5_weights.zip"
$NLTK_URL = "$baseHF/nltk_data.zip"
$JTALK_URL = "$baseHF/open_jtalk_dic_utf_8-1.11.tar.gz"
$PYTHON_VERSION = "3.11.12"
$PY_RELEASE_VERSION = "20250409"
Write-Host "[INFO] Cleaning .git..."
Remove-Item "$srcDir\.git" -Recurse -Force -ErrorAction SilentlyContinue
Write-Host "[INFO] Creating tmp dir..."
New-Item -ItemType Directory -Force -Path $tmpDir
Write-Host "[INFO] System Python version:"
python --version
python -m site
Write-Host "[INFO] Downloading Python $PYTHON_VERSION..."
$zst = "$tmpDir\python.tar.zst"
Invoke-WebRequest "https://github.com/astral-sh/python-build-standalone/releases/download/$PY_RELEASE_VERSION/cpython-$PYTHON_VERSION+$PY_RELEASE_VERSION-x86_64-pc-windows-msvc-pgo-full.tar.zst" -OutFile $zst
& "C:\Program Files\7-Zip\7z.exe" e $zst -o"$tmpDir" -aoa
$tar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
& "C:\Program Files\7-Zip\7z.exe" x $tar.FullName -o"$tmpDir\extracted" -aoa
Move-Item "$tmpDir\extracted\python\install" "$srcDir\runtime"
Write-Host "[INFO] Copying Redistributing Visual C++ Runtime..."
$vswhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
$vsPath = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
$redistRoot = Join-Path $vsPath "VC\Redist\MSVC"
$targetVer = Get-ChildItem -Path $redistRoot -Directory |
Where-Object { $_.Name -match "^14\." } |
Sort-Object Name -Descending |
Select-Object -First 1
$x64Path = Join-Path $targetVer.FullName "x64"
Get-ChildItem -Path $x64Path -Directory | Where-Object {
$_.Name -match '^Microsoft\..*\.(CRT|OpenMP)$'
} | ForEach-Object {
Get-ChildItem -Path $_.FullName -Filter "*.dll" | ForEach-Object {
Copy-Item -Path $_.FullName -Destination "$srcDir\runtime" -Force
}
}
function DownloadAndUnzip($url, $targetRelPath) {
$filename = Split-Path $url -Leaf
$tmpZip = "$tmpDir\$filename"
Invoke-WebRequest $url -OutFile $tmpZip
Expand-Archive -Path $tmpZip -DestinationPath $tmpDir -Force
$subdirName = $filename -replace '\.zip$', ''
$sourcePath = Join-Path $tmpDir $subdirName
$destRoot = Join-Path $srcDir $targetRelPath
$destPath = Join-Path $destRoot $subdirName
if (Test-Path $destPath) {
Remove-Item $destPath -Recurse -Force
}
Move-Item $sourcePath $destRoot
Remove-Item $tmpZip
}
Write-Host "[INFO] Download pretrained_models..."
DownloadAndUnzip $PRETRAINED_URL "GPT_SoVITS"
Write-Host "[INFO] Download G2PWModel..."
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
Write-Host "[INFO] Download UVR5 model..."
DownloadAndUnzip $UVR5_URL "tools\uvr5"
Write-Host "[INFO] Downloading funasr..."
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
$funasrZip = "$tmpDir\funasr.zip"
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
Remove-Item $funasrZip
Write-Host "[INFO] Download ffmpeg..."
$ffUrl = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
$ffZip = "$tmpDir\ffmpeg.zip"
Invoke-WebRequest -Uri $ffUrl -OutFile $ffZip
Expand-Archive $ffZip -DestinationPath $tmpDir -Force
$ffDir = Get-ChildItem -Directory "$tmpDir" | Where-Object { $_.Name -like "ffmpeg*" } | Select-Object -First 1
Move-Item "$($ffDir.FullName)\bin\ffmpeg.exe" "$srcDir\runtime"
Move-Item "$($ffDir.FullName)\bin\ffprobe.exe" "$srcDir\runtime"
Remove-Item $ffZip
Remove-Item $ffDir.FullName -Recurse -Force
Write-Host "[INFO] Installing PyTorch..."
& ".\runtime\python.exe" -m ensurepip
& ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location
switch ($cuda) {
"cu124" {
& ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
}
"cu128" {
& ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location
}
default {
Write-Error "Unsupported CUDA version: $cuda"
exit 1
}
}
Write-Host "[INFO] Installing dependencies..."
& ".\runtime\python.exe" -m pip install -r extra-req.txt --no-deps --no-warn-script-location
& ".\runtime\python.exe" -m pip install -r requirements.txt --no-warn-script-location
Write-Host "[INFO] Downloading NLTK and pyopenjtalk dictionary..."
$PYTHON = ".\runtime\python.exe"
$prefix = & $PYTHON -c "import sys; print(sys.prefix)"
$jtalkPath = & $PYTHON -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))"
$nltkZip = "$tmpDir\nltk_data.zip"
$jtalkTar = "$tmpDir\open_jtalk_dic_utf_8-1.11.tar.gz"
Invoke-WebRequest -Uri $NLTK_URL -OutFile $nltkZip
Expand-Archive -Path $nltkZip -DestinationPath $prefix -Force
Remove-Item $nltkZip
Invoke-WebRequest -Uri $JTALK_URL -OutFile $jtalkTar
& "C:\Program Files\7-Zip\7z.exe" e $jtalkTar -o"$tmpDir" -aoa
$innerTar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
& "C:\Program Files\7-Zip\7z.exe" x $innerTar.FullName -o"$jtalkPath" -aoa
Remove-Item $jtalkTar
Remove-Item $innerTar.FullName
Write-Host "[INFO] Preparing final directory $pkgName ..."
$items = @(Get-ChildItem -Filter "*.sh") +
@(Get-ChildItem -Filter "*.ipynb") +
@("$tmpDir", ".github", "Docker", "docs", ".gitignore", ".dockerignore", "README.md")
Remove-Item $items -Force -Recurse -ErrorAction SilentlyContinue
$curr = Get-Location
Set-Location ../
Get-ChildItem .
Copy-Item -Path $curr -Destination $pkgName -Recurse
$7zPath = "$pkgName.7z"
$start = Get-Date
Write-Host "Compress Starting at $start"
& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
$end = Get-Date
Write-Host "Elapsed time: $($end - $start)"
Get-ChildItem .
python -m pip install --upgrade pip
python -m pip install "modelscope" "huggingface_hub[hf_transfer]" --no-warn-script-location
Write-Host "[INFO] Uploading to ModelScope..."
$msUser = $env:MODELSCOPE_USERNAME
$msToken = $env:MODELSCOPE_TOKEN
if (-not $msUser -or -not $msToken) {
Write-Error "Missing MODELSCOPE_USERNAME or MODELSCOPE_TOKEN"
exit 1
}
modelscope upload "$msUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $msToken
Write-Host "[SUCCESS] Uploaded: $7zPath to ModelScope"
Write-Host "[INFO] Uploading to HuggingFace..."
$hfUser = $env:HUGGINGFACE_USERNAME
$hfToken = $env:HUGGINGFACE_TOKEN
if (-not $hfUser -or -not $hfToken) {
Write-Error "Missing HUGGINGFACE_USERNAME or HUGGINGFACE_TOKEN"
exit 1
}
$env:HF_HUB_ENABLE_HF_TRANSFER = "1"
huggingface-cli upload "$hfUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $hfToken
Write-Host "[SUCCESS] Uploaded: $7zPath to HuggingFace"

View File

@ -1,38 +0,0 @@
name: Build and Upload Windows Package
on:
workflow_dispatch:
inputs:
date:
description: "Date suffix (optional)"
required: false
default: ""
suffix:
description: "Package name suffix (optional)"
required: false
default: ""
jobs:
build:
runs-on: windows-latest
strategy:
matrix:
torch_cuda: [cu124, cu128]
env:
TORCH_CUDA: ${{ matrix.torch_cuda }}
MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
DATE_SUFFIX: ${{ github.event.inputs.date }}
PKG_SUFFIX: ${{ github.event.inputs.suffix }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Run Build and Upload Script
shell: pwsh
run: |
Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1
../build_windows_packages.ps1

View File

@ -1,276 +0,0 @@
name: Build and Publish Docker Image
on:
workflow_dispatch:
jobs:
generate-meta:
runs-on: ubuntu-22.04
outputs:
tag: ${{ steps.meta.outputs.tag }}
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Generate Tag
id: meta
run: |
DATE=$(date +'%Y%m%d')
COMMIT=$(git rev-parse --short=6 HEAD)
echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT
build-amd64:
needs: generate-meta
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (amd64)
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/amd64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64
build-arm64:
needs: generate-meta
runs-on: ubuntu-22.04-arm
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (arm64)
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/arm64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64
merge-and-clean:
needs:
- build-amd64
- build-arm64
- generate-meta
runs-on: ubuntu-latest
strategy:
matrix:
include:
- tag_prefix: cu126-lite
- tag_prefix: cu126
- tag_prefix: cu128-lite
- tag_prefix: cu128
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Merge amd64 and arm64 into multi-arch image
run: |
DATE_TAG=${{ needs.generate-meta.outputs.tag }}
TAG_PREFIX=${{ matrix.tag_prefix }}
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64
- name: Delete old platform-specific tags via Docker Hub API
env:
DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }}
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }}
TAG_PREFIX: ${{ matrix.tag_prefix }}
DATE_TAG: ${{ needs.generate-meta.outputs.tag }}
run: |
sudo apt-get update && sudo apt-get install -y jq
TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \
"https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \
| jq -r .token)
for PLATFORM in amd64 arm64; do
SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g')
TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}"
LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}"
for DEL_TAG in "$TAG" "$LATEST_TAG"; do
echo "Deleting tag: $DEL_TAG"
curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG
done
done
create-default:
runs-on: ubuntu-latest
needs:
- merge-and-clean
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Create Default Tag
run: |
docker buildx imagetools create \
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite

9
.gitignore vendored
View File

@ -7,8 +7,13 @@ runtime
.idea .idea
output output
logs logs
SoVITS_weights*/ reference
GPT_weights*/ GPT_weights
SoVITS_weights
GPT_weights_v2
SoVITS_weights_v2
GPT_weights_v3
SoVITS_weights_v3
TEMP TEMP
weight.json weight.json
ffmpeg* ffmpeg*

View File

@ -1,15 +0,0 @@
ci:
autoupdate_schedule: monthly
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
# Run the linter.
- id: ruff
types_or: [ python, pyi ]
args: [ --fix , "--exit-zero" ]
# Run the formatter.
- id: ruff-format
types_or: [ python, pyi ]
args: [ --line-length, "120", --target-version, "py310" ]

View File

@ -1,191 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# GPT-SoVITS Infer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Env Setup (Run Once Only)\n",
"## 环境配置, 只需运行一次"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e9b7iFV3dm1f"
},
"outputs": [],
"source": [
"%%writefile /content/setup.sh\n",
"set -e\n",
"\n",
"cd /content\n",
"\n",
"git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"\n",
"cd GPT-SoVITS\n",
"\n",
"mkdir -p GPT_weights\n",
"\n",
"mkdir -p SoVITS_weights\n",
"\n",
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
" :\n",
"else\n",
" conda create -n GPTSoVITS python=3.10 -y\n",
"fi\n",
"\n",
"source activate GPTSoVITS\n",
"\n",
"pip install ipykernel\n",
"\n",
"bash install.sh --device CU126 --source HF"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [
"%pip install -q condacolab\n",
"import condacolab\n",
"condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n",
"!cd /content && bash setup.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download From HuggingFace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [
"# Modify These\n",
"USER_ID = \"AkitoP\"\n",
"REPO_NAME = \"GPT-SoVITS-v2-aegi\"\n",
"BRANCH = \"main\"\n",
"GPT_PATH = \"new_aegigoe-e100.ckpt\"\n",
"SOVITS_PATH = \"new_aegigoe_e60_s32220.pth\"\n",
"\n",
"# Do Not Modify\n",
"HF_BASE = \"https://huggingface.co\"\n",
"REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n",
"GPT_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{GPT_PATH}\"\n",
"SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{SOVITS_PATH}\"\n",
"\n",
"!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n",
"!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download From ModelScope"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Modify These\n",
"USER_ID = \"aihobbyist\"\n",
"REPO_NAME = \"GPT-SoVits-V2-models\"\n",
"BRANCH = \"master\"\n",
"GPT_PATH = \"Genshin_Impact/EN/GPT_GenshinImpact_EN_5.1.ckpt\"\n",
"SOVITS_PATH = \"Wuthering_Waves/CN/SV_WutheringWaves_CN_1.3.pth\"\n",
"\n",
"# Do Not Modify\n",
"HF_BASE = \"https://www.modelscope.cn/models\"\n",
"REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n",
"GPT_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{GPT_PATH}\"\n",
"SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{SOVITS_PATH}\"\n",
"\n",
"!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n",
"!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Launch WebUI\n",
"# 启动 WebUI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

3
Docker/damo.sha256 Normal file
View File

@ -0,0 +1,3 @@
5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb

8
Docker/download.py Normal file
View File

@ -0,0 +1,8 @@
# Download moda ASR related models
from modelscope import snapshot_download
model_dir = snapshot_download(
"damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", revision="v2.0.4"
)
model_dir = snapshot_download("damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", revision="v2.0.4")
model_dir = snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", revision="v2.0.4")

11
Docker/download.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -Eeuo pipefail
echo "Downloading models..."
aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
echo "Checking SHA256..."
parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"

View File

@ -1,33 +0,0 @@
#!/bin/bash
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1
cd .. || exit 1
set -e
source "$HOME/miniconda3/etc/profile.d/conda.sh"
mkdir -p GPT_SoVITS
mkdir -p GPT_SoVITS/text
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
pip cache purge
pip show torch
rm -rf /tmp/* /var/tmp/*
rm -rf "$HOME/miniconda3/pkgs"
mkdir -p "$HOME/miniconda3/pkgs"
rm -rf /root/.conda /root/.cache

12
Docker/links.sha256 Normal file
View File

@ -0,0 +1,12 @@
b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx

34
Docker/links.txt Normal file
View File

@ -0,0 +1,34 @@
# GPT-SoVITS models
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
out=GPT_SoVITS/pretrained_models/s2D488k.pth
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
out=GPT_SoVITS/pretrained_models/s2G488k.pth
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
# UVR5
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx

View File

@ -1,70 +0,0 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1
cd .. || exit 1
if [ -d "$HOME/miniconda3" ]; then
exit 0
fi
WORKFLOW=${WORKFLOW:-"false"}
TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"}
if [ "$WORKFLOW" = "true" ]; then
WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
else
WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
fi
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
"${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
"${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
else
exit 1
fi
LOG_PATH="/tmp/miniconda-install.log"
bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1
if [ $? -eq 0 ]; then
echo "== Miniconda Installed =="
else
echo "Failed to Install miniconda"
tail -n 50 "$LOG_PATH"
exit 1
fi
rm miniconda.sh
source "$HOME/miniconda3/etc/profile.d/conda.sh"
"$HOME/miniconda3/bin/conda" config --add channels conda-forge
"$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
"$HOME/miniconda3/bin/conda" install python=3.11 -q -y
"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
if [ "$CUDA_VERSION" = "12.8" ]; then
"$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
elif [ "$CUDA_VERSION" = "12.6" ]; then
"$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
fi
"$HOME/miniconda3/bin/pip" cache purge
rm $LOG_PATH
rm -rf "$HOME/miniconda3/pkgs"
mkdir -p "$HOME/miniconda3/pkgs"
rm -rf "$HOME/.conda" "$HOME/.cache"

View File

@ -1,62 +1,42 @@
ARG CUDA_VERSION=12.6 # Base CUDA image
ARG TORCH_BASE=full FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} LABEL maintainer="breakstring@hotmail.com"
LABEL version="dev-20240209"
LABEL maintainer="XXXXRT"
LABEL version="V4"
LABEL description="Docker image for GPT-SoVITS" LABEL description="Docker image for GPT-SoVITS"
ARG CUDA_VERSION=12.6
ENV CUDA_VERSION=${CUDA_VERSION} # Install 3rd party apps
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get update && \
apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
git lfs install && \
rm -rf /var/lib/apt/lists/*
SHELL ["/bin/bash", "-c"] # Copy only requirements.txt initially to leverage Docker cache
WORKDIR /workspace
COPY requirements.txt /workspace/
RUN pip install --no-cache-dir -r requirements.txt
WORKDIR /workspace/GPT-SoVITS # Define a build-time argument for image type
ARG IMAGE_TYPE=full
COPY Docker /workspace/GPT-SoVITS/Docker/ # Conditional logic based on the IMAGE_TYPE argument
# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
COPY ./Docker /workspace/Docker
# elite 类型的镜像里面不包含额外的模型
RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
chmod +x /workspace/Docker/download.sh && \
/workspace/Docker/download.sh && \
python /workspace/Docker/download.py && \
python -m nltk.downloader averaged_perceptron_tagger cmudict; \
fi
ARG LITE=false
ENV LITE=${LITE}
ARG WORKFLOW=false # Copy the rest of the application
ENV WORKFLOW=${WORKFLOW} COPY . /workspace
ARG TARGETPLATFORM
ENV TARGETPLATFORM=${TARGETPLATFORM}
RUN bash Docker/miniconda_install.sh
COPY extra-req.txt /workspace/GPT-SoVITS/
COPY requirements.txt /workspace/GPT-SoVITS/
COPY install.sh /workspace/GPT-SoVITS/
RUN bash Docker/install_wrapper.sh
EXPOSE 9871 9872 9873 9874 9880 EXPOSE 9871 9872 9873 9874 9880
ENV PYTHONPATH="/workspace/GPT-SoVITS" CMD ["python", "webui.py"]
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
WORKDIR /workspace
RUN rm -rf /workspace/GPT-SoVITS
WORKDIR /workspace/GPT-SoVITS
COPY . /workspace/GPT-SoVITS
CMD ["/bin/bash", "-c", "\
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
exec bash"]

View File

@ -29,7 +29,6 @@ class Text2SemanticLightningModule(LightningModule):
torch.load( torch.load(
pretrained_s1, pretrained_s1,
map_location="cpu", map_location="cpu",
weights_only=False,
)["weight"], )["weight"],
) )
) )

View File

@ -356,7 +356,7 @@ class Text2SemanticDecoder(nn.Module):
x = self.ar_text_embedding(x) x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2)) x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x) x = self.ar_text_position(x)
x_mask = make_pad_mask_left(x_lens) x_mask = make_pad_mask(x_lens)
y_mask = make_pad_mask(y_lens) y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64) y_mask_int = y_mask.type(torch.int64)
@ -420,7 +420,7 @@ class Text2SemanticDecoder(nn.Module):
mask=xy_attn_mask, mask=xy_attn_mask,
) )
x_len = x_lens.max() x_len = x_lens.max()
logits = self.ar_predict_layer(xy_dec[:, x_len-1:]) logits = self.ar_predict_layer(xy_dec[:, x_len:])
###### DPO ############# ###### DPO #############
reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data( reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(
@ -432,7 +432,7 @@ class Text2SemanticDecoder(nn.Module):
mask=reject_xy_attn_mask, mask=reject_xy_attn_mask,
) )
x_len = x_lens.max() x_len = x_lens.max()
reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len-1:]) reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:])
# loss # loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
@ -455,7 +455,7 @@ class Text2SemanticDecoder(nn.Module):
x = self.ar_text_embedding(x) x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2)) x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x) x = self.ar_text_position(x)
x_mask = make_pad_mask_left(x_lens) x_mask = make_pad_mask(x_lens)
y_mask = make_pad_mask(y_lens) y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64) y_mask_int = y_mask.type(torch.int64)
@ -502,7 +502,7 @@ class Text2SemanticDecoder(nn.Module):
(xy_pos, None), (xy_pos, None),
mask=xy_attn_mask, mask=xy_attn_mask,
) )
logits = self.ar_predict_layer(xy_dec[:, x_len-1:]).permute(0, 2, 1) logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1)
# loss # loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
loss = F.cross_entropy(logits, targets, reduction="sum") loss = F.cross_entropy(logits, targets, reduction="sum")
@ -578,7 +578,7 @@ class Text2SemanticDecoder(nn.Module):
def pad_y_eos(self, y, y_mask_int, eos_id): def pad_y_eos(self, y, y_mask_int, eos_id):
targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1) targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1)
# 错位 # 错位
return targets[:, :-1], targets return targets[:, :-1], targets[:, 1:]
def infer_panel_batch_infer( def infer_panel_batch_infer(
self, self,

View File

@ -354,7 +354,7 @@ class ScaledAdam(BatchedOptimizer):
if ans < 1.0: if ans < 1.0:
first_state["num_clipped"] += 1 first_state["num_clipped"] += 1
if ans < 0.1: if ans < 0.1:
logging.warning(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}") logging.warn(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}")
if self.show_dominant_parameters: if self.show_dominant_parameters:
assert p.shape[0] == len(param_names) assert p.shape[0] == len(param_names)
self._show_gradient_dominating_parameter(tuples, tot_sumsq) self._show_gradient_dominating_parameter(tuples, tot_sumsq)
@ -362,7 +362,7 @@ class ScaledAdam(BatchedOptimizer):
def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor): def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor):
""" """
Show information of parameter which dominating tot_sumsq. Show information of parameter wihch dominanting tot_sumsq.
Args: Args:
tuples: a list of tuples of (param, state, param_names) tuples: a list of tuples of (param, state, param_names)
@ -415,7 +415,7 @@ class ScaledAdam(BatchedOptimizer):
dominant_grad, dominant_grad,
) = sorted_by_proportion[dominant_param_name] ) = sorted_by_proportion[dominant_param_name]
logging.info( logging.info(
f"Parameter Dominating tot_sumsq {dominant_param_name}" f"Parameter Dominanting tot_sumsq {dominant_param_name}"
f" with proportion {dominant_proportion:.2f}," f" with proportion {dominant_proportion:.2f},"
f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
f"={dominant_sumsq:.3e}," f"={dominant_sumsq:.3e},"

View File

@ -32,20 +32,9 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio
from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from sv import SV
resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -114,6 +103,17 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
return processed_audio return processed_audio
resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key="%s-%s"%(sr0,sr1)
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
class DictToAttrRecursive(dict): class DictToAttrRecursive(dict):
def __init__(self, input_dict): def __init__(self, input_dict):
super().__init__(input_dict) super().__init__(input_dict)
@ -252,24 +252,7 @@ class TTS_Config:
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
}, },
"v2Pro": {
"device": "cpu",
"is_half": False,
"version": "v2Pro",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
},
"v2ProPlus": {
"device": "cpu",
"is_half": False,
"version": "v2ProPlus",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
},
} }
configs: dict = None configs: dict = None
v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
@ -304,10 +287,10 @@ class TTS_Config:
configs: dict = self._load_configs(self.configs_path) configs: dict = self._load_configs(self.configs_path)
assert isinstance(configs, dict) assert isinstance(configs, dict)
configs_ = deepcopy(self.default_configs) version = configs.get("version", "v2").lower()
configs_.update(configs) assert version in ["v1", "v2", "v3", "v4"]
self.configs: dict = configs_.get("custom", configs_["v2"]) self.default_configs[version] = configs.get(version, self.default_configs[version])
self.default_configs = deepcopy(configs_) self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
self.device = self.configs.get("device", torch.device("cpu")) self.device = self.configs.get("device", torch.device("cpu"))
if "cuda" in str(self.device) and not torch.cuda.is_available(): if "cuda" in str(self.device) and not torch.cuda.is_available():
@ -315,13 +298,11 @@ class TTS_Config:
self.device = torch.device("cpu") self.device = torch.device("cpu")
self.is_half = self.configs.get("is_half", False) self.is_half = self.configs.get("is_half", False)
if str(self.device) == "cpu" and self.is_half: # if str(self.device) == "cpu" and self.is_half:
print(f"Warning: Half precision is not supported on CPU, set is_half to False.") # print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
self.is_half = False # self.is_half = False
version = self.configs.get("version", None)
self.version = version self.version = version
assert self.version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"], "Invalid version!"
self.t2s_weights_path = self.configs.get("t2s_weights_path", None) self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
self.vits_weights_path = self.configs.get("vits_weights_path", None) self.vits_weights_path = self.configs.get("vits_weights_path", None)
self.bert_base_path = self.configs.get("bert_base_path", None) self.bert_base_path = self.configs.get("bert_base_path", None)
@ -423,7 +404,6 @@ class TTS:
self.cnhuhbert_model: CNHubert = None self.cnhuhbert_model: CNHubert = None
self.vocoder = None self.vocoder = None
self.sr_model: AP_BWE = None self.sr_model: AP_BWE = None
self.sv_model = None
self.sr_model_not_exist: bool = False self.sr_model_not_exist: bool = False
self.vocoder_configs: dict = { self.vocoder_configs: dict = {
@ -452,6 +432,7 @@ class TTS:
"aux_ref_audio_paths": [], "aux_ref_audio_paths": [],
} }
self.stop_flag: bool = False self.stop_flag: bool = False
self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32 self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32
@ -484,17 +465,16 @@ class TTS:
def init_vits_weights(self, weights_path: str): def init_vits_weights(self, weights_path: str):
self.configs.vits_weights_path = weights_path self.configs.vits_weights_path = weights_path
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
if "Pro" in model_version:
self.init_sv_model()
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"] path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
if if_lora_v3 == True and os.path.exists(path_sovits) == False: if if_lora_v3 == True and os.path.exists(path_sovits) == False:
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
raise FileExistsError(info) raise FileExistsError(info)
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
dict_s2 = load_sovits_new(weights_path) dict_s2 = load_sovits_new(weights_path)
hps = dict_s2["config"] hps = dict_s2["config"]
hps["model"]["semantic_frame_rate"] = "25hz" hps["model"]["semantic_frame_rate"] = "25hz"
if "enc_p.text_embedding.weight" not in dict_s2["weight"]: if "enc_p.text_embedding.weight" not in dict_s2["weight"]:
hps["model"]["version"] = "v2" # v3model,v2sybomls hps["model"]["version"] = "v2" # v3model,v2sybomls
@ -502,15 +482,7 @@ class TTS:
hps["model"]["version"] = "v1" hps["model"]["version"] = "v1"
else: else:
hps["model"]["version"] = "v2" hps["model"]["version"] = "v2"
version = hps["model"]["version"] # version = hps["model"]["version"]
v3v4set = {"v3", "v4"}
if model_version not in v3v4set:
if "Pro" not in model_version:
model_version = version
else:
hps["model"]["version"] = model_version
else:
hps["model"]["version"] = model_version
self.configs.filter_length = hps["data"]["filter_length"] self.configs.filter_length = hps["data"]["filter_length"]
self.configs.segment_size = hps["train"]["segment_size"] self.configs.segment_size = hps["train"]["segment_size"]
@ -526,7 +498,7 @@ class TTS:
# print(f"model_version:{model_version}") # print(f"model_version:{model_version}")
# print(f'hps["model"]["version"]:{hps["model"]["version"]}') # print(f'hps["model"]["version"]:{hps["model"]["version"]}')
if model_version not in v3v4set: if model_version not in {"v3", "v4"}:
vits_model = SynthesizerTrn( vits_model = SynthesizerTrn(
self.configs.filter_length // 2 + 1, self.configs.filter_length // 2 + 1,
self.configs.segment_size // self.configs.hop_length, self.configs.segment_size // self.configs.hop_length,
@ -535,7 +507,7 @@ class TTS:
) )
self.configs.use_vocoder = False self.configs.use_vocoder = False
else: else:
kwargs["version"] = model_version kwargs["version"]=model_version
vits_model = SynthesizerTrnV3( vits_model = SynthesizerTrnV3(
self.configs.filter_length // 2 + 1, self.configs.filter_length // 2 + 1,
self.configs.segment_size // self.configs.hop_length, self.configs.segment_size // self.configs.hop_length,
@ -547,8 +519,6 @@ class TTS:
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"): if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q del vits_model.enc_q
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
if if_lora_v3 == False: if if_lora_v3 == False:
print( print(
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}" f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
@ -578,16 +548,12 @@ class TTS:
if self.configs.is_half and str(self.configs.device) != "cpu": if self.configs.is_half and str(self.configs.device) != "cpu":
self.vits_model = self.vits_model.half() self.vits_model = self.vits_model.half()
self.configs.save_configs()
def init_t2s_weights(self, weights_path: str): def init_t2s_weights(self, weights_path: str):
print(f"Loading Text2Semantic weights from {weights_path}") print(f"Loading Text2Semantic weights from {weights_path}")
self.configs.t2s_weights_path = weights_path self.configs.t2s_weights_path = weights_path
self.configs.save_configs() self.configs.save_configs()
self.configs.hz = 50 self.configs.hz = 50
dict_s1 = torch.load(weights_path, map_location=self.configs.device, weights_only=False) dict_s1 = torch.load(weights_path, map_location=self.configs.device)
config = dict_s1["config"] config = dict_s1["config"]
self.configs.max_sec = config["data"]["max_sec"] self.configs.max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
@ -636,16 +602,11 @@ class TTS:
upsample_rates=[10, 6, 2, 2, 2], upsample_rates=[10, 6, 2, 2, 2],
upsample_initial_channel=512, upsample_initial_channel=512,
upsample_kernel_sizes=[20, 12, 4, 4, 4], upsample_kernel_sizes=[20, 12, 4, 4, 4],
gin_channels=0, gin_channels=0, is_bias=True
is_bias=True,
) )
self.vocoder.remove_weight_norm() self.vocoder.remove_weight_norm()
state_dict_g = torch.load( state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu")
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), print("loading vocoder",self.vocoder.load_state_dict(state_dict_g))
map_location="cpu",
weights_only=False,
)
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
self.vocoder_configs["sr"] = 48000 self.vocoder_configs["sr"] = 48000
self.vocoder_configs["T_ref"] = 500 self.vocoder_configs["T_ref"] = 500
@ -653,6 +614,9 @@ class TTS:
self.vocoder_configs["upsample_rate"] = 480 self.vocoder_configs["upsample_rate"] = 480
self.vocoder_configs["overlapped_len"] = 12 self.vocoder_configs["overlapped_len"] = 12
self.vocoder = self.vocoder.eval() self.vocoder = self.vocoder.eval()
if self.configs.is_half == True: if self.configs.is_half == True:
self.vocoder = self.vocoder.half().to(self.configs.device) self.vocoder = self.vocoder.half().to(self.configs.device)
@ -669,11 +633,6 @@ class TTS:
print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好"))
self.sr_model_not_exist = True self.sr_model_not_exist = True
def init_sv_model(self):
if self.sv_model is not None:
return
self.sv_model = SV(self.configs.device, self.configs.is_half)
def enable_half_precision(self, enable: bool = True, save: bool = True): def enable_half_precision(self, enable: bool = True, save: bool = True):
""" """
To enable half precision for the TTS model. To enable half precision for the TTS model.
@ -749,11 +708,11 @@ class TTS:
self.prompt_cache["ref_audio_path"] = ref_audio_path self.prompt_cache["ref_audio_path"] = ref_audio_path
def _set_ref_spec(self, ref_audio_path): def _set_ref_spec(self, ref_audio_path):
spec_audio = self._get_ref_spec(ref_audio_path) spec = self._get_ref_spec(ref_audio_path)
if self.prompt_cache["refer_spec"] in [[], None]: if self.prompt_cache["refer_spec"] in [[], None]:
self.prompt_cache["refer_spec"] = [spec_audio] self.prompt_cache["refer_spec"] = [spec]
else: else:
self.prompt_cache["refer_spec"][0] = spec_audio self.prompt_cache["refer_spec"][0] = spec
def _get_ref_spec(self, ref_audio_path): def _get_ref_spec(self, ref_audio_path):
raw_audio, raw_sr = torchaudio.load(ref_audio_path) raw_audio, raw_sr = torchaudio.load(ref_audio_path)
@ -761,36 +720,25 @@ class TTS:
self.prompt_cache["raw_audio"] = raw_audio self.prompt_cache["raw_audio"] = raw_audio
self.prompt_cache["raw_sr"] = raw_sr self.prompt_cache["raw_sr"] = raw_sr
if raw_sr != self.configs.sampling_rate: audio = load_audio(ref_audio_path, int(self.configs.sampling_rate))
audio = raw_audio.to(self.configs.device) audio = torch.FloatTensor(audio)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
else:
audio = raw_audio.to(self.configs.device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
maxx = audio.abs().max() maxx = audio.abs().max()
if maxx > 1: if maxx > 1:
audio /= min(2, maxx) audio /= min(2, maxx)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch( spec = spectrogram_torch(
audio, audio_norm,
self.configs.filter_length, self.configs.filter_length,
self.configs.sampling_rate, self.configs.sampling_rate,
self.configs.hop_length, self.configs.hop_length,
self.configs.win_length, self.configs.win_length,
center=False, center=False,
) )
spec = spec.to(self.configs.device)
if self.configs.is_half: if self.configs.is_half:
spec = spec.half() spec = spec.half()
if self.is_v2pro == True: return spec
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
if self.configs.is_half:
audio = audio.half()
else:
audio = None
return spec, audio
def _set_prompt_semantic(self, ref_wav_path: str): def _set_prompt_semantic(self, ref_wav_path: str):
zero_wav = np.zeros( zero_wav = np.zeros(
@ -1087,10 +1035,7 @@ class TTS:
###### setting reference audio and prompt text preprocessing ######## ###### setting reference audio and prompt text preprocessing ########
t0 = time.perf_counter() t0 = time.perf_counter()
if (ref_audio_path is not None) and ( if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
ref_audio_path != self.prompt_cache["ref_audio_path"]
or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)
):
if not os.path.exists(ref_audio_path): if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists") raise ValueError(f"{ref_audio_path} not exists")
self.set_ref_audio(ref_audio_path) self.set_ref_audio(ref_audio_path)
@ -1228,14 +1173,10 @@ class TTS:
t4 = time.perf_counter() t4 = time.perf_counter()
t_34 += t4 - t3 t_34 += t4 - t3
refer_audio_spec = [] refer_audio_spec: torch.Tensor = [
if self.is_v2pro: item.to(dtype=self.precision, device=self.configs.device)
sv_emb = [] for item in self.prompt_cache["refer_spec"]
for spec, audio_tensor in self.prompt_cache["refer_spec"]: ]
spec = spec.to(dtype=self.precision, device=self.configs.device)
refer_audio_spec.append(spec)
if self.is_v2pro:
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
batch_audio_fragment = [] batch_audio_fragment = []
@ -1267,14 +1208,9 @@ class TTS:
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
) )
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
if self.is_v2pro != True:
_batch_audio_fragment = self.vits_model.decode( _batch_audio_fragment = self.vits_model.decode(
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
).detach()[0, 0, :] ).detach()[0, 0, :]
else:
_batch_audio_fragment = self.vits_model.decode(
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
audio_frag_end_idx.insert(0, 0) audio_frag_end_idx.insert(0, 0)
batch_audio_fragment = [ batch_audio_fragment = [
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]] _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
@ -1287,14 +1223,9 @@ class TTS:
_pred_semantic = ( _pred_semantic = (
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
) # .unsqueeze(0)#mq要多unsqueeze一次 ) # .unsqueeze(0)#mq要多unsqueeze一次
if self.is_v2pro != True:
audio_fragment = self.vits_model.decode( audio_fragment = self.vits_model.decode(
_pred_semantic, phones, refer_audio_spec, speed=speed_factor _pred_semantic, phones, refer_audio_spec, speed=speed_factor
).detach()[0, 0, :] ).detach()[0, 0, :]
else:
audio_fragment = self.vits_model.decode(
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分 batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
else: else:
if parallel_infer: if parallel_infer:
@ -1433,10 +1364,7 @@ class TTS:
): ):
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
raw_entry = self.prompt_cache["refer_spec"][0] refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
@ -1503,10 +1431,7 @@ class TTS:
) -> List[torch.Tensor]: ) -> List[torch.Tensor]:
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
raw_entry = self.prompt_cache["refer_spec"][0] refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]

View File

@ -121,31 +121,33 @@ class TextPreprocessor:
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False): def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
with self.bert_lock: with self.bert_lock:
text = re.sub(r' {2,}', ' ', text) if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
# language = language.replace("all_","")
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
if language == "all_zh":
if re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return self.get_phones_and_bert(formattext, "zh", version)
else:
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return self.get_phones_and_bert(formattext, "yue", version)
else:
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float32,
).to(self.device)
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "auto":
for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
langlist.append("en")
textlist.append(text)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
@ -157,10 +159,6 @@ class TextPreprocessor:
textlist.append(tmp["text"]) textlist.append(tmp["text"])
else: else:
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en": if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
else: else:

View File

@ -1,91 +0,0 @@
{
"train": {
"log_interval": 100,
"eval_interval": 500,
"seed": 1234,
"epochs": 100,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 32,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 20480,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"text_low_lr_rate": 0.4,
"grad_ckpt": false
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 2048,
"hop_length": 640,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 300,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
8,
2,
2
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 1024,
"semantic_frame_rate": "25hz",
"freeze_quantizer": true
},
"s2_ckpt_dir": "logs/s2/big2k1",
"content_module": "cnhubert"
}

View File

@ -1,91 +0,0 @@
{
"train": {
"log_interval": 100,
"eval_interval": 500,
"seed": 1234,
"epochs": 100,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 32,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 20480,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"text_low_lr_rate": 0.4,
"grad_ckpt": false
},
"data": {
"max_wav_value": 32768.0,
"sampling_rate": 32000,
"filter_length": 2048,
"hop_length": 640,
"win_length": 2048,
"n_mel_channels": 128,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 300,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.0,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2,
2
],
"upsample_initial_channel": 768,
"upsample_kernel_sizes": [
20,
16,
8,
2,
2
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 1024,
"semantic_frame_rate": "25hz",
"freeze_quantizer": true
},
"s2_ckpt_dir": "logs/s2/big2k1",
"content_module": "cnhubert"
}

View File

@ -22,22 +22,6 @@ v2:
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2 version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
v2Pro:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
version: v2Pro
vits_weights_path: GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth
v2ProPlus:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
version: v2ProPlus
vits_weights_path: GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth
v3: v3:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base

View File

@ -1,264 +0,0 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
"""
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
inplace_str = "inplace" if self.inplace else ""
return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2Net(nn.Module):
expansion = 2
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockERes2Net, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class BasicBlockERes2Net_diff_AFF(nn.Module):
expansion = 2
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockERes2Net_diff_AFF, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
fuse_models = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = self.fuse_models[i - 1](sp, spx[i])
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class ERes2Net(nn.Module):
def __init__(
self,
block=BasicBlockERes2Net,
block_fuse=BasicBlockERes2Net_diff_AFF,
num_blocks=[3, 4, 6, 3],
m_channels=32,
feat_dim=80,
embedding_size=192,
pooling_func="TSTP",
two_emb_layer=False,
):
super(ERes2Net, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
self.embedding_size = embedding_size
self.stats_dim = int(feat_dim / 8) * m_channels * 8
self.two_emb_layer = two_emb_layer
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
# Downsampling module for each layer
self.layer1_downsample = nn.Conv2d(
m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
)
self.layer2_downsample = nn.Conv2d(
m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
)
self.layer3_downsample = nn.Conv2d(
m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
)
# Bottom-up fusion module
self.fuse_mode12 = AFF(channels=m_channels * 4)
self.fuse_mode123 = AFF(channels=m_channels * 8)
self.fuse_mode1234 = AFF(channels=m_channels * 16)
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
self.seg_2 = nn.Linear(embedding_size, embedding_size)
else:
self.seg_bn_1 = nn.Identity()
self.seg_2 = nn.Identity()
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
stats = self.pool(fuse_out1234)
embed_a = self.seg_1(stats)
if self.two_emb_layer:
out = F.relu(embed_a)
out = self.seg_bn_1(out)
embed_b = self.seg_2(out)
return embed_b
else:
return embed_a
def forward3(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
return fuse_out1234
if __name__ == "__main__":
x = torch.zeros(10, 300, 80)
model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func="TSTP")
model.eval()
out = model(x)
print(out.shape) # torch.Size([10, 192])
num_params = sum(param.numel() for param in model.parameters())
print("{} M".format(num_params / 1e6)) # 6.61M

View File

@ -1,272 +0,0 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
within each stage. However, this modification also increases the number of model parameters and computational complexity.
To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
both the model parameters and its computational cost.
"""
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
inplace_str = "inplace" if self.inplace else ""
return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2NetV2(nn.Module):
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
super(BasicBlockERes2NetV2, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
self.expansion = expansion
convs = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class BasicBlockERes2NetV2AFF(nn.Module):
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
super(BasicBlockERes2NetV2AFF, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
self.expansion = expansion
convs = []
fuse_models = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width, r=4))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = self.fuse_models[i - 1](sp, spx[i])
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class ERes2NetV2(nn.Module):
def __init__(
self,
block=BasicBlockERes2NetV2,
block_fuse=BasicBlockERes2NetV2AFF,
num_blocks=[3, 4, 6, 3],
m_channels=64,
feat_dim=80,
embedding_size=192,
baseWidth=26,
scale=2,
expansion=2,
pooling_func="TSTP",
two_emb_layer=False,
):
super(ERes2NetV2, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
self.embedding_size = embedding_size
self.stats_dim = int(feat_dim / 8) * m_channels * 8
self.two_emb_layer = two_emb_layer
self.baseWidth = baseWidth
self.scale = scale
self.expansion = expansion
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
# Downsampling module
self.layer3_ds = nn.Conv2d(
m_channels * 4 * self.expansion,
m_channels * 8 * self.expansion,
kernel_size=3,
padding=1,
stride=2,
bias=False,
)
# Bottom-up fusion module
self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion)
self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
self.seg_2 = nn.Linear(embedding_size, embedding_size)
else:
self.seg_bn_1 = nn.Identity()
self.seg_2 = nn.Identity()
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(
block(
self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion
)
)
self.in_planes = planes * self.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out3 = self.layer3(out2)
out4 = self.layer4(out3)
out3_ds = self.layer3_ds(out3)
fuse_out34 = self.fuse34(out4, out3_ds)
stats = self.pool(fuse_out34)
embed_a = self.seg_1(stats)
if self.two_emb_layer:
out = F.relu(embed_a)
out = self.seg_bn_1(out)
embed_b = self.seg_2(out)
return embed_b
else:
return embed_a
def forward3(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out3 = self.layer3(out2)
out4 = self.layer4(out3)
out3_ds = self.layer3_ds(out3)
fuse_out34 = self.fuse34(out4, out3_ds)
# print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)
# stats = self.pool(fuse_out34)
#
# embed_a = self.seg_1(stats)
# if self.two_emb_layer:
# out = F.relu(embed_a)
# out = self.seg_bn_1(out)
# embed_b = self.seg_2(out)
# return embed_b
# else:
# return embed_a
if __name__ == "__main__":
x = torch.randn(1, 300, 80)
model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
model.eval()
y = model(x)
print(y.size())
macs, num_params = profile(model, inputs=(x,))
print("Params: {} M".format(num_params / 1e6)) # 17.86 M
print("MACs: {} G".format(macs / 1e9)) # 12.69 G

View File

@ -1,289 +0,0 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
"""
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
inplace_str = "inplace" if self.inplace else ""
return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2Net(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
super(BasicBlockERes2Net, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class BasicBlockERes2Net_diff_AFF(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
super(BasicBlockERes2Net_diff_AFF, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
fuse_models = []
bns = []
for i in range(self.nums):
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes),
)
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = self.fuse_models[i - 1](sp, spx[i])
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class ERes2Net(nn.Module):
def __init__(
self,
block=BasicBlockERes2Net,
block_fuse=BasicBlockERes2Net_diff_AFF,
num_blocks=[3, 4, 6, 3],
m_channels=64,
feat_dim=80,
embedding_size=192,
pooling_func="TSTP",
two_emb_layer=False,
):
super(ERes2Net, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
self.embedding_size = embedding_size
self.stats_dim = int(feat_dim / 8) * m_channels * 8
self.two_emb_layer = two_emb_layer
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
self.layer1_downsample = nn.Conv2d(
m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
)
self.layer2_downsample = nn.Conv2d(
m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
)
self.layer3_downsample = nn.Conv2d(
m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False
)
self.fuse_mode12 = AFF(channels=m_channels * 8)
self.fuse_mode123 = AFF(channels=m_channels * 16)
self.fuse_mode1234 = AFF(channels=m_channels * 32)
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
self.seg_2 = nn.Linear(embedding_size, embedding_size)
else:
self.seg_bn_1 = nn.Identity()
self.seg_2 = nn.Identity()
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
stats = self.pool(fuse_out1234)
embed_a = self.seg_1(stats)
if self.two_emb_layer:
out = F.relu(embed_a)
out = self.seg_bn_1(out)
embed_b = self.seg_2(out)
return embed_b
else:
return embed_a
def forward2(self, x, if_mean):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2) # bs,20480,T
if if_mean == False:
mean = fuse_out1234[0].transpose(1, 0) # (T,20480),bs=T
else:
mean = fuse_out1234.mean(2) # bs,20480
mean_std = torch.cat([mean, torch.zeros_like(mean)], 1)
return self.seg_1(mean_std) # (T,192)
# stats = self.pool(fuse_out1234)
# if self.two_emb_layer:
# out = F.relu(embed_a)
# out = self.seg_bn_1(out)
# embed_b = self.seg_2(out)
# return embed_b
# else:
# return embed_a
def forward3(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
return fuse_out1234
# print(fuse_out1234.shape)
# print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
# pdb.set_trace()

View File

@ -1,27 +0,0 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import torch
import torch.nn as nn
class AFF(nn.Module):
def __init__(self, channels=64, r=4):
super(AFF, self).__init__()
inter_channels = int(channels // r)
self.local_att = nn.Sequential(
nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2d(inter_channels),
nn.SiLU(inplace=True),
nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2d(channels),
)
def forward(self, x, ds_y):
xa = torch.cat((x, ds_y), dim=1)
x_att = self.local_att(xa)
x_att = 1.0 + torch.tanh(x_att)
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
return xo

View File

@ -1,844 +0,0 @@
import math
from typing import Tuple
import torch
import torchaudio
from torch import Tensor
__all__ = [
"get_mel_banks",
"inverse_mel_scale",
"inverse_mel_scale_scalar",
"mel_scale",
"mel_scale_scalar",
"spectrogram",
"fbank",
"mfcc",
"vtln_warp_freq",
"vtln_warp_mel_freq",
]
# numeric_limits<float>::epsilon() 1.1920928955078125e-07
EPSILON = torch.tensor(torch.finfo(torch.float).eps)
# 1 milliseconds = 0.001 seconds
MILLISECONDS_TO_SECONDS = 0.001
# window types
HAMMING = "hamming"
HANNING = "hanning"
POVEY = "povey"
RECTANGULAR = "rectangular"
BLACKMAN = "blackman"
WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
def _get_epsilon(device, dtype):
return EPSILON.to(device=device, dtype=dtype)
def _next_power_of_2(x: int) -> int:
r"""Returns the smallest power of 2 that is greater than x"""
return 1 if x == 0 else 2 ** (x - 1).bit_length()
def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
representing how the window is shifted along the waveform. Each row is a frame.
Args:
waveform (Tensor): Tensor of size ``num_samples``
window_size (int): Frame length
window_shift (int): Frame shift
snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends.
Returns:
Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
"""
assert waveform.dim() == 1
num_samples = waveform.size(0)
strides = (window_shift * waveform.stride(0), waveform.stride(0))
if snip_edges:
if num_samples < window_size:
return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
else:
m = 1 + (num_samples - window_size) // window_shift
else:
reversed_waveform = torch.flip(waveform, [0])
m = (num_samples + (window_shift // 2)) // window_shift
pad = window_size // 2 - window_shift // 2
pad_right = reversed_waveform
if pad > 0:
# torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
# but we want [2, 1, 0, 0, 1, 2]
pad_left = reversed_waveform[-pad:]
waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
else:
# pad is negative so we want to trim the waveform at the front
waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
sizes = (m, window_size)
return waveform.as_strided(sizes, strides)
def _feature_window_function(
window_type: str,
window_size: int,
blackman_coeff: float,
device: torch.device,
dtype: int,
) -> Tensor:
r"""Returns a window function with the given type and size"""
if window_type == HANNING:
return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
elif window_type == HAMMING:
return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
elif window_type == POVEY:
# like hanning but goes to zero at edges
return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
elif window_type == RECTANGULAR:
return torch.ones(window_size, device=device, dtype=dtype)
elif window_type == BLACKMAN:
a = 2 * math.pi / (window_size - 1)
window_function = torch.arange(window_size, device=device, dtype=dtype)
# can't use torch.blackman_window as they use different coefficients
return (
blackman_coeff
- 0.5 * torch.cos(a * window_function)
+ (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
).to(device=device, dtype=dtype)
else:
raise Exception("Invalid window type " + window_type)
def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
r"""Returns the log energy of size (m) for a strided_input (m,*)"""
device, dtype = strided_input.device, strided_input.dtype
log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log() # size (m)
if energy_floor == 0.0:
return log_energy
return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
def _get_waveform_and_window_properties(
waveform: Tensor,
channel: int,
sample_frequency: float,
frame_shift: float,
frame_length: float,
round_to_power_of_two: bool,
preemphasis_coefficient: float,
) -> Tuple[Tensor, int, int, int]:
r"""Gets the waveform and window properties"""
channel = max(channel, 0)
assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
waveform = waveform[channel, :] # size (n)
window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
window_size, len(waveform)
)
assert 0 < window_shift, "`window_shift` must be greater than 0"
assert padded_window_size % 2 == 0, (
"the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
)
assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
return waveform, window_shift, window_size, padded_window_size
def _get_window(
waveform: Tensor,
padded_window_size: int,
window_size: int,
window_shift: int,
window_type: str,
blackman_coeff: float,
snip_edges: bool,
raw_energy: bool,
energy_floor: float,
dither: float,
remove_dc_offset: bool,
preemphasis_coefficient: float,
) -> Tuple[Tensor, Tensor]:
r"""Gets a window and its log energy
Returns:
(Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
"""
device, dtype = waveform.device, waveform.dtype
epsilon = _get_epsilon(device, dtype)
# size (m, window_size)
strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
if dither != 0.0:
rand_gauss = torch.randn(strided_input.shape, device=device, dtype=dtype)
strided_input = strided_input + rand_gauss * dither
if remove_dc_offset:
# Subtract each row/frame by its mean
row_means = torch.mean(strided_input, dim=1).unsqueeze(1) # size (m, 1)
strided_input = strided_input - row_means
if raw_energy:
# Compute the log energy of each row/frame before applying preemphasis and
# window function
signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m)
if preemphasis_coefficient != 0.0:
# strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
0
) # size (m, window_size + 1)
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
# Apply window_function to each row/frame
window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
0
) # size (1, window_size)
strided_input = strided_input * window_function # size (m, window_size)
# Pad columns with zero until we reach size (m, padded_window_size)
if padded_window_size != window_size:
padding_right = padded_window_size - window_size
strided_input = torch.nn.functional.pad(
strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
).squeeze(0)
# Compute energy after window function (not the raw one)
if not raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m)
return strided_input, signal_log_energy
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
# subtracts the column mean of the tensor size (m, n) if subtract_mean=True
# it returns size (m, n)
if subtract_mean:
col_means = torch.mean(tensor, dim=0).unsqueeze(0)
tensor = tensor - col_means
return tensor
def spectrogram(
waveform: Tensor,
blackman_coeff: float = 0.42,
channel: int = -1,
dither: float = 0.0,
energy_floor: float = 1.0,
frame_length: float = 25.0,
frame_shift: float = 10.0,
min_duration: float = 0.0,
preemphasis_coefficient: float = 0.97,
raw_energy: bool = True,
remove_dc_offset: bool = True,
round_to_power_of_two: bool = True,
sample_frequency: float = 16000.0,
snip_edges: bool = True,
subtract_mean: bool = False,
window_type: str = POVEY,
) -> Tensor:
r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
compute-spectrogram-feats.
Args:
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. (Default: ``True``)
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
specified there) (Default: ``16000.0``)
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (Default: ``False``)
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
(Default: ``'povey'``)
Returns:
Tensor: A spectrogram identical to what Kaldi would output. The shape is
(m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
"""
device, dtype = waveform.device, waveform.dtype
epsilon = _get_epsilon(device, dtype)
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
)
if len(waveform) < min_duration * sample_frequency:
# signal is too short
return torch.empty(0)
strided_input, signal_log_energy = _get_window(
waveform,
padded_window_size,
window_size,
window_shift,
window_type,
blackman_coeff,
snip_edges,
raw_energy,
energy_floor,
dither,
remove_dc_offset,
preemphasis_coefficient,
)
# size (m, padded_window_size // 2 + 1, 2)
fft = torch.fft.rfft(strided_input)
# Convert the FFT into a power spectrum
power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log() # size (m, padded_window_size // 2 + 1)
power_spectrum[:, 0] = signal_log_energy
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
return power_spectrum
def inverse_mel_scale_scalar(mel_freq: float) -> float:
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
def mel_scale_scalar(freq: float) -> float:
return 1127.0 * math.log(1.0 + freq / 700.0)
def mel_scale(freq: Tensor) -> Tensor:
return 1127.0 * (1.0 + freq / 700.0).log()
def vtln_warp_freq(
vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq: float,
high_freq: float,
vtln_warp_factor: float,
freq: Tensor,
) -> Tensor:
r"""This computes a VTLN warping function that is not the same as HTK's one,
but has similar inputs (this function has the advantage of never producing
empty bins).
This function computes a warp function F(freq), defined between low_freq
and high_freq inclusive, with the following properties:
F(low_freq) == low_freq
F(high_freq) == high_freq
The function is continuous and piecewise linear with two inflection
points.
The lower inflection point (measured in terms of the unwarped
frequency) is at frequency l, determined as described below.
The higher inflection point is at a frequency h, determined as
described below.
If l <= f <= h, then F(f) = f/vtln_warp_factor.
If the higher inflection point (measured in terms of the unwarped
frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
Since (by the last point) F(h) == h/vtln_warp_factor, then
max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
= vtln_high_cutoff * min(1, vtln_warp_factor).
If the lower inflection point (measured in terms of the unwarped
frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
= vtln_low_cutoff * max(1, vtln_warp_factor)
Args:
vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
low_freq (float): Lower frequency cutoffs in mel computation
high_freq (float): Upper frequency cutoffs in mel computation
vtln_warp_factor (float): Vtln warp factor
freq (Tensor): given frequency in Hz
Returns:
Tensor: Freq after vtln warp
"""
assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
scale = 1.0 / vtln_warp_factor
Fl = scale * l # F(l)
Fh = scale * h # F(h)
assert l > low_freq and h < high_freq
# slope of left part of the 3-piece linear function
scale_left = (Fl - low_freq) / (l - low_freq)
# [slope of center part is just "scale"]
# slope of right part of the 3-piece linear function
scale_right = (high_freq - Fh) / (high_freq - h)
res = torch.empty_like(freq)
outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq) # freq < low_freq || freq > high_freq
before_l = torch.lt(freq, l) # freq < l
before_h = torch.lt(freq, h) # freq < h
after_h = torch.ge(freq, h) # freq >= h
# order of operations matter here (since there is overlapping frequency regions)
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
res[before_h] = scale * freq[before_h]
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
res[outside_low_high_freq] = freq[outside_low_high_freq]
return res
def vtln_warp_mel_freq(
vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq,
high_freq: float,
vtln_warp_factor: float,
mel_freq: Tensor,
) -> Tensor:
r"""
Args:
vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
low_freq (float): Lower frequency cutoffs in mel computation
high_freq (float): Upper frequency cutoffs in mel computation
vtln_warp_factor (float): Vtln warp factor
mel_freq (Tensor): Given frequency in Mel
Returns:
Tensor: ``mel_freq`` after vtln warp
"""
return mel_scale(
vtln_warp_freq(
vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
)
)
def get_mel_banks(
num_bins: int,
window_length_padded: int,
sample_freq: float,
low_freq: float,
high_freq: float,
vtln_low: float,
vtln_high: float,
vtln_warp_factor: float,
device=None,
dtype=None,
) -> Tuple[Tensor, Tensor]:
"""
Returns:
(Tensor, Tensor): The tuple consists of ``bins`` (which is
melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
center frequencies of bins of size (``num_bins``)).
"""
assert num_bins > 3, "Must have at least 3 mel bins"
assert window_length_padded % 2 == 0
num_fft_bins = window_length_padded / 2
nyquist = 0.5 * sample_freq
if high_freq <= 0.0:
high_freq += nyquist
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), (
"Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
)
# fft-bin width [think of it as Nyquist-freq / half-window-length]
fft_bin_width = sample_freq / window_length_padded
mel_low_freq = mel_scale_scalar(low_freq)
mel_high_freq = mel_scale_scalar(high_freq)
# divide by num_bins+1 in next line because of end-effects where the bins
# spread out to the sides.
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
if vtln_high < 0.0:
vtln_high += nyquist
assert vtln_warp_factor == 1.0 or (
(low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
vtln_low, vtln_high, low_freq, high_freq
)
bin = torch.arange(num_bins).unsqueeze(1)
left_mel = mel_low_freq + bin * mel_freq_delta # size(num_bins, 1)
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # size(num_bins, 1)
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # size(num_bins, 1)
if vtln_warp_factor != 1.0:
left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
# center_freqs = inverse_mel_scale(center_mel) # size (num_bins)
# size(1, num_fft_bins)
mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
# size (num_bins, num_fft_bins)
up_slope = (mel - left_mel) / (center_mel - left_mel)
down_slope = (right_mel - mel) / (right_mel - center_mel)
if vtln_warp_factor == 1.0:
# left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
else:
# warping can move the order of left_mel, center_mel, right_mel anywhere
bins = torch.zeros_like(up_slope)
up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel) # left_mel < mel <= center_mel
down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel) # center_mel < mel < right_mel
bins[up_idx] = up_slope[up_idx]
bins[down_idx] = down_slope[down_idx]
return bins.to(device=device, dtype=dtype) # , center_freqs
cache = {}
def fbank(
waveform: Tensor,
blackman_coeff: float = 0.42,
channel: int = -1,
dither: float = 0.0,
energy_floor: float = 1.0,
frame_length: float = 25.0,
frame_shift: float = 10.0,
high_freq: float = 0.0,
htk_compat: bool = False,
low_freq: float = 20.0,
min_duration: float = 0.0,
num_mel_bins: int = 23,
preemphasis_coefficient: float = 0.97,
raw_energy: bool = True,
remove_dc_offset: bool = True,
round_to_power_of_two: bool = True,
sample_frequency: float = 16000.0,
snip_edges: bool = True,
subtract_mean: bool = False,
use_energy: bool = False,
use_log_fbank: bool = True,
use_power: bool = True,
vtln_high: float = -500.0,
vtln_low: float = 100.0,
vtln_warp: float = 1.0,
window_type: str = POVEY,
) -> Tensor:
r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
compute-fbank-feats.
Args:
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
(Default: ``0.0``)
htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible features
(need to change other parameters). (Default: ``False``)
low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. (Default: ``True``)
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
specified there) (Default: ``16000.0``)
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (Default: ``False``)
use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
negative, offset from high-mel-freq (Default: ``-500.0``)
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
(Default: ``'povey'``)
Returns:
Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
where m is calculated in _get_strided
"""
device, dtype = waveform.device, waveform.dtype
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
)
if len(waveform) < min_duration * sample_frequency:
# signal is too short
return torch.empty(0, device=device, dtype=dtype)
# strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
strided_input, signal_log_energy = _get_window(
waveform,
padded_window_size,
window_size,
window_shift,
window_type,
blackman_coeff,
snip_edges,
raw_energy,
energy_floor,
dither,
remove_dc_offset,
preemphasis_coefficient,
)
# size (m, padded_window_size // 2 + 1)
spectrum = torch.fft.rfft(strided_input).abs()
if use_power:
spectrum = spectrum.pow(2.0)
# size (num_mel_bins, padded_window_size // 2)
# print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % (
num_mel_bins,
padded_window_size,
sample_frequency,
low_freq,
high_freq,
vtln_low,
vtln_high,
vtln_warp,
device,
dtype,
)
if cache_key not in cache:
mel_energies = get_mel_banks(
num_mel_bins,
padded_window_size,
sample_frequency,
low_freq,
high_freq,
vtln_low,
vtln_high,
vtln_warp,
device,
dtype,
)
cache[cache_key] = mel_energies
else:
mel_energies = cache[cache_key]
# pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
# sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
mel_energies = torch.mm(spectrum, mel_energies.T)
if use_log_fbank:
# avoid log of zero (which should be prevented anyway by dithering)
mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
# if use_energy then add it as the last column for htk_compat == true else first column
if use_energy:
signal_log_energy = signal_log_energy.unsqueeze(1) # size (m, 1)
# returns size (m, num_mel_bins + 1)
if htk_compat:
mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
else:
mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
return mel_energies
def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
# returns a dct matrix of size (num_mel_bins, num_ceps)
# size (num_mel_bins, num_mel_bins)
dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
# kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
# this would be the first column in the dct_matrix for torchaudio as it expects a
# right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
# expects a left multiply e.g. dct_matrix * vector).
dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
dct_matrix = dct_matrix[:, :num_ceps]
return dct_matrix
def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
# returns size (num_ceps)
# Compute liftering coefficients (scaling on cepstral coeffs)
# coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
i = torch.arange(num_ceps)
return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
def mfcc(
waveform: Tensor,
blackman_coeff: float = 0.42,
cepstral_lifter: float = 22.0,
channel: int = -1,
dither: float = 0.0,
energy_floor: float = 1.0,
frame_length: float = 25.0,
frame_shift: float = 10.0,
high_freq: float = 0.0,
htk_compat: bool = False,
low_freq: float = 20.0,
num_ceps: int = 13,
min_duration: float = 0.0,
num_mel_bins: int = 23,
preemphasis_coefficient: float = 0.97,
raw_energy: bool = True,
remove_dc_offset: bool = True,
round_to_power_of_two: bool = True,
sample_frequency: float = 16000.0,
snip_edges: bool = True,
subtract_mean: bool = False,
use_energy: bool = False,
vtln_high: float = -500.0,
vtln_low: float = 100.0,
vtln_warp: float = 1.0,
window_type: str = POVEY,
) -> Tensor:
r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
compute-mfcc-feats.
Args:
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
(Default: ``0.0``)
htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible
features (need to change other parameters). (Default: ``False``)
low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. (Default: ``True``)
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
specified there) (Default: ``16000.0``)
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (Default: ``False``)
use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
negative, offset from high-mel-freq (Default: ``-500.0``)
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
(Default: ``"povey"``)
Returns:
Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
where m is calculated in _get_strided
"""
assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
device, dtype = waveform.device, waveform.dtype
# The mel_energies should not be squared (use_power=True), not have mean subtracted
# (subtract_mean=False), and use log (use_log_fbank=True).
# size (m, num_mel_bins + use_energy)
feature = fbank(
waveform=waveform,
blackman_coeff=blackman_coeff,
channel=channel,
dither=dither,
energy_floor=energy_floor,
frame_length=frame_length,
frame_shift=frame_shift,
high_freq=high_freq,
htk_compat=htk_compat,
low_freq=low_freq,
min_duration=min_duration,
num_mel_bins=num_mel_bins,
preemphasis_coefficient=preemphasis_coefficient,
raw_energy=raw_energy,
remove_dc_offset=remove_dc_offset,
round_to_power_of_two=round_to_power_of_two,
sample_frequency=sample_frequency,
snip_edges=snip_edges,
subtract_mean=False,
use_energy=use_energy,
use_log_fbank=True,
use_power=True,
vtln_high=vtln_high,
vtln_low=vtln_low,
vtln_warp=vtln_warp,
window_type=window_type,
)
if use_energy:
# size (m)
signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
# offset is 0 if htk_compat==True else 1
mel_offset = int(not htk_compat)
feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
# size (num_mel_bins, num_ceps)
dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
# size (m, num_ceps)
feature = feature.matmul(dct_matrix)
if cepstral_lifter != 0.0:
# size (1, num_ceps)
lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
feature *= lifter_coeffs.to(device=device, dtype=dtype)
# if use_energy then replace the last column for htk_compat == true else first column
if use_energy:
feature[:, 0] = signal_log_energy
if htk_compat:
energy = feature[:, 0].unsqueeze(1) # size (m, 1)
feature = feature[:, 1:] # size (m, num_ceps - 1)
if not use_energy:
# scale on C0 (actually removing a scale we previously added that's
# part of one common definition of the cosine transform.)
energy *= math.sqrt(2)
feature = torch.cat((feature, energy), dim=1)
feature = _subtract_column_mean(feature, subtract_mean)
return feature

View File

@ -1,101 +0,0 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
import torch
import torch.nn as nn
class TAP(nn.Module):
"""
Temporal average pooling, only first-order mean is considered
"""
def __init__(self, **kwargs):
super(TAP, self).__init__()
def forward(self, x):
pooling_mean = x.mean(dim=-1)
# To be compatable with 2D input
pooling_mean = pooling_mean.flatten(start_dim=1)
return pooling_mean
class TSDP(nn.Module):
"""
Temporal standard deviation pooling, only second-order std is considered
"""
def __init__(self, **kwargs):
super(TSDP, self).__init__()
def forward(self, x):
# The last dimension is the temporal axis
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
pooling_std = pooling_std.flatten(start_dim=1)
return pooling_std
class TSTP(nn.Module):
"""
Temporal statistics pooling, concatenate mean and std, which is used in
x-vector
Comment: simple concatenation can not make full use of both statistics
"""
def __init__(self, **kwargs):
super(TSTP, self).__init__()
def forward(self, x):
# The last dimension is the temporal axis
pooling_mean = x.mean(dim=-1)
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
pooling_mean = pooling_mean.flatten(start_dim=1)
pooling_std = pooling_std.flatten(start_dim=1)
stats = torch.cat((pooling_mean, pooling_std), 1)
return stats
class ASTP(nn.Module):
"""Attentive statistics pooling: Channel- and context-dependent
statistics pooling, first used in ECAPA_TDNN.
"""
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
super(ASTP, self).__init__()
self.global_context_att = global_context_att
# Use Conv1d with stride == 1 rather than Linear, then we don't
# need to transpose inputs.
if global_context_att:
self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1) # equals W and b in the paper
else:
self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
def forward(self, x):
"""
x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
or a 4-dimensional tensor in resnet architecture (B,C,F,T)
0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
"""
if len(x.shape) == 4:
x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
assert len(x.shape) == 3
if self.global_context_att:
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
x_in = torch.cat((x, context_mean, context_std), dim=1)
else:
x_in = x
# DON'T use ReLU here! ReLU may be hard to converge.
alpha = torch.tanh(self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
alpha = torch.softmax(self.linear2(alpha), dim=2)
mean = torch.sum(alpha * x, dim=2)
var = torch.sum(alpha * (x**2), dim=2) - mean**2
std = torch.sqrt(var.clamp(min=1e-10))
return torch.cat([mean, std], dim=1)

View File

@ -1,7 +1,6 @@
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
# reference: https://github.com/lifeiteng/vall-e # reference: https://github.com/lifeiteng/vall-e
import argparse import argparse
from io import BytesIO
from typing import Optional from typing import Optional
from my_utils import load_audio from my_utils import load_audio
import torch import torch
@ -18,9 +17,6 @@ from module.models_onnx import SynthesizerTrn
from inference_webui import get_phones_and_bert from inference_webui import get_phones_and_bert
from sv import SV
import kaldi as Kaldi
import os import os
import soundfile import soundfile
@ -36,25 +32,6 @@ default_config = {
"EOS": 1024, "EOS": 1024,
} }
sv_cn_model = None
def init_sv_cn(device, is_half):
global sv_cn_model
sv_cn_model = SV(device, is_half)
def load_sovits_new(sovits_path):
f = open(sovits_path, "rb")
meta = f.read(2)
if meta != b"PK":
data = b"PK" + f.read()
bio = BytesIO()
bio.write(data)
bio.seek(0)
return torch.load(bio, map_location="cpu", weights_only=False)
return torch.load(sovits_path, map_location="cpu", weights_only=False)
def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule:
config = dict_s1["config"] config = dict_s1["config"]
@ -106,7 +83,7 @@ def logits_to_probs(
@torch.jit.script @torch.jit.script
def multinomial_sample_one_no_sync(probs_sort): def multinomial_sample_one_no_sync(probs_sort):
# Does multinomial sampling without a cuda synchronization # Does multinomial sampling without a cuda synchronization
q = torch.empty_like(probs_sort).exponential_(1.0) q = torch.randn_like(probs_sort)
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
@ -117,7 +94,7 @@ def sample(
temperature: float = 1.0, temperature: float = 1.0,
top_k: Optional[int] = None, top_k: Optional[int] = None,
top_p: Optional[int] = None, top_p: Optional[int] = None,
repetition_penalty: float = 1.35, repetition_penalty: float = 1.0,
): ):
probs = logits_to_probs( probs = logits_to_probs(
logits=logits, logits=logits,
@ -132,10 +109,8 @@ def sample(
@torch.jit.script @torch.jit.script
def spectrogram_torch( def spectrogram_torch(y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False):
hann_window: Tensor, y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
):
# hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
y = torch.nn.functional.pad( y = torch.nn.functional.pad(
y.unsqueeze(1), y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
@ -314,9 +289,8 @@ class T2SBlock:
attn = F.scaled_dot_product_attention(q, k, v) attn = F.scaled_dot_product_attention(q, k, v)
# attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim)
# attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1)
attn = F.linear(attn, self.out_w, self.out_b) attn = F.linear(attn, self.out_w, self.out_b)
x = x + attn x = x + attn
@ -354,22 +328,15 @@ class T2STransformer:
class VitsModel(nn.Module): class VitsModel(nn.Module):
def __init__(self, vits_path, version=None, is_half=True, device="cpu"): def __init__(self, vits_path):
super().__init__() super().__init__()
# dict_s2 = torch.load(vits_path,map_location="cpu") # dict_s2 = torch.load(vits_path,map_location="cpu")
dict_s2 = load_sovits_new(vits_path) dict_s2 = torch.load(vits_path)
self.hps = dict_s2["config"] self.hps = dict_s2["config"]
if version is None:
if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
self.hps["model"]["version"] = "v1" self.hps["model"]["version"] = "v1"
else: else:
self.hps["model"]["version"] = "v2" self.hps["model"]["version"] = "v2"
else:
if version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"]:
self.hps["model"]["version"] = version
else:
raise ValueError(f"Unsupported version: {version}")
self.hps = DictToAttrRecursive(self.hps) self.hps = DictToAttrRecursive(self.hps)
self.hps.model.semantic_frame_rate = "25hz" self.hps.model.semantic_frame_rate = "25hz"
@ -379,19 +346,11 @@ class VitsModel(nn.Module):
n_speakers=self.hps.data.n_speakers, n_speakers=self.hps.data.n_speakers,
**self.hps.model, **self.hps.model,
) )
self.vq_model.load_state_dict(dict_s2["weight"], strict=False)
self.vq_model.dec.remove_weight_norm()
if is_half:
self.vq_model = self.vq_model.half()
self.vq_model = self.vq_model.to(device)
self.vq_model.eval() self.vq_model.eval()
self.hann_window = torch.hann_window( self.vq_model.load_state_dict(dict_s2["weight"], strict=False)
self.hps.data.win_length, device=device, dtype=torch.float16 if is_half else torch.float32
)
def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0, sv_emb=None): def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0):
refer = spectrogram_torch( refer = spectrogram_torch(
self.hann_window,
ref_audio, ref_audio,
self.hps.data.filter_length, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.sampling_rate,
@ -399,7 +358,7 @@ class VitsModel(nn.Module):
self.hps.data.win_length, self.hps.data.win_length,
center=False, center=False,
) )
return self.vq_model(pred_semantic, text_seq, refer, speed=speed, sv_emb=sv_emb)[0, 0] return self.vq_model(pred_semantic, text_seq, refer, speed)[0, 0]
class T2SModel(nn.Module): class T2SModel(nn.Module):
@ -474,10 +433,6 @@ class T2SModel(nn.Module):
bert = bert.unsqueeze(0) bert = bert.unsqueeze(0)
x = self.ar_text_embedding(all_phoneme_ids) x = self.ar_text_embedding(all_phoneme_ids)
# avoid dtype inconsistency when exporting
bert = bert.to(dtype=self.bert_proj.weight.dtype)
x = x + self.bert_proj(bert.transpose(1, 2)) x = x + self.bert_proj(bert.transpose(1, 2))
x: torch.Tensor = self.ar_text_position(x) x: torch.Tensor = self.ar_text_position(x)
@ -677,9 +632,7 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
ref_seq = torch.LongTensor([ref_seq_id]).to(device) ref_seq = torch.LongTensor([ref_seq_id]).to(device)
ref_bert = ref_bert_T.T.to(ref_seq.device) ref_bert = ref_bert_T.T.to(ref_seq.device)
text_seq_id, text_bert_T, norm_text = get_phones_and_bert( text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
"这是一个简单的示例真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", "这是一条测试语音,说什么无所谓,只是给它一个例子", "all_zh", "v2"
"auto",
"v2",
) )
text_seq = torch.LongTensor([text_seq_id]).to(device) text_seq = torch.LongTensor([text_seq_id]).to(device)
text_bert = text_bert_T.T.to(text_seq.device) text_bert = text_bert_T.T.to(text_seq.device)
@ -687,12 +640,12 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
ssl_content = ssl(ref_audio).to(device) ssl_content = ssl(ref_audio).to(device)
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
vits = VitsModel(vits_path, device=device, is_half=False) vits = VitsModel(vits_path).to(device)
vits.eval() vits.eval()
# gpt_path = "GPT_weights_v2/xw-e15.ckpt" # gpt_path = "GPT_weights_v2/xw-e15.ckpt"
# dict_s1 = torch.load(gpt_path, map_location=device) # dict_s1 = torch.load(gpt_path, map_location=device)
dict_s1 = torch.load(gpt_path, weights_only=False) dict_s1 = torch.load(gpt_path)
raw_t2s = get_raw_t2s_model(dict_s1).to(device) raw_t2s = get_raw_t2s_model(dict_s1).to(device)
print("#### get_raw_t2s_model ####") print("#### get_raw_t2s_model ####")
print(raw_t2s.config) print(raw_t2s.config)
@ -726,124 +679,6 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
print("#### exported gpt_sovits ####") print("#### exported gpt_sovits ####")
def export_prov2(
gpt_path,
vits_path,
version,
ref_audio_path,
ref_text,
output_path,
export_bert_and_ssl=False,
device="cpu",
is_half=True,
):
if sv_cn_model == None:
init_sv_cn(device, is_half)
if not os.path.exists(output_path):
os.makedirs(output_path)
print(f"目录已创建: {output_path}")
else:
print(f"目录已存在: {output_path}")
ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float()
ssl = SSLModel()
if export_bert_and_ssl:
s = ExportSSLModel(torch.jit.trace(ssl, example_inputs=(ref_audio)))
ssl_path = os.path.join(output_path, "ssl_model.pt")
torch.jit.script(s).save(ssl_path)
print("#### exported ssl ####")
export_bert(output_path)
else:
s = ExportSSLModel(ssl)
print(f"device: {device}")
ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2")
ref_seq = torch.LongTensor([ref_seq_id]).to(device)
ref_bert = ref_bert_T.T
if is_half:
ref_bert = ref_bert.half()
ref_bert = ref_bert.to(ref_seq.device)
text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
"这是一个简单的示例真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
"auto",
"v2",
)
text_seq = torch.LongTensor([text_seq_id]).to(device)
text_bert = text_bert_T.T
if is_half:
text_bert = text_bert.half()
text_bert = text_bert.to(text_seq.device)
ssl_content = ssl(ref_audio)
if is_half:
ssl_content = ssl_content.half()
ssl_content = ssl_content.to(device)
sv_model = ExportERes2NetV2(sv_cn_model)
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
vits = VitsModel(vits_path, version, is_half=is_half, device=device)
vits.eval()
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
# dict_s1 = torch.load(gpt_path, map_location=device)
dict_s1 = torch.load(gpt_path, weights_only=False)
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
print("#### get_raw_t2s_model ####")
print(raw_t2s.config)
if is_half:
raw_t2s = raw_t2s.half()
t2s_m = T2SModel(raw_t2s)
t2s_m.eval()
t2s = torch.jit.script(t2s_m).to(device)
print("#### script t2s_m ####")
print("vits.hps.data.sampling_rate:", vits.hps.data.sampling_rate)
gpt_sovits = GPT_SoVITS_V2Pro(t2s, vits, sv_model).to(device)
gpt_sovits.eval()
ref_audio_sr = s.resample(ref_audio, 16000, 32000)
if is_half:
ref_audio_sr = ref_audio_sr.half()
ref_audio_sr = ref_audio_sr.to(device)
torch._dynamo.mark_dynamic(ssl_content, 2)
torch._dynamo.mark_dynamic(ref_audio_sr, 1)
torch._dynamo.mark_dynamic(ref_seq, 1)
torch._dynamo.mark_dynamic(text_seq, 1)
torch._dynamo.mark_dynamic(ref_bert, 0)
torch._dynamo.mark_dynamic(text_bert, 0)
# torch._dynamo.mark_dynamic(sv_emb, 0)
top_k = torch.LongTensor([5]).to(device)
# 先跑一遍 sv_model 让它加载 cache详情见 L880
gpt_sovits.sv_model(ref_audio_sr)
with torch.no_grad():
gpt_sovits_export = torch.jit.trace(
gpt_sovits,
example_inputs=(
ssl_content,
ref_audio_sr,
ref_seq,
text_seq,
ref_bert,
text_bert,
top_k,
),
)
gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt")
gpt_sovits_export.save(gpt_sovits_path)
print("#### exported gpt_sovits ####")
audio = gpt_sovits_export(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, text_bert, top_k)
print("start write wav")
soundfile.write("out.wav", audio.float().detach().cpu().numpy(), 32000)
@torch.jit.script @torch.jit.script
def parse_audio(ref_audio): def parse_audio(ref_audio):
ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() # .to(ref_audio.device) ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() # .to(ref_audio.device)
@ -882,67 +717,6 @@ class GPT_SoVITS(nn.Module):
return audio return audio
class ExportERes2NetV2(nn.Module):
def __init__(self, sv_cn_model: SV):
super(ExportERes2NetV2, self).__init__()
self.bn1 = sv_cn_model.embedding_model.bn1
self.conv1 = sv_cn_model.embedding_model.conv1
self.layer1 = sv_cn_model.embedding_model.layer1
self.layer2 = sv_cn_model.embedding_model.layer2
self.layer3 = sv_cn_model.embedding_model.layer3
self.layer4 = sv_cn_model.embedding_model.layer4
self.layer3_ds = sv_cn_model.embedding_model.layer3_ds
self.fuse34 = sv_cn_model.embedding_model.fuse34
# audio_16k.shape: [1,N]
def forward(self, audio_16k):
# 这个 fbank 函数有一个 cache, 不过不要紧,它跟 audio_16k 的长度无关
# 只跟 device 和 dtype 有关
x = Kaldi.fbank(audio_16k, num_mel_bins=80, sample_frequency=16000, dither=0)
x = torch.stack([x])
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
out2 = self.layer2(out1)
out3 = self.layer3(out2)
out4 = self.layer4(out3)
out3_ds = self.layer3_ds(out3)
fuse_out34 = self.fuse34(out4, out3_ds)
return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)
class GPT_SoVITS_V2Pro(nn.Module):
def __init__(self, t2s: T2SModel, vits: VitsModel, sv_model: ExportERes2NetV2):
super().__init__()
self.t2s = t2s
self.vits = vits
self.sv_model = sv_model
def forward(
self,
ssl_content: torch.Tensor,
ref_audio_sr: torch.Tensor,
ref_seq: Tensor,
text_seq: Tensor,
ref_bert: Tensor,
text_bert: Tensor,
top_k: LongTensor,
speed=1.0,
):
codes = self.vits.vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
prompts = prompt_semantic.unsqueeze(0)
audio_16k = resamplex(ref_audio_sr, 32000, 16000).to(ref_audio_sr.dtype)
sv_emb = self.sv_model(audio_16k)
pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert, top_k)
audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed, sv_emb)
return audio
def test(): def test():
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
@ -1065,25 +839,8 @@ def main():
parser.add_argument("--output_path", required=True, help="Path to the output directory") parser.add_argument("--output_path", required=True, help="Path to the output directory")
parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model") parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model")
parser.add_argument("--device", help="Device to use") parser.add_argument("--device", help="Device to use")
parser.add_argument("--version", help="version of the model", default="v2")
parser.add_argument("--no-half", action="store_true", help="Do not use half precision for model weights")
args = parser.parse_args() args = parser.parse_args()
if args.version in ["v2Pro", "v2ProPlus"]:
is_half = not args.no_half
print(f"Using half precision: {is_half}")
export_prov2(
gpt_path=args.gpt_model,
vits_path=args.sovits_model,
version=args.version,
ref_audio_path=args.ref_audio,
ref_text=args.ref_text,
output_path=args.output_path,
export_bert_and_ssl=args.export_common_model,
device=args.device,
is_half=is_half,
)
else:
export( export(
gpt_path=args.gpt_model, gpt_path=args.gpt_model,
vits_path=args.sovits_model, vits_path=args.sovits_model,
@ -1095,7 +852,10 @@ def main():
) )
import inference_webui
if __name__ == "__main__": if __name__ == "__main__":
with torch.no_grad(): inference_webui.is_half = False
inference_webui.dtype = torch.float32
main() main()
# test() # test()

View File

@ -10,7 +10,7 @@ from inference_webui import get_phones_and_bert
import librosa import librosa
from module import commons from module import commons
from module.mel_processing import mel_spectrogram_torch from module.mel_processing import mel_spectrogram_torch
from module.models_onnx import CFM, Generator, SynthesizerTrnV3 from module.models_onnx import CFM, SynthesizerTrnV3
import numpy as np import numpy as np
import torch._dynamo.config import torch._dynamo.config
import torchaudio import torchaudio
@ -46,7 +46,7 @@ class MelSpectrgram(torch.nn.Module):
center=False, center=False,
): ):
super().__init__() super().__init__()
self.hann_window = torch.hann_window(win_size).to(device=device, dtype=dtype) self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype)
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device) self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device)
self.n_fft: int = n_fft self.n_fft: int = n_fft
@ -189,19 +189,6 @@ mel_fn = lambda x: mel_spectrogram_torch(
"center": False, "center": False,
}, },
) )
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
spec_min = -12 spec_min = -12
spec_max = 2 spec_max = 2
@ -243,7 +230,6 @@ class ExportGPTSovitsHalf(torch.nn.Module):
self.sampling_rate: int = hps.data.sampling_rate self.sampling_rate: int = hps.data.sampling_rate
self.hop_length: int = hps.data.hop_length self.hop_length: int = hps.data.hop_length
self.win_length: int = hps.data.win_length self.win_length: int = hps.data.win_length
self.hann_window = torch.hann_window(self.win_length, device=device, dtype=torch.float32)
def forward( def forward(
self, self,
@ -256,7 +242,6 @@ class ExportGPTSovitsHalf(torch.nn.Module):
top_k, top_k,
): ):
refer = spectrogram_torch( refer = spectrogram_torch(
self.hann_window,
ref_audio_32k, ref_audio_32k,
self.filter_length, self.filter_length,
self.sampling_rate, self.sampling_rate,
@ -300,86 +285,6 @@ class ExportGPTSovitsHalf(torch.nn.Module):
return fea_ref, fea_todo, mel2 return fea_ref, fea_todo, mel2
class ExportGPTSovitsV4Half(torch.nn.Module):
def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3):
super().__init__()
self.hps = hps
self.t2s_m = t2s_m
self.vq_model = vq_model
self.mel2 = MelSpectrgram(
dtype=torch.float32,
device=device,
n_fft=1280,
num_mels=100,
sampling_rate=32000,
hop_size=320,
win_size=1280,
fmin=0,
fmax=None,
center=False,
)
# self.dtype = dtype
self.filter_length: int = hps.data.filter_length
self.sampling_rate: int = hps.data.sampling_rate
self.hop_length: int = hps.data.hop_length
self.win_length: int = hps.data.win_length
self.hann_window = torch.hann_window(self.win_length, device=device, dtype=torch.float32)
def forward(
self,
ssl_content,
ref_audio_32k: torch.FloatTensor,
phoneme_ids0,
phoneme_ids1,
bert1,
bert2,
top_k,
):
refer = spectrogram_torch(
self.hann_window,
ref_audio_32k,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
).to(ssl_content.dtype)
codes = self.vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
prompt = prompt_semantic.unsqueeze(0)
# print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
pred_semantic = self.t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k)
# print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
ge = self.vq_model.create_ge(refer)
# print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
prompt_ = prompt.unsqueeze(0)
fea_ref = self.vq_model(prompt_, phoneme_ids0, ge)
# print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# print(prompt_.shape, phoneme_ids0.shape, ge.shape)
# print(fea_ref.shape)
ref_32k = ref_audio_32k
mel2 = norm_spec(self.mel2(ref_32k)).to(ssl_content.dtype)
T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min]
if T_min > 500:
mel2 = mel2[:, :, -500:]
fea_ref = fea_ref[:, :, -500:]
T_min = 500
fea_todo = self.vq_model(pred_semantic, phoneme_ids1, ge)
# print('fea_todo',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# print(pred_semantic.shape, phoneme_ids1.shape, ge.shape)
# print(fea_todo.shape)
return fea_ref, fea_todo, mel2
class GPTSoVITSV3(torch.nn.Module): class GPTSoVITSV3(torch.nn.Module):
def __init__(self, gpt_sovits_half, cfm, bigvgan): def __init__(self, gpt_sovits_half, cfm, bigvgan):
super().__init__() super().__init__()
@ -406,7 +311,6 @@ class GPTSoVITSV3(torch.nn.Module):
chunk_len = 934 - fea_ref.shape[2] chunk_len = 934 - fea_ref.shape[2]
wav_gen_list = [] wav_gen_list = []
idx = 0 idx = 0
fea_todo = fea_todo[:, :, :-5]
wav_gen_length = fea_todo.shape[2] * 256 wav_gen_length = fea_todo.shape[2] * 256
while 1: while 1:
# current_time = datetime.now() # current_time = datetime.now()
@ -440,66 +344,6 @@ class GPTSoVITSV3(torch.nn.Module):
return wav_gen[0][0][:wav_gen_length] return wav_gen[0][0][:wav_gen_length]
class GPTSoVITSV4(torch.nn.Module):
def __init__(self, gpt_sovits_half, cfm, hifigan):
super().__init__()
self.gpt_sovits_half = gpt_sovits_half
self.cfm = cfm
self.hifigan = hifigan
def forward(
self,
ssl_content,
ref_audio_32k: torch.FloatTensor,
phoneme_ids0: torch.LongTensor,
phoneme_ids1: torch.LongTensor,
bert1,
bert2,
top_k: torch.LongTensor,
sample_steps: torch.LongTensor,
):
# current_time = datetime.now()
# print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S"))
fea_ref, fea_todo, mel2 = self.gpt_sovits_half(
ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k
)
chunk_len = 1000 - fea_ref.shape[2]
wav_gen_list = []
idx = 0
fea_todo = fea_todo[:, :, :-10]
wav_gen_length = fea_todo.shape[2] * 480
while 1:
# current_time = datetime.now()
# print("idx:",idx,current_time.strftime("%Y-%m-%d %H:%M:%S"))
fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len]
if fea_todo_chunk.shape[-1] == 0:
break
# 因为导出的模型在不同shape时会重新编译还是怎么的会卡顿10s这样
# 所以在这里补0让他shape维持不变
# 但是这样会导致生成的音频长度不对,所以在最后截取一下。
# 经过 hifigan 之后音频长度就是 fea_todo.shape[2] * 480
complete_len = chunk_len - fea_todo_chunk.shape[-1]
if complete_len != 0:
fea_todo_chunk = torch.cat(
[
fea_todo_chunk,
torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype),
],
2,
)
cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps)
idx += chunk_len
cfm_res = denorm_spec(cfm_res)
hifigan_res = self.hifigan(cfm_res)
wav_gen_list.append(hifigan_res)
wav_gen = torch.cat(wav_gen_list, 2)
return wav_gen[0][0][:wav_gen_length]
def init_bigvgan(): def init_bigvgan():
global bigvgan_model global bigvgan_model
from BigVGAN import bigvgan from BigVGAN import bigvgan
@ -517,31 +361,6 @@ def init_bigvgan():
bigvgan_model = bigvgan_model.to(device) bigvgan_model = bigvgan_model.to(device)
def init_hifigan():
global hifigan_model, bigvgan_model
hifigan_model = Generator(
initial_channel=100,
resblock="1",
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
upsample_rates=[10, 6, 2, 2, 2],
upsample_initial_channel=512,
upsample_kernel_sizes=[20, 12, 4, 4, 4],
gin_channels=0,
is_bias=True,
)
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
if is_half == True:
hifigan_model = hifigan_model.half().to(device)
else:
hifigan_model = hifigan_model.to(device)
class Sovits: class Sovits:
def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps): def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps):
self.vq_model = vq_model self.vq_model = vq_model
@ -580,8 +399,6 @@ class DictToAttrRecursive(dict):
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
v3v4set = {"v3", "v4"}
def get_sovits_weights(sovits_path): def get_sovits_weights(sovits_path):
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
@ -602,8 +419,8 @@ def get_sovits_weights(sovits_path):
else: else:
hps.model.version = "v2" hps.model.version = "v2"
if model_version in v3v4set: if model_version == "v3":
hps.model.version = model_version hps.model.version = "v3"
logger.info(f"hps: {hps}") logger.info(f"hps: {hps}")
@ -705,13 +522,10 @@ def export_cfm(
return export_cfm return export_cfm
def export_1(ref_wav_path, ref_wav_text, version="v3"): def export():
if version == "v3":
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
init_bigvgan() init_bigvgan()
else:
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
init_hifigan()
dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
raw_t2s = get_raw_t2s_model(dict_s1).to(device) raw_t2s = get_raw_t2s_model(dict_s1).to(device)
@ -726,9 +540,9 @@ def export_1(ref_wav_path, ref_wav_text, version="v3"):
script_t2s = torch.jit.script(t2s_m).to(device) script_t2s = torch.jit.script(t2s_m).to(device)
hps = sovits.hps hps = sovits.hps
# ref_wav_path = "onnx/ad/ref.wav" ref_wav_path = "onnx/ad/ref.wav"
speed = 1.0 speed = 1.0
sample_steps = 8 sample_steps = 32
dtype = torch.float16 if is_half == True else torch.float32 dtype = torch.float16 if is_half == True else torch.float32
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
zero_wav = np.zeros( zero_wav = np.zeros(
@ -753,10 +567,9 @@ def export_1(ref_wav_path, ref_wav_text, version="v3"):
prompt_semantic = codes[0, 0] prompt_semantic = codes[0, 0]
prompt = prompt_semantic.unsqueeze(0).to(device) prompt = prompt_semantic.unsqueeze(0).to(device)
# phones1, bert1, norm_text1 = get_phones_and_bert( phones1, bert1, norm_text1 = get_phones_and_bert(
# "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3" "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
# ) )
phones1, bert1, norm_text1 = get_phones_and_bert(ref_wav_text, "auto", "v3")
phones2, bert2, norm_text2 = get_phones_and_bert( phones2, bert2, norm_text2 = get_phones_and_bert(
"这是一个简单的示例真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", "这是一个简单的示例真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
"auto", "auto",
@ -821,33 +634,25 @@ def export_1(ref_wav_path, ref_wav_text, version="v3"):
# vq_model = sovits.vq_model # vq_model = sovits.vq_model
vq_model = trace_vq_model vq_model = trace_vq_model
if version == "v3":
gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model) gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model)
torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt") torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt")
else:
gpt_sovits_half = ExportGPTSovitsV4Half(sovits.hps, script_t2s, trace_vq_model)
torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v4_half.pt")
ref_audio, sr = torchaudio.load(ref_wav_path) ref_audio, sr = torchaudio.load(ref_wav_path)
ref_audio = ref_audio.to(device).float() ref_audio = ref_audio.to(device).float()
if ref_audio.shape[0] == 2: if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
tgt_sr = 24000 if version == "v3" else 32000 if sr != 24000:
if sr != tgt_sr: ref_audio = resample(ref_audio, sr)
ref_audio = resample(ref_audio, sr, tgt_sr)
# mel2 = mel_fn(ref_audio) # mel2 = mel_fn(ref_audio)
mel2 = mel_fn(ref_audio) if version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel_fn(ref_audio))
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2]) T_min = min(mel2.shape[2], fea_ref.shape[2])
fea_ref = fea_ref[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min]
print("fea_ref:", fea_ref.shape, T_min) print("fea_ref:", fea_ref.shape, T_min)
Tref = 468 if version == "v3" else 500 if T_min > 468:
Tchunk = 934 if version == "v3" else 1000 mel2 = mel2[:, :, -468:]
if T_min > Tref: fea_ref = fea_ref[:, :, -468:]
mel2 = mel2[:, :, -Tref:] T_min = 468
fea_ref = fea_ref[:, :, -Tref:] chunk_len = 934 - T_min
T_min = Tref
chunk_len = Tchunk - T_min
mel2 = mel2.to(dtype) mel2 = mel2.to(dtype)
# fea_todo, ge = sovits.vq_model(pred_semantic,y_lengths, phoneme_ids1, ge) # fea_todo, ge = sovits.vq_model(pred_semantic,y_lengths, phoneme_ids1, ge)
@ -909,19 +714,13 @@ def export_1(ref_wav_path, ref_wav_text, version="v3"):
with torch.inference_mode(): with torch.inference_mode():
cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype) cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype)
torch._dynamo.mark_dynamic(cmf_res_rand, 2) torch._dynamo.mark_dynamic(cmf_res_rand, 2)
if version == "v3":
bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,)) bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,))
bigvgan_model_.save("onnx/ad/bigvgan_model.pt") bigvgan_model_.save("onnx/ad/bigvgan_model.pt")
wav_gen = bigvgan_model(cmf_res) wav_gen = bigvgan_model(cmf_res)
else:
hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,))
hifigan_model_.save("onnx/ad/hifigan_model.pt")
wav_gen = hifigan_model(cmf_res)
print("wav_gen:", wav_gen.shape, wav_gen.dtype) print("wav_gen:", wav_gen.shape, wav_gen.dtype)
audio = wav_gen[0][0].cpu().detach().numpy() audio = wav_gen[0][0].cpu().detach().numpy()
sr = 24000 if version == "v3" else 48000 sr = 24000
soundfile.write("out.export.wav", (audio * 32768).astype(np.int16), sr) soundfile.write("out.export.wav", (audio * 32768).astype(np.int16), sr)
@ -1047,11 +846,10 @@ def test_export(
soundfile.write(output, (audio * 32768).astype(np.int16), sr) soundfile.write(output, (audio * 32768).astype(np.int16), sr)
def test_export( def test_export1(
todo_text, todo_text,
gpt_sovits_v3v4, gpt_sovits_v3,
output, output,
out_sr=24000,
): ):
# hps = sovits.hps # hps = sovits.hps
ref_wav_path = "onnx/ad/ref.wav" ref_wav_path = "onnx/ad/ref.wav"
@ -1061,7 +859,7 @@ def test_export(
dtype = torch.float16 if is_half == True else torch.float32 dtype = torch.float16 if is_half == True else torch.float32
zero_wav = np.zeros( zero_wav = np.zeros(
int(out_sr * 0.3), int(24000 * 0.3),
dtype=np.float16 if is_half == True else np.float32, dtype=np.float16 if is_half == True else np.float32,
) )
@ -1096,7 +894,7 @@ def test_export(
bert1 = bert1.T.to(device) bert1 = bert1.T.to(device)
bert2 = bert2.T.to(device) bert2 = bert2.T.to(device)
top_k = torch.LongTensor([20]).to(device) top_k = torch.LongTensor([15]).to(device)
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.info("start inference %s", current_time) logger.info("start inference %s", current_time)
@ -1109,26 +907,22 @@ def test_export(
bert2.shape, bert2.shape,
top_k.shape, top_k.shape,
) )
wav_gen = gpt_sovits_v3v4(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps) wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps)
print("wav_gen:", wav_gen.shape, wav_gen.dtype) print("wav_gen:", wav_gen.shape, wav_gen.dtype)
wav_gen = torch.cat([wav_gen, zero_wav_torch], 0) wav_gen = torch.cat([wav_gen, zero_wav_torch], 0)
audio = wav_gen.cpu().detach().numpy() audio = wav_gen.cpu().detach().numpy()
logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
soundfile.write(output, (audio * 32768).astype(np.int16), out_sr) sr = 24000
soundfile.write(output, (audio * 32768).astype(np.int16), sr)
import time import time
def export_2(version="v3"): def test_():
if version == "v3":
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
# init_bigvgan()
else:
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
# init_hifigan()
# cfm = ExportCFM(sovits.cfm) # cfm = ExportCFM(sovits.cfm)
# cfm.cfm.estimator = dit # cfm.cfm.estimator = dit
@ -1152,7 +946,7 @@ def export_2(version="v3"):
raw_t2s = raw_t2s.half().to(device) raw_t2s = raw_t2s.half().to(device)
t2s_m = T2SModel(raw_t2s).half().to(device) t2s_m = T2SModel(raw_t2s).half().to(device)
t2s_m.eval() t2s_m.eval()
t2s_m = torch.jit.script(t2s_m).to(device) t2s_m = torch.jit.script(t2s_m)
t2s_m.eval() t2s_m.eval()
# t2s_m.top_k = 15 # t2s_m.top_k = 15
logger.info("t2s_m ok") logger.info("t2s_m ok")
@ -1169,9 +963,9 @@ def export_2(version="v3"):
# gpt_sovits_v3_half = gpt_sovits_v3_half.half() # gpt_sovits_v3_half = gpt_sovits_v3_half.half()
# gpt_sovits_v3_half = gpt_sovits_v3_half.cuda() # gpt_sovits_v3_half = gpt_sovits_v3_half.cuda()
# gpt_sovits_v3_half.eval() # gpt_sovits_v3_half.eval()
if version == "v3":
gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model) gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model)
logger.info("gpt_sovits_v3_half ok") logger.info("gpt_sovits_v3_half ok")
# init_bigvgan() # init_bigvgan()
# global bigvgan_model # global bigvgan_model
bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt") bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt")
@ -1181,28 +975,13 @@ def export_2(version="v3"):
bigvgan_model.eval() bigvgan_model.eval()
logger.info("bigvgan ok") logger.info("bigvgan ok")
gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model) gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model)
gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3) gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3)
gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt") gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt")
gpt_sovits_v3 = gpt_sovits_v3.half().to(device) gpt_sovits_v3 = gpt_sovits_v3.half().to(device)
gpt_sovits_v3.eval() gpt_sovits_v3.eval()
print("save gpt_sovits_v3 ok") print("save gpt_sovits_v3 ok")
else:
gpt_sovits_v4_half = ExportGPTSovitsV4Half(sovits.hps, t2s_m, vq_model)
logger.info("gpt_sovits_v4 ok")
hifigan_model = torch.jit.load("onnx/ad/hifigan_model.pt")
hifigan_model = hifigan_model.half()
hifigan_model = hifigan_model.cuda()
hifigan_model.eval()
logger.info("hifigan ok")
gpt_sovits_v4 = GPTSoVITSV4(gpt_sovits_v4_half, cfm, hifigan_model)
gpt_sovits_v4 = torch.jit.script(gpt_sovits_v4)
gpt_sovits_v4.save("onnx/ad/gpt_sovits_v4.pt")
print("save gpt_sovits_v4 ok")
gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
sr = 24000 if version == "v3" else 48000
time.sleep(5) time.sleep(5)
# print("thread:", torch.get_num_threads()) # print("thread:", torch.get_num_threads())
@ -1210,18 +989,16 @@ def export_2(version="v3"):
# torch.set_num_interop_threads(1) # torch.set_num_interop_threads(1)
# torch.set_num_threads(1) # torch.set_num_threads(1)
test_export( test_export1(
"汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
gpt_sovits_v3v4, gpt_sovits_v3,
"out.wav", "out.wav",
sr,
) )
test_export( test_export1(
"你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!", "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
gpt_sovits_v3v4, gpt_sovits_v3,
"out2.wav", "out2.wav",
sr,
) )
# test_export( # test_export(
@ -1245,7 +1022,7 @@ def test_export_gpt_sovits_v3():
# gpt_sovits_v3, # gpt_sovits_v3,
# "out4.wav", # "out4.wav",
# ) # )
test_export( test_export1(
"风萧萧兮易水寒,壮士一去兮不复还.", "风萧萧兮易水寒,壮士一去兮不复还.",
gpt_sovits_v3, gpt_sovits_v3,
"out5.wav", "out5.wav",
@ -1253,6 +1030,6 @@ def test_export_gpt_sovits_v3():
with torch.no_grad(): with torch.no_grad():
# export_1("onnx/ad/ref.wav","你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。","v4") # export()
export_2("v4") test_()
# test_export_gpt_sovits_v3() # test_export_gpt_sovits_v3()

View File

@ -143,9 +143,6 @@ class DiT(nn.Module):
drop_audio_cond=False, # cfg for cond audio drop_audio_cond=False, # cfg for cond audio
drop_text=False, # cfg for text drop_text=False, # cfg for text
# mask: bool["b n"] | None = None, # noqa: F722 # mask: bool["b n"] | None = None, # noqa: F722
infer=False, # bool
text_cache=None, # torch tensor as text_embed
dt_cache=None, # torch tensor as dt
): ):
x = x0.transpose(2, 1) x = x0.transpose(2, 1)
cond = cond0.transpose(2, 1) cond = cond0.transpose(2, 1)
@ -158,17 +155,9 @@ class DiT(nn.Module):
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
t = self.time_embed(time) t = self.time_embed(time)
if infer and dt_cache is not None:
dt = dt_cache
else:
dt = self.d_embed(dt_base_bootstrap) dt = self.d_embed(dt_base_bootstrap)
t += dt t += dt
if infer and text_cache is not None:
text_embed = text_cache
else:
text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond) x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
rope = self.rotary_embed.forward_from_seq_len(seq_len) rope = self.rotary_embed.forward_from_seq_len(seq_len)
@ -188,7 +177,4 @@ class DiT(nn.Module):
x = self.norm_out(x, t) x = self.norm_out(x, t)
output = self.proj_out(x) output = self.proj_out(x)
if infer:
return output, text_embed, dt
else:
return output return output

View File

@ -6,31 +6,12 @@
全部按英文识别 全部按英文识别
全部按日文识别 全部按日文识别
""" """
import psutil
import os
def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
if os.name != "nt":
return # 仅 Windows 有效
p = psutil.Process(os.getpid())
try:
p.nice(psutil.HIGH_PRIORITY_CLASS)
print("已将进程优先级设为 High")
except psutil.AccessDenied:
print("权限不足,无法修改优先级(请用管理员运行)")
set_high_priority()
import json
import logging import logging
import os
import re
import sys
import traceback import traceback
import warnings import warnings
import torch
import torchaudio import torchaudio
from text.LangSegmenter import LangSegmenter
logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR)
@ -42,17 +23,47 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
logging.getLogger("multipart.multipart").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
warnings.simplefilter(action="ignore", category=FutureWarning) warnings.simplefilter(action="ignore", category=FutureWarning)
import json
import os
import re
import sys
import torch
from text.LangSegmenter import LangSegmenter
try:
import gradio.analytics as analytics
analytics.version_check = lambda: None
except:
...
version = model_version = os.environ.get("version", "v2") version = model_version = os.environ.get("version", "v2")
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
SoVITS_names, GPT_names = get_weights_names()
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3) is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4) is_exist_s2gv4 = os.path.exists(path_sovits_v4)
pretrained_sovits_name = [
"GPT_SoVITS/pretrained_models/s2G488k.pth",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
]
pretrained_gpt_name = [
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
]
_ = [[], []]
for i in range(4):
if os.path.exists(pretrained_gpt_name[i]):
_[0].append(pretrained_gpt_name[i])
if os.path.exists(pretrained_sovits_name[i]):
_[-1].append(pretrained_sovits_name[i])
pretrained_gpt_name, pretrained_sovits_name = _
if os.path.exists("./weight.json"): if os.path.exists("./weight.json"):
pass pass
@ -63,22 +74,17 @@ else:
with open("./weight.json", "r", encoding="utf-8") as file: with open("./weight.json", "r", encoding="utf-8") as file:
weight_data = file.read() weight_data = file.read()
weight_data = json.loads(weight_data) weight_data = json.loads(weight_data)
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1])) gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name))
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0])) sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name))
if isinstance(gpt_path, list): if isinstance(gpt_path, list):
gpt_path = gpt_path[0] gpt_path = gpt_path[0]
if isinstance(sovits_path, list): if isinstance(sovits_path, list):
sovits_path = sovits_path[0] sovits_path = sovits_path[0]
# print(2333333) # gpt_path = os.environ.get(
# print(os.environ["gpt_path"]) # "gpt_path", pretrained_gpt_name
# print(gpt_path) # )
# print(GPT_names) # sovits_path = os.environ.get("sovits_path", pretrained_sovits_name)
# print(weight_data)
# print(weight_data.get("GPT", {}))
# print(version)###GPT version里没有s2的v2pro
# print(weight_data.get("GPT", {}).get(version, GPT_names[-1]))
cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base") cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base")
bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large")
infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
@ -100,7 +106,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path
import random import random
from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3 from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3,Generator
def set_seed(seed): def set_seed(seed):
@ -123,7 +129,6 @@ from peft import LoraConfig, get_peft_model
from text import cleaned_text_to_sequence from text import cleaned_text_to_sequence
from text.cleaner import clean_text from text.cleaner import clean_text
from tools.assets import css, js, top_html
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
@ -218,24 +223,29 @@ if is_half == True:
else: else:
ssl_model = ssl_model.to(device) ssl_model = ssl_model.to(device)
resample_transform_dict = {}
def resample(audio_tensor, sr0,sr1):
global resample_transform_dict
key="%s-%s"%(sr0,sr1)
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt ###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt
# symbol_version-model_version-if_lora_v3 # symbol_version-model_version-if_lora_v3
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
v3v4set = {"v3", "v4"} v3v4set={"v3","v4"}
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
if "" in sovits_path or "!" in sovits_path:
sovits_path = name2sovits_path[sovits_path]
global vq_model, hps, version, model_version, dict_language, if_lora_v3 global vq_model, hps, version, model_version, dict_language, if_lora_v3
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
print(sovits_path, version, model_version, if_lora_v3) print(sovits_path,version, model_version, if_lora_v3)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False: if if_lora_v3 == True and is_exist == False:
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重") info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
gr.Warning(info) gr.Warning(info)
raise FileExistsError(info) raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2 dict_language = dict_language_v1 if version == "v1" else dict_language_v2
@ -266,15 +276,10 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update, prompt_language_update,
text_update, text_update,
text_language_update, text_language_update,
{ {"__type__": "update", "visible": visible_sample_steps, "value": 32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]},
"__type__": "update",
"visible": visible_sample_steps,
"value": 32 if model_version == "v3" else 8,
"choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
},
{"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False},
{"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "visible": True if model_version =="v3" else False},
{"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
) )
@ -291,18 +296,15 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
version = hps.model.version version = hps.model.version
# print("sovits版本:",hps.model.version) # print("sovits版本:",hps.model.version)
if model_version not in v3v4set: if model_version not in v3v4set:
if "Pro" not in model_version:
model_version = version
else:
hps.model.version = model_version
vq_model = SynthesizerTrn( vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1, hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length, hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers, n_speakers=hps.data.n_speakers,
**hps.model, **hps.model,
) )
model_version = version
else: else:
hps.model.version = model_version hps.model.version=model_version
vq_model = SynthesizerTrnV3( vq_model = SynthesizerTrnV3(
hps.data.filter_length // 2 + 1, hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length, hps.train.segment_size // hps.data.hop_length,
@ -324,7 +326,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
else: else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
print( print(
"loading sovits_%spretrained_G" % model_version, "loading sovits_%spretrained_G"%model_version,
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False), vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
) )
lora_rank = dict_s2["lora_rank"] lora_rank = dict_s2["lora_rank"]
@ -335,7 +337,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
init_lora_weights=True, init_lora_weights=True,
) )
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
print("loading sovits_%s_lora%s" % (model_version, lora_rank)) print("loading sovits_%s_lora%s" % (model_version,lora_rank))
vq_model.load_state_dict(dict_s2["weight"], strict=False) vq_model.load_state_dict(dict_s2["weight"], strict=False)
vq_model.cfm = vq_model.cfm.merge_and_unload() vq_model.cfm = vq_model.cfm.merge_and_unload()
# torch.save(vq_model.state_dict(),"merge_win.pth") # torch.save(vq_model.state_dict(),"merge_win.pth")
@ -348,15 +350,10 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
prompt_language_update, prompt_language_update,
text_update, text_update,
text_language_update, text_language_update,
{ {"__type__": "update", "visible": visible_sample_steps, "value":32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]},
"__type__": "update",
"visible": visible_sample_steps,
"value": 32 if model_version == "v3" else 8,
"choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
},
{"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "visible": visible_inp_refs},
{"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False},
{"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "visible": True if model_version =="v3" else False},
{"__type__": "update", "value": i18n("合成语音"), "interactive": True}, {"__type__": "update", "value": i18n("合成语音"), "interactive": True},
) )
with open("./weight.json") as f: with open("./weight.json") as f:
@ -374,11 +371,9 @@ except:
def change_gpt_weights(gpt_path): def change_gpt_weights(gpt_path):
if "" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path]
global hz, max_sec, t2s_model, config global hz, max_sec, t2s_model, config
hz = 50 hz = 50
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False) dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"] config = dict_s1["config"]
max_sec = config["data"]["max_sec"] max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
@ -404,41 +399,8 @@ import torch
now_dir = os.getcwd() now_dir = os.getcwd()
def clean_hifigan_model():
global hifigan_model
if hifigan_model:
hifigan_model = hifigan_model.cpu()
hifigan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_bigvgan_model():
global bigvgan_model
if bigvgan_model:
bigvgan_model = bigvgan_model.cpu()
bigvgan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_sv_cn_model():
global sv_cn_model
if sv_cn_model:
sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu()
sv_cn_model = None
try:
torch.cuda.empty_cache()
except:
pass
def init_bigvgan(): def init_bigvgan():
global bigvgan_model, hifigan_model, sv_cn_model global bigvgan_model,hifigan_model
from BigVGAN import bigvgan from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained( bigvgan_model = bigvgan.BigVGAN.from_pretrained(
@ -448,16 +410,18 @@ def init_bigvgan():
# remove weight norm in the model and set to eval mode # remove weight norm in the model and set to eval mode
bigvgan_model.remove_weight_norm() bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval() bigvgan_model = bigvgan_model.eval()
clean_hifigan_model() if hifigan_model:
clean_sv_cn_model() hifigan_model=hifigan_model.cpu()
hifigan_model=None
try:torch.cuda.empty_cache()
except:pass
if is_half == True: if is_half == True:
bigvgan_model = bigvgan_model.half().to(device) bigvgan_model = bigvgan_model.half().to(device)
else: else:
bigvgan_model = bigvgan_model.to(device) bigvgan_model = bigvgan_model.to(device)
def init_hifigan(): def init_hifigan():
global hifigan_model, bigvgan_model, sv_cn_model global hifigan_model,bigvgan_model
hifigan_model = Generator( hifigan_model = Generator(
initial_channel=100, initial_channel=100,
resblock="1", resblock="1",
@ -466,87 +430,47 @@ def init_hifigan():
upsample_rates=[10, 6, 2, 2, 2], upsample_rates=[10, 6, 2, 2, 2],
upsample_initial_channel=512, upsample_initial_channel=512,
upsample_kernel_sizes=[20, 12, 4, 4, 4], upsample_kernel_sizes=[20, 12, 4, 4, 4],
gin_channels=0, gin_channels=0, is_bias=True
is_bias=True,
) )
hifigan_model.eval() hifigan_model.eval()
hifigan_model.remove_weight_norm() hifigan_model.remove_weight_norm()
state_dict_g = torch.load( state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu")
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), print("loading vocoder",hifigan_model.load_state_dict(state_dict_g))
map_location="cpu", if bigvgan_model:
weights_only=False, bigvgan_model=bigvgan_model.cpu()
) bigvgan_model=None
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g)) try:torch.cuda.empty_cache()
clean_bigvgan_model() except:pass
clean_sv_cn_model()
if is_half == True: if is_half == True:
hifigan_model = hifigan_model.half().to(device) hifigan_model = hifigan_model.half().to(device)
else: else:
hifigan_model = hifigan_model.to(device) hifigan_model = hifigan_model.to(device)
bigvgan_model=hifigan_model=None
from sv import SV if model_version=="v3":
def init_sv_cn():
global hifigan_model, bigvgan_model, sv_cn_model
sv_cn_model = SV(device, is_half)
clean_bigvgan_model()
clean_hifigan_model()
bigvgan_model = hifigan_model = sv_cn_model = None
if model_version == "v3":
init_bigvgan() init_bigvgan()
if model_version == "v4": if model_version=="v4":
init_hifigan() init_hifigan()
if model_version in {"v2Pro", "v2ProPlus"}:
init_sv_cn()
resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device): def get_spepc(hps, filename):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
# audio = load_audio(filename, int(hps.data.sampling_rate)) # audio = load_audio(filename, int(hps.data.sampling_rate))
audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate))
# audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio)
# audio = torch.FloatTensor(audio)
sr1 = int(hps.data.sampling_rate)
audio, sr0 = torchaudio.load(filename)
if sr0 != sr1:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, sr0, sr1, device)
else:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
maxx = audio.abs().max() maxx = audio.abs().max()
if maxx > 1: if maxx > 1:
audio /= min(2, maxx) audio /= min(2, maxx)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch( spec = spectrogram_torch(
audio, audio_norm,
hps.data.filter_length, hps.data.filter_length,
hps.data.sampling_rate, hps.data.sampling_rate,
hps.data.hop_length, hps.data.hop_length,
hps.data.win_length, hps.data.win_length,
center=False, center=False,
) )
spec = spec.to(dtype) return spec
if is_v2pro == True:
audio = resample(audio, sr1, 16000, device).to(dtype)
return spec, audio
def clean_text_inf(text, language, version): def clean_text_inf(text, language, version):
@ -599,31 +523,32 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False): def get_phones_and_bert(text, language, version, final=False):
text = re.sub(r' {2,}', ' ', text) if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
if language == "all_zh":
if re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "zh", version)
else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = get_bert_feature(norm_text, word2ph).to(device)
elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "yue", version)
else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "auto":
for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
langlist.append("en")
textlist.append(text)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
@ -635,10 +560,6 @@ def get_phones_and_bert(text, language, version, final=False):
textlist.append(tmp["text"]) textlist.append(tmp["text"])
else: else:
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en": if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
else: else:
@ -782,10 +703,6 @@ def get_tts_wav(
ref_free = False # s2v3暂不支持ref_free ref_free = False # s2v3暂不支持ref_free
else: else:
if_sr = False if_sr = False
if model_version not in {"v3", "v4", "v2Pro", "v2ProPlus"}:
clean_bigvgan_model()
clean_hifigan_model()
clean_sv_cn_model()
t0 = ttime() t0 = ttime()
prompt_language = dict_language[prompt_language] prompt_language = dict_language[prompt_language]
text_language = dict_language[text_language] text_language = dict_language[text_language]
@ -889,57 +806,42 @@ def get_tts_wav(
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
cache[i_text] = pred_semantic cache[i_text] = pred_semantic
t3 = ttime() t3 = ttime()
is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
# print(23333,is_v2pro,model_version)
###v3不存在以下逻辑和inp_refs ###v3不存在以下逻辑和inp_refs
if model_version not in v3v4set: if model_version not in v3v4set:
refers = [] refers = []
if is_v2pro:
sv_emb = []
if sv_cn_model == None:
init_sv_cn()
if inp_refs: if inp_refs:
for path in inp_refs: for path in inp_refs:
try: #####这里加上提取sv的逻辑要么一堆sv一堆refer要么单个sv单个refer try:
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) refer = get_spepc(hps, path.name).to(dtype).to(device)
refers.append(refer) refers.append(refer)
if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
except: except:
traceback.print_exc() traceback.print_exc()
if len(refers) == 0: if len(refers) == 0:
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
refers = [refers]
if is_v2pro:
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
if is_v2pro:
audio = vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb
)[0][0]
else:
audio = vq_model.decode( audio = vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
)[0][0] )[0][0] # .cpu().detach().numpy()
else: else:
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device) refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0)
# print(11111111, phoneme_ids0, phoneme_ids1)
fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
ref_audio, sr = torchaudio.load(ref_wav_path) ref_audio, sr = torchaudio.load(ref_wav_path)
ref_audio = ref_audio.to(device).float() ref_audio = ref_audio.to(device).float()
if ref_audio.shape[0] == 2: if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
tgt_sr = 24000 if model_version == "v3" else 32000 tgt_sr=24000 if model_version=="v3"else 32000
if sr != tgt_sr: if sr != tgt_sr:
ref_audio = resample(ref_audio, sr, tgt_sr, device) ref_audio = resample(ref_audio, sr,tgt_sr)
# print("ref_audio",ref_audio.abs().mean()) # print("ref_audio",ref_audio.abs().mean())
mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio) mel2 = mel_fn(ref_audio)if model_version=="v3"else mel_fn_v4(ref_audio)
mel2 = norm_spec(mel2) mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2]) T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min] mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min]
Tref = 468 if model_version == "v3" else 500 Tref=468 if model_version=="v3"else 500
Tchunk = 934 if model_version == "v3" else 1000 Tchunk=934 if model_version=="v3"else 1000
if T_min > Tref: if T_min > Tref:
mel2 = mel2[:, :, -Tref:] mel2 = mel2[:, :, -Tref:]
fea_ref = fea_ref[:, :, -Tref:] fea_ref = fea_ref[:, :, -Tref:]
@ -964,13 +866,13 @@ def get_tts_wav(
cfm_resss.append(cfm_res) cfm_resss.append(cfm_res)
cfm_res = torch.cat(cfm_resss, 2) cfm_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res) cfm_res = denorm_spec(cfm_res)
if model_version == "v3": if model_version=="v3":
if bigvgan_model == None: if bigvgan_model == None:
init_bigvgan() init_bigvgan()
else: # v4 else:#v4
if hifigan_model == None: if hifigan_model == None:
init_hifigan() init_hifigan()
vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model vocoder_model=bigvgan_model if model_version=="v3"else hifigan_model
with torch.inference_mode(): with torch.inference_mode():
wav_gen = vocoder_model(cfm_res) wav_gen = vocoder_model(cfm_res)
audio = wav_gen[0][0] # .cpu().detach().numpy() audio = wav_gen[0][0] # .cpu().detach().numpy()
@ -984,12 +886,9 @@ def get_tts_wav(
t1 = ttime() t1 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
audio_opt = torch.cat(audio_opt, 0) # np.concatenate audio_opt = torch.cat(audio_opt, 0) # np.concatenate
if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}: if model_version in {"v1","v2"}:opt_sr=32000
opt_sr = 32000 elif model_version=="v3":opt_sr=24000
elif model_version == "v3": else:opt_sr=48000#v4
opt_sr = 24000
else:
opt_sr = 48000 # v4
if if_sr == True and opt_sr == 24000: if if_sr == True and opt_sr == 24000:
print(i18n("音频超分中")) print(i18n("音频超分中"))
audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr) audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
@ -1119,6 +1018,37 @@ def process_text(texts):
return _text return _text
def change_choices():
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
"choices": sorted(GPT_names, key=custom_sort_key),
"__type__": "update",
}
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
for path in SoVITS_weight_root + GPT_weight_root:
os.makedirs(path, exist_ok=True)
def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names = [i for i in pretrained_sovits_name]
for path in SoVITS_weight_root:
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
GPT_names = [i for i in pretrained_gpt_name]
for path in GPT_weight_root:
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
return SoVITS_names, GPT_names
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
def html_center(text, label="p"): def html_center(text, label="p"):
return f"""<div style="text-align: center; margin: 100; padding: 50;"> return f"""<div style="text-align: center; margin: 100; padding: 50;">
<{label} style="margin: 0; padding: 0;">{text}</{label}> <{label} style="margin: 0; padding: 0;">{text}</{label}>
@ -1131,13 +1061,11 @@ def html_left(text, label="p"):
</div>""" </div>"""
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app: with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.HTML( gr.Markdown(
top_html.format( value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "<br>"
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
),
elem_classes="markdown",
) )
with gr.Group(): with gr.Group():
gr.Markdown(html_center(i18n("模型切换"), "h3")) gr.Markdown(html_center(i18n("模型切换"), "h3"))
@ -1203,16 +1131,16 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
sample_steps = ( sample_steps = (
gr.Radio( gr.Radio(
label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
value=32 if model_version == "v3" else 8, value=32 if model_version=="v3"else 8,
choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32],
visible=True, visible=True,
) )
if model_version in v3v4set if model_version in v3v4set
else gr.Radio( else gr.Radio(
label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32],
visible=False, visible=False,
value=32 if model_version == "v3" else 8, value=32 if model_version=="v3"else 8,
) )
) )
if_sr_Checkbox = gr.Checkbox( if_sr_Checkbox = gr.Checkbox(
@ -1220,7 +1148,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
value=False, value=False,
interactive=True, interactive=True,
show_label=True, show_label=True,
visible=False if model_version != "v3" else True, visible=False if model_version !="v3" else True,
) )
gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
with gr.Row(): with gr.Row():
@ -1349,5 +1277,5 @@ if __name__ == "__main__":
inbrowser=True, inbrowser=True,
share=is_share, share=is_share,
server_port=infer_ttswebui, server_port=infer_ttswebui,
# quiet=True, quiet=True,
) )

View File

@ -6,20 +6,7 @@
全部按英文识别 全部按英文识别
全部按日文识别 全部按日文识别
""" """
import psutil
import os
def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
if os.name != "nt":
return # 仅 Windows 有效
p = psutil.Process(os.getpid())
try:
p.nice(psutil.HIGH_PRIORITY_CLASS)
print("已将进程优先级设为 High")
except psutil.AccessDenied:
print("权限不足,无法修改优先级(请用管理员运行)")
set_high_priority()
import json import json
import logging import logging
import os import os
@ -27,8 +14,6 @@ import random
import re import re
import sys import sys
import torch
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir)) sys.path.append("%s/GPT_SoVITS" % (now_dir))
@ -40,6 +25,14 @@ logging.getLogger("httpx").setLevel(logging.ERROR)
logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR)
logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
import torch
try:
import gradio.analytics as analytics
analytics.version_check = lambda: None
except:
...
infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
@ -60,7 +53,6 @@ import gradio as gr
from TTS_infer_pack.text_segmentation_method import get_method from TTS_infer_pack.text_segmentation_method import get_method
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
from tools.assets import css, js, top_html
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto") language = os.environ.get("language", "Auto")
@ -112,28 +104,13 @@ cut_method = {
i18n("按标点符号切"): "cut5", i18n("按标点符号切"): "cut5",
} }
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
SoVITS_names, GPT_names = get_weights_names()
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml") tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")
tts_config.device = device tts_config.device = device
tts_config.is_half = is_half tts_config.is_half = is_half
# tts_config.version = version tts_config.version = version
tts_config.update_version(version)
if gpt_path is not None: if gpt_path is not None:
if "" in gpt_path or "!" in gpt_path:
gpt_path = name2gpt_path[gpt_path]
tts_config.t2s_weights_path = gpt_path tts_config.t2s_weights_path = gpt_path
if sovits_path is not None: if sovits_path is not None:
if "" in sovits_path or "!" in sovits_path:
sovits_path = name2sovits_path[sovits_path]
tts_config.vits_weights_path = sovits_path tts_config.vits_weights_path = sovits_path
if cnhubert_base_path is not None: if cnhubert_base_path is not None:
tts_config.cnhuhbert_base_path = cnhubert_base_path tts_config.cnhuhbert_base_path = cnhubert_base_path
@ -209,6 +186,40 @@ def custom_sort_key(s):
return parts return parts
def change_choices():
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
"choices": sorted(GPT_names, key=custom_sort_key),
"__type__": "update",
}
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
pretrained_sovits_name = [
"GPT_SoVITS/pretrained_models/s2G488k.pth",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
]
pretrained_gpt_name = [
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
]
_ = [[], []]
for i in range(4):
if os.path.exists(pretrained_gpt_name[i]):
_[0].append(pretrained_gpt_name[i])
if os.path.exists(pretrained_sovits_name[i]):
_[-1].append(pretrained_sovits_name[i])
pretrained_gpt_name, pretrained_sovits_name = _
if os.path.exists("./weight.json"): if os.path.exists("./weight.json"):
pass pass
else: else:
@ -218,28 +229,48 @@ else:
with open("./weight.json", "r", encoding="utf-8") as file: with open("./weight.json", "r", encoding="utf-8") as file:
weight_data = file.read() weight_data = file.read()
weight_data = json.loads(weight_data) weight_data = json.loads(weight_data)
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1])) gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name))
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0])) sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name))
if isinstance(gpt_path, list): if isinstance(gpt_path, list):
gpt_path = gpt_path[0] gpt_path = gpt_path[0]
if isinstance(sovits_path, list): if isinstance(sovits_path, list):
sovits_path = sovits_path[0] sovits_path = sovits_path[0]
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
for path in SoVITS_weight_root + GPT_weight_root:
os.makedirs(path, exist_ok=True)
def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names = [i for i in pretrained_sovits_name]
for path in SoVITS_weight_root:
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
GPT_names = [i for i in pretrained_gpt_name]
for path in GPT_weight_root:
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
return SoVITS_names, GPT_names
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast from process_ckpt import get_sovits_version_from_path_fast
v3v4set = {"v3", "v4"} v3v4set={"v3","v4"}
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
if "" in sovits_path or "!" in sovits_path:
sovits_path = name2sovits_path[sovits_path]
global version, model_version, dict_language, if_lora_v3 global version, model_version, dict_language, if_lora_v3
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
# print(sovits_path,version, model_version, if_lora_v3) # print(sovits_path,version, model_version, if_lora_v3)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False: if if_lora_v3 == True and is_exist == False:
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重") info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
gr.Warning(info) gr.Warning(info)
raise FileExistsError(info) raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2 dict_language = dict_language_v1 if version == "v1" else dict_language_v2
@ -297,19 +328,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
f.write(json.dumps(data)) f.write(json.dumps(data))
def change_gpt_weights(gpt_path): with gr.Blocks(title="GPT-SoVITS WebUI") as app:
if "" in gpt_path or "!" in gpt_path: gr.Markdown(
gpt_path = name2gpt_path[gpt_path] value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
tts_pipeline.init_t2s_weights(gpt_path) + "<br>"
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app:
gr.HTML(
top_html.format(
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
),
elem_classes="markdown",
) )
with gr.Column(): with gr.Column():
@ -477,7 +500,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
inference_button, inference_button,
], ],
) # ) #
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
with gr.Group(): with gr.Group():
gr.Markdown( gr.Markdown(
@ -519,5 +542,5 @@ if __name__ == "__main__":
inbrowser=True, inbrowser=True,
share=is_share, share=is_share,
server_port=infer_ttswebui, server_port=infer_ttswebui,
# quiet=True, quiet=True,
) )

View File

@ -21,7 +21,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
3) computes spectrograms from audio files. 3) computes spectrograms from audio files.
""" """
def __init__(self, hparams, version=None, val=False): def __init__(self, hparams, val=False):
exp_dir = hparams.exp_dir exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir self.path4 = "%s/4-cnhubert" % exp_dir
@ -29,14 +29,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
assert os.path.exists(self.path2) assert os.path.exists(self.path2)
assert os.path.exists(self.path4) assert os.path.exists(self.path4)
assert os.path.exists(self.path5) assert os.path.exists(self.path5)
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
if self.is_v2Pro:
self.path7 = "%s/7-sv_cn" % exp_dir
assert os.path.exists(self.path7)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀 names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5)) names5 = set(os.listdir(self.path5))
if self.is_v2Pro:
names6 = set([name[:-3] for name in list(os.listdir(self.path7))]) # 去除.pt后缀
self.phoneme_data = {} self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f: with open(self.path2, "r", encoding="utf8") as f:
lines = f.read().strip("\n").split("\n") lines = f.read().strip("\n").split("\n")
@ -46,9 +40,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
if len(tmp) != 4: if len(tmp) != 4:
continue continue
self.phoneme_data[tmp[0]] = [tmp[1]] self.phoneme_data[tmp[0]] = [tmp[1]]
if self.is_v2Pro:
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5 & names6)
else:
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5) self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
tmp = self.audiopaths_sid_text tmp = self.audiopaths_sid_text
leng = len(tmp) leng = len(tmp)
@ -117,20 +109,13 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
typee = ssl.dtype typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False ssl.requires_grad = False
if self.is_v2Pro:
sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
except: except:
traceback.print_exc() traceback.print_exc()
spec = torch.zeros(1025, 100) spec = torch.zeros(1025, 100)
wav = torch.zeros(1, 100 * self.hop_length) wav = torch.zeros(1, 100 * self.hop_length)
ssl = torch.zeros(1, 768, 100) ssl = torch.zeros(1, 768, 100)
text = text[-1:] text = text[-1:]
if self.is_v2Pro:
sv_emb = torch.zeros(1, 20480)
print("load audio or ssl error!!!!!!", audiopath) print("load audio or ssl error!!!!!!", audiopath)
if self.is_v2Pro:
return (ssl, spec, wav, text, sv_emb)
else:
return (ssl, spec, wav, text) return (ssl, spec, wav, text)
def get_audio(self, filename): def get_audio(self, filename):
@ -192,9 +177,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
class TextAudioSpeakerCollate: class TextAudioSpeakerCollate:
"""Zero-pads model inputs and targets""" """Zero-pads model inputs and targets"""
def __init__(self, return_ids=False, version=None): def __init__(self, return_ids=False):
self.return_ids = return_ids self.return_ids = return_ids
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
def __call__(self, batch): def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities """Collate's training batch from normalized text, audio and speaker identities
@ -227,9 +211,6 @@ class TextAudioSpeakerCollate:
ssl_padded.zero_() ssl_padded.zero_()
text_padded.zero_() text_padded.zero_()
if self.is_v2Pro:
sv_embs = torch.FloatTensor(len(batch), 20480)
for i in range(len(ids_sorted_decreasing)): for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]] row = batch[ids_sorted_decreasing[i]]
@ -249,31 +230,7 @@ class TextAudioSpeakerCollate:
text_padded[i, : text.size(0)] = text text_padded[i, : text.size(0)] = text
text_lengths[i] = text.size(0) text_lengths[i] = text.size(0)
if self.is_v2Pro: return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
sv_embs[i] = row[4]
if self.is_v2Pro:
return (
ssl_padded,
ssl_lengths,
spec_padded,
spec_lengths,
wav_padded,
wav_lengths,
text_padded,
text_lengths,
sv_embs,
)
else:
return (
ssl_padded,
ssl_lengths,
spec_padded,
spec_lengths,
wav_padded,
wav_lengths,
text_padded,
text_lengths,
)
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
@ -513,7 +470,6 @@ class TextAudioSpeakerCollateV3:
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset): class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
""" """
1) loads audio, speaker_id, text pairs 1) loads audio, speaker_id, text pairs
@ -640,7 +596,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
) )
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False) spec1 = spectrogram_torch(audio_norm, 1280,32000, 320, 1280,center=False)
mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None) mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None)
mel = self.norm_spec(torch.squeeze(mel, 0)) mel = self.norm_spec(torch.squeeze(mel, 0))
return spec, mel return spec, mel
@ -687,7 +643,7 @@ class TextAudioSpeakerCollateV4:
mel_lengths = torch.LongTensor(len(batch)) mel_lengths = torch.LongTensor(len(batch))
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2) mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len*2)
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
text_padded = torch.LongTensor(len(batch), max_text_len) text_padded = torch.LongTensor(len(batch), max_text_len)
# wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

View File

@ -39,36 +39,24 @@ hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.2: if torch.min(y) < -1.2:
print("min value is ", torch.min(y)) print('min value is ', torch.min(y))
if torch.max(y) > 1.2: if torch.max(y) > 1.2:
print("max value is ", torch.max(y)) print('max value is ', torch.max(y))
global hann_window global hann_window
dtype_device = str(y.dtype) + "_" + str(y.device) dtype_device = str(y.dtype) + '_' + str(y.device)
# wnsize_dtype_device = str(win_size) + '_' + dtype_device # wnsize_dtype_device = str(win_size) + '_' + dtype_device
key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size) key = "%s-%s-%s-%s-%s" %(dtype_device,n_fft, sampling_rate, hop_size, win_size)
# if wnsize_dtype_device not in hann_window: # if wnsize_dtype_device not in hann_window:
if key not in hann_window: if key not in hann_window:
# hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) # hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad( y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
)
y = y.squeeze(1) y = y.squeeze(1)
# spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], # spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
spec = torch.stft( spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[key],
y, center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[key],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)
return spec return spec
@ -76,9 +64,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
global mel_basis global mel_basis
dtype_device = str(spec.dtype) + "_" + str(spec.device) dtype_device = str(spec.dtype) + '_' + str(spec.device)
# fmax_dtype_device = str(fmax) + '_' + dtype_device # fmax_dtype_device = str(fmax) + '_' + dtype_device
key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax) key = "%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, fmin, fmax)
# if fmax_dtype_device not in mel_basis: # if fmax_dtype_device not in mel_basis:
if key not in mel_basis: if key not in mel_basis:
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
@ -90,25 +78,17 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
return spec return spec
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
if torch.min(y) < -1.2: if torch.min(y) < -1.2:
print("min value is ", torch.min(y)) print('min value is ', torch.min(y))
if torch.max(y) > 1.2: if torch.max(y) > 1.2:
print("max value is ", torch.max(y)) print('max value is ', torch.max(y))
global mel_basis, hann_window global mel_basis, hann_window
dtype_device = str(y.dtype) + "_" + str(y.device) dtype_device = str(y.dtype) + '_' + str(y.device)
# fmax_dtype_device = str(fmax) + '_' + dtype_device # fmax_dtype_device = str(fmax) + '_' + dtype_device
fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % ( fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax)
dtype_device,
n_fft,
num_mels,
sampling_rate,
hop_size,
win_size,
fmin,
fmax,
)
# wnsize_dtype_device = str(win_size) + '_' + dtype_device # wnsize_dtype_device = str(win_size) + '_' + dtype_device
wnsize_dtype_device = fmax_dtype_device wnsize_dtype_device = fmax_dtype_device
if fmax_dtype_device not in mel_basis: if fmax_dtype_device not in mel_basis:
@ -117,23 +97,11 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
if wnsize_dtype_device not in hann_window: if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad( y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
)
y = y.squeeze(1) y = y.squeeze(1)
spec = torch.stft( spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
y, center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)

View File

@ -414,8 +414,7 @@ class Generator(torch.nn.Module):
upsample_rates, upsample_rates,
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=0, gin_channels=0,is_bias=False,
is_bias=False,
): ):
super(Generator, self).__init__() super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes) self.num_kernels = len(resblock_kernel_sizes)
@ -587,15 +586,9 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap return x, fmap
v2pro_set = {"v2Pro", "v2ProPlus"}
class MultiPeriodDiscriminator(torch.nn.Module): class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False, version=None): def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__() super(MultiPeriodDiscriminator, self).__init__()
if version in v2pro_set:
periods = [2, 3, 5, 7, 11, 17, 23]
else:
periods = [2, 3, 5, 7, 11] periods = [2, 3, 5, 7, 11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@ -892,23 +885,12 @@ class SynthesizerTrn(nn.Module):
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
self.freeze_quantizer = freeze_quantizer self.freeze_quantizer = freeze_quantizer
self.is_v2pro = self.version in v2pro_set def forward(self, ssl, y, y_lengths, text, text_lengths):
if self.is_v2pro:
self.sv_emb = nn.Linear(20480, gin_channels)
self.ge_to512 = nn.Linear(gin_channels, 512)
self.prelu = nn.PReLU(num_parameters=gin_channels)
def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None):
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
if self.version == "v1": if self.version == "v1":
ge = self.ref_enc(y * y_mask, y_mask) ge = self.ref_enc(y * y_mask, y_mask)
else: else:
ge = self.ref_enc(y[:, :704] * y_mask, y_mask) ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
if self.is_v2pro:
sv_emb = self.sv_emb(sv_emb) # B*20480->B*512
ge += sv_emb.unsqueeze(-1)
ge = self.prelu(ge)
ge512 = self.ge_to512(ge.transpose(2, 1)).transpose(2, 1)
with autocast(enabled=False): with autocast(enabled=False):
maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
with maybe_no_grad: with maybe_no_grad:
@ -921,7 +903,7 @@ class SynthesizerTrn(nn.Module):
if self.semantic_frame_rate == "25hz": if self.semantic_frame_rate == "25hz":
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge512 if self.is_v2pro else ge) x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
z_p = self.flow(z, y_mask, g=ge) z_p = self.flow(z, y_mask, g=ge)
@ -958,8 +940,8 @@ class SynthesizerTrn(nn.Module):
return o, y_mask, (z, z_p, m_p, logs_p) return o, y_mask, (z, z_p, m_p, logs_p)
@torch.no_grad() @torch.no_grad()
def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None): def decode(self, codes, text, refer, noise_scale=0.5, speed=1):
def get_ge(refer, sv_emb): def get_ge(refer):
ge = None ge = None
if refer is not None: if refer is not None:
refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
@ -968,20 +950,16 @@ class SynthesizerTrn(nn.Module):
ge = self.ref_enc(refer * refer_mask, refer_mask) ge = self.ref_enc(refer * refer_mask, refer_mask)
else: else:
ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
if self.is_v2pro:
sv_emb = self.sv_emb(sv_emb) # B*20480->B*512
ge += sv_emb.unsqueeze(-1)
ge = self.prelu(ge)
return ge return ge
if type(refer) == list: if type(refer) == list:
ges = [] ges = []
for idx, _refer in enumerate(refer): for _refer in refer:
ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None) ge = get_ge(_refer)
ges.append(ge) ges.append(ge)
ge = torch.stack(ges, 0).mean(0) ge = torch.stack(ges, 0).mean(0)
else: else:
ge = get_ge(refer, sv_emb) ge = get_ge(refer)
y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
@ -989,14 +967,7 @@ class SynthesizerTrn(nn.Module):
quantized = self.quantizer.decode(codes) quantized = self.quantizer.decode(codes)
if self.semantic_frame_rate == "25hz": if self.semantic_frame_rate == "25hz":
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
x, m_p, logs_p, y_mask = self.enc_p( x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
quantized,
y_lengths,
text,
text_lengths,
self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
speed,
)
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=ge, reverse=True) z = self.flow(z_p, y_mask, g=ge, reverse=True)
@ -1021,8 +992,6 @@ class CFM(torch.nn.Module):
self.criterion = torch.nn.MSELoss() self.criterion = torch.nn.MSELoss()
self.use_conditioner_cache = True
@torch.inference_mode() @torch.inference_mode()
def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0): def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0):
"""Forward diffusion""" """Forward diffusion"""
@ -1035,33 +1004,15 @@ class CFM(torch.nn.Module):
mu = mu.transpose(2, 1) mu = mu.transpose(2, 1)
t = 0 t = 0
d = 1 / n_timesteps d = 1 / n_timesteps
text_cache = None
text_cfg_cache = None
dt_cache = None
d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
for j in range(n_timesteps): for j in range(n_timesteps):
t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
# v_pred = model(x, t_tensor, d_tensor, **extra_args) # v_pred = model(x, t_tensor, d_tensor, **extra_args)
v_pred, text_emb, dt = self.estimator( v_pred = self.estimator(
x, x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False
prompt_x, ).transpose(2, 1)
x_lens,
t_tensor,
d_tensor,
mu,
use_grad_ckpt=False,
drop_audio_cond=False,
drop_text=False,
infer=True,
text_cache=text_cache,
dt_cache=dt_cache,
)
v_pred = v_pred.transpose(2, 1)
if self.use_conditioner_cache:
text_cache = text_emb
dt_cache = dt
if inference_cfg_rate > 1e-5: if inference_cfg_rate > 1e-5:
neg, text_cfg_emb, _ = self.estimator( neg = self.estimator(
x, x,
prompt_x, prompt_x,
x_lens, x_lens,
@ -1071,13 +1022,7 @@ class CFM(torch.nn.Module):
use_grad_ckpt=False, use_grad_ckpt=False,
drop_audio_cond=True, drop_audio_cond=True,
drop_text=True, drop_text=True,
infer=True, ).transpose(2, 1)
text_cache=text_cfg_cache,
dt_cache=dt_cache,
)
neg = neg.transpose(2, 1)
if self.use_conditioner_cache:
text_cfg_cache = text_cfg_emb
v_pred = v_pred + (v_pred - neg) * inference_cfg_rate v_pred = v_pred + (v_pred - neg) * inference_cfg_rate
x = x + d * v_pred x = x + d * v_pred
t = t + d t = t + d
@ -1228,7 +1173,7 @@ class SynthesizerTrnV3(nn.Module):
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
fea = self.bridge(x) fea = self.bridge(x)
fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT
fea, y_mask_ = self.wns1( fea, y_mask_ = self.wns1(
fea, mel_lengths, ge fea, mel_lengths, ge
) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. ) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
@ -1251,9 +1196,9 @@ class SynthesizerTrnV3(nn.Module):
ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
if speed == 1: if speed == 1:
sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4)) sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4))
else: else:
sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1 sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4) / speed) + 1
y_lengths1 = torch.LongTensor([sizee]).to(codes.device) y_lengths1 = torch.LongTensor([sizee]).to(codes.device)
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
@ -1262,7 +1207,7 @@ class SynthesizerTrnV3(nn.Module):
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
fea = self.bridge(x) fea = self.bridge(x)
fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT
####more wn paramter to learn mel ####more wn paramter to learn mel
fea, y_mask_ = self.wns1(fea, y_lengths1, ge) fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
return fea, ge return fea, ge

View File

@ -391,7 +391,6 @@ class Generator(torch.nn.Module):
upsample_initial_channel, upsample_initial_channel,
upsample_kernel_sizes, upsample_kernel_sizes,
gin_channels=0, gin_channels=0,
is_bias=False,
): ):
super(Generator, self).__init__() super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes) self.num_kernels = len(resblock_kernel_sizes)
@ -419,7 +418,7 @@ class Generator(torch.nn.Module):
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d)) self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=is_bias) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights) self.ups.apply(init_weights)
if gin_channels != 0: if gin_channels != 0:
@ -763,9 +762,6 @@ class CodePredictor(nn.Module):
return pred_codes.transpose(0, 1) return pred_codes.transpose(0, 1)
v2pro_set = {"v2Pro", "v2ProPlus"}
class SynthesizerTrn(nn.Module): class SynthesizerTrn(nn.Module):
""" """
Synthesizer for Training Synthesizer for Training
@ -870,32 +866,19 @@ class SynthesizerTrn(nn.Module):
# self.enc_p.text_embedding.requires_grad_(False) # self.enc_p.text_embedding.requires_grad_(False)
# self.enc_p.encoder_text.requires_grad_(False) # self.enc_p.encoder_text.requires_grad_(False)
# self.enc_p.mrte.requires_grad_(False) # self.enc_p.mrte.requires_grad_(False)
self.is_v2pro = self.version in v2pro_set
if self.is_v2pro:
self.sv_emb = nn.Linear(20480, gin_channels)
self.ge_to512 = nn.Linear(gin_channels, 512)
self.prelu = nn.PReLU(num_parameters=gin_channels)
def forward(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None): def forward(self, codes, text, refer, noise_scale=0.5, speed=1):
refer_mask = torch.ones_like(refer[:1, :1, :]) refer_mask = torch.ones_like(refer[:1, :1, :])
if self.version == "v1": if self.version == "v1":
ge = self.ref_enc(refer * refer_mask, refer_mask) ge = self.ref_enc(refer * refer_mask, refer_mask)
else: else:
ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
if self.is_v2pro:
sv_emb = self.sv_emb(sv_emb)
ge += sv_emb.unsqueeze(-1)
ge = self.prelu(ge)
quantized = self.quantizer.decode(codes) quantized = self.quantizer.decode(codes)
if self.semantic_frame_rate == "25hz": if self.semantic_frame_rate == "25hz":
dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0) dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0)
quantized = dquantized.contiguous().view(1, self.ssl_dim, -1) quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)
if self.is_v2pro:
ge_ = self.ge_to512(ge.transpose(2, 1)).transpose(2, 1)
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge_, speed)
else:
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed) x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed)
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale

View File

@ -1,5 +1,4 @@
import math import math
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
@ -719,10 +718,8 @@ class MelStyleEncoder(nn.Module):
else: else:
len_ = (~mask).sum(dim=1).unsqueeze(1) len_ = (~mask).sum(dim=1).unsqueeze(1)
x = x.masked_fill(mask.unsqueeze(-1), 0) x = x.masked_fill(mask.unsqueeze(-1), 0)
dtype = x.dtype x = x.sum(dim=1)
x = x.float() out = torch.div(x, len_)
x = torch.div(x, len_.unsqueeze(1))
out = x.sum(dim=1).to(dtype)
return out return out
def forward(self, x, mask=None): def forward(self, x, mask=None):
@ -746,6 +743,7 @@ class MelStyleEncoder(nn.Module):
x = self.fc(x) x = self.fc(x)
# temoral average pooling # temoral average pooling
w = self.temporal_avg_pool(x, mask=mask) w = self.temporal_avg_pool(x, mask=mask)
return w.unsqueeze(-1) return w.unsqueeze(-1)

View File

@ -1,115 +0,0 @@
# -*- coding: utf-8 -*-
import sys
import os
inp_text = os.environ.get("inp_text")
inp_wav_dir = os.environ.get("inp_wav_dir")
exp_name = os.environ.get("exp_name")
i_part = os.environ.get("i_part")
all_parts = os.environ.get("all_parts")
if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
opt_dir = os.environ.get("opt_dir")
sv_path = os.environ.get("sv_path")
import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
import traceback
import torchaudio
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
from tools.my_utils import clean_path
from time import time as ttime
import shutil
from ERes2NetV2 import ERes2NetV2
import kaldi as Kaldi
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
tmp_path = "%s%s.pth" % (ttime(), i_part)
torch.save(fea, tmp_path)
shutil.move(tmp_path, "%s/%s" % (dir, name))
sv_cn_dir = "%s/7-sv_cn" % (opt_dir)
wav32dir = "%s/5-wav32k" % (opt_dir)
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(sv_cn_dir, exist_ok=True)
os.makedirs(wav32dir, exist_ok=True)
maxx = 0.95
alpha = 0.5
if torch.cuda.is_available():
device = "cuda:0"
# elif torch.backends.mps.is_available():
# device = "mps"
else:
device = "cpu"
class SV:
def __init__(self, device, is_half):
pretrained_state = torch.load(sv_path, map_location="cpu")
embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
embedding_model.load_state_dict(pretrained_state)
embedding_model.eval()
self.embedding_model = embedding_model
self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
if is_half == False:
self.embedding_model = self.embedding_model.to(device)
else:
self.embedding_model = self.embedding_model.half().to(device)
self.is_half = is_half
def compute_embedding3(self, wav): # (1,x)#-1~1
with torch.no_grad():
wav = self.res(wav)
if self.is_half == True:
wav = wav.half()
feat = torch.stack(
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
)
sv_emb = self.embedding_model.forward3(feat)
return sv_emb
sv = SV(device, is_half)
def name2go(wav_name, wav_path):
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
if os.path.exists(sv_cn_path):
return
wav_path = "%s/%s" % (wav32dir, wav_name)
wav32k, sr0 = torchaudio.load(wav_path)
assert sr0 == 32000
wav32k = wav32k.to(device)
emb = sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
my_save(emb, sv_cn_path)
with open(inp_text, "r", encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines[int(i_part) :: int(all_parts)]:
try:
wav_name, spk_name, language, text = line.split("|")
wav_name = clean_path(wav_name)
if inp_wav_dir != "" and inp_wav_dir != None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
else:
wav_path = wav_name
wav_name = os.path.basename(wav_name)
name2go(wav_name, wav_path)
except:
print(line, traceback.format_exc())

View File

@ -17,28 +17,29 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
shutil.move(tmp_path, "%s/%s" % (dir, name)) shutil.move(tmp_path, "%s/%s" % (dir, name))
"""
00:v1
01:v2
02:v3
03:v3lora
04:v4lora
"""
from io import BytesIO from io import BytesIO
model_version2byte = {
"v3": b"03",
"v4": b"04",
"v2Pro": b"05",
"v2ProPlus": b"06",
}
def my_save2(fea, path,cfm_version):
def my_save2(fea, path, model_version):
bio = BytesIO() bio = BytesIO()
torch.save(fea, bio) torch.save(fea, bio)
bio.seek(0) bio.seek(0)
data = bio.getvalue() data = bio.getvalue()
byte = model_version2byte[model_version] byte=b"03" if cfm_version=="v3"else b"04"
data = byte + data[2:] data = byte + data[2:]
with open(path, "wb") as f: with open(path, "wb") as f:
f.write(data) f.write(data)
def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None): def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None):
try: try:
opt = OrderedDict() opt = OrderedDict()
opt["weight"] = {} opt["weight"] = {}
@ -50,9 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
opt["info"] = "%sepoch_%siteration" % (epoch, steps) opt["info"] = "%sepoch_%siteration" % (epoch, steps)
if lora_rank: if lora_rank:
opt["lora_rank"] = lora_rank opt["lora_rank"] = lora_rank
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version) my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version)
elif model_version != None and "Pro" in model_version:
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
else: else:
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
return "Success." return "Success."
@ -60,31 +59,18 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
return traceback.format_exc() return traceback.format_exc()
"""
00:v1
01:v2
02:v3
03:v3lora
04:v4lora
05:v2Pro
06:v2ProPlus
"""
head2version = { head2version = {
b"00": ["v1", "v1", False], b"00": ["v1", "v1", False],
b"01": ["v2", "v2", False], b"01": ["v2", "v2", False],
b"02": ["v2", "v3", False], b"02": ["v2", "v3", False],
b"03": ["v2", "v3", True], b"03": ["v2", "v3", True],
b"04": ["v2", "v4", True], b"04": ["v2", "v4", True],
b"05": ["v2", "v2Pro", False],
b"06": ["v2", "v2ProPlus", False],
} }
hash_pretrained_dict = { hash_pretrained_dict = {
"dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained
"43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained
"6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained
"4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained
"c7e9fce2223f3db685cdfa1e6368728a": ["v2", "v2Pro", False], # s2Gv2Pro.pth#sovits_v2Pro_pretrained
"66b313e39455b57ab1b0bc0b239c9d0a": ["v2", "v2ProPlus", False], # s2Gv2ProPlus.pth#sovits_v2ProPlus_pretrained
} }
import hashlib import hashlib
@ -129,7 +115,7 @@ def get_sovits_version_from_path_fast(sovits_path):
def load_sovits_new(sovits_path): def load_sovits_new(sovits_path):
f = open(sovits_path, "rb") f = open(sovits_path, "rb")
meta = f.read(2) meta = f.read(2)
if meta != b"PK": if meta != "PK":
data = b"PK" + f.read() data = b"PK" + f.read()
bio = BytesIO() bio = BytesIO()
bio.write(data) bio.write(data)

View File

@ -87,7 +87,7 @@ def run(rank, n_gpus, hps):
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.set_device(rank) torch.cuda.set_device(rank)
train_dataset = TextAudioSpeakerLoader(hps.data, version=hps.model.version) train_dataset = TextAudioSpeakerLoader(hps.data) ########
train_sampler = DistributedBucketSampler( train_sampler = DistributedBucketSampler(
train_dataset, train_dataset,
hps.train.batch_size, hps.train.batch_size,
@ -115,10 +115,10 @@ def run(rank, n_gpus, hps):
rank=rank, rank=rank,
shuffle=True, shuffle=True,
) )
collate_fn = TextAudioSpeakerCollate(version=hps.model.version) collate_fn = TextAudioSpeakerCollate()
train_loader = DataLoader( train_loader = DataLoader(
train_dataset, train_dataset,
num_workers=5, num_workers=6,
shuffle=False, shuffle=False,
pin_memory=True, pin_memory=True,
collate_fn=collate_fn, collate_fn=collate_fn,
@ -149,9 +149,9 @@ def run(rank, n_gpus, hps):
) )
net_d = ( net_d = (
MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).cuda(rank) MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
if torch.cuda.is_available() if torch.cuda.is_available()
else MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device) else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device)
) )
for name, param in net_g.named_parameters(): for name, param in net_g.named_parameters():
if not param.requires_grad: if not param.requires_grad:
@ -235,12 +235,12 @@ def run(rank, n_gpus, hps):
print( print(
"loaded pretrained %s" % hps.train.pretrained_s2G, "loaded pretrained %s" % hps.train.pretrained_s2G,
net_g.module.load_state_dict( net_g.module.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False, strict=False,
) )
if torch.cuda.is_available() if torch.cuda.is_available()
else net_g.load_state_dict( else net_g.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False, strict=False,
), ),
) ##测试不加载优化器 ) ##测试不加载优化器
@ -254,11 +254,11 @@ def run(rank, n_gpus, hps):
print( print(
"loaded pretrained %s" % hps.train.pretrained_s2D, "loaded pretrained %s" % hps.train.pretrained_s2D,
net_d.module.load_state_dict( net_d.module.load_state_dict(
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], strict=False torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"],
) )
if torch.cuda.is_available() if torch.cuda.is_available()
else net_d.load_state_dict( else net_d.load_state_dict(
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"],
), ),
) )
@ -328,11 +328,16 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
net_g.train() net_g.train()
net_d.train() net_d.train()
for batch_idx, data in enumerate(tqdm(train_loader)): for batch_idx, (
if hps.model.version in {"v2Pro", "v2ProPlus"}: ssl,
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths, sv_emb = data ssl_lengths,
else: spec,
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data spec_lengths,
y,
y_lengths,
text,
text_lengths,
) in enumerate(tqdm(train_loader)):
if torch.cuda.is_available(): if torch.cuda.is_available():
spec, spec_lengths = ( spec, spec_lengths = (
spec.cuda( spec.cuda(
@ -367,8 +372,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
non_blocking=True, non_blocking=True,
), ),
) )
if hps.model.version in {"v2Pro", "v2ProPlus"}:
sv_emb = sv_emb.cuda(rank, non_blocking=True)
else: else:
spec, spec_lengths = spec.to(device), spec_lengths.to(device) spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device) y, y_lengths = y.to(device), y_lengths.to(device)
@ -376,14 +379,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
ssl.requires_grad = False ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
text, text_lengths = text.to(device), text_lengths.to(device) text, text_lengths = text.to(device), text_lengths.to(device)
if hps.model.version in {"v2Pro", "v2ProPlus"}:
sv_emb = sv_emb.to(device)
with autocast(enabled=hps.train.fp16_run): with autocast(enabled=hps.train.fp16_run):
if hps.model.version in {"v2Pro", "v2ProPlus"}:
(y_hat, kl_ssl, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl) = net_g(
ssl, spec, spec_lengths, text, text_lengths, sv_emb
)
else:
( (
y_hat, y_hat,
kl_ssl, kl_ssl,
@ -570,7 +567,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
epoch, epoch,
global_step, global_step,
hps, hps,
model_version=None if hps.model.version not in {"v2Pro", "v2ProPlus"} else hps.model.version,
), ),
) )
) )

View File

@ -204,12 +204,12 @@ def run(rank, n_gpus, hps):
print( print(
"loaded pretrained %s" % hps.train.pretrained_s2G, "loaded pretrained %s" % hps.train.pretrained_s2G,
net_g.module.load_state_dict( net_g.module.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False, strict=False,
) )
if torch.cuda.is_available() if torch.cuda.is_available()
else net_g.load_state_dict( else net_g.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False, strict=False,
), ),
) ##测试不加载优化器 ) ##测试不加载优化器

View File

@ -31,6 +31,7 @@ from module.data_utils import (
TextAudioSpeakerLoaderV3, TextAudioSpeakerLoaderV3,
TextAudioSpeakerCollateV4, TextAudioSpeakerCollateV4,
TextAudioSpeakerLoaderV4, TextAudioSpeakerLoaderV4,
) )
from module.models import ( from module.models import (
SynthesizerTrnV3 as SynthesizerTrn, SynthesizerTrnV3 as SynthesizerTrn,
@ -87,8 +88,8 @@ def run(rank, n_gpus, hps):
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.set_device(rank) torch.cuda.set_device(rank)
TextAudioSpeakerLoader = TextAudioSpeakerLoaderV3 if hps.model.version == "v3" else TextAudioSpeakerLoaderV4 TextAudioSpeakerLoader=TextAudioSpeakerLoaderV3 if hps.model.version=="v3"else TextAudioSpeakerLoaderV4
TextAudioSpeakerCollate = TextAudioSpeakerCollateV3 if hps.model.version == "v3" else TextAudioSpeakerCollateV4 TextAudioSpeakerCollate=TextAudioSpeakerCollateV3 if hps.model.version=="v3"else TextAudioSpeakerCollateV4
train_dataset = TextAudioSpeakerLoader(hps.data) ######## train_dataset = TextAudioSpeakerLoader(hps.data) ########
train_sampler = DistributedBucketSampler( train_sampler = DistributedBucketSampler(
train_dataset, train_dataset,
@ -189,7 +190,7 @@ def run(rank, n_gpus, hps):
print( print(
"loaded pretrained %s" % hps.train.pretrained_s2G, "loaded pretrained %s" % hps.train.pretrained_s2G,
net_g.load_state_dict( net_g.load_state_dict(
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"], torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
strict=False, strict=False,
), ),
) )
@ -364,8 +365,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank), hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank),
epoch, epoch,
global_step, global_step,
hps, hps,cfm_version=hps.model.version,
model_version=hps.model.version,
lora_rank=lora_rank, lora_rank=lora_rank,
), ),
) )

View File

@ -1,32 +0,0 @@
import sys
import os
import torch
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
from ERes2NetV2 import ERes2NetV2
import kaldi as Kaldi
class SV:
def __init__(self, device, is_half):
pretrained_state = torch.load(sv_path, map_location="cpu", weights_only=False)
embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
embedding_model.load_state_dict(pretrained_state)
embedding_model.eval()
self.embedding_model = embedding_model
if is_half == False:
self.embedding_model = self.embedding_model.to(device)
else:
self.embedding_model = self.embedding_model.half().to(device)
self.is_half = is_half
def compute_embedding3(self, wav):
with torch.no_grad():
if self.is_half == True:
wav = wav.half()
feat = torch.stack(
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
)
sv_emb = self.embedding_model.forward3(feat)
return sv_emb

View File

@ -3,19 +3,25 @@ import re
# jieba静音 # jieba静音
import jieba import jieba
jieba.setLogLevel(logging.CRITICAL) jieba.setLogLevel(logging.CRITICAL)
# 更改fast_langdetect大模型位置 # 更改fast_langdetect大模型位置
from pathlib import Path from pathlib import Path
import fast_langdetect import fast_langdetect
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
fast_langdetect.infer.LangDetectConfig(
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
)
)
from split_lang import LangSplitter from split_lang import LangSplitter
def full_en(text): def full_en(text):
pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$' pattern = r"^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
return bool(re.match(pattern, text)) return bool(re.match(pattern, text))
@ -34,7 +40,7 @@ def full_cjk(text):
(0x2EBF0, 0x2EE5D), # CJK Extension H (0x2EBF0, 0x2EE5D), # CJK Extension H
] ]
pattern = r'[0-9、-〜。!?.!?… /]+$' pattern = r"[0-9、-〜。!?.!?… ]+$"
cjk_text = "" cjk_text = ""
for char in text: for char in text:
@ -45,7 +51,7 @@ def full_cjk(text):
return cjk_text return cjk_text
def split_jako(tag_lang,item): def split_jako(tag_lang, item):
if tag_lang == "ja": if tag_lang == "ja":
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)" pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
else: else:
@ -53,28 +59,28 @@ def split_jako(tag_lang,item):
lang_list: list[dict] = [] lang_list: list[dict] = []
tag = 0 tag = 0
for match in re.finditer(pattern, item['text']): for match in re.finditer(pattern, item["text"]):
if match.start() > tag: if match.start() > tag:
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]}) lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
tag = match.end() tag = match.end()
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]}) lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
if tag < len(item['text']): if tag < len(item["text"]):
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]}) lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
return lang_list return lang_list
def merge_lang(lang_list, item): def merge_lang(lang_list, item):
if lang_list and item['lang'] == lang_list[-1]['lang']: if lang_list and item["lang"] == lang_list[-1]["lang"]:
lang_list[-1]['text'] += item['text'] lang_list[-1]["text"] += item["text"]
else: else:
lang_list.append(item) lang_list.append(item)
return lang_list return lang_list
class LangSegmenter(): class LangSegmenter:
# 默认过滤器, 基于gsv目前四种语言 # 默认过滤器, 基于gsv目前四种语言
DEFAULT_LANG_MAP = { DEFAULT_LANG_MAP = {
"zh": "zh", "zh": "zh",
@ -87,41 +93,25 @@ class LangSegmenter():
"en": "en", "en": "en",
} }
def getTexts(text,default_lang = ""): def getTexts(text):
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP) lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
lang_splitter.merge_across_digit = False
substr = lang_splitter.split_by_lang(text=text) substr = lang_splitter.split_by_lang(text=text)
lang_list: list[dict] = [] lang_list: list[dict] = []
have_num = False
for _, item in enumerate(substr): for _, item in enumerate(substr):
dict_item = {'lang':item.lang,'text':item.text} dict_item = {"lang": item.lang, "text": item.text}
if dict_item['lang'] == 'digit':
if default_lang != "":
dict_item['lang'] = default_lang
else:
have_num = True
lang_list = merge_lang(lang_list,dict_item)
continue
# 处理短英文被识别为其他语言的问题 # 处理短英文被识别为其他语言的问题
if full_en(dict_item['text']): if full_en(dict_item["text"]):
dict_item['lang'] = 'en' dict_item["lang"] = "en"
lang_list = merge_lang(lang_list,dict_item) lang_list = merge_lang(lang_list, dict_item)
continue continue
if default_lang != "":
dict_item['lang'] = default_lang
lang_list = merge_lang(lang_list,dict_item)
continue
else:
# 处理非日语夹日文的问题(不包含CJK) # 处理非日语夹日文的问题(不包含CJK)
ja_list: list[dict] = [] ja_list: list[dict] = []
if dict_item['lang'] != 'ja': if dict_item["lang"] != "ja":
ja_list = split_jako('ja',dict_item) ja_list = split_jako("ja", dict_item)
if not ja_list: if not ja_list:
ja_list.append(dict_item) ja_list.append(dict_item)
@ -130,8 +120,8 @@ class LangSegmenter():
ko_list: list[dict] = [] ko_list: list[dict] = []
temp_list: list[dict] = [] temp_list: list[dict] = []
for _, ko_item in enumerate(ja_list): for _, ko_item in enumerate(ja_list):
if ko_item["lang"] != 'ko': if ko_item["lang"] != "ko":
ko_list = split_jako('ko',ko_item) ko_list = split_jako("ko", ko_item)
if ko_list: if ko_list:
temp_list.extend(ko_list) temp_list.extend(ko_list)
@ -141,75 +131,26 @@ class LangSegmenter():
# 未存在非日韩文夹日韩文 # 未存在非日韩文夹日韩文
if len(temp_list) == 1: if len(temp_list) == 1:
# 未知语言检查是否为CJK # 未知语言检查是否为CJK
if dict_item['lang'] == 'x': if dict_item["lang"] == "x":
cjk_text = full_cjk(dict_item['text']) cjk_text = full_cjk(dict_item["text"])
if cjk_text: if cjk_text:
dict_item = {'lang':'zh','text':cjk_text} dict_item = {"lang": "zh", "text": cjk_text}
lang_list = merge_lang(lang_list,dict_item) lang_list = merge_lang(lang_list, dict_item)
else:
lang_list = merge_lang(lang_list,dict_item)
continue continue
else: else:
lang_list = merge_lang(lang_list,dict_item) lang_list = merge_lang(lang_list, dict_item)
continue continue
# 存在非日韩文夹日韩文 # 存在非日韩文夹日韩文
for _, temp_item in enumerate(temp_list): for _, temp_item in enumerate(temp_list):
# 未知语言检查是否为CJK # 未知语言检查是否为CJK
if temp_item['lang'] == 'x': if temp_item["lang"] == "x":
cjk_text = full_cjk(temp_item['text']) cjk_text = full_cjk(dict_item["text"])
if cjk_text: if cjk_text:
lang_list = merge_lang(lang_list,{'lang':'zh','text':cjk_text}) dict_item = {"lang": "zh", "text": cjk_text}
lang_list = merge_lang(lang_list, dict_item)
else: else:
lang_list = merge_lang(lang_list,temp_item) lang_list = merge_lang(lang_list, temp_item)
else:
lang_list = merge_lang(lang_list,temp_item)
# 有数字
if have_num:
temp_list = lang_list
lang_list = []
for i, temp_item in enumerate(temp_list):
if temp_item['lang'] == 'digit':
if default_lang:
temp_item['lang'] = default_lang
elif lang_list and i == len(temp_list) - 1:
temp_item['lang'] = lang_list[-1]['lang']
elif not lang_list and i < len(temp_list) - 1:
temp_item['lang'] = temp_list[1]['lang']
elif lang_list and i < len(temp_list) - 1:
if lang_list[-1]['lang'] == temp_list[i + 1]['lang']:
temp_item['lang'] = lang_list[-1]['lang']
elif lang_list[-1]['text'][-1] in [",",".","!","?","","","",""]:
temp_item['lang'] = temp_list[i + 1]['lang']
elif temp_list[i + 1]['text'][0] in [",",".","!","?","","","",""]:
temp_item['lang'] = lang_list[-1]['lang']
elif temp_item['text'][-1] in ["","."]:
temp_item['lang'] = lang_list[-1]['lang']
elif len(lang_list[-1]['text']) >= len(temp_list[i + 1]['text']):
temp_item['lang'] = lang_list[-1]['lang']
else:
temp_item['lang'] = temp_list[i + 1]['lang']
else:
temp_item['lang'] = 'zh'
lang_list = merge_lang(lang_list,temp_item)
# 筛X
temp_list = lang_list
lang_list = []
for _, temp_item in enumerate(temp_list):
if temp_item['lang'] == 'x':
if lang_list:
temp_item['lang'] = lang_list[-1]['lang']
elif len(temp_list) > 1:
temp_item['lang'] = temp_list[1]['lang']
else:
temp_item['lang'] = 'zh'
lang_list = merge_lang(lang_list,temp_item)
return lang_list return lang_list
@ -219,7 +160,3 @@ if __name__ == "__main__":
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。" text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
print(LangSegmenter.getTexts(text)) print(LangSegmenter.getTexts(text))
text = "当时ThinkPad T60刚刚发布一同推出的还有一款名为Advanced Dock的扩展坞配件。这款扩展坞通过连接T60底部的插槽扩展出包括PCIe在内的一大堆接口并且自带电源让T60可以安装桌面显卡来提升性能。"
print(LangSegmenter.getTexts(text,"zh"))
print(LangSegmenter.getTexts(text))

View File

@ -181,6 +181,20 @@ def text_normalize(text):
return dest_text return dest_text
# 不排除英文的文本格式化
def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
sentences = tx.normalize(text)
dest_text = ""
for sentence in sentences:
dest_text += replace_punctuation_with_en(sentence)
# 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text
if __name__ == "__main__": if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?" text = "呣呣呣~就是…大人的鼹鼠党吧?"

View File

@ -326,6 +326,20 @@ def text_normalize(text):
return dest_text return dest_text
# 不排除英文的文本格式化
def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
sentences = tx.normalize(text)
dest_text = ""
for sentence in sentences:
dest_text += replace_punctuation_with_en(sentence)
# 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text
if __name__ == "__main__": if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?" text = "呣呣呣~就是…大人的鼹鼠党吧?"

View File

@ -1,32 +1,27 @@
# This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw # This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw
# This code is modified from https://github.com/GitYCC/g2pW # This code is modified from https://github.com/GitYCC/g2pW
import warnings
warnings.filterwarnings("ignore")
import json import json
import os import os
import warnings
import zipfile import zipfile
from typing import Any, Dict, List, Tuple from typing import Any, Dict, List, Tuple
import numpy as np import numpy as np
import onnxruntime import onnxruntime
import requests import requests
import torch
onnxruntime.set_default_logger_severity(3)
from opencc import OpenCC from opencc import OpenCC
from pypinyin import Style, pinyin from pypinyin import Style, pinyin
from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers import AutoTokenizer
from ..zh_normalization.char_convert import tranditional_to_simplified from ..zh_normalization.char_convert import tranditional_to_simplified
from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input
from .utils import load_config from .utils import load_config
onnxruntime.set_default_logger_severity(3)
try:
onnxruntime.preload_dlls()
except:
pass
# traceback.print_exc()
warnings.filterwarnings("ignore")
model_version = "1.1" model_version = "1.1"
@ -92,14 +87,14 @@ class G2PWOnnxConverter:
sess_options = onnxruntime.SessionOptions() sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0 sess_options.intra_op_num_threads = 2
if "CUDAExecutionProvider" in onnxruntime.get_available_providers(): try:
self.session_g2pW = onnxruntime.InferenceSession( self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"), os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options, sess_options=sess_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"], providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
) )
else: except:
self.session_g2pW = onnxruntime.InferenceSession( self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"), os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options, sess_options=sess_options,

View File

@ -650,25 +650,32 @@ class ToneSandhi:
# output seg: [['听一听', 'v']] # output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = [] new_seg = []
i = 0
# function 1 # function 1
while i < len(seg): for i, (word, pos) in enumerate(seg):
word, pos = seg[i] if (
merged = False i - 1 >= 0
if i - 1 >= 0 and word == "" and i + 1 < len(seg): and word == ""
last = new_seg[-1] if new_seg else seg[i - 1] and i + 1 < len(seg)
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v": and seg[i - 1][0] == seg[i + 1][0]
combined = last[0] + "" + seg[i + 1][0] and seg[i - 1][1] == "v"
new_seg[-1] = [combined, last[1]] and seg[i + 1][1] == "v"
i += 2 ):
merged = True new_seg[i - 1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0]
if not merged: else:
if (
i - 2 >= 0
and seg[i - 1][0] == ""
and seg[i - 2][0] == word
and pos == "v"
and seg[i - 2][1] == "v"
):
continue
else:
new_seg.append([word, pos]) new_seg.append([word, pos])
i += 1
seg = new_seg seg = new_seg
new_seg = [] new_seg = []
# function 2 # function 2
for word, pos in seg: for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "": if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word new_seg[-1][0] = new_seg[-1][0] + word
else: else:

View File

@ -256,24 +256,6 @@ def replace_to_range(match) -> str:
return result return result
RE_VERSION_NUM = re.compile(r"((\d+)(\.\d+)(\.\d+)?(\.\d+)+)")
def replace_vrsion_num(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = ""
for c in match.group(1):
if c == ".":
result += ""
else:
result += num2str(c)
return result
def _get_value(value_string: str, use_zero: bool = True) -> List[str]: def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
stripped = value_string.lstrip("0") stripped = value_string.lstrip("0")
if len(stripped) == 0: if len(stripped) == 0:
@ -326,11 +308,7 @@ def num2str(value_string: str) -> str:
result = verbalize_cardinal(integer) result = verbalize_cardinal(integer)
if decimal.endswith("0"):
decimal = decimal.rstrip("0") + "0"
else:
decimal = decimal.rstrip("0") decimal = decimal.rstrip("0")
if decimal: if decimal:
# '.22' is verbalized as '零点二二' # '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二 # '3.20' is verbalized as '三点二

View File

@ -25,7 +25,6 @@ from .chronology import replace_time
from .constants import F2H_ASCII_LETTERS from .constants import F2H_ASCII_LETTERS
from .constants import F2H_DIGITS from .constants import F2H_DIGITS
from .constants import F2H_SPACE from .constants import F2H_SPACE
from .num import RE_VERSION_NUM
from .num import RE_DECIMAL_NUM from .num import RE_DECIMAL_NUM
from .num import RE_DEFAULT_NUM from .num import RE_DEFAULT_NUM
from .num import RE_FRAC from .num import RE_FRAC
@ -37,7 +36,6 @@ from .num import RE_RANGE
from .num import RE_TO_RANGE from .num import RE_TO_RANGE
from .num import RE_ASMD from .num import RE_ASMD
from .num import RE_POWER from .num import RE_POWER
from .num import replace_vrsion_num
from .num import replace_default_num from .num import replace_default_num
from .num import replace_frac from .num import replace_frac
from .num import replace_negative_num from .num import replace_negative_num
@ -160,7 +158,6 @@ class TextNormalizer:
sentence = RE_RANGE.sub(replace_range, sentence) sentence = RE_RANGE.sub(replace_range, sentence)
sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_VERSION_NUM.sub(replace_vrsion_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence) sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)

View File

@ -13,16 +13,16 @@ import torch
logging.getLogger("numba").setLevel(logging.ERROR) logging.getLogger("numba").setLevel(logging.ERROR)
logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("matplotlib").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
MATPLOTLIB_FLAG = False MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging logger = logging
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
assert os.path.isfile(checkpoint_path) assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False) checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
iteration = checkpoint_dict["iteration"] iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"] learning_rate = checkpoint_dict["learning_rate"]
if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None: if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None:
@ -283,7 +283,7 @@ def get_hparams_from_file(config_path):
def check_git_hash(model_dir): def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__)) source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")): if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warning( logger.warn(
"{} is not a git repository, therefore hash value comparison will be ignored.".format( "{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir, source_dir,
) )
@ -296,7 +296,7 @@ def check_git_hash(model_dir):
if os.path.exists(path): if os.path.exists(path):
saved_hash = open(path).read() saved_hash = open(path).read()
if saved_hash != cur_hash: if saved_hash != cur_hash:
logger.warning( logger.warn(
"git hash values are different. {}(saved) != {}(current)".format( "git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], saved_hash[:8],
cur_hash[:8], cur_hash[:8],
@ -309,13 +309,13 @@ def check_git_hash(model_dir):
def get_logger(model_dir, filename="train.log"): def get_logger(model_dir, filename="train.log"):
global logger global logger
logger = logging.getLogger(os.path.basename(model_dir)) logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.INFO) logger.setLevel(logging.ERROR)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir): if not os.path.exists(model_dir):
os.makedirs(model_dir) os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename)) h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.INFO) h.setLevel(logging.ERROR)
h.setFormatter(formatter) h.setFormatter(formatter)
logger.addHandler(h) logger.addHandler(h)
return logger return logger

153
GPT_SoVITS_Inference.ipynb Normal file
View File

@ -0,0 +1,153 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "himHYZmra7ix"
},
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e9b7iFV3dm1f"
},
"outputs": [],
"source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r extra-req.txt --no-deps\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
"!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
"%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
"%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
"# @title UVR5 pretrains 安装uvr5模型\n",
"%cd /content/GPT-SoVITS/tools/uvr5\n",
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"outputs": [],
"source": [
"#@title Create folder models 创建文件夹模型\n",
"import os\n",
"base_directory = \"/content/GPT-SoVITS\"\n",
"folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n",
"\n",
"for folder_name in folder_names:\n",
" if os.path.exists(os.path.join(base_directory, folder_name)):\n",
" print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n",
" else:\n",
" os.makedirs(os.path.join(base_directory, folder_name))\n",
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [
"import requests\n",
"import zipfile\n",
"import shutil\n",
"import os\n",
"\n",
"#@title Import model 导入模型 (HuggingFace)\n",
"hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n",
"\n",
"output_path = '/content/'\n",
"\n",
"response = requests.get(hf_link)\n",
"with open(output_path + 'file.zip', 'wb') as file:\n",
" file.write(response.content)\n",
"\n",
"with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n",
" zip_ref.extractall(output_path)\n",
"\n",
"os.remove(output_path + \"file.zip\")\n",
"\n",
"source_directory = output_path\n",
"SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n",
"GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".pth\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(SoVITS_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".ckpt\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(GPT_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"print(f'Model downloaded. (模型已下载。)')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

173
README.md
View File

@ -9,17 +9,10 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> --> <!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org) [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases) [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) [![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
[![Huggingface](https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface)](https://lj1995-gpt-sovits-proplus.hf.space/)
[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md) **English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
@ -43,11 +36,6 @@ Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
**RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
请不要尬黑GPT-SoVITS推理速度慢谢谢
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** **User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation ## Installation
@ -57,14 +45,14 @@ For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/
### Tested Environments ### Tested Environments
| Python Version | PyTorch Version | Device | | Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- | |----------------|------------------|-----------------|
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | | Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon | | Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon | | Python 3.11 | PyTorch 2.6.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU | | Python 3.9 | PyTorch 2.2.2 | CPU |
| Python 3.9 | PyTorch 2.8.0dev | CUDA12.8(for Nvidia50x0) |
### Windows ### Windows
@ -72,52 +60,34 @@ If you are a Windows user (tested with win>=10), you can [download the integrate
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** **Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
Install the program by running the following commands:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
```
### Linux ### Linux
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### macOS ### macOS
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.** **Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
Install the program by running the following commands: 1. Install Xcode command-line tools by running `xcode-select --install`.
2. Install the program by running the following commands:
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### Install Manually ### Install Manually
#### Install Dependences
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
#### Install FFmpeg #### Install FFmpeg
##### Conda Users ##### Conda Users
```bash ```bash
conda activate GPTSoVits
conda install ffmpeg conda install ffmpeg
``` ```
@ -126,13 +96,14 @@ conda install ffmpeg
```bash ```bash
sudo apt install ffmpeg sudo apt install ffmpeg
sudo apt install libsox-dev sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
``` ```
##### Windows Users ##### Windows Users
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root.
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
##### MacOS Users ##### MacOS Users
@ -140,54 +111,36 @@ Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe)
brew install ffmpeg brew install ffmpeg
``` ```
### Running GPT-SoVITS with Docker #### Install Dependences
#### Docker Image Selection
Due to rapid development in the codebase and a slower Docker image release cycle, please:
- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags
- Choose an appropriate image tag for your environment
- `Lite` means the Docker image **does not include** ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed
- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose
- Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
#### Environment Variables
- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.
#### Shared Memory Configuration
On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.
#### Choosing a Service
The `docker-compose.yaml` defines two services:
- `GPT-SoVITS-CU126` & `GPT-SoVITS-CU128`: Full version with all features.
- `GPT-SoVITS-CU126-Lite` & `GPT-SoVITS-CU128-Lite`: Lightweight version with reduced dependencies and functionality.
To run a specific service with Docker Compose, use:
```bash ```bash
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
#### Building the Docker Image Locally ### Using Docker
If you want to build the image yourself, use: #### docker-compose.yaml configuration
```bash 0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(outdated) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs.
bash docker_build.sh --cuda <12.6|12.8> [--lite] 1. Environment Variables:
- is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation.
2. Volumes Configuration, The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content.
3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation.
4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances.
#### Running with docker compose
```
docker compose -f "docker-compose.yaml" up -d
``` ```
#### Accessing the Running Container (Bash Shell) #### Running with docker command
Once the container is running in the background, you can access it using: As above, modify the corresponding parameters based on your actual situation, then run the following command:
```bash ```
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
``` ```
## Pretrained Models ## Pretrained Models
@ -215,9 +168,7 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
The TTS annotation .list file format: The TTS annotation .list file format:
``` ```
vocal_path|speaker_name|language|text vocal_path|speaker_name|language|text
``` ```
Language dictionary: Language dictionary:
@ -231,9 +182,7 @@ Language dictionary:
Example: Example:
``` ```
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
``` ```
## Finetune and inference ## Finetune and inference
@ -263,12 +212,12 @@ Or maunally switch version in WebUI
#### Path Auto-filling is now supported #### Path Auto-filling is now supported
1. Fill in the audio path 1. Fill in the audio path
2. Slice the audio into small chunks 2. Slice the audio into small chunks
3. Denoise(optinal) 3. Denoise(optinal)
4. ASR 4. ASR
5. Proofreading ASR transcriptions 5. Proofreading ASR transcriptions
6. Go to the next Tab, then finetune the model 6. Go to the next Tab, then finetune the model
### Open Inference WebUI ### Open Inference WebUI
@ -310,7 +259,7 @@ Use v2 from v1 environment:
2. Clone the latest codes from github. 2. Clone the latest codes from github.
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`. 3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.) Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.)
@ -330,7 +279,7 @@ Use v3 from v2 environment:
2. Clone the latest codes from github. 2. Clone the latest codes from github.
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. 3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
@ -347,24 +296,7 @@ Use v4 from v1/v2/v3 environment:
2. Clone the latest codes from github. 2. Clone the latest codes from github.
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. 3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
## V2Pro Release Notes
New Features:
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
Use v2Pro from v1/v2/v3/v4 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v2Pro pretrained models (v2Pro/s2Dv2Pro.pth, v2Pro/s2Gv2Pro.pth, v2Pro/s2Dv2ProPlus.pth, v2Pro/s2Gv2ProPlus.pth, and sv/pretrained_eres2netv2w24s4ep4.ckpt) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
## Todo List ## Todo List
@ -390,7 +322,7 @@ Use v2Pro from v1/v2/v3/v4 environment:
Use the command line to open the WebUI for UVR5 Use the command line to open the WebUI for UVR5
```bash ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
@ -401,7 +333,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
This is how the audio segmentation of the dataset is done using the command line This is how the audio segmentation of the dataset is done using the command line
```bash ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -413,7 +345,7 @@ python audio_slicer.py \
This is how dataset ASR processing is done using the command line(Only Chinese) This is how dataset ASR processing is done using the command line(Only Chinese)
```bash ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
@ -421,7 +353,7 @@ ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays) (No progress bars, GPU performance may cause time delays)
```bash ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
@ -448,7 +380,6 @@ Special thanks to the following projects and contributors:
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
### Text Frontend for Inference ### Text Frontend for Inference

321
api.py
View File

@ -163,7 +163,7 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np import numpy as np
from feature_extractor import cnhubert from feature_extractor import cnhubert
from io import BytesIO from io import BytesIO
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3 from module.models import SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model from peft import LoraConfig, get_peft_model
from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence from text import cleaned_text_to_sequence
@ -198,44 +198,8 @@ def is_full(*items): # 任意一项为空返回False
return True return True
bigvgan_model = hifigan_model = sv_cn_model = None
def clean_hifigan_model():
global hifigan_model
if hifigan_model:
hifigan_model = hifigan_model.cpu()
hifigan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_bigvgan_model():
global bigvgan_model
if bigvgan_model:
bigvgan_model = bigvgan_model.cpu()
bigvgan_model = None
try:
torch.cuda.empty_cache()
except:
pass
def clean_sv_cn_model():
global sv_cn_model
if sv_cn_model:
sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu()
sv_cn_model = None
try:
torch.cuda.empty_cache()
except:
pass
def init_bigvgan(): def init_bigvgan():
global bigvgan_model, hifigan_model, sv_cn_model global bigvgan_model
from BigVGAN import bigvgan from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained( bigvgan_model = bigvgan.BigVGAN.from_pretrained(
@ -245,57 +209,20 @@ def init_bigvgan():
# remove weight norm in the model and set to eval mode # remove weight norm in the model and set to eval mode
bigvgan_model.remove_weight_norm() bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval() bigvgan_model = bigvgan_model.eval()
if is_half == True: if is_half == True:
bigvgan_model = bigvgan_model.half().to(device) bigvgan_model = bigvgan_model.half().to(device)
else: else:
bigvgan_model = bigvgan_model.to(device) bigvgan_model = bigvgan_model.to(device)
def init_hifigan():
global hifigan_model, bigvgan_model, sv_cn_model
hifigan_model = Generator(
initial_channel=100,
resblock="1",
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
upsample_rates=[10, 6, 2, 2, 2],
upsample_initial_channel=512,
upsample_kernel_sizes=[20, 12, 4, 4, 4],
gin_channels=0,
is_bias=True,
)
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
map_location="cpu",
weights_only=False,
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
if is_half == True:
hifigan_model = hifigan_model.half().to(device)
else:
hifigan_model = hifigan_model.to(device)
from sv import SV
def init_sv_cn():
global hifigan_model, bigvgan_model, sv_cn_model
sv_cn_model = SV(device, is_half)
resample_transform_dict = {} resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device): def resample(audio_tensor, sr0):
global resample_transform_dict global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device)) if sr0 not in resample_transform_dict:
if key not in resample_transform_dict: resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) return resample_transform_dict[sr0](audio_tensor)
return resample_transform_dict[key](audio_tensor)
from module.mel_processing import mel_spectrogram_torch from module.mel_processing import mel_spectrogram_torch
@ -325,19 +252,6 @@ mel_fn = lambda x: mel_spectrogram_torch(
"center": False, "center": False,
}, },
) )
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
sr_model = None sr_model = None
@ -379,19 +293,12 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
def get_sovits_weights(sovits_path): def get_sovits_weights(sovits_path):
from config import pretrained_sovits_name path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3) is_exist_s2gv3 = os.path.exists(path_sovits_v3)
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 if if_lora_v3 == True and is_exist_s2gv3 == False:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
if if_lora_v3 == True and is_exist == False:
logger.info("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
dict_s2 = load_sovits_new(sovits_path) dict_s2 = load_sovits_new(sovits_path)
hps = dict_s2["config"] hps = dict_s2["config"]
@ -404,13 +311,11 @@ def get_sovits_weights(sovits_path):
else: else:
hps.model.version = "v2" hps.model.version = "v2"
model_params_dict = vars(hps.model) if model_version == "v3":
if model_version not in {"v3", "v4"}: hps.model.version = "v3"
if "Pro" in model_version:
hps.model.version = model_version
if sv_cn_model == None:
init_sv_cn()
model_params_dict = vars(hps.model)
if model_version != "v3":
vq_model = SynthesizerTrn( vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1, hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length, hps.train.segment_size // hps.data.hop_length,
@ -418,18 +323,13 @@ def get_sovits_weights(sovits_path):
**model_params_dict, **model_params_dict,
) )
else: else:
hps.model.version = model_version
vq_model = SynthesizerTrnV3( vq_model = SynthesizerTrnV3(
hps.data.filter_length // 2 + 1, hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length, hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers, n_speakers=hps.data.n_speakers,
**model_params_dict, **model_params_dict,
) )
if model_version == "v3":
init_bigvgan() init_bigvgan()
if model_version == "v4":
init_hifigan()
model_version = hps.model.version model_version = hps.model.version
logger.info(f"模型版本: {model_version}") logger.info(f"模型版本: {model_version}")
if "pretrained" not in sovits_path: if "pretrained" not in sovits_path:
@ -445,8 +345,7 @@ def get_sovits_weights(sovits_path):
if if_lora_v3 == False: if if_lora_v3 == False:
vq_model.load_state_dict(dict_s2["weight"], strict=False) vq_model.load_state_dict(dict_s2["weight"], strict=False)
else: else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False)
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False)
lora_rank = dict_s2["lora_rank"] lora_rank = dict_s2["lora_rank"]
lora_config = LoraConfig( lora_config = LoraConfig(
target_modules=["to_k", "to_q", "to_v", "to_out.0"], target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@ -475,7 +374,7 @@ hz = 50
def get_gpt_weights(gpt_path): def get_gpt_weights(gpt_path):
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False) dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"] config = dict_s1["config"]
max_sec = config["data"]["max_sec"] max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
@ -543,31 +442,32 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False): def get_phones_and_bert(text, language, version, final=False):
text = re.sub(r' {2,}', ' ', text) if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
if language == "all_zh":
if re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "zh", version)
else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = get_bert_feature(norm_text, word2ph).to(device)
elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext, "yue", version)
else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "auto":
for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
langlist.append("en")
textlist.append(text)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
@ -579,10 +479,6 @@ def get_phones_and_bert(text, language, version, final=False):
textlist.append(tmp["text"]) textlist.append(tmp["text"])
else: else:
for tmp in LangSegmenter.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en": if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
else: else:
@ -637,34 +533,23 @@ class DictToAttrRecursive(dict):
raise AttributeError(f"Attribute {item} not found") raise AttributeError(f"Attribute {item} not found")
def get_spepc(hps, filename, dtype, device, is_v2pro=False): def get_spepc(hps, filename):
sr1 = int(hps.data.sampling_rate) audio, _ = librosa.load(filename, int(hps.data.sampling_rate))
audio, sr0 = torchaudio.load(filename) audio = torch.FloatTensor(audio)
if sr0 != sr1:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, sr0, sr1, device)
else:
audio = audio.to(device)
if audio.shape[0] == 2:
audio = audio.mean(0).unsqueeze(0)
maxx = audio.abs().max() maxx = audio.abs().max()
if maxx > 1: if maxx > 1:
audio /= min(2, maxx) audio /= min(2, maxx)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch( spec = spectrogram_torch(
audio, audio_norm,
hps.data.filter_length, hps.data.filter_length,
hps.data.sampling_rate, hps.data.sampling_rate,
hps.data.hop_length, hps.data.hop_length,
hps.data.win_length, hps.data.win_length,
center=False, center=False,
) )
spec = spec.to(dtype) return spec
if is_v2pro == True:
audio = resample(audio, sr1, 16000, device).to(dtype)
return spec, audio
def pack_audio(audio_bytes, data, rate): def pack_audio(audio_bytes, data, rate):
@ -851,16 +736,6 @@ def get_tts_wav(
t2s_model = infer_gpt.t2s_model t2s_model = infer_gpt.t2s_model
max_sec = infer_gpt.max_sec max_sec = infer_gpt.max_sec
if version == "v3":
if sample_steps not in [4, 8, 16, 32, 64, 128]:
sample_steps = 32
elif version == "v4":
if sample_steps not in [4, 8, 16, 32]:
sample_steps = 8
if if_sr and version != "v3":
if_sr = False
t0 = ttime() t0 = ttime()
prompt_text = prompt_text.strip("\n") prompt_text = prompt_text.strip("\n")
if prompt_text[-1] not in splits: if prompt_text[-1] not in splits:
@ -884,29 +759,19 @@ def get_tts_wav(
prompt_semantic = codes[0, 0] prompt_semantic = codes[0, 0]
prompt = prompt_semantic.unsqueeze(0).to(device) prompt = prompt_semantic.unsqueeze(0).to(device)
is_v2pro = version in {"v2Pro", "v2ProPlus"} if version != "v3":
if version not in {"v3", "v4"}:
refers = [] refers = []
if is_v2pro:
sv_emb = []
if sv_cn_model == None:
init_sv_cn()
if inp_refs: if inp_refs:
for path in inp_refs: for path in inp_refs:
try: #####这里加上提取sv的逻辑要么一堆sv一堆refer要么单个sv单个refer try:
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) refer = get_spepc(hps, path).to(dtype).to(device)
refers.append(refer) refers.append(refer)
if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
if len(refers) == 0: if len(refers) == 0:
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
refers = [refers]
if is_v2pro:
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
else: else:
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device) refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
t1 = ttime() t1 = ttime()
# os.environ['version'] = version # os.environ['version'] = version
@ -946,56 +811,41 @@ def get_tts_wav(
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
t3 = ttime() t3 = ttime()
if version not in {"v3", "v4"}: if version != "v3":
if is_v2pro:
audio = ( audio = (
vq_model.decode( vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)
pred_semantic,
torch.LongTensor(phones2).to(device).unsqueeze(0),
refers,
speed=speed,
sv_emb=sv_emb,
)
.detach() .detach()
.cpu() .cpu()
.numpy()[0, 0] .numpy()[0, 0]
) ) ###试试重建不带上prompt部分
else:
audio = (
vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
)
.detach()
.cpu()
.numpy()[0, 0]
)
else: else:
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0)
# print(11111111, phoneme_ids0, phoneme_ids1)
fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
ref_audio, sr = torchaudio.load(ref_wav_path) ref_audio, sr = torchaudio.load(ref_wav_path)
ref_audio = ref_audio.to(device).float() ref_audio = ref_audio.to(device).float()
if ref_audio.shape[0] == 2: if ref_audio.shape[0] == 2:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
if sr != 24000:
tgt_sr = 24000 if version == "v3" else 32000 ref_audio = resample(ref_audio, sr)
if sr != tgt_sr: # print("ref_audio",ref_audio.abs().mean())
ref_audio = resample(ref_audio, sr, tgt_sr, device) mel2 = mel_fn(ref_audio)
mel2 = mel_fn(ref_audio) if version == "v3" else mel_fn_v4(ref_audio)
mel2 = norm_spec(mel2) mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2]) T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min] mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min]
Tref = 468 if version == "v3" else 500 if T_min > 468:
Tchunk = 934 if version == "v3" else 1000 mel2 = mel2[:, :, -468:]
if T_min > Tref: fea_ref = fea_ref[:, :, -468:]
mel2 = mel2[:, :, -Tref:] T_min = 468
fea_ref = fea_ref[:, :, -Tref:] chunk_len = 934 - T_min
T_min = Tref # print("fea_ref",fea_ref,fea_ref.shape)
chunk_len = Tchunk - T_min # print("mel2",mel2)
mel2 = mel2.to(dtype) mel2 = mel2.to(dtype)
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed)
# print("fea_todo",fea_todo)
# print("ge",ge.abs().mean())
cfm_resss = [] cfm_resss = []
idx = 0 idx = 0
while 1: while 1:
@ -1004,24 +854,22 @@ def get_tts_wav(
break break
idx += chunk_len idx += chunk_len
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
# set_seed(123)
cfm_res = vq_model.cfm.inference( cfm_res = vq_model.cfm.inference(
fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
) )
cfm_res = cfm_res[:, :, mel2.shape[2] :] cfm_res = cfm_res[:, :, mel2.shape[2] :]
mel2 = cfm_res[:, :, -T_min:] mel2 = cfm_res[:, :, -T_min:]
# print("fea", fea)
# print("mel2in", mel2)
fea_ref = fea_todo_chunk[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:]
cfm_resss.append(cfm_res) cfm_resss.append(cfm_res)
cfm_res = torch.cat(cfm_resss, 2) cmf_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res) cmf_res = denorm_spec(cmf_res)
if version == "v3":
if bigvgan_model == None: if bigvgan_model == None:
init_bigvgan() init_bigvgan()
else: # v4
if hifigan_model == None:
init_hifigan()
vocoder_model = bigvgan_model if version == "v3" else hifigan_model
with torch.inference_mode(): with torch.inference_mode():
wav_gen = vocoder_model(cfm_res) wav_gen = bigvgan_model(cmf_res)
audio = wav_gen[0][0].cpu().detach().numpy() audio = wav_gen[0][0].cpu().detach().numpy()
max_audio = np.abs(audio).max() max_audio = np.abs(audio).max()
@ -1032,13 +880,7 @@ def get_tts_wav(
audio_opt = np.concatenate(audio_opt, 0) audio_opt = np.concatenate(audio_opt, 0)
t4 = ttime() t4 = ttime()
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}: sr = hps.data.sampling_rate if version != "v3" else 24000
sr = 32000
elif version == "v3":
sr = 24000
else:
sr = 48000 # v4
if if_sr and sr == 24000: if if_sr and sr == 24000:
audio_opt = torch.from_numpy(audio_opt).float().to(device) audio_opt = torch.from_numpy(audio_opt).float().to(device)
audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr)
@ -1058,12 +900,8 @@ def get_tts_wav(
if not stream_mode == "normal": if not stream_mode == "normal":
if media_type == "wav": if media_type == "wav":
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
sr = 32000
elif version == "v3":
sr = 48000 if if_sr else 24000 sr = 48000 if if_sr else 24000
else: sr = hps.data.sampling_rate if version != "v3" else sr
sr = 48000 # v4
audio_bytes = pack_wav(audio_bytes, sr) audio_bytes = pack_wav(audio_bytes, sr)
yield audio_bytes.getvalue() yield audio_bytes.getvalue()
@ -1128,6 +966,9 @@ def handle(
if not default_refer.is_ready(): if not default_refer.is_ready():
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
if sample_steps not in [4, 8, 16, 32]:
sample_steps = 32
if cut_punc == None: if cut_punc == None:
text = cut_text(text, default_cut_punc) text = cut_text(text, default_cut_punc)
else: else:
@ -1230,10 +1071,10 @@ default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, a
# 模型路径检查 # 模型路径检查
if sovits_path == "": if sovits_path == "":
sovits_path = g_config.pretrained_sovits_path sovits_path = g_config.pretrained_sovits_path
logger.warning(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}") logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
if gpt_path == "": if gpt_path == "":
gpt_path = g_config.pretrained_gpt_path gpt_path = g_config.pretrained_gpt_path
logger.warning(f"未指定GPT模型路径, fallback后当前值: {gpt_path}") logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用 # 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
if default_refer.path == "" or default_refer.text == "" or default_refer.language == "": if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":

View File

@ -33,14 +33,14 @@ POST:
"text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details. "text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details.
"batch_size": 1, # int. batch size for inference "batch_size": 1, # int. batch size for inference
"batch_threshold": 0.75, # float. threshold for batch splitting. "batch_threshold": 0.75, # float. threshold for batch splitting.
"split_bucket": True, # bool. whether to split the batch into multiple buckets. "split_bucket: True, # bool. whether to split the batch into multiple buckets.
"speed_factor":1.0, # float. control the speed of the synthesized audio. "speed_factor":1.0, # float. control the speed of the synthesized audio.
"streaming_mode": False, # bool. whether to return a streaming response. "streaming_mode": False, # bool. whether to return a streaming response.
"seed": -1, # int. random seed for reproducibility. "seed": -1, # int. random seed for reproducibility.
"parallel_infer": True, # bool. whether to use parallel inference. "parallel_infer": True, # bool. whether to use parallel inference.
"repetition_penalty": 1.35, # float. repetition penalty for T2S model. "repetition_penalty": 1.35 # float. repetition penalty for T2S model.
"sample_steps": 32, # int. number of sampling steps for VITS model V3. "sample_steps": 32, # int. number of sampling steps for VITS model V3.
"super_sampling": False # bool. whether to use super-sampling for audio when using VITS model V3. "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
} }
``` ```

View File

@ -7,14 +7,7 @@
"id": "view-in-github" "id": "view-in-github"
}, },
"source": [ "source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# GPT-SoVITS WebUI"
] ]
}, },
{ {
@ -23,15 +16,15 @@
"id": "_o6a8GS2lWQM" "id": "_o6a8GS2lWQM"
}, },
"source": [ "source": [
"## Env Setup (Run Once Only)\n", "# Env Setup (Run Once Only)\n",
"## 环境配置, 只需运行一次" "# 环境配置, 只需运行一次"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 1." "## 1."
] ]
}, },
{ {
@ -42,11 +35,9 @@
"source": [ "source": [
"%%writefile /content/setup.sh\n", "%%writefile /content/setup.sh\n",
"set -e\n", "set -e\n",
"\n",
"cd /content\n", "cd /content\n",
"\n", "rm -rf GPT-SoVITS\n",
"git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"\n",
"cd GPT-SoVITS\n", "cd GPT-SoVITS\n",
"\n", "\n",
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
@ -57,16 +48,14 @@
"\n", "\n",
"source activate GPTSoVITS\n", "source activate GPTSoVITS\n",
"\n", "\n",
"pip install ipykernel\n", "bash install.sh --source HF --download-uvr5"
"\n",
"bash install.sh --device CU126 --source HF --download-uvr5"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### 2." "## 2."
] ]
}, },
{ {
@ -85,8 +74,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Launch WebUI\n", "# Launch WebUI\n",
"## 启动 WebUI" "# 启动 WebUI"
] ]
}, },
{ {

185
config.py
View File

@ -1,126 +1,8 @@
import os
import re
import sys import sys
import os
import torch import torch
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
pretrained_sovits_name = {
"v1": "GPT_SoVITS/pretrained_models/s2G488k.pth",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"v3": "GPT_SoVITS/pretrained_models/s2Gv3.pth", ###v3v4还要检查vocoder算了。。。
"v4": "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
"v2Pro": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
"v2ProPlus": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
pretrained_gpt_name = {
"v1": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"v3": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v4": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2Pro": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
"v2ProPlus": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
name2sovits_path = {
# i18n("不训练直接推v1底模"): "GPT_SoVITS/pretrained_models/s2G488k.pth",
i18n("不训练直接推v2底模"): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
# i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s2Gv3.pth",
# i18n("不训练直接推v4底模"): "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
i18n("不训练直接推v2Pro底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
i18n("不训练直接推v2ProPlus底模"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
}
name2gpt_path = {
# i18n("不训练直接推v1底模"):"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
i18n(
"不训练直接推v2底模"
): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
i18n("不训练直接推v3底模"): "GPT_SoVITS/pretrained_models/s1v3.ckpt",
}
SoVITS_weight_root = [
"SoVITS_weights",
"SoVITS_weights_v2",
"SoVITS_weights_v3",
"SoVITS_weights_v4",
"SoVITS_weights_v2Pro",
"SoVITS_weights_v2ProPlus",
]
GPT_weight_root = [
"GPT_weights",
"GPT_weights_v2",
"GPT_weights_v3",
"GPT_weights_v4",
"GPT_weights_v2Pro",
"GPT_weights_v2ProPlus",
]
SoVITS_weight_version2root = {
"v1": "SoVITS_weights",
"v2": "SoVITS_weights_v2",
"v3": "SoVITS_weights_v3",
"v4": "SoVITS_weights_v4",
"v2Pro": "SoVITS_weights_v2Pro",
"v2ProPlus": "SoVITS_weights_v2ProPlus",
}
GPT_weight_version2root = {
"v1": "GPT_weights",
"v2": "GPT_weights_v2",
"v3": "GPT_weights_v3",
"v4": "GPT_weights_v4",
"v2Pro": "GPT_weights_v2Pro",
"v2ProPlus": "GPT_weights_v2ProPlus",
}
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def get_weights_names():
SoVITS_names = []
for key in name2sovits_path:
if os.path.exists(name2sovits_path[key]):
SoVITS_names.append(key)
for path in SoVITS_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
if not SoVITS_names:
SoVITS_names = [""]
GPT_names = []
for key in name2gpt_path:
if os.path.exists(name2gpt_path[key]):
GPT_names.append(key)
for path in GPT_weight_root:
if not os.path.exists(path):
continue
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
GPT_names = sorted(GPT_names, key=custom_sort_key)
if not GPT_names:
GPT_names = [""]
return SoVITS_names, GPT_names
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": SoVITS_names, "__type__": "update"}, {
"choices": GPT_names,
"__type__": "update",
}
# 推理用的指定模型 # 推理用的指定模型
sovits_path = "" sovits_path = ""
gpt_path = "" gpt_path = ""
@ -136,6 +18,10 @@ pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=
exp_root = "logs" exp_root = "logs"
python_exec = sys.executable or "python" python_exec = sys.executable or "python"
if torch.cuda.is_available():
infer_device = "cuda"
else:
infer_device = "cpu"
webui_port_main = 9874 webui_port_main = 9874
webui_port_uvr5 = 9873 webui_port_uvr5 = 9873
@ -144,55 +30,20 @@ webui_port_subfix = 9871
api_port = 9880 api_port = 9880
if infer_device == "cuda":
gpu_name = torch.cuda.get_device_name(0)
if (
("16" in gpu_name and "V100" not in gpu_name.upper())
or "P40" in gpu_name.upper()
or "P10" in gpu_name.upper()
or "1060" in gpu_name
or "1070" in gpu_name
or "1080" in gpu_name
):
is_half = False
# Thanks to the contribution of @Karasukaigan and @XXXXRT666 if infer_device == "cpu":
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]: is_half = False
cpu = torch.device("cpu")
cuda = torch.device(f"cuda:{idx}")
if not torch.cuda.is_available():
return cpu, torch.float32, 0.0, 0.0
device_idx = idx
capability = torch.cuda.get_device_capability(device_idx)
name = torch.cuda.get_device_name(device_idx)
mem_bytes = torch.cuda.get_device_properties(device_idx).total_memory
mem_gb = mem_bytes / (1024**3) + 0.4
major, minor = capability
sm_version = major + minor / 10.0
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
if mem_gb < 4 or sm_version < 5.3:
return cpu, torch.float32, 0.0, 0.0
if sm_version == 6.1 or is_16_series == True:
return cuda, torch.float32, sm_version, mem_gb
if sm_version > 6.1:
return cuda, torch.float16, sm_version, mem_gb
return cpu, torch.float32, 0.0, 0.0
IS_GPU = True
GPU_INFOS: list[str] = []
GPU_INDEX: set[int] = set()
GPU_COUNT = torch.cuda.device_count()
CPU_INFO: str = "0\tCPU " + i18n("CPU训练,较慢")
tmp: list[tuple[torch.device, torch.dtype, float, float]] = []
memset: set[float] = set()
for i in range(max(GPU_COUNT, 1)):
tmp.append(get_device_dtype_sm(i))
for j in tmp:
device = j[0]
memset.add(j[3])
if device.type != "cpu":
GPU_INFOS.append(f"{device.index}\t{torch.cuda.get_device_name(device.index)}")
GPU_INDEX.add(device.index)
if not GPU_INFOS:
IS_GPU = False
GPU_INFOS.append(CPU_INFO)
GPU_INDEX.add(0)
infer_device = max(tmp, key=lambda x: (x[2], x[3]))[0]
is_half = any(dtype == torch.float16 for _, dtype, _, _ in tmp)
class Config: class Config:

View File

@ -1,79 +1,32 @@
version: "3.8" version: '3.8'
services: services:
GPT-SoVITS-CU126: gpt-sovits:
image: xxxxrt666/gpt-sovits:latest-cu126 image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container.
container_name: GPT-SoVITS-CU126 container_name: gpt-sovits-container
environment:
- is_half=False
- is_share=False
volumes:
- ./output:/workspace/output
- ./logs:/workspace/logs
- ./SoVITS_weights:/workspace/SoVITS_weights
- ./reference:/workspace/reference
working_dir: /workspace
ports: ports:
- "9880:9880"
- "9871:9871" - "9871:9871"
- "9872:9872" - "9872:9872"
- "9873:9873" - "9873:9873"
- "9874:9874" - "9874:9874"
- "9880:9880" shm_size: 16G
volumes: deploy:
- .:/workspace/GPT-SoVITS resources:
environment: reservations:
- is_half=true devices:
tty: true - driver: nvidia
count: "all"
capabilities: [gpu]
stdin_open: true stdin_open: true
shm_size: "16g"
restart: unless-stopped
runtime: nvidia
GPT-SoVITS-CU126-Lite:
image: xxxxrt666/gpt-sovits:latest-cu126-lite
container_name: GPT-SoVITS-CU126-Lite
ports:
- "9871:9871"
- "9872:9872"
- "9873:9873"
- "9874:9874"
- "9880:9880"
volumes:
- .:/workspace/GPT-SoVITS
- tools/asr/models:/workspace/models/asr_models
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
environment:
- is_half=true
tty: true tty: true
stdin_open: true
shm_size: "16g"
restart: unless-stopped restart: unless-stopped
runtime: nvidia
GPT-SoVITS-CU128:
image: xxxxrt666/gpt-sovits:latest-cu128
container_name: GPT-SoVITS-CU128
ports:
- "9871:9871"
- "9872:9872"
- "9873:9873"
- "9874:9874"
- "9880:9880"
volumes:
- .:/workspace/GPT-SoVITS
environment:
- is_half=true
tty: true
stdin_open: true
shm_size: "16g"
restart: unless-stopped
runtime: nvidia
GPT-SoVITS-CU128-Lite:
image: xxxxrt666/gpt-sovits:latest-cu128-lite
container_name: GPT-SoVITS-CU128-Lite
ports:
- "9871:9871"
- "9872:9872"
- "9873:9873"
- "9874:9874"
- "9880:9880"
volumes:
- .:/workspace/GPT-SoVITS
- tools/asr/models:/workspace/models/asr_models
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
environment:
- is_half=true
tty: true
stdin_open: true
shm_size: "16g"
restart: unless-stopped
runtime: nvidia

View File

@ -1,82 +0,0 @@
#!/bin/bash
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1
set -e
if ! command -v docker &>/dev/null; then
echo "Docker Not Found"
exit 1
fi
trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
LITE=false
CUDA_VERSION=12.6
print_help() {
echo "Usage: bash docker_build.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)"
echo " --lite Build a Lite Image"
echo " -h, --help Show this help message and exit"
echo ""
echo "Examples:"
echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
}
# Show help if no arguments provided
if [[ $# -eq 0 ]]; then
print_help
exit 0
fi
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--cuda)
case "$2" in
12.6)
CUDA_VERSION=12.6
;;
12.8)
CUDA_VERSION=12.8
;;
*)
echo "Error: Invalid CUDA_VERSION: $2"
echo "Choose From: [12.6, 12.8]"
exit 1
;;
esac
shift 2
;;
--lite)
LITE=true
shift
;;
*)
echo "Unknown Argument: $1"
echo "Use -h or --help to see available options."
exit 1
;;
esac
done
TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64")
if [ $LITE = true ]; then
TORCH_BASE="lite"
else
TORCH_BASE="full"
fi
docker build \
--build-arg CUDA_VERSION=$CUDA_VERSION \
--build-arg LITE=$LITE \
--build-arg TARGETPLATFORM="$TARGETPLATFORM" \
--build-arg TORCH_BASE=$TORCH_BASE \
-t "${USER}/gpt-sovits:local" \
.

21
dockerbuild.sh Executable file
View File

@ -0,0 +1,21 @@
#!/bin/bash
# 获取当前日期,格式为 YYYYMMDD
DATE=$(date +%Y%m%d)
# 获取最新的 Git commit 哈希值的前 7 位
COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
# 构建 full 版本的镜像
docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite

View File

@ -1,626 +1,302 @@
# 更新日志 ### 20240121更新
## 202401 1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) 2-WebUI添加英文系统英文翻译适配
- 内容: WebUI 增加英文系统英文翻译适配.
- 类型: 文档
- 提交: D3lik
- 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e)
- 内容: 尝试修复 SoVITS 训练报错 ZeroDivisionError 的问题.
- 类型: 修复
- 提交: RVC-Boss, Tybost
- 关联: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
- 2024.01.21 [Commit#ea62d6e0](https://github.com/RVC-Boss/GPT-SoVITS/commit/ea62d6e0cf1efd75287766ea2b55d1c3b69b4fd3)
- 内容: 大幅削弱合成音频包含参考音频结尾的问题.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef)
- 内容: `cmd-asr.py` 添加判断默认目录内是否存在模型, 如不存在则从 ModelScope 自动下载.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d)
- 内容: `Config.py` 添加 `is_share` 参数, 如 Colab 等场景可以将此改为 `True` 将 WebUI 映射到公网.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f)
- 内容: 清理 TEMP 文件夹内缓存, 音频等文件.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.22 [Commit#872134c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/872134c846bcb8f1909a3f5aff68a6aa67643f68)
- 内容: 修复过短输出文件返回重复参考音频的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.22 经测试, 英文日文训练原生支持 (日文训练需要根目录不含非英文等特殊字符).
- 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124)
- 内容: 音频路径检查. 如果尝试读取输入错的路径报错路径不存在, 而非 FFmpeg 错误.
- 类型: 优化
- 提交: xmimu
- 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1)
- 内容: 解决 HuBERT 提取 NaN 导致 SoVITS/GPT 训练报错 ZeroDivisionError 的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50)
- 内容: 中文分词使用 `jieba_fast` 代替 `jieba`.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.23 [Commit#63625758](https://github.com/RVC-Boss/GPT-SoVITS/commit/63625758a99e645f3218dd167924e01a0e3cf0dc)
- 内容: 优化模型文件排序逻辑.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f)
- 内容: 支持推理界面快速切换模型.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b)
- 内容: 去除推理界面大量冗余日志.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200)
- 内容: 支持 MacOS MPS 训练推理.
- 类型: 新功能
- 提交: Lion-Wu
- 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b)
- 内容: 修复 UVR5 读取到目录自动跳出的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204)
- 内容: 支持输出文本中英混合, 日英混合.
- 类型: 新功能
- 提交: Kakaru Hayate
- 2024.01.26 [Commit#f4148cf7](https://github.com/RVC-Boss/GPT-SoVITS/commit/f4148cf77fb899c22bcdd4e773d2f24ab34a73e7)
- 内容: 输出可选切分模式.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.26 [Commit#9fe955c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/9fe955c1bf5f94546c9f699141281f2661c8a180)
- 内容: 修复多个换行导致推理报错.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b)
- 内容: 自动识别不支持半精度的卡强制单精度, CPU 推理下强制单精度.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238)
- 内容: 完善 Dockerfile 下载模型流程.
- 类型: 修复
- 提交: breakstring
- 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257)
- 内容: 修复数字转汉字念法问题.
- 类型: 修复
- 提交: duliangang
- 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683)
- 内容: 修复 GPT 训练不保存权重文件的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8)
- 内容: 排除不合理的参考音频长度.
- 类型: 优化
- 提交: RVC-Boss
- 2024.01.28 [Commit#698e9655](https://github.com/RVC-Boss/GPT-SoVITS/commit/698e9655132d194b25b86fbbc99d53c8d2cea2a3)
- 内容: 修复句首少量字容易吞字的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0)
- 内容: 对于 16 系等半精度训练存在问题的显卡把训练配置改为单精度训练.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec)
- 内容: 测试更新可用的 Colab 版本.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135)
- 内容: 更新 FunASR 为 1.0 版本并修复接口不对齐导致的报错问题.
- 类型: 修复
- 提交: LauraGPT
- 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4)
- 内容: 修复中文标点切割问题和句首句尾补标点的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.01.30 [Commit#74409f35](https://github.com/RVC-Boss/GPT-SoVITS/commit/74409f3570fa1c0ff28d4c65c288a6ce58ca00d2)
- 内容: 增加按标点符号切分.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.01.30 [Commit#c42eeccf](https://github.com/RVC-Boss/GPT-SoVITS/commit/c42eeccfdd2d0a0d714ecc8bfc22a12373aca6b7)
- 内容: 所有涉及路径的位置自动去除双引号, 解决复制路径带双引号时报错的问题.
- 类型: 修复
- 提交: RVC-Boss
## 202402 3-cmd-asr自动判断是否已自带damo模型, 如不在默认目录上将从modelscope自带下载
- 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) 4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等)
- 内容: 修复 ASR 路径尾缀带有 `/` 时保存文件名报错的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7)
- 内容: 修复 UVR5 读取格式错误导致分离失败的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7)
- 内容: 支持中日英混合多种语言文本自动切分识别语种.
- 类型: 优化
- 提交: RVC-Boss
- 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)
- 内容: 引入 PaddleSpeech 的文本规范化, 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨, 下划线不再会报错.
- 类型: 优化
- 提交: KamioRinn
- 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395)
- 内容: 优化英语文本前端.
- 类型: 优化
- 提交: KamioRinn
- 2024.02.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/65b463a787f31637b4768cc9a47cab59541d3927)
- 内容: 修正语种传参混乱导致中文推理效果下降.
- 类型: 修复
- 提交: RVC-Boss
- 关联: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)
- 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)
- 内容: UVR5 适配更高版本的 Librosa.
- 类型: 修复
- 提交: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- 内容: 修复 UVR5 `inf everywhere` 报错的问题 (`is_half` 传参未转换布尔类型导致恒定半精度推理, 16系显卡会 `inf`).
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
- 内容: 修复 Gradio 依赖.
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400)
- 内容: 集成 Faster Whisper 实现对日语英语的语音识别.
- 类型: 新功能
- 提交: Shadow
- 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)~[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10)
- 内容: 支持三连根目录留空自动读取 `.list` 全路径.
- 类型: 优化
- 提交: RVC-Boss
- 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)
- 内容: 修复 GPT 训练卡死 (win10 1909) 和系统语言繁体 GPT 训练报错.
- 类型: 修复
- 提交: RVC-Boss
- 关联: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)
- 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)
- 内容: 添加 DPO 损失实验性训练选项, 通过构造负样本训练缓解 GPT 重复漏字问题, 推理界面开放数个推理参数.
- 类型: 新功能
- 提交: liufenghua
- 2024.02.12 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28)
- 内容: 优化语音识别部分逻辑. Faster Whisper 转镜像站下载, 规避 HuggingFace 连接不上的问题.
- 类型: 优化
- 提交: RVC-Boss
- 2024.02.15 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8)
- 内容: 训练支持中文实验名称.
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)~[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b)
- 内容: DPO 训练修改为可选项而非必选项, 若勾选则 Batch Size 自动减半, 修复推理界面新参数不传参的问题.
- 类型: 优化
- 提交: RVC-Boss
- 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5)
- 内容: 修复中文文本前端错误.
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.16 [PR#499](https://github.com/RVC-Boss/GPT-SoVITS/pull/499)
- 内容: 支持无参考文本输入.
- 类型: 新功能
- 提交: Watchtower-Liu
- 关联: [Issue#475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)
- 2024.02.17 [PR#509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509), [PR#507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR#532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR#556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR#559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559)
- 内容: 优化中文日文前端处理.
- 类型: 优化
- 提交: KamioRinn, v3cun
- 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511)
- 内容: 修复 Colab 不开启公网 URL 的问题.
- 类型: 修复
- 提交: ChanningWang2018, RVC-Boss
- 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557)
- 内容: MacOS 推理设备从 MPS 改为 CPU (CPU 推理更快).
- 类型: 优化
- 提交: XXXXRT666
- 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35)
- 内容: 数据预处理添加语音降噪选项 (降噪为只剩 16K 采样率, 除非底噪很大否则不急使用).
- 类型: 新功能
- 提交: RVC-Boss
- 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)
- 内容: 修改 `is_half` 的判断让 MacOS 能正常 CPU 推理.
- 类型: 修复
- 提交: XXXXRT666
- 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)
- 内容: 修复 UVR5 MDXNet 参数顺序错误导致输出文件夹相反.
- 类型: 修复
- 提交: Yuze Wang
## 202403 5-清理TEMP文件夹缓存音频等文件
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) 6-大幅削弱合成音频包含参考音频结尾的问题
- 内容: Faster Whisper 在没有 CUDA 可用时自动使用 CPU 推理.
- 类型: 优化
- 提交: ShiroDoMain
- 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193) ### 20240122更新
- 内容: 使用 Faster Whisper 进行非中文语音识别时不再需要先下载 FunASR 模型.
- 类型: 优化
- 提交: RVC-Boss
- 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 内容: 加速推理 50% (RTX3090+PyTorch2.2.1+Cuda11.8+Win10+Py39 已测试).
- 类型: 优化
- 提交: GoHomeToMacDonal
- 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
- 内容: 新增 `fast_inference_` 快速推理分支.
- 类型: 新功能
- 提交: ChasonJiang
- 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
- 内容: 支持 CPU 训练, 在 MacOS 上使用 CPU 训练.
- 类型: 新功能
- 提交: Lion-Wu
- 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821)
- 内容: 优化英文 G2P 文本前端.
- 类型: 优化
- 提交: KamioRinn
- 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894)
- 内容: API 格式优化.
- 类型: 优化
- 提交: KamioRinn
## 202404 1-修复过短输出文件返回重复参考音频的问题.
- 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) 2-经测试, 英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符).
- 内容: 修复 UVR5 WebUI 调用 FFmpeg 时字符串格式.
- 类型: 修复
- 提交: StaryLan
## 202405 3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误.
- 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) ### 20240123更新
- 内容: 修复 SoVITS 训练未冻结 VQ 的问题 (可能造成效果下降).
- 类型: 修复
- 提交: hcwu1993
- 关联: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747)
- 2024.05.19 [PR#1102](https://github.com/RVC-Boss/GPT-SoVITS/pull/1102)
- 内容: 添加训练数据预处理阶段不支持的语言提示.
- 类型: 优化
- 提交: StaryLan
- 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132)
- 内容: 修复提取 HuBERT 特征 NaN 失败自动转 FP32 出现的错误.
- 类型: 修复
- 提交: XXXXRT666
## 202406 1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
- 2024.06.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) 2-支持推理界面快速切换模型
- 内容: 修复 WebUI 进行 GPT 中文微调时未读取 BERT 特征导致和推理不一致, 大量训练可能导致效果变差的问题. 若已使用大量数据微调, 建议重新微调模型得到质量优化.
- 类型: 修复
- 提交: RVC-Boss
- 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)
- 内容: 修复 S2 训练进度条逻辑.
- 类型: 修复
- 提交: pengzhendong
- 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
- 内容: 修复 UVR5 MDXNet 调用 FFmpeg 时字符串格式, 兼容带空格路径.
- 类型: 修复
- 提交: RVC-Boss
- 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)
- 内容: 完善纯标点、多标点文本输入的判断逻辑.
- 类型: 修复
- 提交: XXXXRT666
- 关联: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165)
- 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
- 内容: 修正 CPU 推理时默认 Batch Size 为小数的问题.
- 类型: 修复
- 提交: RVC-Boss
- 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267)
- 内容: 修复降噪、识别时遇到异常跳出所有需处理的音频文件的问题.
- 类型: 修复
- 提交: XXXXRT666
- 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
- 内容: 多卡训练多进程保存逻辑修复.
- 类型: 修复
- 提交: RVC-Boss
- 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251)
- 内容: 移除冗余 `my_utils.py`.
- 类型: 优化
- 提交: aoguai
- 关联: [Issue#1189](https://github.com/RVC-Boss/GPT-SoVITS/issues/1189)
## 202407 3-优化模型文件排序逻辑
- 2024.07.06 [PR#1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) 4-中文分词使用jieba_fast代替jieba
- 内容: 修复按标点符号切分时小数会被切分.
- 类型: 修复
- 提交: aoguai
- 2024.07.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5)
- 内容: 验证倍速推理代码结果和原本一致, 合并到 `main` 分支, 支持无参考文本模式.
- 类型: 优化
- 提交: RVC-Boss, GoHomeToMacDonal
- 关联: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 后续逐渐验证快速推理分支的推理改动的一致性.
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) ### 20240126更新
- 内容: 重构 i18n 扫描并更新多语言配置文件.
- 类型: 文档
- 提交: StaryLan
- 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299)
- 内容: 修复用户打文件及路径在结尾添加 `/` 会导致命令行报错的问题.
- 类型: 修复
- 提交: XXXXRT666
- 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756)
- 内容: 修复训练 GPT 时采用自定义 bucket_sampler 导致步数不一致的问题.
- 类型: 修复
- 提交: huangxu1991
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- 内容: 支持合成语速调节, 支持冻结随机性只调节语速, 并将其更新到`api.py` 上.
- 类型: 新功能
- 提交: RVC-Boss, 红血球AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- 内容: 增加 BS-Roformer 人声伴奏分离模型支持.
- 类型: 新功能
- 提交: KamioRinn
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351)
- 内容: 更好的中文文本前端.
- 类型: 新功能
- 提交: KamioRinn
## 202408 (V2 版本) 1-支持输出文本中英混合、日英混合
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) 2-输出可选切分模式
- 内容: 添加自动填充下一步文件路径的功能.
- 类型: 杂项
- 提交: XXXXRT666
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
- 内容: 支持 BS-Roformer 的 FP16 推理.
- 类型: 性能优化
- 提交: RVC-Boss
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
- 内容: 增加用户友好逻辑, 对用户随意输入的显卡序号也能正常运行.
- 类型: 杂项
- 提交: RVC-Boss
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758)
- 内容: **新增 GPT-SoVITS V2 模型.**
- 类型: 新功能
- 提交: RVC-Boss
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
- 内容: 增加粤语 FunASR 支持.
- 类型: 新功能
- 提交: RVC-Boss
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388)
- 内容: 优化界面, 优化计时逻辑.
- 类型: 杂项
- 提交: XXXXRT666
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488)
- 内容: 优化多音字逻辑 (V2 版本特供).
- 类型: 修复, 新功能
- 提交: KamioRinn, RVC-Boss
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422)
- 内容: 修复参考音频混合只能上传一条的错误, 添加数据集检查, 缺失会弹出警告窗口.
- 类型: 修复, 杂项
- 提交: XXXXRT666
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508)
- 内容: 上游 LangSegment 库支持通过 SSML 标签优化数字、电话、时间日期等.
- 类型: 新功能
- 提交: juntaosun
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503)
- 内容: 修复并优化 API.
- 类型: 修复
- 提交: KamioRinn
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490)
- 内容: 合并 fast_inference 分支.
- 类型: 重构
- 提交: ChasonJiang
- 2024.08.21 **正式发布 GPT-SoVITS V2 版本.**
## 202502 (V3 版本) 3-修复uvr5读取到目录自动跳出的问题
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) 4-修复多个换行导致推理报错
- 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
- 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
- 提交: RVC-Boss
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
- 内容: 更新项目多语言文档.
- 类型: 文档
- 提交: StaryLan
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033)
- 内容: 更新日语文档.
- 类型: 文档
- 提交: Fyphen
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010)
- 内容: 优化注意力计算逻辑.
- 类型: 性能优化
- 提交: wzy3650
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040)
- 内容: 微调添加梯度检查点支持, 需要 12G 显存进行微调.
- 类型: 新功能
- 提交: Kakaru Hayate
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073)
- 内容: 切换新的语言分割工具, 优化多语种混合文本切分策略, 优化文本里的数字和英文处理逻辑.
- 类型: 新功能
- 提交: KamioRinn
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e)
- 内容: **GPT-SoVITS V3 模型支持 LoRA 训练, 需要 8G 显存进行微调.**
- 类型: 新功能
- 提交: RVC-Boss
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078)
- 内容: 人声背景音分离增加 Mel Band Roformer 模型支持.
- 类型: 新功能
- 提交: Sucial
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114)
- 内容: 修复中文路径下 Mecab 的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错).
- 类型: 修复
- 提交: KamioRinn
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc)
- 内容: **支持使用 24KHz 转 48kHz 的音频超分模型**, 缓解 V3 模型生成音频感觉闷的问题.
- 类型: 新功能
- 提交: RVC-Boss
- 关联: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123)
- 内容: 更新项目多语言文档
- 类型: 文档
- 提交: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- 内容: 对于模型无法判断的CJK短字符采用规则判断.
- 类型: 修复
- 提交: KamioRinn
- 关联: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5)
- 内容: 增加语速传参以支持调整合成语速.
- 类型: 修复
- 提交: RVC-Boss
- 2025.02.28 **正式发布 GPT-SoVITS V3**.
## 202503 5-去除推理界面大量冗余log
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236) 6-支持mac训练推理
- 内容: 修复一批由依赖的库版本不对导致的问题.
- 类型: 修复
- 提交: XXXXRT666
- 关联
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
- 内容: **为 SoVITS v3 适配并行推理**.
- 类型: 新功能
- 提交: ChasonJiang
- 修复其他若干错误. 7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度.
- 整合包修复 onnxruntime GPU 推理的支持 ### 20240128更新
- 类型: 修复
- 内容
- G2PW 内的 ONNX 模型由 CPU 推理 换为 GPU, 显著降低推理的 CPU 瓶颈;
- foxjoy 去混响模型现在可使用 GPU 推理
## 202504 (V4 版本) 1-修复数字转汉字念法问题
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) 2-修复句首少量字容易吞字的问题
- 内容: 解锁 SoVITS v3 并行推理, 修复模型加载异步逻辑.
- 类型: 修复
- 提交: RVC-Boss
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255)
- 内容: Ruff 格式化代码, 更新 G2PW 链接.
- 类型: 风格
- 提交: XXXXRT666
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290)
- 内容: 清理文档, 支持 Python 3.11, 更新安装文件.
- 类型: 杂项
- 提交: XXXXRT666
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300)
- 内容: 更新 Colab, 安装文件和模型下载.
- 类型: 杂项
- 提交: XXXXRT666
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf)
- 内容: **新增 GPT-SoVITS V4 模型**.
- 类型: 新功能
- 提交: RVC-Boss
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307)
- 内容: 适配 V4 并行推理.
- 类型: 新功能
- 提交: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309)
- 内容: 修复模型版本传参.
- 类型: 修复
- 提交: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310)
- 内容: 修复 Numpy 与 Numba 版本不匹配问题, 更新 librosa 版本.
- 类型: 修复
- 提交: RVC-Boss, XXXXRT666
- 关联: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
- **2024.04.22 正式发布 GPT-SoVITS V4**.
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311)
- 内容: 更新 Gradio 参数.
- 类型: 杂项
- 提交: XXXXRT666
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322)
- 内容: 完善 Colab/Kaggle Notebook 脚本.
- 类型: 杂项
- 提交: XXXXRT666
## 202505 3-通过限制排除不合理的参考音频长度
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) 4-修复GPT训练不保存ckpt的问题
- 内容: 完善 Docker, Windows 自动构建脚本, Pre-Commit 格式化.
- 类型: 杂项
- 提交: XXXXRT666
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408)
- 内容: 优化混合语种切分识别逻辑.
- 类型: 修复
- 提交: KamioRinn
- 关联: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377)
- 内容: 通过缓存策略使 SoVITS V3/V4 推理提速 10%.
- 类型: 性能优化
- 提交: Kakaru Hayate
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba)
- 内容: 更新标注界面, 增加友情提示, 即标注完每一页都要点击 `Submit Text` 否则修改无效.
- 类型: 修复
- 提交: RVC-Boss
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc)
- 内容: 修复 UVR5 和 ONNX 去混响模型使用 FFmpeg 编码 MP3 和 M4A 原路径带空格时的错误.
- 类型: 修复
- 提交: RVC-Boss
## 202506 (V2Pro 系列) 5-完善Dockerfile的下载模型流程
- 2025.06.03 [PR#2420](https://github.com/RVC-Boss/GPT-SoVITS/pull/2420) ### 20240129更新
- 内容: 更新项目多语言文档.
- 类型: 文档 1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
- 提交: StaryLan
- 2025.06.04 [PR#2417](https://github.com/RVC-Boss/GPT-SoVITS/pull/2417) 2-测试更新可用的colab版本
- 内容: 支持 torchscript 导出 V4 模型.
- 类型: 新功能 3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
- 提交: L-jasmine
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)~[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70)
- 内容: **新增 GPT-SoVITS V2Pro 系列模型**.
- 类型: 新功能
- 提交: RVC-Boss
- 2025.06.05 [PR#2426](https://github.com/RVC-Boss/GPT-SoVITS/pull/2426)
- 内容: `config/inference_webui` 初始化错误修复.
- 类型: 修复
- 提交: StaryLan
- 2025.06.05 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427), [Commit#7d70852a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7d70852a3f67c3b52e3a62857f8663d529efc8cd), [PR#2434](https://github.com/RVC-Boss/GPT-SoVITS/pull/2434)
- 内容: 优化精度自动检测逻辑, 给 WebUI 前端界面模块增加折叠功能.
- 类型: 新功能
- 提交: XXXXRT666, RVC-Boss
- 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
- 内容: X一X型多音字判断修复
- 类型: 修复
- 提交: wzy3650
- 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
- 内容: 配置修复sovits模型读取修复
- 类型: 修复
- 提交: wzy3650
- 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
- 内容: 修复ge.sum数值可能爆炸导致推理无声的问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
- 内容: 修复实验名结尾出现空格在win中路径不正确的问题
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
- 内容: 语种分割优化
- 类型: 优化
- 提交: KamioRinn
- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
- 内容: 修复并行推理对v2pro支持bug
- 类型: 修复
- 提交: YYuX-1145
- 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
- 内容: v2pro对ge提取时会出现数值溢出的问题修复
- 类型: 修复
- 提交: RVC-Boss
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
- 内容: install.sh逻辑优化
- 类型: 优化
- 提交: XXXXRT666
- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
- 内容: onnxruntime加载逻辑优化对gpu/cpu的判断
- 类型: 优化
- 提交: KamioRinn
- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
- 内容: 语言分割及格式化优化
- 类型: 优化
- 提交: KamioRinn
- 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
- 内容: 提升推理进程优先级修复win11下可能GPU利用率受限的问题
- 类型: 修复
- 提交: XianYue0125
### 20240130更新
1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
2-修复中英文标点切割问题和句首句尾补标点的问题
3-增加按标点符号切分
### 20240201更新
1-修复uvr5读取格式错误导致分离失败的问题
2-支持中日英混合多种文本自动切分识别语种
### 20240202更新
1-修复asr路径尾缀带/保存文件名报错
2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
### 20240207更新
1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403
3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理, 16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
4-优化英文文本前端
5-修复gradio依赖
6-支持三连根目录留空自动读取.list全路径
7-集成faster whisper ASR日文英文
### 20240208更新
1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b).
### 20240212更新
1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题.
2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457
### 20240214更新
1-训练支持中文实验名 (原来会报错)
2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题.
### 20240216更新
1-支持无参考文本输入
2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
### 20240221更新
1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦).
2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
3-mac CPU推理更快因此把推理设备从mps改到CPU
4-colab修复不开启公网url
### 20240306更新
1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672
2-如果用faster whisper非中文ASR不再需要先下中文funasr模型
3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610
4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675
5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
### 202403/202404/202405更新
2个重点
1-修复sovits训练未冻结vq的问题 (可能造成效果下降)
2-增加一个快速推理分支
以下都是小修补
1-修复无参考文本模式问题
2-优化中英文文本前端
3-api格式优化
4-cmd格式问题修复
5-训练数据处理阶段不支持的语言提示报错
6-nan自动转fp32阶段的hubert提取bug修复
### 20240610
小问题修复:
1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
2-uvr5中的mdxnet去混响cmd格式修复, 兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
大问题修复:
4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
### 20240706
小问题修复:
1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
2-修复降噪、asr中途遇到异常跳出所有需处理的音频文件的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1258 https://github.com/RVC-Boss/GPT-SoVITS/pull/1265 https://github.com/RVC-Boss/GPT-SoVITS/pull/1267
3-修复按标点符号切分时小数会被切分 https://github.com/RVC-Boss/GPT-SoVITS/pull/1253
4-[多卡训练多进程保存逻辑修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251
重点:
6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速.
后面会逐渐验证快速推理分支的推理改动的一致性
### 20240727
1-清理冗余i18n代码 https://github.com/RVC-Boss/GPT-SoVITS/pull/1298
2-修复用户打文件及路径在结尾添加/会导致命令行报错的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1299
3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756
重点:
4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340
### 20240806
1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488
3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355
4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
6-GPT-SoVITS-v2支持
7-计时逻辑优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/1387
### 20240821
1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490
2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508
3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503
4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
### 20250211
增加gpt-sovits-v3模型, 需要14G显存可以微调
### 20250212
sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040
### 20250214
优化多语种混合文本切分策略a https://github.com/RVC-Boss/GPT-SoVITS/pull/2047
### 20250217
优化文本里的数字和英文处理逻辑https://github.com/RVC-Boss/GPT-SoVITS/pull/2062
### 20250218
优化多语种混合文本切分策略b https://github.com/RVC-Boss/GPT-SoVITS/pull/2073
### 20250223
1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好
2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078
### 20250226
https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114
修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错)
### 20250227
针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解.
### 20250228
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
修复v3sovits未传参以支持调节语速
### 202503
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
修复其他若干bug
重点更新:
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理

View File

@ -7,18 +7,12 @@
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org) <!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
[![Huggingface](https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface)](https://lj1995-gpt-sovits-proplus.hf.space/)
[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) [**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@ -26,21 +20,21 @@
--- ---
## 功能 ## 功能:
1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换. 1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感. 2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
3. **跨语言支持:** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文. 3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文.
4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型. 4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型.
**查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)** **查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)**
未见过的说话者 few-shot 微调演示: 未见过的说话者 few-shot 微调演示:
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb> https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** **用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
@ -51,13 +45,12 @@
### 测试通过的环境 ### 测试通过的环境
| Python Version | PyTorch Version | Device | | Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- | |----------------|------------------|-----------------|
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | | Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon | | Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon | | Python 3.11 | PyTorch 2.6.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU | | Python 3.9 | PyTorch 2.2.2 | CPU |
### Windows ### Windows
@ -66,50 +59,34 @@
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** **中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
```
### Linux ### Linux
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### macOS ### macOS
**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.** **注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.**
运行以下的命令来安装本项目: 1. 运行 `xcode-select --install` 安装 Xcode command-line tools.
2. 运行以下的命令来安装本项目:
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### 手动安装 ### 手动安装
#### 安装依赖
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
#### 安装 FFmpeg #### 安装 FFmpeg
##### Conda 用户 ##### Conda 用户
```bash ```bash
conda activate GPTSoVits
conda install ffmpeg conda install ffmpeg
``` ```
@ -118,13 +95,14 @@ conda install ffmpeg
```bash ```bash
sudo apt install ffmpeg sudo apt install ffmpeg
sudo apt install libsox-dev sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
``` ```
##### Windows 用户 ##### Windows 用户
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下 下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下.
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境 安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
##### MacOS 用户 ##### MacOS 用户
@ -132,54 +110,38 @@ sudo apt install libsox-dev
brew install ffmpeg brew install ffmpeg
``` ```
### 运行 GPT-SoVITS (使用 Docker) #### 安装依赖
#### Docker 镜像选择
由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意:
- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags)
- 根据你的运行环境选择合适的镜像标签
- `Lite` Docker 镜像**不包含** ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载
- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64)
- Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
#### 环境变量
- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用
#### 共享内存配置
在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)
#### 选择服务
`docker-compose.yaml` 文件定义了两个主要服务类型:
- `GPT-SoVITS-CU126``GPT-SoVITS-CU128`:完整版, 包含所有功能
- `GPT-SoVITS-CU126-Lite``GPT-SoVITS-CU128-Lite`:轻量版, 依赖更少, 功能略有删减
如需使用 Docker Compose 运行指定服务, 请执行:
```bash ```bash
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
#### 本地构建 Docker 镜像 ### 在 Docker 中使用
如果你希望自行构建镜像, 请使用以下命令: #### docker-compose.yaml 设置
```bash 0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建.
bash docker_build.sh --cuda <12.6|12.8> [--lite] 1. 环境变量:
- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False.
2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容.
3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置.
4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置.
#### 通过 docker compose 运行
```
docker compose -f "docker-compose.yaml" up -d
``` ```
#### 访问运行中的容器 (Bash Shell) #### 通过 docker 命令运行
当容器在后台运行时, 你可以通过以下命令进入容器: 同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令:
```bash ```
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
``` ```
## 预训练模型 ## 预训练模型
@ -251,12 +213,12 @@ python webui.py v1 <language(optional)>
#### 现已支持自动填充路径 #### 现已支持自动填充路径
1. 填入训练音频路径 1. 填入训练音频路径
2. 切割音频 2. 切割音频
3. 进行降噪(可选) 3. 进行降噪(可选)
4. 进行 ASR 4. 进行ASR
5. 校对标注 5. 校对标注
6. 前往下一个窗口,点击训练 6. 前往下一个窗口,点击训练
### 打开推理 WebUI ### 打开推理 WebUI
@ -298,7 +260,7 @@ python webui.py
2. 需要克隆 github 上的最新代码 2. 需要克隆 github 上的最新代码
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下 3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
@ -318,45 +280,13 @@ python webui.py
2. 需要克隆 github 上的最新代码 2. 需要克隆 github 上的最新代码
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下 3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
## V4 更新说明
新特性:
1. **V4 版本修复了 V3 版本中由于非整数倍上采样导致的金属音问题, 并原生输出 48kHz 音频以避免声音闷糊 (而 V3 版本仅原生输出 24kHz 音频)**. 作者认为 V4 是对 V3 的直接替代, 但仍需进一步测试.
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 V1/V2/V3 环境迁移至 V4
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
2. 从 GitHub 克隆最新代码.
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V4 预训练模型 (`gsv-v4-pretrained/s2v4.ckpt``gsv-v4-pretrained/vocoder.pth`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
## V2Pro 更新说明
新特性:
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
从 V1/V2/V3/V4 环境迁移至 V2Pro
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
2. 从 GitHub 克隆最新代码.
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V2Pro 预训练模型 (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 和 `sv/pretrained_eres2netv2w24s4ep4.ckpt`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
## 待办事项清单 ## 待办事项清单
- [x] **高优先级:** - [x] **高优先级: **
- [x] 日语和英语的本地化. - [x] 日语和英语的本地化.
- [x] 用户指南. - [x] 用户指南.
@ -378,7 +308,7 @@ python webui.py
使用命令行打开 UVR5 的 WebUI 使用命令行打开 UVR5 的 WebUI
```bash ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
@ -389,7 +319,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
这是使用命令行完成数据集的音频切分的方式 这是使用命令行完成数据集的音频切分的方式
```bash ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -401,15 +331,15 @@ python audio_slicer.py \
这是使用命令行完成数据集 ASR 处理的方式 (仅限中文) 这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
```bash ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记) 通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
(没有进度条, GPU 性能可能会导致时间延迟) (没有进度条, GPU 性能可能会导致时间延迟)
```bash ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
@ -436,7 +366,6 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
### 推理用文本前端 ### 推理用文本前端

View File

@ -1,580 +1,222 @@
# Changelog ### 20240121 Update
## 202401 1. Added `is_share` to the `config`. In scenarios like Colab, this can be set to `True` to map the WebUI to the public network.
2. Added English system translation support to WebUI.
3. The `cmd-asr` automatically detects if the FunASR model is included; if not found in the default directory, it will be downloaded from ModelScope.
4. Attempted to fix the SoVITS training ZeroDivisionError reported in [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) by filtering samples with zero length, etc.
5. Cleaned up cached audio files and other files in the `TEMP` folder.
6. Significantly reduced the issue of synthesized audio containing the end of the reference audio.
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) ### 20240122 Update
- Content: Added English system translation support to WebUI.
- Type: Documentation
- Contributor: D3lik
- 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e)
- Content: Attempted to fix SoVITS training ZeroDivisionError.
- Type: Fix
- Contributor: RVC-Boss, Tybost
- Related: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
- 2024.01.21 [Commit#ea62d6e0](https://github.com/RVC-Boss/GPT-SoVITS/commit/ea62d6e0cf1efd75287766ea2b55d1c3b69b4fd3)
- Content: Significantly reduced the issue of synthesized audio containing the end of the reference audio.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef)
- Content: `cmd-asr.py` now checks if the FunASR model is included in the default directory, and if not, it will download it from ModelScope.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d)
- Content: `Config.py` now has an `is_share` parameter, which can be set to `True` to map the WebUI to the public network.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f)
- Content: Cleaned up cached audio files and other files in the `TEMP` folder.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.22 [Commit#872134c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/872134c846bcb8f1909a3f5aff68a6aa67643f68)
- Content: Fixed the issue where excessively short output files resulted in repeating the reference audio.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.22 Tested native support for English and Japanese training (Japanese training requires the root directory to be free of non-English special characters).
- 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124)
- Content: Improved audio path checking. If an attempt is made to read from an incorrect input path, it will report that the path does not exist instead of an ffmpeg error.
- Type: Optimization
- Contributor: xmimu
- 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1)
- Content: Resolved the issue where Hubert extraction caused NaN errors, leading to SoVITS/GPT training ZeroDivisionError.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50)
- Content: Replaced `jieba` with `jieba_fast` for Chinese word segmentation.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.23 [Commit#63625758](https://github.com/RVC-Boss/GPT-SoVITS/commit/63625758a99e645f3218dd167924e01a0e3cf0dc)
- Content: Optimized model file sorting logic.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f)
- Content: Added support for quick model switching in the inference WebUI.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b)
- Content: Removed redundant logs in the inference WebUI.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200)
- Content: Supported training and inference on Mac.
- Type: Feature
- Contributor: Lion-Wu
- 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b)
- Content: Fixed the issue of UVR5 reading and automatically jumping out of directories.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204)
- Content: Added support for Chinese-English mixed and Japanese-English mixed output texts.
- Type: Feature
- Contributor: Kakaru Hayate
- 2024.01.26 [Commit#f4148cf7](https://github.com/RVC-Boss/GPT-SoVITS/commit/f4148cf77fb899c22bcdd4e773d2f24ab34a73e7)
- Content: Added optional segmentation mode for output.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.26 [Commit#9fe955c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/9fe955c1bf5f94546c9f699141281f2661c8a180)
- Content: Fixed multiple newline issues causing inference errors.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b)
- Content: Automatically forced single precision for GPU that do not support half precision; enforced single precision under CPU inference.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238)
- Content: Completed model downloading process in the Dockerfile.
- Type: Fix
- Contributor: breakstring
- 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257)
- Content: Fixed the issue with the pronunciation of numbers converting to Chinese characters.
- Type: Fix
- Contributor: duliangang
- 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683)
- Content: Fixed the issue where GPT training did not save checkpoints.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8)
- Content: Excluded unreasonable reference audio lengths by setting restrictions.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.28 [Commit#698e9655](https://github.com/RVC-Boss/GPT-SoVITS/commit/698e9655132d194b25b86fbbc99d53c8d2cea2a3)
- Content: Fixed the issue where swallowing a few characters at the beginning of sentences.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0)
- Content: Changed training configurations to single precision for GPUs like the 16 series, which have issues with half precision training.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec)
- Content: Tested and updated the available Colab version.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135)
- Content: Updated FunASR to Version 1.0 and fixed errors caused by interface misalignment.
- Type: Fix
- Contributor: LauraGPT
- 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4)
- Content: Fixed issues with splitting Chinese and English punctuation and added punctuation at the beginning and end of sentences.
- Type: Fix
- Contributor: RVC-Boss
- 2024.01.30 [Commit#74409f35](https://github.com/RVC-Boss/GPT-SoVITS/commit/74409f3570fa1c0ff28d4c65c288a6ce58ca00d2)
- Content: Added splitting by punctuation.
- Type: Feature
- Contributor: RVC-Boss
- 2024.01.30 [Commit#c42eeccf](https://github.com/RVC-Boss/GPT-SoVITS/commit/c42eeccfdd2d0a0d714ecc8bfc22a12373aca6b7)
- Content: Automatically removed double quotes from all path-related entries to prevent errors from novice users copying paths with double quotes.
- Type: Fix
- Contributor: RVC-Boss
## 202402 1. Fixed the issue where excessively short output files resulted in repeating the reference audio.
2. Tested native support for English and Japanese training (Japanese training requires the root directory to be free of non-English special characters).
3. Improved audio path checking. If an attempt is made to read from an incorrect input path, it will report that the path does not exist instead of an ffmpeg error.
- 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) ### 20240123 Update
- Content: Fixed the issue where an ASR path ending with `/` caused an error in saving the filename.
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7)
- Content: Fixed the UVR5 format reading error causing separation failures.
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7)
- Content: Supported automatic segmentation and language recognition for mixed Chinese-Japanese-English texts.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)
- Content: introduced PaddleSpeech's Normalizer to fix issues like reading "xx.xx%" (percent symbols) and "元/吨" being read as "元吨" instead of "元每吨", and fixed underscore errors.
- Type: Optimization
- Contributor: KamioRinn
- 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395)
- Content: Optimized English text frontend.
- Type: Optimization
- Contributor: KamioRinn
- 2024.02.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/65b463a787f31637b4768cc9a47cab59541d3927)
- Content: Corrected language parameter confusion causing decreased Chinese inference quality.
- Type: Fix
- Contributor: RVC-Boss
- Related: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)
- 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)
- Content: Adapted UVR5 to higher versions of librosa.
- Type: Fix
- Contributor: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- Content: Fixed UVR5 inf everywhere error caused by `is_half` parameter not converting to boolean, resulting in constant half precision inference, which caused `inf` on 16 series GPUs.
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
- Content: Fixed Gradio dependencies.
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400)
- Content: Integrated Faster Whisper ASR for Japanese and English.
- Type: Feature
- Contributor: Shadow
- 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)~[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10)
- Content: Supported automatic reading of `.list` full paths if the root directory is left blank during dataset preparation.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)
- Content: Attempted to fix GPT training hang on Windows 10 1909 and Traditional Chinese System Language.
- Type: Fix
- Contributor: RVC-Boss
- Related: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)
- 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)
- Content: Enabled experimental DPO Loss training option to mitigate GPT repetition and missing characters by constructing negative samples during training and made several inference parameters available in the inference WebUI.
- Type: Feature
- Contributor: liufenghua
- 2024.02.12 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28)
- Content: Optimized logic for Faster Whisper and FunASR, switching Faster Whisper to mirror downloads to avoid issues with Hugging Face connections.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.02.15 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8)
- Content: Supported Chinese experiment names in training (previously caused errors).
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)~[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b)
- Content: Made DPO training an optional feature instead of mandatory. If selected, the batch size is automatically halved. Fixed issues with new parameters not being passed in the inference WebUI.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5)
- Content: Fixed bugs in Chinese frontend.
- Type: Fix
- Contributor: RVC-Boss
- 2024.02.16 [PR#499](https://github.com/RVC-Boss/GPT-SoVITS/pull/499)
- Content: Supported input without reference text.
- Type: Feature
- Contributor: Watchtower-Liu
- Related: [Issue#475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)
- 2024.02.17 [PR#509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509), [PR#507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR#532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR#556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR#559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559)
- Content: Optimized Chinese and Japanese frontend processing.
- Type: Optimization
- Contributor: KamioRinn, v3cun
- 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511)
- Content: Fixed Colab public URL issue.
- Type: Fix
- Contributor: ChanningWang2018, RVC-Boss
- 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557)
- Content: Switched Mac CPU inference to use CPU instead of MPS for faster performance.
- Type: Optimization
- Contributor: XXXXRT666
- 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35)
- Content: Added a noise reduction option during data processing (noise reduction leaves only 16kHz sampling rate; use only if the background noise is significant).
- Type: Feature
- Contributor: RVC-Boss
- 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)
- Content: Modified `is_half` check to ensure proper CPU inference on Mac.
- Type: Fix
- Contributor: XXXXRT666
- 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)
- Content: Fixed UVR5 reverb removal model where the setting was reversed.
- Type: Fix
- Contributor: Yuze Wang
## 202403 1. Resolved the issue where Hubert extraction caused NaN errors, leading to SoVITS/GPT training ZeroDivisionError.
2. Added support for quick model switching in the inference WebUI.
3. Optimized the model file sorting logic.
4. Replaced `jieba` with `jieba_fast` for Chinese word segmentation.
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) ### 20240126 Update
- Content: Enabled automatic CPU inference for Faster Whisper if no CUDA is available.
- Type: Optimization
- Contributor: ShiroDoMain
- 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193)
- Content: No longer requires downloading the Chinese FunASR model first when using Faster Whisper non-Chinese ASR.
- Type: Optimization
- Contributor: RVC-Boss
- 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- Content: Accelerated inference by 50% (tested on RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39).
- Type: Optimization
- Contributor: GoHomeToMacDonal
- 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
- Content: Added a quick inference branch `fast_inference_`.
- Type: Feature
- Contributor: ChasonJiang
- 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
- Content: Supported CPU training, using CPU for training on macOS.
- Type: Feature
- Contributor: Lion-Wu
- 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821)
- Content: Optimized the English text frontend.
- Type: Optimization
- Contributor: KamioRinn
- 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894)
- Content: Improved API format.
- Type: Optimization
- Contributor: KamioRinn
## 202404 1. Added support for Chinese-English mixed and Japanese-English mixed output texts.
2. Added an optional segmentation mode for output.
3. Fixed the issue of UVR5 reading and automatically jumping out of directories.
4. Fixed multiple newline issues causing inference errors.
5. Removed redundant logs in the inference WebUI.
6. Supported training and inference on Mac.
7. Automatically forced single precision for GPU that do not support half precision; enforced single precision under CPU inference.
- 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) ### 20240128 Update
- Content: Corrected FFmpeg command string formatting in UVR5 WebUI.
- Type: Fix
- Contributor: StaryLan
## 202405 1. Fixed the issue with the pronunciation of numbers converting to Chinese characters.
2. Fixed the issue of swallowing a few characters at the beginning of sentences.
3. Excluded unreasonable reference audio lengths by setting restrictions.
4. Fixed the issue where GPT training did not save checkpoints.
5. Completed model downloading process in the Dockerfile.
- 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) ### 20240129 Update
- Content: Fixed the issue of SoVITS training without freezing VQ (which could cause quality degradation).
- Type: Fix
- Contributor: hcwu1993
- Related: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747)
- 2024.05.19 [PR#1102](https://github.com/RVC-Boss/GPT-SoVITS/pull/1102)
- Content: Added error prompts for unsupported languages during training data processing.
- Type: Optimization
- Contributor: StaryLan
- 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132)
- Content: Fixed the bug in Hubert extraction.
- Type: Fix
- Contributor: XXXXRT666
## 202406 1. Changed training configurations to single precision for GPUs like the 16 series, which have issues with half precision training.
2. Tested and updated the available Colab version.
3. Fixed the issue of git cloning the ModelScope FunASR repository with older versions of FunASR causing interface misalignment errors.
- 2024.06.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) ### 20240130 Update
- Content: Fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
1. Automatically removed double quotes from all path-related entries to prevent errors from novice users copying paths with double quotes.
2. Fixed issues with splitting Chinese and English punctuation and added punctuation at the beginning and end of sentences.
3. Added splitting by punctuation.
### 20240201 Update
1. Fixed the UVR5 format reading error causing separation failures.
2. Supported automatic segmentation and language recognition for mixed Chinese-Japanese-English texts.
### 20240202 Update
1. Fixed the issue where an ASR path ending with `/` caused an error in saving the filename.
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) introduced PaddleSpeech's Normalizer to fix issues like reading "xx.xx%" (percent symbols) and "元/吨" being read as "元吨" instead of "元每吨", and fixed underscore errors.
### 20240207 Update
1. Corrected language parameter confusion causing decreased Chinese inference quality reported in [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391).
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) adapted UVR5 to higher versions of librosa.
3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) fixed UVR5 inf everywhere error caused by `is_half` parameter not converting to boolean, resulting in constant half precision inference, which caused `inf` on 16 series GPUs.
4. Optimized English text frontend.
5. Fixed Gradio dependencies.
6. Supported automatic reading of `.list` full paths if the root directory is left blank during dataset preparation.
7. Integrated Faster Whisper ASR for Japanese and English.
### 20240208 Update
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) attempted to fix GPT training hang on Windows 10 1909 and [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (Traditional Chinese System Language).
### 20240212 Update
1. Optimized logic for Faster Whisper and FunASR, switching Faster Whisper to mirror downloads to avoid issues with Hugging Face connections.
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) enabled experimental DPO Loss training option to mitigate GPT repetition and missing characters by constructing negative samples during training and made several inference parameters available in the inference WebUI.
### 20240214 Update
1. Supported Chinese experiment names in training (previously caused errors).
2. Made DPO training an optional feature instead of mandatory. If selected, the batch size is automatically halved. Fixed issues with new parameters not being passed in the inference WebUI.
### 20240216 Update
1. Supported input without reference text.
2. Fixed bugs in Chinese frontend reported in [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475).
### 20240221 Update
1. Added a noise reduction option during data processing (noise reduction leaves only 16kHz sampling rate; use only if the background noise is significant).
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) optimized Chinese and Japanese frontend processing.
3. Switched Mac CPU inference to use CPU instead of MPS for faster performance.
4. Fixed Colab public URL issue.
### 20240306 Update
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) accelerated inference by 50% (tested on RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39) .
2. No longer requires downloading the Chinese FunASR model first when using Faster Whisper non-Chinese ASR.
3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) fixed UVR5 reverb removal model where the setting was reversed.
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) enabled automatic CPU inference for Faster Whisper if no CUDA is available.
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) modified `is_half` check to ensure proper CPU inference on Mac.
### 202403/202404/202405 Update
#### Minor Fixes:
1. Fixed issues with the no-reference text mode.
2. Optimized the Chinese and English text frontend.
3. Improved API format.
4. Fixed CMD format issues.
5. Added error prompts for unsupported languages during training data processing.
6. Fixed the bug in Hubert extraction.
#### Major Fixes:
1. Fixed the issue of SoVITS training without freezing VQ (which could cause quality degradation).
2. Added a quick inference branch.
### 20240610 Update
#### Minor Fixes:
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) improved the logic for pure punctuation and multi-punctuation text input.
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) fixed CMD format for MDXNet de-reverb in UVR5, supporting paths with spaces.
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) fixed progress bar logic for SoVITS training in `s2_train.py`.
#### Major Fixes:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
**Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.** **Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.**
- Type: Fix
- Contributor: RVC-Boss
- 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)
- Content: Fixed progress bar logic for SoVITS training in `s2_train.py`.
- Type: Fix
- Contributor: pengzhendong
- 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
- Content: Fixed string formatting when UVR5 MDXNet calls FFmpeg, ensuring compatibility with paths containing spaces.
- Type: Fix
- Contributor: RVC-Boss
- 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)
- Content: Improved the logic for pure punctuation and multi-punctuation text input.
- Type: Fix
- Contributor: XXXXRT666
- Related: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165)
- 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
- Content: Fixed default batch size decimal issue in CPU inference.
- Type: Fix
- Contributor: RVC-Boss
- 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267)
- Content: Fixed issues where denoising or ASR encountering exceptions would exit all pending audio files.
- Type: Fix
- Contributor: XXXXRT666
- 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
- Content: Fixed multi-process save logic for multi-GPU training.
- Type: Fix
- Contributor: RVC-Boss
- 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251)
- Content: Removed redundant `my_utils.py`.
- Type: Optimization
- Contributor: aoguai
- Related: [Issue#1189](https://github.com/RVC-Boss/GPT-SoVITS/issues/1189)
## 202407 ### 20240706 Update
- 2024.07.06 [PR#1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) #### Minor Fixes:
- Content: Fixed the issue of splitting decimals when splitting by punctuation.
- Type: Fix
- Contributor: aoguai
- 2024.07.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5)
- Content: The accelerated inference code has been validated and merged into the main branch, ensuring consistent inference effects with the base. It also supports accelerated inference in no-reference text mode.
- Type: Optimization
- Contributor: RVC-Boss, GoHomeToMacDonal
- Related: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- Future updates will continue to verify the consistency of changes in the `fast_inference` branch.
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) fixed default batch size decimal issue in CPU inference.
- Content: Refactor i18n scanning and update multi-language configuration files. 2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) fixed issues where denoising or ASR encountering exceptions would exit all pending audio files.
- Type: Documentation 3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) fixed the issue of splitting decimals when splitting by punctuation.
- Contributor: StaryLan 4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) fixed multi-process save logic for multi-GPU training.
- 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) 5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) removed redundant `my_utils`.
- Content: Fixed issues where trailing slashes in user file paths caused command line errors.
- Type: Fix
- Contributor: XXXXRT666
- 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756)
- Content: Fix the inconsistency in training steps when using a custom `bucket_sampler` during GPT training.
- Type: Fix
- Contributor: huangxu1991
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- Content: Support adjusting speech speed during synthesis, including an option to freeze randomness and only control speed. This feature has been updated to `api.py`.
- Type: Feature
- Contributor: RVC-Boss, 红血球AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- Content: Added support for the BS-RoFormer vocal accompaniment separation model.
- Type: Feature
- Contributor: KamioRinn
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351)
- Content: Improved Chinese text frontend.
- Type: Feature
- Contributor: KamioRinn
## 202408 (V2 Version) #### Major Fixes:
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) 6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base.
- Content: Automatically fill in the paths when processing files in the WebUI. It also supports accelerated inference in no-reference text mode.
- Type: Chore
- Contributor: XXXXRT666
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
- Content: Enabled FP16 inference support for BS-Roformer.
- Type: Performance Optimization
- Contributor: RVC-Boss
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
- Content: Optimized GPU recognition logic, added user-friendly logic to handle arbitrary GPU indices entered by users.
- Type: Chore
- Contributor: RVC-Boss
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758)
- Content: **Added GPT-SoVITS V2 model.**
- Type: Feature
- Contributor: RVC-Boss
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
- Content: Added support for Cantonese ASR by using FunASR.
- Type: Feature
- Contributor: RVC-Boss
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388)
- Content: Optimized UI and timing logic.
- Type: Chore
- Contributor: XXXXRT666
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488)
- Content: Optimized polyphonic character handling logic (V2 Only).
- Type: Fix, Feature
- Contributor: KamioRinn, RVC-Boss
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422)
- Content: Fixed bug where only one reference audio could be uploaded; added dataset validation with warning popups for missing files.
- Type: Fix, Chore
- Contributor: XXXXRT666
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508)
- Content: Upstream LangSegment library now supports optimizing numbers, phone numbers, dates, and times using SSML tags.
- Type: Feature
- Contributor: juntaosun
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503)
- Content: Fixed and optimized API.
- Type: Fix
- Contributor: KamioRinn
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490)
- Content: Merged `fast_inference` branch into the main branch.
- Type: Refactor
- Contributor: ChasonJiang
- 2024.08.21 **Officially released GPT-SoVITS V2 version.**
## 202502 (V3 Version) **Future updates will continue to verify the consistency of changes in the `fast_inference` branch**.
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) ### 20240727 Update
- Content: **Added GPT-SoVITS V3 model, which requires 14GB VRAM for fine-tuning.**
- Type: Feature (Refer to [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
- Contributor: RVC-Boss
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
- Content: Updated multilingual project documentation.
- Type: Documentation
- Contributor: StaryLan
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033)
- Content: Updated Japanese documentation.
- Type: Documentation
- Contributor: Fyphen
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010)
- Content: Optimized attention calculation logic.
- Type: Performance Optimization
- Contributor: wzy3650
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040)
- Content: Added gradient checkpointing support for fine-tuning, requiring 12GB VRAM.
- Type: Feature
- Contributor: Kakaru Hayate
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073)
- Content: Switched to a new language segmentation tool, improved multilingual mixed-text splitting strategy, and optimized number and English processing logic.
- Type: Feature
- Contributor: KamioRinn
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e)
- Content: **GPT-SoVITS V3 model now supports LoRA training, requiring 8GB GPU Memory for fine-tuning.**
- Type: Feature
- Contributor: RVC-Boss
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078)
- Content: Added Mel Band Roformer model support for vocal and Instrument separation.
- Type: Feature
- Contributor: Sucial
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114)
- Content: Fixed MeCab error under Chinese paths (specifically for Japanese/Korean or multilingual text splitting).
- Type: Fix
- Contributor: KamioRinn
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc)
- Content: **Added 24kHz to 48kHz audio super-resolution models** to alleviate the "muffled" audio issue when generating 24K audio with V3 model.
- Type: Feature
- Contributor: RVC-Boss
- Related: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123)
- Content: Updated multilingual project documentation.
- Type: Documentation
- Contributor: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- Content: Applied rule-based detection for short CJK characters when model cannot identify them.
- Type: Fix
- Contributor: KamioRinn
- Related: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5)
- Content: Added speech rate parameter to control synthesis speed.
- Type: Fix
- Contributor: RVC-Boss
- 2025.02.28 **Officially released GPT-SoVITS V3**.
## 202503 #### Minor Fixes:
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236) 1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) cleaned up redundant i18n code.
- Content: Fixed issues caused by incorrect versions of dependencies. 2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) fixed issues where trailing slashes in user file paths caused command line errors.
- Type: Fix 3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) fixed the step calculation logic in GPT training.
- Contributor: XXXXRT666
- Related:
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
- Content: **Enabled parallel inference for SoVITS v3.**
- Type: Feature
- Contributor: ChasonJiang
- Fixed other minor bugs. #### Major Fixes:
- Integrated package fixes for ONNX runtime GPU inference support: 4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis.
- Type: Fix Enabled freezing randomness while only adjusting the speech rate.
- Details:
- ONNX models within G2PW switched from CPU to GPU inference, significantly reducing CPU bottleneck;
- foxjoy dereverberation model now supports GPU inference.
## 202504 (V4 Version) ### 20240806 Update
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) 1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) Added support for the BS RoFormer vocal accompaniment separation model. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) Enabled FP16 inference.
- Content: Unlocked SoVITS v3 parallel inference; fixed asynchronous model loading logic. 2. Improved Chinese text frontend.
- Type: Fix - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) added support for polyphonic characters (v2 only);
- Contributor: RVC-Boss - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) added quantifier;
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255) - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) supports arithmetic and basic math formulas;
- Content: Code formatting using Ruff; updated G2PW link. - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) fixed mixed text errors.
- Type: Style 3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) automatically filled in the paths when processing audio in the WebUI.
- Contributor: XXXXRT666 4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) optimized GPU recognition logic.
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290) 5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) added support for Cantonese ASR.
- Content: Cleaned up documentation; added Python 3.11 support; updated installers. 6. Added support for GPT-SoVITS v2.
- Type: Chore 7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) optimized timing logic.
- Contributor: XXXXRT666
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300)
- Content: Updated Colab, installation files, and model downloads.
- Type: Chore
- Contributor: XXXXRT666
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf)
- Content: **Added GPT-SoVITS V4 model.**
- Type: Feature
- Contributor: RVC-Boss
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307)
- Content: Enabled parallel inference for V4.
- Type: Feature
- Contributor: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309)
- Content: Fixed model version parameter passing.
- Type: Fix
- Contributor: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310)
- Content: Fixed Numpy and Numba version mismatch issue; updated librosa version.
- Type: Fix
- Contributor: RVC-Boss, XXXXRT666
- Related: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
- **2024.04.22 Officially released GPT-SoVITS V4**.
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311)
- Content: Updated Gradio parameters.
- Type: Chore
- Contributor: XXXXRT666
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322)
- Content: Improved Colab/Kaggle notebook scripts.
- Type: Chore
- Contributor: XXXXRT666
## 202505 ### 20240821 Update
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) 1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) Merge the `fast_inference` branch into the main branch.
- Content: Improved Docker and Windows auto-build scripts; added pre-commit formatting. 2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) Support for optimizing numbers, phone numbers, dates, and times using SSML tags.
- Type: Chore 3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) Fixed and optimized API.
- Contributor: XXXXRT666 4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Fixed the bug where only one reference audio could be uploaded for mixing, Added various dataset checks with warnings popping up if missing files.
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408)
- Content: Optimized multilingual text splitting and recognition logic.
- Type: Fix
- Contributor: KamioRinn
- Related: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377)
- Content: Implemented caching strategies to improve SoVITS V3/V4 inference speed by 10%.
- Type: Performance Optimization
- Contributor: Kakaru Hayate
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba)
- Content: Updated the annotation interface with a reminder: click Submit Text after completing each page, or changes will not be saved.
- Type: Fix
- Contributor: RVC-Boss
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc)
- Content: Fixed UVR5 and ONNX dereverberation model errors when FFmpeg encodes MP3/M4A files with spaces in original paths.
- Type: Fix
- Contributor: RVC-Boss
## 202506 (V2Pro Series) ### 20250211 Update
- 2025.06.03 [PR#2420](https://github.com/RVC-Boss/GPT-SoVITS/pull/2420) - [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) Added GPT-SoVITS v3 Model, Need 14GB GPU Memory to Fine-tune SoVITS v3.
- Content: Updated multilingual project documentation.
- Type: Documentation ### 20250212 Update
- Contributor: StaryLan
- 2025.06.04 [PR#2417](https://github.com/RVC-Boss/GPT-SoVITS/pull/2417) - [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) Added gradient checkpointing to Fine-tune SoVITS v3, Need 12GB GPU Memory.
- Content: Support exporting V4 with TorchScript.
- Type: Feature ### 20250214 Update
- Contributor: L-jasmine
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)~[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70) - [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Optimize the multilingual mixed text segmentation strategy **A**.
- Content: **Added GPT-SoVITS V2Pro Series model (V2Pro, V2ProPlus).**. -AAdded `split-lang` as a language segmentation tool to improve segmentation capabilities for multi-language mixed text.
- Type: Feature
- Contributor: RVC-Boss ### 20250217 Update
- 2025.06.05 [PR#2426](https://github.com/RVC-Boss/GPT-SoVITS/pull/2426)
- Description: Fix initialization error in `config/inference_webui`. - [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Optimize the logic for handling numbers and English in the text.
- Type: Fix
- Contributor: StaryLan ### 20250218 Update
- 2025.06.05 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427), [Commit#7d70852a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7d70852a3f67c3b52e3a62857f8663d529efc8cd), [PR#2434](https://github.com/RVC-Boss/GPT-SoVITS/pull/2434)
- Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules. - [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Optimize the multilingual mixed text segmentation strategy **B**.
- Type: New Feature
- Contributors: XXXXRT666, RVC-Boss ### 20250223 Update
1. LoRA training is supported for fine-tuning with SoVITS V3. It requires 8GB GPU Memory and the results are better than full parameter fine-tuning.
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Added Mel Band RoFormer model for Vocal & Instrument Separation.
### 20250226 Update
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Fix issues caused by non-English directories in Windows.
- Using `langsegmenter` for Korean.
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Fix issues caused by non-English directories in Windows.
- Using `langsegmenter` for Korean/Japanese.
### 20250227 Update
- Added 24K to 48K audio super-resolution models to alleviate the muffled issue when generating 24K audio with V3 model, as reported in [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117).

View File

@ -1,580 +1,221 @@
# 更新履歴 ### 20240121 更新
## 202401 1. `config``is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます.
2. WebUI に英語システムの英語翻訳を追加しました.
3. `cmd-asr`は FunASR モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします.
4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます (長さ 0 のサンプルをフィルタリングなど)
5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します.
6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました.
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) ### 20240122 更新
- 内容: WebUIに英語システム翻訳サポートを追加。
- タイプ: ドキュメンテーション
- 貢献者: D3lik
- 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e)
- 内容: SoVITSトレーニングのZeroDivisionError修正を試みた。
- タイプ: 修正
- 貢献者: RVC-Boss, Tybost
- 関連: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
- 2024.01.21 [Commit#ea62d6e0](https://github.com/RVC-Boss/GPT-SoVITS/commit/ea62d6e0cf1efd75287766ea2b55d1c3b69b4fd3)
- 内容: 合成音声に参照音声の終端が含まれる問題を大幅に軽減。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef)
- 内容: `cmd-asr.py`がデフォルトディレクトリにFunASRモデルが含まれているか確認し、ない場合はModelScopeからダウンロードするようになった。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d)
- 内容: `Config.py``is_share`パラメータを追加、`True`に設定するとWebUIを公開ネットワークにマッピング可能。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f)
- 内容: `TEMP`フォルダ内のキャッシュ音声ファイルやその他ファイルをクリーンアップ。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.22 [Commit#872134c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/872134c846bcb8f1909a3f5aff68a6aa67643f68)
- 内容: 極端に短い出力ファイルで参照音声が繰り返される問題を修正。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.22 英語と日本語トレーニングのネイティブサポートをテスト(日本語トレーニングはルートディレクトリに非英語特殊文字がないことが必要)。
- 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124)
- 内容: 音声パスチェックを改善。不正な入力パスから読み取ろうとすると、ffmpegエラーではなくパスが存在しないと報告するようになった。
- タイプ: 最適化
- 貢献者: xmimu
- 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1)
- 内容: Hubert抽出がNaNエラーを引き起こし、SoVITS/GPTトレーニングでZeroDivisionErrorが発生する問題を解決。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50)
- 内容: 中国語分ツール`jieba``jieba_fast`に置き換え。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.23 [Commit#63625758](https://github.com/RVC-Boss/GPT-SoVITS/commit/63625758a99e645f3218dd167924e01a0e3cf0dc)
- 内容: モデルファイルのソートロジックを最適化。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f)
- 内容: 推論WebUIでクイックモデル切り替えをサポート追加。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b)
- 内容: 推論WebUIの冗長なログを削除。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200)
- 内容: Macでのトレーニングと推論をサポート。
- タイプ: 機能
- 貢献者: Lion-Wu
- 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b)
- 内容: UVR5の読み取り時にディレクトリが自動的に飛び出す問題を修正。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204)
- 内容: 中日混合および日英混合出力テキストをサポート追加。
- タイプ: 機能
- 貢献者: Kakaru Hayate
- 2024.01.26 [Commit#f4148cf7](https://github.com/RVC-Boss/GPT-SoVITS/commit/f4148cf77fb899c22bcdd4e773d2f24ab34a73e7)
- 内容: 出力のセグメンテーションモードをオプションで追加。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.26 [Commit#9fe955c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/9fe955c1bf5f94546c9f699141281f2661c8a180)
- 内容: 複数改行による推論エラーを修正。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b)
- 内容: 半精度をサポートしないGPU向けに自動的に単精度を強制。CPU推論時も単精度を強制。
- タイプ: 最適化
- 貢献者: RVC-Boss
- 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238)
- 内容: Dockerfile内のモデルダウンロードプロセスを完了。
- タイプ: 修正
- 貢献者: breakstring
- 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257)
- 内容: 数字の発音が中国語文字に変換される問題を修正。
- タイプ: 修正
- 貢献者: duliangang
- 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683)
- 内容: GPTトレーニングがチェックポイントを保存しない問題を修正。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8)
- 内容: 制限を設定して不合理な参照音声長を除外。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.28 [Commit#698e9655](https://github.com/RVC-Boss/GPT-SoVITS/commit/698e9655132d194b25b86fbbc99d53c8d2cea2a3)
- 内容: 文頭の数文字が飲み込まれる問題を修正。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0)
- 内容: 16シリーズのような半精度トレーニングに問題があるGPU向けに、トレーニング設定を単精度に変更。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec)
- 内容: 利用可能なColabバージョンをテストして更新。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135)
- 内容: FunASRをバージョン1.0に更新し、インターフェース不一致によるエラーを修正。
- タイプ: 修正
- 貢献者: LauraGPT
- 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4)
- 内容: 中日英句読点の分割問題を修正し、文の始めと終わりに句読点を追加。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.01.30 [Commit#74409f35](https://github.com/RVC-Boss/GPT-SoVITS/commit/74409f3570fa1c0ff28d4c65c288a6ce58ca00d2)
- 内容: 句読点による分割を追加。
- タイプ: 機能
- 貢献者: RVC-Boss
- 2024.01.30 [Commit#c42eeccf](https://github.com/RVC-Boss/GPT-SoVITS/commit/c42eeccfdd2d0a0d714ecc8bfc22a12373aca6b7)
- 内容: パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません.
- タイプ: 修正
- 貢献者: RVC-Boss
## 202402 1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました.
2. 英語-日本語学習がスムーズに進む QA を完了しました. (ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります)
3. オーディオパスをチェックします.間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます.これは ffmpeg モジュールのエラーではありません.
- 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) ### 20240123 更新
- 内容: ASRパス末尾のスラッシュによるファイル名保存エラーの修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7)
- 内容: UVR5のフォーマット読み取りエラーによる音声分離失敗の修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7)
- 内容: 中日英混合テキストの自動言語判別・分割機能のサポート
- タイプ: 機能改善
- 貢献者: RVC-Boss
- 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)
- 内容: PaddleSpeechのテキスト正規化を導入(例: xx.xx%表記、「元/吨」を「元每吨」と正確に読む、アンダースコア問題の解消)
- タイプ: 機能改善
- 貢献者: KamioRinn
- 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395)
- 内容: 英語テキスト前処理の最適化
- タイプ: 機能改善
- 貢献者: KamioRinn
- 2024.02.06 [Commit#65b463a7](https://github.com/RVC-Boss/GPT-SoVITS/commit/65b463a787f31637b4768cc9a47cab59541d3927)
- 内容: 言語パラメータの混乱による中国語推論精度低下の修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 関連: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)
- 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)
- 内容: UVR5の高バージョンLibrosaへの対応
- タイプ: 修正
- 貢献者: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- 内容: UVR5の「inf everywhere」エラーの修正(ブール型変換不足による半精度推論問題、16シリーズGPUで発生)
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
- 内容: Gradio依存関係の修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400)
- 内容: Faster Whisperの統合による日本語・英語音声認識機能の追加
- タイプ: 新機能
- 貢献者: Shadow
- 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10)
- 内容: 3連ルートディレクトリ空欄時の自動.listファイルパス読み込み機能
- タイプ: 機能改善
- 貢献者: RVC-Boss
- 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)
- 内容: GPTトレーニングのフリーズ問題(Windows10 1909)と繁体字システム言語時のエラー修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 関連: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)
- 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)
- 内容: DPO損失実験的トレーニングオプションの追加(ネガティブサンプル構築によるGPTの繰り返し・文字抜け問題改善)、推論インターフェースの複数パラメータ公開
- タイプ: 新機能
- 貢献者: liufenghua
- 2024.02.12 [Commit#2fa74ecb](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#d82f6bbb](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28)
- 内容: 音声認識ロジックの最適化、Faster Whisperのミラーサイトダウンロード対応(HuggingFace接続問題回避)
- タイプ: 機能改善
- 貢献者: RVC-Boss
- 2024.02.15 [Commit#dd2c4d6d](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8)
- 内容: 中国語実験名のトレーニングサポート
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b)
- 内容: DPOトレーニングを必須からオプションに変更(選択時は自動的にバッチサイズ半減)、推論インターフェースの新パラメータ未伝達問題の修正
- タイプ: 機能改善
- 貢献者: RVC-Boss
- 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5)
- 内容: 中国語テキスト前処理エラーの修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.02.16 [PR#499](https://github.com/RVC-Boss/GPT-SoVITS/pull/499)
- 内容: 参照テキストなし入力のサポート
- タイプ: 新機能
- 貢献者: Watchtower-Liu
- 関連: [Issue#475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)
- 2024.02.17 [PR#509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509), [PR#507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR#532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR#556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR#559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559)
- 内容: 中国語・日本語前処理の改善
- タイプ: 機能改善
- 貢献者: KamioRinn, v3cun
- 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511)
- 内容: Colabの公開URL未開始問題の修正
- タイプ: 修正
- 貢献者: ChanningWang2018, RVC-Boss
- 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557)
- 内容: MacOS推論デバイスをMPSからCPUに変更(CPU推論の方が高速)
- タイプ: 機能改善
- 貢献者: XXXXRT666
- 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#5a171773](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35)
- 内容: データ前処理に音声ノイズ除去オプション追加(16Kサンプルレートにダウンサンプリング、高イズ時以外は非推奨)
- タイプ: 新機能
- 貢献者: RVC-Boss
- 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)
- 内容: is_half判定の修正によるMacOSの正常なCPU推論対応
- タイプ: 修正
- 貢献者: XXXXRT666
- 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)
- 内容: UVR5 MDXNetパラメータ順序エラーによる出力フォルダ逆転問題の修正
- タイプ: 修正
- 貢献者: Yuze Wang
## 202403 1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました.
2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました.
3. モデルファイルのソートロジックを最適化しました.
4. 中国語の分析に `jieba_fast``jieba` に置き換えました.
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) ### 20240126 更新
- 内容: CUDAが利用できない場合、Faster Whisperの自動CPU推論を有効化
- タイプ: 機能改善
- 貢献者: ShiroDoMain
- 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193)
- 内容: Faster Whisper非中国語ASR使用時、中国語FunASRモデルの事前ダウンロードが不要に
- タイプ: 機能改善
- 貢献者: RVC-Boss
- 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 内容: 推論速度を50%向上RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39環境で検証
- タイプ: 機能改善
- 貢献者: GoHomeToMacDonal
- 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
- 内容: 高速推論ブランチ`fast_inference_`を追加
- タイプ: 新機能
- 貢献者: ChasonJiang
- 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
- 内容: CPUトレーニングをサポートmacOSでCPUを使用したトレーニングが可能に
- タイプ: 新機能
- 貢献者: Lion-Wu
- 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821)
- 内容: 英語テキストフロントエンドの最適化
- タイプ: 機能改善
- 貢献者: KamioRinn
- 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894)
- 内容: APIフォーマットの改善
- タイプ: 機能改善
- 貢献者: KamioRinn
## 202404 1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします.
2. 出力で選択的な分割モードをサポートします.
3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました.
4. 複数の改行による推論エラーを修正しました.
5. 推論インターフェースから不要なログを削除しました.
6. MacOS での学習と推論をサポートします.
7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します.
- 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) ### 20240128 更新
- 内容: UVR5 WebUIにおけるFFmpegコマンド文字列フォーマットの修正
- タイプ: 修正
- 貢献者: StaryLan
## 202405 1. 数字を漢字で読む問題を修正しました.
2. 文章の先頭の一部の単語が欠落する問題を修正しました.
3. 不適切な長さのリファレンスオーディオを制限しました.
4. GPT 学習時の ckpt が保存されない問題を修正しました.
5. Dockerfile のモデルダウンロードプロセスを改善しました.
- 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) ### 20240129 更新
- 内容: SoVITSトレーニング時のVQ凍結漏れ問題を修正品質劣化の原因となる
- タイプ: 修正
- 貢献者: hcwu1993
- 関連: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747)
- 2024.05.19 [PR#1102](https://github.com/RVC-Boss/GPT-SoVITS/pull/1102)
- 内容: トレーニングデータ処理時、未対応言語に対するエラープロンプトを追加
- タイプ: 機能改善
- 貢献者: StaryLan
- 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132)
- 内容: Hubert抽出におけるバグ修正
- タイプ: 修正
- 貢献者: XXXXRT666
## 202406 1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました.
2. Colab でも使用可能なバージョンをテストして更新しました.
3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました.
- 2024.06.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) ### 20240130 更新
- 内容: WebUIのGPTファインチューニング時に中国語入力テキストのBERT特徴量を読み取れない問題を修正推論時との不一致や品質劣化の原因となる
**注意: 既に大量データでファインチューニング済みの場合は、品質向上のためモデルの再チューニングを推奨**
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)
- 内容: `s2_train.py`におけるSoVITSトレーニングの進捗バー処理を修正
- タイプ: 修正
- 貢献者: pengzhendong
- 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
- 内容: UVR5 MDXNetがFFmpegを呼び出す際の文字列フォーマットを修正スペースを含むパスに対応
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)
- 内容: 純粋な句読点および複数句読点テキスト入力の処理ロジックを改善
- タイプ: 修正
- 貢献者: XXXXRT666
- 関連: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165)
- 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
- 内容: CPU推論におけるデフォルトバッチサイズの小数点問題を修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267)
- 内容: イズ除去やASRで例外が発生した場合に保留中の全オーディオファイル処理が終了してしまう問題を修正
- タイプ: 修正
- 貢献者: XXXXRT666
- 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
- 内容: マルチGPUトレーニング時のマルチプロセス保存ロジックを修正
- タイプ: 修正
- 貢献者: RVC-Boss
- 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251)
- 内容: 冗長な`my_utils.py`を削除
- タイプ: 最適化
- 貢献者: aoguai
- 関連: [Issue#1189](https://github.com/RVC-Boss/GPT-SoVITS/issues/1189)
## 202407 1. パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません.
2. 中国語と英語、日本語と英語の混合出力をサポートします.
3. 出力で選択的な分割モードをサポートします.
- 2024.07.06 [PR#1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) ### 20240201 更新
- 内容: 句読点分割時の小数点分割問題を修正
- タイプ: 修正
- 貢献者: aoguai
- 2024.07.06 [Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5)
- 内容: 高速化推論コードが検証済みでmainブランチにマージされ、ベースと同等の推論効果を保証。テキスト未参照モードでも高速推論をサポート
- タイプ: 最適化
- 貢献者: RVC-Boss, GoHomeToMacDonal
- 関連: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 今後も`fast_inference`ブランチでの変更整合性を継続検証
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 1. UVR5 形式の読み取りエラーによる分離失敗を修正しました.
- 内容: i18nスキャンのリファクタリングと多言語設定ファイルの更新 2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました.
- タイプ: ドキュメンテーション
- 貢献者: StaryLan
- 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299)
- 内容: ユーザーファイルパスの末尾スラッシュがコマンドラインエラーを引き起こす問題を修正
- タイプ: 修正
- 貢献者: XXXXRT666
- 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756)
- 内容: GPTトレーニング時、カスタム`bucket_sampler`使用時のステップ数不一致を修正
- タイプ: 修正
- 貢献者: huangxu1991
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- 内容: 合成時の話速調整をサポート(ランダム性を固定して速度のみ制御するオプション含む)。`api.py`に更新済み
- タイプ: 新機能
- 貢献者: RVC-Boss, 红血球AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- 内容: BS-RoFormerボーカル・伴奏分離モデルのサポートを追加。
- タイプ: 新機能
- 貢献者: KamioRinn
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351)
- 内容: 中国語テキストフロントエンドの改善。
- タイプ: 新機能
- 貢献者: KamioRinn
## 202408 (V2 バージョン) ### 20240202 更新
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) 1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました.
- 内容: WebUIでファイル処理時にパスを自動入力するように変更。 2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%" (パーセント記号) の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました.
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
- 内容: BS-RoformerのFP16推論サポートを有効化。
- タイプ: パフォーマンス最適化
- 貢献者: RVC-Boss
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
- 内容: GPU認識ロジックを最適化、ユーザーが入力した任意のGPUインデックスを処理するユーザーフレンドリーなロジックを追加。
- タイプ: 雑務
- 貢献者: RVC-Boss
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758)
- 内容: **GPT-SoVITS V2モデルを追加。**
- タイプ: 新機能
- 貢献者: RVC-Boss
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
- 内容: FunASRを使用して広東語ASRをサポート。
- タイプ: 新機能
- 貢献者: RVC-Boss
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388)
- 内容: UIとタイミングロジックを最適化。
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488)
- 内容: 多音字処理ロジックを最適化V2のみ
- タイプ: 修正、新機能
- 貢献者: KamioRinn、RVC-Boss
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422)
- 内容: 参照音声が1つしかアップロードできないバグを修正。欠損ファイルがある場合に警告ポップアップを表示するデータセット検証を追加。
- タイプ: 修正、雑務
- 貢献者: XXXXRT666
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508)
- 内容: 上流のLangSegmentライブラリがSSMLタグを使用した数字、電話番号、日付、時刻の最適化をサポート。
- タイプ: 新機能
- 貢献者: juntaosun
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503)
- 内容: APIを修正・最適化。
- タイプ: 修正
- 貢献者: KamioRinn
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490)
- 内容: `fast_inference`ブランチをメインブランチにマージ。
- タイプ: リファクタリング
- 貢献者: ChasonJiang
- 2024.08.21 **GPT-SoVITS V2バージョンを正式リリース。**
## 202502 (V3 バージョン) ### 20240207 更新
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) 1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました.
- 内容: **GPT-SoVITS V3モデルを追加。ファインチューニングには14GBのVRAMが必要。** 2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました.
- タイプ: 新機能([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))参照) 3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) で、`is_half` パラメータがブール値に変換されず、常に半精度推論が行われ、16 シリーズの GPU で `inf` が発生する UVR5 inf everywhereエラーを修正しました.
- 貢献者: RVC-Boss 4. 英語テキストフロントエンドを最適化しました.
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) 5. Gradio の依存関係を修正しました.
- 内容: 多言語プロジェクトドキュメントを更新。 6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました.
- タイプ: ドキュメント 7. 日本語と英語のために Faster Whisper ASR を統合しました.
- 貢献者: StaryLan
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033)
- 内容: 日本語ドキュメントを更新。
- タイプ: ドキュメント
- 貢献者: Fyphen
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010)
- 内容: アテンション計算ロジックを最適化。
- タイプ: パフォーマンス最適化
- 貢献者: wzy3650
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040)
- 内容: ファインチューニング用に勾配チェックポイントサポートを追加。12GB VRAMが必要。
- タイプ: 新機能
- 貢献者: Kakaru Hayate
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073)
- 内容: 新しい言語セグメンテーションツールに切り替え、多言語混合テキストの分割戦略を改善。数字と英語の処理ロジックを最適化。
- タイプ: 新機能
- 貢献者: KamioRinn
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e)
- 内容: **GPT-SoVITS V3モデルがLoRAトレーニングをサポート。ファインチューニングに8GB GPUメモリが必要。**
- タイプ: 新機能
- 貢献者: RVC-Boss
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078)
- 内容: ボーカルと楽器分離のためのMel Band Roformerモデルサポートを追加。
- タイプ: 新機能
- 貢献者: Sucial
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114)
- 内容: 中国語パス下でのMeCabエラーを修正日本語/韓国語または多言語テキスト分割用)。
- タイプ: 修正
- 貢献者: KamioRinn
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc)
- 内容: **24kHzから48kHzへのオーディオ超解像モデルを追加**。V3モデルで24Kオーディオを生成する際の「こもった」オーディオ問題を緩和。
- タイプ: 新機能
- 貢献者: RVC-Boss
- 関連: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123)
- 内容: 多言語プロジェクトドキュメントを更新。
- タイプ: ドキュメント
- 貢献者: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- 内容: モデルが識別できない短いCJK文字に対してルールベースの検出を適用。
- タイプ: 修正
- 貢献者: KamioRinn
- 関連: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5)
- 内容: 合成速度を制御するための発話速度パラメータを追加。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.02.28 **GPT-SoVITS V3を正式リリース**
## 202503 ### 20240208 更新
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236) 1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (繁体字中国語システム言語) での GPT トレーニングのハングを修正する試みを行いました.
- 内容: 依存関係の不正なバージョンによる問題を修正。
- タイプ: 修正
- 貢献者: XXXXRT666
- 関連:
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
- 内容: **SoVITS v3の並列推論を有効化。**
- タイプ: 新機能
- 貢献者: ChasonJiang
- その他の軽微なバグを修正。 ### 20240212 更新
- ONNXランタイムGPU推論サポートのための統合パッケージ修正: 1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました.
- タイプ: 修正 2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました.
- 詳細:
- G2PW内のONNXモデルをCPUからGPU推論に切り替え、CPUボトルネックを大幅に削減;
- foxjoy dereverberationモデルがGPU推論をサポート。
## 202504 (V4 バージョン) ### 20240214 更新
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) 1. トレーニングで中国語の実験名をサポート (以前はエラーが発生していました).
- 内容: SoVITS v3並列推論のロックを解除。非同期モデル読み込みロジックを修正。 2. DPOトレーニングを必須ではなくオプション機能に変更.選択された場合、バッチサイズは自動的に半分になります.推論 WebUI で新しいパラメータが渡されない問題を修正しました.
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255)
- 内容: Ruffを使用したコードフォーマット。G2PWリンクを更新。
- タイプ: スタイル
- 貢献者: XXXXRT666
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290)
- 内容: ドキュメントを整理。Python 3.11サポートを追加。インストーラーを更新。
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300)
- 内容: Colab、インストールファイル、モデルダウンロードを更新。
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf)
- 内容: **GPT-SoVITS V4モデルを追加。**
- タイプ: 新機能
- 貢献者: RVC-Boss
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307)
- 内容: V4の並列推論を有効化。
- タイプ: 新機能
- 貢献者: RVC-Boss、ChasonJiang
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309)
- 内容: モデルバージョンパラメータの受け渡しを修正。
- タイプ: 修正
- 貢献者: RVC-Boss、ChasonJiang
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310)
- 内容: NumpyとNumbaのバージョン不一致問題を修正。librosaバージョンを更新。
- タイプ: 修正
- 貢献者: RVC-Boss、XXXXRT666
- 関連: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
- **2024.04.22 GPT-SoVITS V4を正式リリース**
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311)
- 内容: Gradioパラメータを更新。
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322)
- 内容: Colab/Kaggleートブックスクリプトを改善。
- タイプ: 雑務
- 貢献者: XXXXRT666
## 202505 ### 20240216 更新
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) 1. 参照テキストなしでの入力をサポート.
- 内容: DockerとWindows自動ビルドスクリプトを改善。pre-commitフォーマットを追加。 2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました.
- タイプ: 雑務
- 貢献者: XXXXRT666
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408)
- 内容: 多言語テキスト分割と認識ロジックを最適化。
- タイプ: 修正
- 貢献者: KamioRinn
- 関連: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377)
- 内容: キャッシュ戦略を実装し、SoVITS V3/V4推論速度を10%向上。
- タイプ: パフォーマンス最適化
- 貢献者: Kakaru Hayate
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba)
- 内容: アテーションインターフェースを更新し、以下の注意事項を追加しました各ページの編集が終わったら必ず「Submit Text」をクリックしてください。さもなくば変更は保存されません。
- タイプ: 修正
- 貢献者: RVC-Boss
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc)
- 内容: UVR5およびONNX dereverberationモデルのエラーを修正。FFmpegが元のパスにスペースを含むMP3/M4Aファイルをエンコードする場合の問題を解決。
- タイプ: 修正
- 貢献者: RVC-Boss
## 202506V2Pro シリーズ) ### 20240221 更新
- 2025.06.03 [PR#2420](https://github.com/RVC-Boss/GPT-SoVITS/pull/2420) 1. データ処理中のノイズ低減オプションを追加 (イズ低減は16kHzサンプリングレートのみを残します背景イズが大きい場合にのみ使用してください).
- 内容: プロジェクトの多言語ドキュメントを更新 2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました.
- タイプ: ドキュメント 3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました.
- 貢献者: StaryLan 4. Colab のパブリック URL の問題を修正しました.
- 2025.06.04 [PR#2417](https://github.com/RVC-Boss/GPT-SoVITS/pull/2417) ### 20240306 更新
- 内容: TorchScript を使用した V4 モデルのエクスポート機能を追加
- タイプ: 新機能 1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト).
- 貢献者: L-jasmine 2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました.
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)〜[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70) 3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) で UVR5 残響除去モデルの設定が逆になっていた問題を修正しました.
- 内容: GPT-SoVITS V2Pro シリーズモデル (V2Pro, V2ProPlus) を正式に導入 4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました.
- タイプ: 新機能 5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました.
- 貢献者: RVC-Boss
- 2025.06.05 [PR#2426](https://github.com/RVC-Boss/GPT-SoVITS/pull/2426) ### 202403/202404/202405 更新
- 内容: `config/inference_webui` の初期化時のエラーを修正
- タイプ: 不具合修正Bug Fix #### マイナー修正:
- 貢献者: StaryLan
- 2025.06.05 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427), [Commit#7d70852a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7d70852a3f67c3b52e3a62857f8663d529efc8cd), [PR#2434](https://github.com/RVC-Boss/GPT-SoVITS/pull/2434) 1. 参照テキストなしモードの問題を修正しました.
- 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳みCollapsible機能を追加 2. 中国語と英語のテキストフロントエンドを最適化しました.
- タイプ: 新機能 3. API フォーマットを改善しました.
- 貢献者: XXXXRT666, RVC-Boss 4. CMD フォーマットの問題を修正しました.
5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました.
6. Hubert 抽出のバグを修正しました.
#### メジャー修正:
1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました.
2. クイック推論ブランチを追加しました.
### 20240610 更新
#### マイナー修正:
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました.
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました.
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました.
#### メジャー修正:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました.
**注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします.**
### 20240706 更新
#### マイナー修正:
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました.
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、イズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました.
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) で、句読点で分割する際の小数点分割の問題を修正しました.
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました.
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました.
#### メジャー修正:
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました.
また、参照テキストなしモードでの加速推論もサポートしています.
**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**.
### 20240727 更新
#### マイナー修正:
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました.
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました.
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました.
#### メジャー修正:
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました.
スピーチレートのみを調整しながらランダム性を固定できるようになりました.
### 20240806 更新
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました.[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました.
2. 中国語テキストフロントエンドを改善しました.
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加 (v2 のみ);
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 量詞を追加;
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 四則演算と基本数式のサポート;
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正.
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました.
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました.
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました.
6. GPT-SoVITS v2 のサポートを追加しました.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました.
### 20240821 更新
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました.
### 20250211 更新
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました.SoVITS v3のファインチューニングには14GBのGPUメモリが必要です.
### 20250212 更新
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です.
### 20250214 更新
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**.
- `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました.
### 20250217 更新
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化.
### 20250218 更新
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**.
### 20250223 更新
1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました.8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています.
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました.
### 20250226 更新
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました.
- `langsegmenter`を使用して韓国語の問題を修正.
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました.
- `langsegmenter`を使用して韓国語/日本語の問題を修正.
### 20250227 更新
- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました.[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています.

View File

@ -5,20 +5,12 @@
[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org)
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
[![Huggingface](https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface)](https://lj1995-gpt-sovits-proplus.hf.space/)
[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@ -49,13 +41,12 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
### テスト済みの環境 ### テスト済みの環境
| Python Version | PyTorch Version | Device | | Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- | |----------------|------------------|-----------------|
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | | Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon | | Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon | | Python 3.11 | PyTorch 2.6.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU | | Python 3.9 | PyTorch 2.2.2 | CPU |
### Windows ### Windows
@ -65,41 +56,31 @@ Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケ
### Linux ### Linux
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### macOS ### macOS
**注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.** **注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.**
以下のコマンドを実行してこのプロジェクトをインストールします: 1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします.
2. 以下のコマンドを実行してこのプロジェクトをインストールします.
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### 手動インストール ### 手動インストール
#### 依存関係をインストールします #### FFmpeg をインストールします.
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
#### FFmpeg をインストールします
##### Conda ユーザー ##### Conda ユーザー
```bash ```bash
conda activate GPTSoVits
conda install ffmpeg conda install ffmpeg
``` ```
@ -108,13 +89,12 @@ conda install ffmpeg
```bash ```bash
sudo apt install ffmpeg sudo apt install ffmpeg
sudo apt install libsox-dev sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
``` ```
##### Windows ユーザー ##### Windows ユーザー
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください
##### MacOS ユーザー ##### MacOS ユーザー
@ -122,54 +102,38 @@ sudo apt install libsox-dev
brew install ffmpeg brew install ffmpeg
``` ```
### GPT-SoVITS の実行 (Docker 使用) #### 依存関係をインストールします
#### Docker イメージの選択
コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください:
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください
- 環境に合った適切なイメージタグを選択してください
- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが**含まれていない**ことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします
- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます
- Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
#### 環境変数
- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます
#### 共有メモリの設定
Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします
#### サービスの選択
`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています:
- `GPT-SoVITS-CU126` および `GPT-SoVITS-CU128`:すべての機能を含むフルバージョン
- `GPT-SoVITS-CU126-Lite` および `GPT-SoVITS-CU128-Lite`:依存関係を削減した軽量バージョン
特定のサービスを Docker Compose で実行するには、以下のコマンドを使用します:
```bash ```bash
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> pip install -r extra-req.txt --no-deps
pip install -r requirementx.txt
``` ```
#### Docker イメージのローカルビルド ### Docker の使用
自分でイメージをビルドするには、以下のコマンドを使ってください: #### docker-compose.yaml の設定
```bash 0. イメージのタグについて: コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(古いバージョン) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください.
bash docker_build.sh --cuda <12.6|12.8> [--lite] 1. 環境変数:
- `is_half`: 半精度/倍精度の制御."SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です.実際の状況に応じて True または False に調整してください.
2. ボリューム設定: コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます.デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています.
3. `shm_size`: Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります.状況に応じて適宜設定してください.
4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください.
#### docker compose で実行する
```markdown
docker compose -f "docker-compose.yaml" up -d
``` ```
#### 実行中のコンテナへアクセス (Bash Shell) #### docker コマンドで実行する
コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます: 上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します:
```bash ```markdown
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
``` ```
## 事前訓練済みモデル ## 事前訓練済みモデル
@ -237,12 +201,12 @@ python webui.py v1 <言語(オプション)>
#### パス自動補完のサポート #### パス自動補完のサポート
1. 音声パスを入力する 1. 音声パスを入力する
2. 音声を小さなチャンクに分割する 2. 音声を小さなチャンクに分割する
3. ノイズ除去 (オプション) 3. ノイズ除去 (オプション)
4. ASR 4. ASR
5. ASR 転写を校正する 5. ASR転写を校正する
6. 次のタブに移動し、モデルを微調整する 6. 次のタブに移動し、モデルを微調整する
### 推論 WebUI を開く ### 推論 WebUI を開く
@ -284,7 +248,7 @@ V1 環境から V2 を使用するには:
2. 最新のコードを github からクローン 2. 最新のコードを github からクローン
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) 中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
@ -304,42 +268,10 @@ v2 環境から v3 を使用する方法:
2. GitHub から最新のコードをクローンします. 2. GitHub から最新のコードをクローンします.
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します. 3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します.
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください. 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
## V4 リリースノート
新機能:
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力しますV3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 環境から V4 への移行方法:
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
2. GitHub から最新のコードをクローンします.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V4 の事前学習済みモデル (`gsv-v4-pretrained/s2v4.ckpt` および `gsv-v4-pretrained/vocoder.pth`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
## V2Pro リリースノート
新機能:
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
V1/V2/V3/V4 環境から V2Pro への移行方法:
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
2. GitHub から最新のコードをクローンします.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V2Pro の事前学習済みモデル (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, および `sv/pretrained_eres2netv2w24s4ep4.ckpt`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
## Todo リスト ## Todo リスト
- [x] **優先度 高:** - [x] **優先度 高:**
@ -364,7 +296,7 @@ V1/V2/V3/V4 環境から V2Pro への移行方法:
コマンド ラインを使用して UVR5 の WebUI を開きます コマンド ラインを使用して UVR5 の WebUI を開きます
```bash ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
@ -375,7 +307,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです. コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
```bash ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -387,7 +319,7 @@ python audio_slicer.py \
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
```bash ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
@ -395,7 +327,7 @@ ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く A
(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります) (進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
```bash ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
@ -422,7 +354,6 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
### 推論用テキストフロントエンド ### 推論用テキストフロントエンド

View File

@ -1,580 +1,222 @@
# 변경 내역 ### 20240121 업데이트
## 202401 1. `config``is_share`를 추가했습니다. Colab과 같은 시나리오에서는 이 값을 `True`로 설정하여 WebUI를 공개 네트워크에 매핑할 수 있습니다.
2. WebUI에 영어 시스템 번역 지원을 추가했습니다.
3. `cmd-asr`이 FunASR 모델이 포함되어 있는지 자동으로 감지합니다; 기본 디렉토리에서 찾을 수 없으면 ModelScope에서 다운로드됩니다.
4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)에서 보고된 SoVITS 훈련의 ZeroDivisionError를 필터링 샘플 등으로 해결하려고 시도했습니다.
5. `TEMP` 폴더의 캐시된 오디오 파일 및 기타 파일을 정리했습니다.
6. 참조 오디오의 끝이 포함된 합성 오디오 문제를 크게 줄였습니다.
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) ### 20240122 업데이트
- 내용: WebUI에 영어 시스템 번역 지원 추가.
- 유형: 문서화
- 기여자: D3lik
- 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e)
- 내용: SoVITS 학습 시 ZeroDivisionError 수정 시도.
- 유형: 수정
- 기여자: RVC-Boss, Tybost
- 관련: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
- 2024.01.21 [Commit#ea62d6e0](https://github.com/RVC-Boss/GPT-SoVITS/commit/ea62d6e0cf1efd75287766ea2b55d1c3b69b4fd3)
- 내용: 합성된 오디오가 참조 오디오의 끝부분을 포함하는 문제 크게 감소.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef)
- 내용: `cmd-asr.py`가 이제 FunASR 모델이 기본 디렉토리에 포함되어 있는지 확인하고, 없으면 ModelScope에서 다운로드.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d)
- 내용: `Config.py``is_share` 매개변수 추가, `True`로 설정하면 WebUI를 공용 네트워크에 매핑.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f)
- 내용: `TEMP` 폴더에서 캐시된 오디오 파일 및 기타 파일 정리.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.22 [Commit#872134c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/872134c846bcb8f1909a3f5aff68a6aa67643f68)
- 내용: 지나치게 짧은 출력 파일로 인해 참조 오디오가 반복되는 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.22 영어 및 일본어 학습의 기본 지원 테스트 (일본어 학습은 루트 디렉토리에 비영어 특수 문자가 없어야 함).
- 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124)
- 내용: 오디오 경로 확인 개선. 잘못된 입력 경로에서 읽으려고 하면 ffmpeg 오류 대신 경로가 존재하지 않는다고 보고.
- 유형: 최적화
- 기여자: xmimu
- 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1)
- 내용: Hubert 추출이 NaN 오류를 일으켜 SoVITS/GPT 학습 시 ZeroDivisionError가 발생하는 문제 해결.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50)
- 내용: 중국어 단어 분리를 위해 `jieba``jieba_fast`로 교체.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.23 [Commit#63625758](https://github.com/RVC-Boss/GPT-SoVITS/commit/63625758a99e645f3218dd167924e01a0e3cf0dc)
- 내용: 모델 파일 정렬 로직 최적화.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f)
- 내용: 추론 WebUI에서 빠른 모델 전환 지원 추가.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b)
- 내용: 추론 WebUI에서 불필요한 로그 제거.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200)
- 내용: Mac에서의 학습 및 추론 지원.
- 유형: 기능
- 기여자: Lion-Wu
- 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b)
- 내용: UVR5가 디렉토리를 읽고 자동으로 빠져나가는 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204)
- 내용: 중국어-영어 혼합 및 일본어-영어 혼합 출력 텍스트 지원 추가.
- 유형: 기능
- 기여자: Kakaru Hayate
- 2024.01.26 [Commit#f4148cf7](https://github.com/RVC-Boss/GPT-SoVITS/commit/f4148cf77fb899c22bcdd4e773d2f24ab34a73e7)
- 내용: 출력을 위한 선택적 분할 모드 추가.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.26 [Commit#9fe955c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/9fe955c1bf5f94546c9f699141281f2661c8a180)
- 내용: 여러 줄바꿈 문제로 인한 추론 오류 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b)
- 내용: 반 정밀도를 지원하지 않는 GPU의 경우 자동으로 단 정밀도 강제; CPU 추론 시 단 정밀도 강제.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238)
- 내용: Dockerfile에서 모델 다운로드 프로세스 완료.
- 유형: 수정
- 기여자: breakstring
- 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257)
- 내용: 숫자의 발음이 한자로 변환되는 문제 수정.
- 유형: 수정
- 기여자: duliangang
- 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683)
- 내용: GPT 학습 시 체크포인트가 저장되지 않는 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8)
- 내용: 제한을 설정하여 불합리한 참조 오디오 길이 제외.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.28 [Commit#698e9655](https://github.com/RVC-Boss/GPT-SoVITS/commit/698e9655132d194b25b86fbbc99d53c8d2cea2a3)
- 내용: 문장 시작 부분에서 몇 글자를 누락시키는 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0)
- 내용: 16 시리즈와 같은 반 정밀도 학습에 문제가 있는 GPU를 위해 학습 구성을 단 정밀도로 변경.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec)
- 내용: 사용 가능한 Colab 버전 테스트 및 업데이트.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135)
- 내용: FunASR을 버전 1.0으로 업데이트 및 인터페이스 불일치로 인한 오류 수정.
- 유형: 수정
- 기여자: LauraGPT
- 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4)
- 내용: 중국어 및 영어 구두점 분할 문제 수정 및 문장 시작과 끝에 구두점 추가.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.01.30 [Commit#74409f35](https://github.com/RVC-Boss/GPT-SoVITS/commit/74409f3570fa1c0ff28d4c65c288a6ce58ca00d2)
- 내용: 구두점으로 분할 지원 추가.
- 유형: 기능
- 기여자: RVC-Boss
- 2024.01.30 [Commit#c42eeccf](https://github.com/RVC-Boss/GPT-SoVITS/commit/c42eeccfdd2d0a0d714ecc8bfc22a12373aca6b7)
- 내용: 초보 사용자가 경로를 복사할 때 큰따옴표를 포함하여 오류가 발생하는 것을 방지하기 위해 모든 경로 관련 항목에서 큰따옴표 자동 제거.
- 유형: 수정
- 기여자: RVC-Boss
## 202402 1. 지나치게 짧은 출력 파일로 인해 참조 오디오가 반복되는 문제를 수정했습니다.
2. 영어 및 일본어 훈련의 네이티브 지원을 테스트했습니다 (일본어 훈련 시 루트 디렉토리에 비영어 특수 문자가 없어야 합니다).
3. 오디오 경로 확인을 개선했습니다. 잘못된 입력 경로에서 읽으려는 시도가 있을 경우, ffmpeg 오류 대신 경로가 존재하지 않는다고 보고합니다.
- 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) ### 20240123 업데이트
- 내용: ASR 경로가 `/`로 끝날 때 파일명 저장 오류 발생 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7)
- 내용: UVR5 형식 읽기 오류로 인한 분리 실패 문제 해결.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7)
- 내용: 중국어-일본어-영어 혼합 텍스트 자동 분할 및 언어 인식 지원.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)
- 내용: PaddleSpeech Normalizer 도입으로 "xx.xx%"(퍼센트 기호) 및 "元/吨"이 "元吨" 대신 "元每吨"으로 읽히는 문제, 언더스코어 오류 수정.
- 유형: 최적화
- 기여자: KamioRinn
- 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395)
- 내용: 영어 텍스트 프론트엔드 최적화.
- 유형: 최적화
- 기여자: KamioRinn
- 2024.02.06 [Commit#65b463a7](https://github.com/RVC-Boss/GPT-SoVITS/commit/65b463a787f31637b4768cc9a47cab59541d3927)
- 내용: 언어 매개변수 혼동으로 인한 중국어 추론 품질 저하 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 관련: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)
- 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)
- 내용: UVR5을 librosa 상위 버전에 적응시킴.
- 유형: 수정
- 기여자: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- 내용: `is_half` 매개변수가 불리언으로 변환되지 않아 발생한 UVR5 inf 오류 수정 (16 시리즈 GPU에서 `inf` 문제 발생).
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
- 내용: Gradio 의존성 문제 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400)
- 내용: 일본어 및 영어용 Faster Whisper ASR 통합.
- 유형: 기능
- 기여자: Shadow
- 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)~[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10)
- 내용: 데이터셋 준비 시 루트 디렉토리를 비워둘 경우 `.list` 전체 경로 자동 읽기 지원.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)
- 내용: Windows 10 1909 및 번체 중국어 시스템 언어에서 GPT 훈련 중단 문제 해결 시도.
- 유형: 수정
- 기여자: RVC-Boss
- 관련: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)
- 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)
- 내용: DPO Loss 훈련 옵션 추가 (GPT 반복 및 문자 누락 완화), 추론 WebUI에 여러 매개변수 노출.
- 유형: 기능
- 기여자: liufenghua
- 2024.02.12 [Commit#2fa74ecb](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#d82f6bbb](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28)
- 내용: Faster Whisper 및 FunASR 로직 최적화, Hugging Face 연결 문제 회피를 위해 미러 다운로드로 전환.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.02.15 [Commit#dd2c4d6d](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8)
- 내용: 훈련 시 중국어 실험 이름 지원 (이전 버전에서는 오류 발생).
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)~[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b)
- 내용: DPO 훈련을 필수에서 선택 사항으로 변경. 선택 시 배치 크기 자동 절반 감소. 추론 WebUI에서 새 매개변수 전달 문제 수정.
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5)
- 내용: 중국어 프론트엔드 버그 수정.
- 유형: 수정
- 기여자: RVC-Boss
- 2024.02.16 [PR#499](https://github.com/RVC-Boss/GPT-SoVITS/pull/499)
- 내용: 참조 텍스트 없이 입력 지원.
- 유형: 기능
- 기여자: Watchtower-Liu
- 관련: [Issue#475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)
- 2024.02.17 [PR#509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509), [PR#507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR#532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR#556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR#559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559)
- 내용: 중국어 및 일본어 프론트엔드 처리 최적화.
- 유형: 최적화
- 기여자: KamioRinn, v3cun
- 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511)
- 내용: Colab 공개 URL 문제 수정.
- 유형: 수정
- 기여자: ChanningWang2018, RVC-Boss
- 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557)
- 내용: Mac CPU 추론 시 MPS 대신 CPU 사용으로 성능 향상.
- 유형: 최적화
- 기여자: XXXXRT666
- 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#5a171773](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35)
- 내용: 데이터 처리 시 노이즈 감소 옵션 추가 (16kHz 샘플링 레이트만 유지, 배경 노이즈가 심한 경우에만 사용 권장).
- 유형: 기능
- 기여자: RVC-Boss
- 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)
- 내용: Mac에서 CPU 추론이 정상적으로 작동하도록 `is_half` 확인 수정.
- 유형: 수정
- 기여자: XXXXRT666
- 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)
- 내용: UVR5 리버브 제거 모델 설정이 반대로 되어 있던 문제 수정.
- 유형: 수정
- 기여자: Yuze Wang
## 202403 1. Hubert 추출로 인해 NaN 오류가 발생하여 SoVITS/GPT 훈련에서 ZeroDivisionError가 발생하는 문제를 해결했습니다.
2. 추론 WebUI에서 빠른 모델 전환 지원을 추가했습니다.
3. 모델 파일 정렬 로직을 최적화했습니다.
4. 중국어 단어 분할을 위해 `jieba``jieba_fast`로 교체했습니다.
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) ### 20240126 업데이트
- 내용: CUDA가 없는 경우 Faster Whisper의 자동 CPU 추론 기능 활성화
- 유형: 최적화
- 기여자: ShiroDoMain
- 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193)
- 내용: Faster Whisper 비중국어 ASR 사용 시 중국어 FunASR 모델을 먼저 다운로드할 필요 없음
- 유형: 최적화
- 기여자: RVC-Boss
- 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 내용: 추론 속도 50% 향상 (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 환경 테스트)
- 유형: 최적화
- 기여자: GoHomeToMacDonal
- 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
- 내용: 빠른 추론 브랜치 'fast_inference_' 추가
- 유형: 기능
- 기여자: ChasonJiang
- 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
- 내용: CPU 훈련 지원 추가, macOS에서 CPU를 사용한 훈련 가능
- 유형: 기능
- 기여자: Lion-Wu
- 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821)
- 내용: 영어 텍스트 프론트엔드 최적화
- 유형: 최적화
- 기여자: KamioRinn
- 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894)
- 내용: API 형식 개선
- 유형: 최적화
- 기여자: KamioRinn
## 202404 1. 중국어-영어 혼합 및 일본어-영어 혼합 출력 텍스트를 지원합니다.
2. 출력에 대한 선택적 분할 모드를 추가했습니다.
3. UVR5 읽기 문제 및 디렉토리 자동 탈출 문제를 수정했습니다.
4. 추론 오류를 일으키는 여러 줄 바꿈 문제를 수정했습니다.
5. 추론 WebUI 에서 중복 로그를 제거했습니다.
6. Mac에서 훈련 및 추론을 지원합니다.
7. 절반 정밀도를 지원하지 않는 GPU에 대해 자동으로 단정밀도를 강제하며, CPU 추론 시 단정밀도를 적용합니다.
- 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) ### 20240128 업데이트
- 내용: UVR5 WebUI에서 FFmpeg 명령어 문자열 형식 수정
- 유형: 수정
- 기여자: StaryLan
## 202405 1. 숫자의 발음이 중국어 문자로 변환되는 문제를 수정했습니다.
2. 문장 시작 부분에서 몇 개의 문자가 누락되는 문제를 수정했습니다.
3. 비합리적인 참조 오디오 길이를 설정하여 제외했습니다.
4. GPT 훈련 시 체크포인트가 저장되지 않는 문제를 수정했습니다.
5. Dockerfile 에서 모델 다운로드 프로세스를 완료했습니다.
- 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) ### 20240129 업데이트
- 내용: SoVITS 훈련 시 VQ를 고정하지 않아 발생하는 품질 저하 문제 해결
- 유형: 수정
- 기여자: hcwu1993
- 관련: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747)
- 2024.05.19 [PR#1102](https://github.com/RVC-Boss/GPT-SoVITS/pull/1102)
- 내용: 훈련 데이터 처리 시 지원되지 않는 언어에 대한 오류 메시지 추가
- 유형: 최적화
- 기여자: StaryLan
- 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132)
- 내용: Hubert 추출 버그 수정
- 유형: 수정
- 기여자: XXXXRT666
## 202406 1. 절반 정밀도 훈련에 문제가 있는 16 시리즈와 같은 GPU의 훈련 구성을 단정밀도로 변경했습니다.
2. 사용 가능한 Colab 버전을 테스트하고 업데이트했습니다.
3. 이전 버전의 FunASR 로 인해 인터페이스 정렬 오류가 발생하는 ModelScope FunASR 저장소의 git 클로닝 문제를 수정했습니다.
- 2024.06.06 [Commit#99f09c8b](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) ### 20240130 업데이트
- 내용: WebUI의 GPT 미세조정 시 중국어 입력 텍스트의 BERT 특징을 읽지 않아 추론과 불일치 및 품질 저하가 발생하는 문제 수정
**주의: 이전에 대량의 데이터로 미세조정을 한 경우 품질 향상을 위해 모델 재조정 권장**
- 유형: 수정
- 기여자: RVC-Boss
- 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)
- 내용: `s2_train.py`에서 SoVITS 훈련 진행률 표시 로직 수정
- 유형: 수정
- 기여자: pengzhendong
- 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
- 내용: UVR5 MDXNet이 FFmpeg 호출 시 공백 포함 경로와의 호환성을 보장하도록 문자열 형식 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)
- 내용: 순수 구두점 및 다중 구두점 텍스트 입력 처리 로직 개선
- 유형: 수정
- 기여자: XXXXRT666
- 관련: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165)
- 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
- 내용: CPU 추론 시 기본 배치 크기 소수점 문제 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267)
- 내용: 잡음 제거 또는 ASR 처리 중 예외 발생 시 대기 중인 모든 오디오 파일이 종료되는 문제 수정
- 유형: 수정
- 기여자: XXXXRT666
- 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
- 내용: 다중 GPU 훈련 시 다중 프로세스 저장 로직 수정
- 유형: 수정
- 기여자: RVC-Boss
- 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251)
- 내용: 중복된 `my_utils.py` 제거
- 유형: 최적화
- 기여자: aoguai
- 관련: [Issue#1189](https://github.com/RVC-Boss/GPT-SoVITS/issues/1189)
## 202407 1. 모든 경로 관련 항목에서 이중 따옴표를 자동으로 제거하여 초보자가 이중 따옴표가 포함된 경로를 복사하는 오류를 방지했습니다.
2. 중국어 및 영어 문장 부호 분할 문제를 수정하고 문장 시작과 끝에 부호를 추가했습니다.
3. 부호에 의한 분할을 추가했습니다.
- 2024.07.06 [PR#1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) ### 20240201 업데이트
- 내용: 구두점 분할 시 소수점이 분할되는 문제 수정
- 유형: 수정
- 기여자: aoguai
- 2024.07.06 [Commit#b0786f29](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5)
- 내용: 가속 추론 코드 검증 완료 및 메인 브랜치 병합. 기본 버전과 동일한 추론 효과 보장하며 참조 텍스트 없음 모드에서도 가속 추론 지원
- 유형: 최적화
- 기여자: RVC-Boss, GoHomeToMacDonal
- 관련: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 향후 업데이트에서는 `fast_inference` 브랜치의 변경 사항 일관성 검증을 지속할 예정입니다.
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 1. 분리 실패를 일으킨 UVR5 형식 읽기 오류를 수정했습니다.
- 내용: i18n 스캐닝 리팩토링 및 다국어 구성 파일 업데이트 2. 혼합된 중국어-일본어-영어 텍스트에 대한 자동 분할 및 언어 인식을 지원합니다.
- 유형: 문서화
- 기여자: StaryLan
- 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299)
- 내용: 사용자 파일 경로의 끝 슬래시로 인한 명령줄 오류 문제 수정
- 유형: 수정
- 기여자: XXXXRT666
- 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756)
- 내용: GPT 훈련 시 사용자 정의 `bucket_sampler` 사용 시 훈련 단계 불일치 문제 수정
- 유형: 수정
- 기여자: huangxu1991
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- 내용: 합성 중 음성 속도 조절 기능 추가(무작위성 고정 및 속도만 제어 옵션 포함). 이 기능은 `api.py`에 업데이트됨
- 유형: 기능
- 기여자: RVC-Boss, 红血球AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- 내용: BS-RoFormer 보컬 분리 모델 지원 추가
- 유형: 기능
- 기여자: KamioRinn
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351)
- 내용: 중국어 텍스트 프론트엔드 개선
- 유형: 기능
- 기여자: KamioRinn
## 202408 (V2 버전) ### 20240202 업데이트
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) 1. `/` 로 끝나는 ASR 경로가 파일 이름 저장 시 오류를 발생시키는 문제를 수정했습니다.
- 내용: WebUI에서 파일 처리 시 경로 자동 입력 기능 추가. 2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) 에서는 PaddleSpeech 의 Normalizer 를 도입하여 "xx.xx%" (백분율 기호)와 "元/吨"이 "元吨"으로 읽히는 문제를 "元每吨"으로 수정하고, 밑줄 오류를 수정했습니다.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
- 내용: BS-Roformer FP16 추론 지원 활성화.
- 유형: 성능 최적화
- 기여자: RVC-Boss
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
- 내용: GPU 인식 로직 최적화, 사용자 입력 GPU 인덱스 처리 로직 추가.
- 유형: 정리 작업
- 기여자: RVC-Boss
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758)
- 내용: **GPT-SoVITS V2 모델 추가.**
- 유형: 신규 기능
- 기여자: RVC-Boss
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
- 내용: FunASR을 이용한 광둥어 ASR 지원 추가.
- 유형: 신규 기능
- 기여자: RVC-Boss
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388)
- 내용: UI 및 타이밍 로직 최적화.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488)
- 내용: 다중 발음 문자 처리 로직 최적화 (V2 전용).
- 유형: 수정, 신규 기능
- 기여자: KamioRinn, RVC-Boss
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422)
- 내용: 참조 오디오 1개만 업로드 가능한 버그 수정; 누락 파일 경고 팝업 추가.
- 유형: 수정, 정리 작업
- 기여자: XXXXRT666
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508)
- 내용: 상위 LangSegment 라이브러리에서 SSML 태그로 숫자, 전화번호, 날짜, 시간 최적화 지원.
- 유형: 신규 기능
- 기여자: juntaosun
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503)
- 내용: API 수정 및 최적화.
- 유형: 수정
- 기여자: KamioRinn
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490)
- 내용: `fast_inference` 브랜치를 메인 브랜치로 병합.
- 유형: 리팩토링
- 기여자: ChasonJiang
- 2024.08.21 **GPT-SoVITS V2 버전 정식 출시.**
## 202502 (V3 버전) ### 20240207 업데이트
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) 1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) 에서 보고된 중국어 추론 품질 저하를 일으킨 언어 매개변수 혼동을 수정했습니다.
- 내용: **GPT-SoVITS V3 모델 추가, 파인튜닝 시 14GB VRAM 필요.** 2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) 에서는 UVR5 를 높은 버전의 librosa에 맞게 조정했습니다.
- 유형: 신규 기능 ([위키 참조](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))) 3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)에서는 `is_half` 매개변수가 불리언으로 변환되지 않아 발생한 UVR5 `inf` 오류를 수정했습니다. 이로 인해 16 시리즈 GPU에서 `inf` 가 발생했습니다.
- 기여자: RVC-Boss 4. 영어 텍스트 프론트엔드를 최적화했습니다.
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) 5. Gradio 종속성 문제를 수정했습니다.
- 내용: 다국어 프로젝트 문서 업데이트. 6. 데이터셋 준비 시 루트 디렉토리를 비워두면 `.list` 전체 경로를 자동으로 읽도록 지원합니다.
- 유형: 문서화 7. 일본어와 영어에 대한 Faster Whisper ASR을 통합했습니다.
- 기여자: StaryLan
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033)
- 내용: 일본어 문서 업데이트.
- 유형: 문서화
- 기여자: Fyphen
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010)
- 내용: 어텐션 계산 로직 최적화.
- 유형: 성능 최적화
- 기여자: wzy3650
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040)
- 내용: 파인튜닝 시 그래디언트 체크포인팅 지원 추가, 12GB VRAM 필요.
- 유형: 신규 기능
- 기여자: Kakaru Hayate
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073)
- 내용: 새로운 언어 분할 도구 전환, 다국어 혼합 텍스트 분할 전략 개선, 숫자 및 영어 처리 로직 최적화.
- 유형: 신규 기능
- 기여자: KamioRinn
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e)
- 내용: **GPT-SoVITS V3 모델 LoRA 학습 지원 추가, 파인튜닝 시 8GB GPU 메모리 필요.**
- 유형: 신규 기능
- 기여자: RVC-Boss
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078)
- 내용: 보컬 및 악기 분리를 위한 Mel Band Roformer 모델 지원 추가.
- 유형: 신규 기능
- 기여자: Sucial
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114)
- 내용: 중국어 경로에서 MeCab 오류 수정 (일본어/한국어 또는 다국어 텍스트 분할 전용).
- 유형: 수정
- 기여자: KamioRinn
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc)
- 내용: **24kHz에서 48kHz 오디오 초해상도 모델 추가** (V3 모델로 24K 오디오 생성 시 "뭉개지는" 현상 완화).
- 유형: 신규 기능
- 기여자: RVC-Boss
- 관련: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123)
- 내용: 다국어 프로젝트 문서 업데이트.
- 유형: 문서화
- 기여자: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- 내용: 모델이 인식하지 못하는 짧은 CJK 문자에 대해 규칙 기반 검출 적용.
- 유형: 수정
- 기여자: KamioRinn
- 관련: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5)
- 내용: 음성 속도 제어 매개변수 추가.
- 유형: 수정
- 기여자: RVC-Boss
- 2025.02.28 **GPT-SoVITS V3 정식 출시**.
## 202503 ### 20240208 업데이트
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236) 1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)에서는 Windows 10 1909와 [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (전통 중국어 시스템 언어)에서 GPT 훈련 멈춤 문제를 수정하려고 했습니다.
- 내용: 의존성 버전 오류로 인한 문제 수정.
- 유형: 수정
- 기여자: XXXXRT666
- 관련:
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
- 내용: **SoVITS v3 병렬 추론 지원 활성화.**
- 유형: 신규 기능
- 기여자: ChasonJiang
- 기타 사소한 버그 수정. ### 20240212 업데이트
- ONNX 런타임 GPU 추론 지원을 위한 패키지 통합 수정: 1. Faster Whisper와 FunASR의 로직을 최적화하고, Faster Whisper를 미러 다운로드로 전환하여 Hugging Face 연결 문제를 피했습니다.
- 유형: 수정 2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)은 DPO Loss 실험적 훈련 옵션을 활성화하여 GPT의 반복 및 문자 누락 문제를 완화하고, 훈련 중 부정 샘플을 구성하며 여러 추론 매개변수를 추론 WebUI에서 사용할 수 있게 했습니다.
- 상세:
- G2PW 내 ONNX 모델이 CPU에서 GPU 추론으로 전환, CPU 병목 현상 크게 감소;
- foxjoy dereverberation 모델이 GPU 추론 지원.
## 202504 (V4 버전) ### 20240214 업데이트
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) 1. 훈련 시 중국어 실험 이름을 지원합니다 (이전에는 오류가 발생했습니다).
- 내용: SoVITS v3 병렬 추론 잠금 해제; 비동기 모델 로딩 로직 수정. 2. DPO 훈련을 필수 기능 대신 선택적 기능으로 변경했습니다. 선택 시, 배치 크기가 자동으로 절반으로 줄어듭니다. 추론 WebUI에서 새로운 매개변수가 전달되지 않는 문제를 수정했습니다.
- 유형: 수정
- 기여자: RVC-Boss
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255)
- 내용: Ruff를 이용한 코드 포맷팅; G2PW 링크 업데이트.
- 유형: 스타일
- 기여자: XXXXRT666
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290)
- 내용: 문서 정리; Python 3.11 지원 추가; 설치 프로그램 업데이트.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300)
- 내용: Colab, 설치 파일 및 모델 다운로드 업데이트.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf)
- 내용: **GPT-SoVITS V4 모델 추가.**
- 유형: 신규 기능
- 기여자: RVC-Boss
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307)
- 내용: V4 병렬 추론 지원 활성화.
- 유형: 신규 기능
- 기여자: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309)
- 내용: 모델 버전 매개변수 전달 오류 수정.
- 유형: 수정
- 기여자: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310)
- 내용: Numpy와 Numba 버전 불일치 문제 수정; librosa 버전 업데이트.
- 유형: 수정
- 기여자: RVC-Boss, XXXXRT666
- 관련: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
- **2024.04.22 GPT-SoVITS V4 정식 출시**.
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311)
- 내용: Gradio 매개변수 업데이트.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322)
- 내용: Colab/Kaggle 노트북 스크립트 개선.
- 유형: 정리 작업
- 기여자: XXXXRT666
## 202505 ### 20240216 업데이트
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) 1. 참조 텍스트 없이 입력을 지원합니다.
- 내용: Docker 및 Windows 자동 빌드 스크립트 개선; pre-commit 포맷팅 추가. 2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)에서 보고된 중국어 프론트엔드의 버그를 수정했습니다.
- 유형: 정리 작업
- 기여자: XXXXRT666
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408)
- 내용: 다국어 텍스트 분할 및 인식 로직 최적화.
- 유형: 수정
- 기여자: KamioRinn
- 관련: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377)
- 내용: 캐싱 전략 구현으로 SoVITS V3/V4 추론 속도 10% 향상.
- 유형: 성능 최적화
- 기여자: Kakaru Hayate
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba)
- 내용: 어노테이션 인터페이스를 업데이트하여 안내 문구를 추가했습니다: 각 페이지 편집 후 반드시 'Submit Text'를 클릭해 주세요. 그렇지 않으면 변경 사항이 저장되지 않습니다.
- 유형: 수정
- 기여자: RVC-Boss
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc)
- 내용: UVR5 및 ONNX dereverberation 모델에서 FFmpeg이 공백 포함 원본 경로로 MP3/M4A 파일 인코딩 시 오류 수정.
- 유형: 수정
- 기여자: RVC-Boss
## 202406 (V2Pro 시리즈) ### 20240221 업데이트
- 2025.06.03 [PR#2420](https://github.com/RVC-Boss/GPT-SoVITS/pull/2420) 1. 데이터 처리 중 노이즈 감소 옵션을 추가했습니다 (노이즈 감소는 16kHz 샘플링 비율만 남깁니다; 배경 노이즈가 심한 경우에만 사용하십시오).
- 내용: 다국어 프로젝트 문서 업데이트 2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) 중국어 및 일본어 프론트엔드 처리를 최적화했습니다.
- 유형: 문서화 3. Mac CPU 추론을 MPS 대신 CPU를 사용하도록 전환하여 성능을 향상시켰습니다.
- 기여자: StaryLan 4. Colab 공개 URL 문제를 수정했습니다.
- 2025.06.04 [PR#2417](https://github.com/RVC-Boss/GPT-SoVITS/pull/2417)
- 내용: TorchScript를 이용한 V4 내보내기 기능 지원 추가 ### 20240306 업데이트
- 유형: 기능 추가
- 기여자: L-jasmine 1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)는 추론 속도를 50% 가속화했습니다 (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39에서 테스트됨).
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)~[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70) 2. Faster Whisper의 비중국어 ASR을 사용할 때 중국어 FunASR 모델을 먼저 다운로드할 필요가 없습니다.
- 내용: **GPT-SoVITS V2Pro 시리즈 모델 추가 (V2Pro, V2ProPlus)** 3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)은 UVR5 리버브 제거 모델에서 설정이 반대로 되어 있는 문제를 수정했습니다.
- 유형: 기능 추가 4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)는 CUDA가 없는 경우 Faster Whisper의 자동 CPU 추론을 가능하게 했습니다.
- 기여자: RVC-Boss 5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)은 Mac에서 올바른 CPU 추론을 보장하기 위해 `is_half` 체크를 수정했습니다.
- 2025.06.05 [PR#2426](https://github.com/RVC-Boss/GPT-SoVITS/pull/2426)
- 내용: `config/inference_webui` 초기화 오류 수정 ### 202403/202404/202405 업데이트
- 유형: 버그 수정
- 기여자: StaryLan #### 사소한 수정:
- 2025.06.05 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427), [Commit#7d70852a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7d70852a3f67c3b52e3a62857f8663d529efc8cd), [PR#2434](https://github.com/RVC-Boss/GPT-SoVITS/pull/2434)
- 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가 1. 참조 텍스트 없는 모드의 문제를 수정했습니다.
- 유형: 신규 기능 2. 중국어 및 영어 텍스트 프론트엔드를 최적화했습니다.
- 기여자: XXXXRT666, RVC-Boss 3. API 형식을 개선했습니다.
4. CMD 형식 문제를 수정했습니다.
5. 훈련 데이터 처리 중 지원되지 않는 언어에 대한 오류 프롬프트를 추가했습니다.
6. Hubert 추출의 버그를 수정했습니다.
#### 주요 수정:
1. VQ를 고정하지 않고 SoVITS 훈련의 문제를 수정했습니다(품질 저하를 일으킬 수 있음).
2. 빠른 추론 분기를 추가했습니다.
### 20240610 업데이트
#### 사소한 수정:
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) 순수 구두점 및 다중 구두점 텍스트 입력 로직을 개선했습니다.
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5에서 MDXNet 디러버브를 위한 CMD 형식을 수정하고 공백이 있는 경로를 지원했습니다.
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py`에서 SoVITS 훈련을 위한 진행률 표시줄 로직을 수정했습니다.
#### 주요 수정:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI의 GPT 미세 조정이 중국어 입력 텍스트의 BERT 기능을 읽지 않아 추론과 불일치 및 잠재적 품질 저하를 일으키는 문제를 수정했습니다.
**주의: 이전에 많은 양의 데이터로 미세 조정한 경우 품질을 향상시키기 위해 모델을 다시 조정하는 것이 좋습니다.**
### 20240706 업데이트
#### 사소한 수정:
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU 추론에서 기본 배치 크기 소수점 문제를 수정했습니다.
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) 노이즈 제거 또는 ASR이 예외를 만나면 모든 보류 중인 오디오 파일이 종료되는 문제를 수정했습니다.
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) 구두점으로 분할할 때 소수점 분할 문제를 수정했습니다.
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) 다중 GPU 훈련을 위한 다중 프로세스 저장 로직을 수정했습니다.
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) 불필요한 `my_utils`를 제거했습니다.
#### 주요 수정:
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)의 가속 추론 코드가 검증되어 메인 브랜치에 병합되었으며, 기본과 일관된 추론 효과를 보장합니다.
또한 참조 텍스트 없는 모드에서 가속 추론을 지원합니다.
**향후 업데이트에서는 `fast_inference` 브랜치의 변경 사항의 일관성을 계속 검증할 것입니다**.
### 20240727 업데이트
#### 사소한 수정:
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 불필요한 i18n 코드를 정리했습니다.
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) 사용자 파일 경로의 후행 슬래시가 명령줄 오류를 일으키는 문제를 수정했습니다.
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT 훈련의 단계 계산 로직을 수정했습니다.
#### 주요 수정:
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) 합성을 위한 음성 속도 조절을 지원했습니다.
음성 속도만 조절하면서 무작위성을 고정할 수 있습니다.
### 20240806 업데이트
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer 보컬 반주 분리 모델에 대한 지원을 추가했습니다. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 추론을 활성화했습니다.
2. 중국어 텍스트 프론트엔드를 개선했습니다.
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 중국어 다의자 지원 (v2 전용);
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 추가된 양자;
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 사칙연산 및 기본 수학 공식을 지원합니다;
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 혼합 텍스트 오류를 수정했습니다.
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI 에서 오디오를 처리할 때 경로를 자동으로 채웠습니다.
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 인식 로직을 최적화했습니다.
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 광동어 ASR 지원을 추가했습니다.
6. GPT-SoVITS v2 지원을 추가했습니다.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다.
### 20240821 업데이트
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` 브랜치를 메인 브랜치에 병합.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML 태그를 사용하여 숫자, 전화번호, 날짜 및 시간 최적화 지원.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API 수정 및 최적화.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 믹싱을 위한 참조 오디오를 하나만 업로드할 수 있는 버그 수정, 다양한 데이터셋 검사 추가 및 파일이 누락된 경우 경고 팝업.
### 20250211 업데이트
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 모델 추가, SoVITS v3의 파인튜닝에는 14GB GPU 메모리가 필요합니다.
### 20250212 업데이트
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3의 파인튜닝에 그라디언트 체크포인트 추가, 12GB GPU 메모리가 필요합니다.
### 20250214 업데이트
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 다국어 혼합 텍스트 분할 전략 **A** 최적화.
- `split-lang`을 언어 분할 도구로 추가하여 다국어 혼합 텍스트의 분할 능력을 향상시켰습니다.
### 20250217 업데이트
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) 텍스트 내 숫자와 영어 처리 로직 최적화.
### 20250218 업데이트
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 다국어 혼합 텍스트 분할 전략 **B** 최적화.
### 20250223 업데이트
1. SoVITS V3의 파인튜닝에 LoRA 훈련이 지원됩니다. 8GB GPU 메모리가 필요하며, 전체 매개변수 파인튜닝보다 더 나은 결과를 제공합니다.
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) 보컬 및 악기 분리를 위해 Mel Band RoFormer 모델 추가.
### 20250226 업데이트
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows에서 비영어 디렉토리로 인한 문제 수정.
- 한국어에 대한 `langsegmenter` 사용 문제 수정.
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows에서 비영어 디렉토리로 인한 문제 수정.
- 한국어/일본어에 대한 `langsegmenter` 사용 문제 수정.
### 20250227 업데이트
- V3 모델로 24K 오디오를 생성할 때 발생하는 음성 뭉침 문제를 완화하기 위해, 24K에서 48K로의 오디오 초해상도 모델을 추가했습니다. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)에서 보고된 문제입니다.

View File

@ -5,20 +5,12 @@
[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org)
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
[![Huggingface](https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface)](https://lj1995-gpt-sovits-proplus.hf.space/)
[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md) [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
@ -49,63 +41,46 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
### 테스트 통과 환경 ### 테스트 통과 환경
| Python Version | PyTorch Version | Device | | Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- | |----------------|------------------|-----------------|
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | | Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon | | Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon | | Python 3.11 | PyTorch 2.6.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU | | Python 3.9 | PyTorch 2.2.2 | CPU |
### Windows ### Windows
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다. Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
```
### Linux ### Linux
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### macOS ### macOS
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.** **주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
다음 명령어를 실행하여 이 프로젝트를 설치하세요 1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요.
2. 다음 명령어를 실행하여 이 프로젝트를 설치하세요.
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### 수동 설치 ### 수동 설치
#### 의존성 설치
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
#### FFmpeg 설치 #### FFmpeg 설치
##### Conda 사용자 ##### Conda 사용자
```bash ```bash
conda activate GPTSoVits
conda install ffmpeg conda install ffmpeg
``` ```
@ -114,13 +89,14 @@ conda install ffmpeg
```bash ```bash
sudo apt install ffmpeg sudo apt install ffmpeg
sudo apt install libsox-dev sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
``` ```
##### Windows 사용자 ##### Windows 사용자
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
##### MacOS 사용자 ##### MacOS 사용자
@ -128,54 +104,41 @@ sudo apt install libsox-dev
brew install ffmpeg brew install ffmpeg
``` ```
### GPT-SoVITS 실행하기 (Docker 사용) #### 의존성 설치
#### Docker 이미지 선택
코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요
- 환경에 맞는 적절한 이미지 태그를 선택하세요
- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 **포함되어 있지 않음**을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다
- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다
- Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
#### 환경 변수
- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다
#### 공유 메모리 설정
Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다
#### 서비스 선택
`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:
- `GPT-SoVITS-CU126``GPT-SoVITS-CU128`: 전체 기능을 포함한 풀 버전
- `GPT-SoVITS-CU126-Lite``GPT-SoVITS-CU128-Lite`: 의존성이 줄어든 경량 버전
특정 서비스를 Docker Compose로 실행하려면 다음 명령을 사용하세요:
```bash ```bash
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
#### Docker 이미지 직접 빌드하기 ### Docker에서 사용
직접 이미지를 빌드하려면 다음 명령어를 사용하세요: #### docker-compose.yaml 설정
```bash 0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(오래된 버전) 에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬에서 빌드할 수 있습니다.
bash docker_build.sh --cuda <12.6|12.8> [--lite]
1. 환경 변수:
- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다.
2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다.
3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다.
4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다.
#### docker compose로 실행
```
docker compose -f "docker-compose.yaml" up -d
``` ```
#### 실행 중인 컨테이너 접속하기 (Bash Shell) #### docker 명령으로 실행
컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다: 위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다:
```bash ```
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
``` ```
## 사전 학습된 모델 ## 사전 학습된 모델
@ -243,12 +206,12 @@ python webui.py v1 <언어(옵션)>
#### 경로 자동 채우기가 지원됩니다 #### 경로 자동 채우기가 지원됩니다
1. 오디오 경로를 입력하십시오. 1. 오디오 경로를 입력하십시오.
2. 오디오를 작은 청크로 분할하십시오. 2. 오디오를 작은 청크로 분할하십시오.
3. 노이즈 제거(옵션) 3. 노이즈 제거(옵션)
4. ASR 수행 4. ASR 수행
5. ASR 전사를 교정하십시오. 5. ASR 전사를 교정하십시오.
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. 6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
### 추론 WebUI 열기 ### 추론 WebUI 열기
@ -290,7 +253,7 @@ V1 환경에서 V2를 사용하려면:
2. github에서 최신 코드를 클론하십시오. 2. github에서 최신 코드를 클론하십시오.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
@ -310,42 +273,10 @@ v2 환경에서 v3 사용하기:
2. 최신 코드를 github 에서 클론합니다. 2. 최신 코드를 github 에서 클론합니다.
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다. 3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
## V4 릴리스 노트
신규 기능:
1. **V4는 V3에서 발생하는 비정수 배율 업샘플링으로 인한 금속성 잡음 문제를 수정했으며, 소리가 먹먹해지는 것을 방지하기 위해 기본적으로 48kHz 오디오를 출력합니다 (V3는 기본적으로 24kHz만 지원)**. 개발자는 V4를 V3의 직접적인 대체 버전으로 보고 있지만 추가 테스트가 필요합니다.
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 환경에서 V4로 전환 방법:
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
2. GitHub에서 최신 코드를 클론하세요.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V4 사전 학습 모델(`gsv-v4-pretrained/s2v4.ckpt``gsv-v4-pretrained/vocoder.pth`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
## V2Pro 릴리스 노트
신규 기능:
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
2. GitHub에서 최신 코드를 클론하세요.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V2Pro 사전 학습 모델(`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 및 `sv/pretrained_eres2netv2w24s4ep4.ckpt`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
## 할 일 목록 ## 할 일 목록
- [x] **최우선순위:** - [x] **최우선순위:**
@ -371,7 +302,7 @@ V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
명령줄을 사용하여 UVR5용 WebUI 열기 명령줄을 사용하여 UVR5용 WebUI 열기
```bash ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
@ -382,7 +313,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
```bash ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -394,7 +325,7 @@ python audio_slicer.py \
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
```bash ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
@ -402,7 +333,7 @@ ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
```bash ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
@ -429,7 +360,6 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
### 추론용 텍스트 프론트엔드 ### 추론용 텍스트 프론트엔드

View File

@ -1,580 +1,222 @@
# Güncelleme Günlüğü ### 20240121 Güncellemesi
## 202401 1. `config`e `is_share` eklendi. Colab gibi senaryolarda, WebUI'yi halka açık ağa yönlendirmek için bu değeri `True` olarak ayarlayabilirsiniz.
2. WebUI'ye İngilizce sistem çeviri desteği eklendi.
3. `cmd-asr`, FunASR modelinin dahil olup olmadığını otomatik olarak tespit eder; eğer varsayılan dizinde bulunamazsa, ModelScope'dan indirilecektir.
4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)de bildirilen SoVITS eğitimindeki ZeroDivisionError'u sıfır uzunlukta örnekleri filtreleyerek düzeltmeye çalıştık.
5. `TEMP` klasöründeki önbelleğe alınmış ses dosyaları ve diğer dosyaları temizledik.
6. Referans sesinin sonunu içeren sentezlenmiş ses sorununu önemli ölçüde azalttık.
## 202401 ### 20240122 Güncellemesi
- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108) 1. Aşırı kısa çıktı dosyalarının referans sesini tekrarlamasına neden olan sorun giderildi.
- İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi. 2. İngilizce ve Japonca eğitim için yerel destek test edildi (Japonca eğitim için kök dizinin İngilizce olmayan özel karakterlerden arındırılmış olması gerekir).
- Tür: Dokümantasyon 3. Ses yolu denetimi iyileştirildi. Yanlış bir giriş yolundan okumaya çalışıldığında, ffmpeg hatası yerine yolun mevcut olmadığını bildirir.
- Katkıda Bulunan: D3lik
- 2024.01.21 [Commit#7b89c9ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b89c9ed5669f63c4ed6ae791408969640bdcf3e)
- İçerik: SoVITS eğitiminde ZeroDivisionError düzeltme girişimi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss, Tybost
- İlgili: [Issue#79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
- 2024.01.21 [Commit#ea62d6e0](https://github.com/RVC-Boss/GPT-SoVITS/commit/ea62d6e0cf1efd75287766ea2b55d1c3b69b4fd3)
- İçerik: Sentezlenen sesin referans sesin sonunu içerme sorunu önemli ölçüde azaltıldı.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.21 [Commit#a87ad522](https://github.com/RVC-Boss/GPT-SoVITS/commit/a87ad5228ed2d729da42019ae1b93171f6a745ef)
- İçerik: `cmd-asr.py` artık FunASR modelinin varsayılan dizinde olup olmadığını kontrol ediyor ve değilse ModelScope'tan indiriyor.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.21 [Commit#f6147116](https://github.com/RVC-Boss/GPT-SoVITS/commit/f61471166c107ba56ccb7a5137fa9d7c09b2830d)
- İçerik: `Config.py`'a `is_share` parametresi eklendi, `True` olarak ayarlanırsa WebUI genel ağa eşlenir.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.21 [Commit#102d5081](https://github.com/RVC-Boss/GPT-SoVITS/commit/102d50819e5d24580d6e96085b636b25533ecc7f)
- İçerik: `TEMP` klasöründeki önbelleğe alınmış ses dosyaları ve diğer dosyalar temizlendi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.22 [Commit#872134c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/872134c846bcb8f1909a3f5aff68a6aa67643f68)
- İçerik: Aşırı kısa çıktı dosyalarının referans sesi tekrarlaması sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.22 İngilizce ve Japonca eğitim için yerel destek test edildi (Japonca eğitim için kök dizinde İngilizce olmayan özel karakterler olmamalı).
- 2024.01.22 [PR#124](https://github.com/RVC-Boss/GPT-SoVITS/pull/124)
- İçerik: Ses yolu kontrolü iyileştirildi. Yanlış giriş yolundan okuma girişiminde ffmpeg hatası yerine yolun mevcut olmadığı bildiriliyor.
- Tür: Optimizasyon
- Katkıda Bulunan: xmimu
- 2024.01.23 [Commit#93c47cd9](https://github.com/RVC-Boss/GPT-SoVITS/commit/93c47cd9f0c53439536eada18879b4ec5a812ae1)
- İçerik: Hubert çıkarımının NaN hatalarına neden olarak SoVITS/GPT eğitiminde ZeroDivisionError'a yol açması sorunu çözüldü.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.23 [Commit#80fffb0a](https://github.com/RVC-Boss/GPT-SoVITS/commit/80fffb0ad46e4e7f27948d5a57c88cf342088d50)
- İçerik: Çince kelime bölme için `jieba`, `jieba_fast` ile değiştirildi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.23 [Commit#63625758](https://github.com/RVC-Boss/GPT-SoVITS/commit/63625758a99e645f3218dd167924e01a0e3cf0dc)
- İçerik: Model dosyası sıralama mantığı optimize edildi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.23 [Commit#0c691191](https://github.com/RVC-Boss/GPT-SoVITS/commit/0c691191e894c15686e88279745712b3c6dc232f)
- İçerik: Çıkarım WebUI'ında hızlı model değiştirme desteği eklendi.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.25 [Commit#249561e5](https://github.com/RVC-Boss/GPT-SoVITS/commit/249561e5a18576010df6587c274d38cbd9e18b4b)
- İçerik: Çıkarım WebUI'ında gereksiz günlükler kaldırıldı.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.25 [PR#183](https://github.com/RVC-Boss/GPT-SoVITS/pull/183), [PR#200](https://github.com/RVC-Boss/GPT-SoVITS/pull/200)
- İçerik: Mac'te eğitim ve çıkarım desteği eklendi.
- Tür: Özellik
- Katkıda Bulunan: Lion-Wu
- 2024.01.26 [Commit#813cf96e](https://github.com/RVC-Boss/GPT-SoVITS/commit/813cf96e508ba1bb2c658f38c7cc77b797fb4082), [Commit#2d1ddeca](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d1ddeca42db90c3fe2d0cd79480fd544d87f02b)
- İçerik: UVR5'in dizinleri okuyup otomatik olarak çıkması sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.26 [PR#204](https://github.com/RVC-Boss/GPT-SoVITS/pull/204)
- İçerik: Çince-İngilizce ve Japonca-İngilizce karışık çıktı metinleri için destek eklendi.
- Tür: Özellik
- Katkıda Bulunan: Kakaru Hayate
- 2024.01.26 [Commit#f4148cf7](https://github.com/RVC-Boss/GPT-SoVITS/commit/f4148cf77fb899c22bcdd4e773d2f24ab34a73e7)
- İçerik: Çıktı için isteğe bağlı bölümleme modu eklendi.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.26 [Commit#9fe955c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/9fe955c1bf5f94546c9f699141281f2661c8a180)
- İçerik: Birden fazla satır sonunun çıkarım hatasına neden olması sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.26 [Commit#84ee4719](https://github.com/RVC-Boss/GPT-SoVITS/commit/84ee471936b332bc2ccee024d6dfdedab4f0dc7b)
- İçerik: Yarım hassasiyeti desteklemeyen GPU'lar için otomatik olarak tek hassasiyet zorlandı; CPU çıkarımında tek hassasiyet zorunlu kılındı.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.01.28 [PR#238](https://github.com/RVC-Boss/GPT-SoVITS/pull/238)
- İçerik: Dockerfile'da model indirme süreci tamamlandı.
- Tür: Düzeltme
- Katkıda Bulunan: breakstring
- 2024.01.28 [PR#257](https://github.com/RVC-Boss/GPT-SoVITS/pull/257)
- İçerik: Sayıların telaffuzunun Çince karakterlere dönüşmesi sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: duliangang
- 2024.01.28 [Commit#f0cfe397](https://github.com/RVC-Boss/GPT-SoVITS/commit/f0cfe397089a6fd507d678c71adeaab5e7ed0683)
- İçerik: GPT eğitiminde kontrol noktalarının kaydedilmemesi sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.28 [Commit#b8ae5a27](https://github.com/RVC-Boss/GPT-SoVITS/commit/b8ae5a2761e2654fc0c905498009d3de9de745a8)
- İçerik: Kısıtlamalar ayarlanarak mantıksız referans ses uzunlukları hariç tutuldu.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.28 [Commit#698e9655](https://github.com/RVC-Boss/GPT-SoVITS/commit/698e9655132d194b25b86fbbc99d53c8d2cea2a3)
- İçerik: Cümlelerin başında birkaç karakterin yutulması sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.29 [Commit#ff977a5f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff977a5f5dc547e0ad82b9e0f1cd95fbc830b2b0)
- İçerik: 16 serisi gibi yarım hassasiyet eğitiminde sorun yaşayan GPU'lar için eğitim yapılandırmaları tek hassasiyete değiştirildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.29 [Commit#172e139f](https://github.com/RVC-Boss/GPT-SoVITS/commit/172e139f45ac26723bc2cf7fac0112f69d6b46ec)
- İçerik: Kullanılabilir Colab sürümü test edildi ve güncellendi.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.29 [PR#135](https://github.com/RVC-Boss/GPT-SoVITS/pull/135)
- İçerik: FunASR Sürüm 1.0'a güncellendi ve arayüz uyumsuzluğundan kaynaklanan hatalar düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: LauraGPT
- 2024.01.30 [Commit#1c2fa98c](https://github.com/RVC-Boss/GPT-SoVITS/commit/1c2fa98ca8c325dcfb32797d22ff1c2a726d1cb4)
- İçerik: Çince ve İngilizce noktalama işaretlerinin bölünmesi sorunları düzeltildi ve cümle başlarına ve sonlarına noktalama işaretleri eklendi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.01.30 [Commit#74409f35](https://github.com/RVC-Boss/GPT-SoVITS/commit/74409f3570fa1c0ff28d4c65c288a6ce58ca00d2)
- İçerik: Noktalama işaretlerine göre bölme desteği eklendi.
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.01.30 [Commit#c42eeccf](https://github.com/RVC-Boss/GPT-SoVITS/commit/c42eeccfdd2d0a0d714ecc8bfc22a12373aca6b7)
- İçerik: Yeni kullanıcıların yolları çift tırnak içinde kopyalayarak hata yapmasını önlemek için tüm yol ile ilgili girdilerden çift tırnaklar otomatik olarak kaldırıldı.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
## 202402 ### 20240123 Güncellemesi
- 2024.02.01 [Commit#45f73519](https://github.com/RVC-Boss/GPT-SoVITS/commit/45f73519cc41cd17cf816d8b997a9dcb0bee04b6) 1. Hubert çıkarımının NaN hatalarına neden olup SoVITS/GPT eğitiminde ZeroDivisionError'a yol açtığı sorun çözüldü.
- İçerik: ASR yolunun `/` ile bitmesi durumunda dosya adı kaydetme hatası düzeltildi. 2. İnferans WebUI'de hızlı model değiştirme desteği eklendi.
- Tür: Düzeltme 3. Model dosyası sıralama mantığı optimize edildi.
- Katkıda Bulunan: RVC-Boss 4. Çince kelime ayrımı için `jieba` `jieba_fast` ile değiştirildi.
- 2024.02.03 [Commit#dba1a74c](https://github.com/RVC-Boss/GPT-SoVITS/commit/dba1a74ccb0cf19a1b4eb93faf11d4ec2b1fc5d7)
- İçerik: UVR5 format okuma hatası nedeniyle ayrıştırma başarısızlığı sorunu çözüldü.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.02.03 [Commit#3ebff70b](https://github.com/RVC-Boss/GPT-SoVITS/commit/3ebff70b71580ee1f97b3238c9442cbc5aef47c7)
- İçerik: Çince-Japonca-İngilizce karışık metinler için otomatik bölümleme ve dil tanıma desteği eklendi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.02.03 [PR#377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)
- İçerik: PaddleSpeech Normalizer entegre edildi - "xx.xx%" (yüzde sembolü) ve "元/吨" ifadelerinin "元吨" yerine "元每吨" olarak okunması sorunu ile alt çizgi hataları düzeltildi.
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2024.02.05 [PR#395](https://github.com/RVC-Boss/GPT-SoVITS/pull/395)
- İçerik: İngilizce metin ön uç işleme optimizasyonu yapıldı.
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2024.02.06 [Commit#65b463a7](https://github.com/RVC-Boss/GPT-SoVITS/commit/65b463a787f31637b4768cc9a47cab59541d3927)
- İçerik: Dil parametrelerinin karışması nedeniyle Çince çıkarım kalitesinin düşmesi sorunu giderildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- İlgili: [Issue#391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)
- 2024.02.06 [PR#403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)
- İçerik: UVR5, librosa'nın daha yeni sürümlerine uyumlu hale getirildi.
- Tür: Düzeltme
- Katkıda Bulunan: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- İçerik: `is_half` parametresinin boolean'a dönüştürülmemesi nedeniyle oluşan UVR5 inf hatası düzeltildi (16 serisi GPU'larda `inf` sorununa neden oluyordu).
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
- İçerik: Gradio bağımlılık sorunları giderildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.02.07 [PR#400](https://github.com/RVC-Boss/GPT-SoVITS/pull/400)
- İçerik: Japonca ve İngilizce için Faster Whisper ASR entegrasyonu yapıldı.
- Tür: Özellik
- Katkıda Bulunan: Shadow
- 2024.02.07 [Commit#6469048d](https://github.com/RVC-Boss/GPT-SoVITS/commit/6469048de12a8d6f0bd05d07f031309e61575a38)~[Commit#94ee71d9](https://github.com/RVC-Boss/GPT-SoVITS/commit/94ee71d9d562d10c9a1b96e745c6a6575aa66a10)
- İçerik: Veri seti hazırlarken kök dizin boş bırakılırsa `.list` dosya yollarının otomatik okunması desteği eklendi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.02.08 [Commit#59f35ada](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)
- İçerik: Windows 10 1909 ve Geleneksel Çince sistem dilinde GPT eğitiminin donma sorunu çözülmeye çalışıldı.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- İlgili: [Issue#232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)
- 2024.02.12 [PR#457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)
- İçerik: DPO Loss eğitim seçeneği eklendi (GPT tekrarlarını ve karakter atlamalarını azaltmak için), çıkarım WebUI'sına yeni parametreler eklendi.
- Tür: Özellik
- Katkıda Bulunan: liufenghua
- 2024.02.12 [Commit#2fa74ecb](https://github.com/RVC-Boss/GPT-SoVITS/commit/2fa74ecb941db27d9015583a9be6962898d66730), [Commit#d82f6bbb](https://github.com/RVC-Boss/GPT-SoVITS/commit/d82f6bbb98ba725e6725dcee99b80ce71fb0bf28)
- İçerik: Faster Whisper ve FunASR mantığı optimize edildi, Hugging Face bağlantı sorunlarını önlemek için yansı indirmelere geçildi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.02.15 [Commit#dd2c4d6d](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2c4d6d7121bf82d29d0f0e4d788f3b231997c8)
- İçerik: Eğitimde Çince deney adları desteklendi (önceki sürümlerde hata veriyordu).
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.02.15 [Commit#ccb9b08b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ccb9b08be3c58e102defcc94ff4fd609da9e27ee)~[Commit#895fde46](https://github.com/RVC-Boss/GPT-SoVITS/commit/895fde46e420040ed26aaf0c5b7e99359d9b199b)
- İçerik: DPO eğitimi zorunlu olmaktan çıkarılıp seçmeli hale getirildi. Seçildiğinde batch boyutu otomatik yarıya indiriliyor. Çıkarım WebUI'sında yeni parametrelerin iletilmemesi sorunu düzeltildi.
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss
- 2024.02.15 [Commit#7b0c3c67](https://github.com/RVC-Boss/GPT-SoVITS/commit/7b0c3c676495c64b2064aa472bff14b5c06206a5)
- İçerik: Çince ön uç hataları düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.02.16 [PR#499](https://github.com/RVC-Boss/GPT-SoVITS/pull/499)
- İçerik: Referans metin olmadan giriş yapma desteği eklendi.
- Tür: Özellik
- Katkıda Bulunan: Watchtower-Liu
- İlgili: [Issue#475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)
- 2024.02.17 [PR#509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509), [PR#507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR#532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR#556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR#559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559)
- İçerik: Çince ve Japonca ön uç işleme optimizasyonları yapıldı.
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn, v3cun
- 2024.02.17 [PR#510](https://github.com/RVC-Boss/GPT-SoVITS/pull/511), [PR#511](https://github.com/RVC-Boss/GPT-SoVITS/pull/511)
- İçerik: Colab genel URL sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: ChanningWang2018, RVC-Boss
- 2024.02.21 [PR#557](https://github.com/RVC-Boss/GPT-SoVITS/pull/557)
- İçerik: Mac CPU çıkarımında MPS yerine CPU kullanılarak performans iyileştirildi.
- Tür: Optimizasyon
- Katkıda Bulunan: XXXXRT666
- 2024.02.21 [Commit#6da486c1](https://github.com/RVC-Boss/GPT-SoVITS/commit/6da486c15d09e3d99fa42c5e560aaac56b6b4ce1), [Commit#5a171773](https://github.com/RVC-Boss/GPT-SoVITS/commit/5a17177342d2df1e11369f2f4f58d34a3feb1a35)
- İçerik: Veri işleme sırasında gürültü azaltma seçeneği eklendi (sadece 16kHz örnekleme hızını korur, sadece yüksek arka plan gürültüsü varsa kullanılması önerilir).
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.02.28 [PR#573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)
- İçerik: Mac'te CPU çıkarımının düzgün çalışması için `is_half` kontrolü düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
- 2024.02.28 [PR#610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)
- İçerik: UVR5 reverb kaldırma modelinde ayarların ters olması sorunu düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: Yuze Wang
## 202403 ### 20240126 Güncellemesi
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) 1. Çince-İngilizce ve Japonca-İngilizce karışık çıktı metinleri için destek eklendi.
- İçerik: CUDA yokken Faster Whisper için otomatik CPU çıkarımı etkinleştirildi 2. Çıktı için isteğe bağlı bir bölme modu eklendi.
- Tür: Optimizasyon 3. UVR5'in dizinlerden otomatik olarak çıkmasına neden olan okuma sorununu düzelttik.
- Katkıda Bulunan: ShiroDoMain 4. Çeşitli yeni satır sorunlarını düzelterek çıkarım hatalarını giderdik.
- 2024.03.06 [Commit#616be20d](https://github.com/RVC-Boss/GPT-SoVITS/commit/616be20db3cf94f1cd663782fea61b2370704193) 5. Çıkarım WebUI'deki gereksiz günlükleri kaldırdık.
- İçerik: Faster Whisper Çince olmayan ASR kullanırken artık Çince FunASR modelini önceden indirmeye gerek yok 6. Mac'te eğitim ve çıkarım desteği eklendi.
- Tür: Optimizasyon 7. Yarım hassasiyeti desteklemeyen GPU'lar için otomatik olarak tek hassasiyet zorlandı; CPU çıkarımında tek hassasiyet uygulandı.
- Katkıda Bulunan: RVC-Boss
- 2024.03.09 [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- İçerik: Çıkarım hızı %50 iyileştirildi (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 ortamında test edildi)
- Tür: Optimizasyon
- Katkıda Bulunan: GoHomeToMacDonal
- 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
- İçerik: Hızlı çıkarım dalı 'fast_inference_' eklendi
- Tür: Özellik
- Katkıda Bulunan: ChasonJiang
- 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
- İçerik: CPU ile eğitim desteği eklendi, macOS'ta CPU kullanarak eğitim yapılabilir
- Tür: Özellik
- Katkıda Bulunan: Lion-Wu
- 2024.03.19 [PR#804](https://github.com/RVC-Boss/GPT-SoVITS/pull/804), [PR#812](https://github.com/RVC-Boss/GPT-SoVITS/pull/812), [PR#821](https://github.com/RVC-Boss/GPT-SoVITS/pull/821)
- İçerik: İngilizce metin ön uç iyileştirmeleri
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
- 2024.03.30 [PR#894](https://github.com/RVC-Boss/GPT-SoVITS/pull/894)
- İçerik: API formatı geliştirildi
- Tür: Optimizasyon
- Katkıda Bulunan: KamioRinn
## 202404 ### 20240128 Güncellemesi
- 2024.04.03 [PR#917](https://github.com/RVC-Boss/GPT-SoVITS/pull/917) 1. Sayıların Çince karakterlere dönüştürülmesiyle ilgili sorunu düzelttik.
- İçerik: UVR5 WebUI'da FFmpeg komut dizgisi biçimlendirmesi düzeltildi 2. Cümlelerin başındaki birkaç karakterin yutulması sorununu düzelttik.
- Tür: Düzeltme 3. Mantıksız referans ses uzunluklarını sınırlamalar koyarak hariç tuttuk.
- Katkıda Bulunan: StaryLan 4. GPT eğitiminin kontrol noktalarını kaydetmemesi sorununu düzelttik.
5. Dockerfile'da model indirme sürecini tamamladık.
## 202405 ### 20240129 Güncellemesi
- 2024.05.02 [PR#953](https://github.com/RVC-Boss/GPT-SoVITS/pull/953) 1. Yarım hassasiyet eğitimi ile ilgili sorun yaşayan 16 serisi gibi GPU'lar için eğitim yapılandırmalarını tek hassasiyete değiştirdik.
- İçerik: SoVITS eğitiminde VQ'nun dondurulmamasından kaynaklanan kalite düşüşü sorunu çözüldü 2. Mevcut Colab sürümünü test ettik ve güncelledik.
- Tür: Düzeltme 3. Eski sürüm FunASR ile ModelScope FunASR deposunun git klonlanmasıyla oluşan arayüz hizalama hatalarını düzelttik.
- Katkıda Bulunan: hcwu1993
- İlgili: [Issue#747](https://github.com/RVC-Boss/GPT-SoVITS/issues/747)
- 2024.05.19 [PR#1102](https://github.com/RVC-Boss/GPT-SoVITS/pull/1102)
- İçerik: Eğitim verisi işleme sırasında desteklenmeyen diller için hata mesajı eklendi
- Tür: Optimizasyon
- Katkıda Bulunan: StaryLan
- 2024.05.27 [PR#1132](https://github.com/RVC-Boss/GPT-SoVITS/pull/1132)
- İçerik: Hubert çıkarım hatası düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
## 202406 ### 20240130 Güncellemesi
- 2024.06.06 [Commit#99f09c8b](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) 1. Çift tırnaklarla yol kopyalama hatalarını önlemek için tüm yol ile ilgili girdilerden otomatik olarak çift tırnakları kaldırdık.
- İçerik: WebUI'da GPT ince ayarında Çince metinlerin BERT özelliklerinin okunmaması nedeniyle çıkarım tutarsızlığı ve kalite düşüşü sorunu düzeltildi 2. Çince ve İngilizce noktalama işaretlerini ayırma sorunlarını düzelttik ve cümlelerin başına ve sonuna noktalama işaretleri ekledik.
**Uyarı: Daha önce büyük miktarda veriyle ince ayar yaptıysanız, kaliteyi artırmak için modeli yeniden ayarlamanız önerilir** 3. Noktalama işaretlerine göre ayırma özelliğini ekledik.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.06.07 [PR#1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)
- İçerik: `s2_train.py` dosyasında SoVITS eğitim ilerleme çubuğu mantığı düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: pengzhendong
- 2024.06.10 [Commit#501a74ae](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
- İçerik: UVR5 MDXNet'in FFmpeg çağrılarında boşluk içeren yollarla uyumlu olması için dize biçimlendirme düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.06.10 [PR#1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168), [PR#1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)
- İçerik: Saf noktalama işaretleri ve çoklu noktalama işaretli metin girişi işleme mantığı iyileştirildi
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
- İlgili: [Issue#1165](https://github.com/RVC-Boss/GPT-SoVITS/issues/1165)
- 2024.06.13 [Commit#db506705](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
- İçerik: CPU çıkarımında varsayılan batch boyutu ondalık sorunu düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.06.28 [PR#1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR#1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR#1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267)
- İçerik: Gürültü azaltma veya ASR işlemi sırasında istisna oluştuğunda bekleyen tüm ses dosyalarının kapanması sorunu düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
- 2024.06.29 [Commit#a208698e](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
- İçerik: Çoklu GPU eğitiminde çoklu işlem kayıt mantığı düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2024.06.29 [PR#1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251)
- İçerik: Yinelenen `my_utils.py` dosyası kaldırıldı
- Tür: Optimizasyon
- Katkıda Bulunan: aoguai
- İlgili: [Issue#1189](https://github.com/RVC-Boss/GPT-SoVITS/issues/1189)
## 202407 ### 20240201 Güncellemesi
- 2024.07.06 [PR#1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) 1. Ayrılma hatalarına neden olan UVR5 format okuma hatasını düzelttik.
- İçerik: Noktalama işaretlerine göre bölme işlemi sırasında ondalık sayıların bölünmesi sorunu düzeltildi 2. Karışık Çince-Japonca-İngilizce metinler için otomatik segmentasyon ve dil tanıma desteği sağladık.
- Tür: Düzeltme
- Katkıda Bulunan: aoguai
- 2024.07.06 [Commit#b0786f29](https://github.com/RVC-Boss/GPT-SoVITS/commit/b0786f2998f1b2fce6678434524b4e0e8cc716f5)
- İçerik: Hızlandırılmış çıkarım kodu doğrulandı ve ana dal ile birleştirildi. Temel sürümle aynı çıkarım etkisi garanti edilirken referans metni olmayan modda da hızlandırılmış çıkarım destekleniyor
- Tür: Optimizasyon
- Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
- İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
- 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
- İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
- Tür: Dokümantasyon
- Katkıda Bulunan: StaryLan
- 2024.07.13 [PR#1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299)
- İçerik: Kullanıcı dosya yollarındaki son eğik çizgilerin neden olduğu komut satırı hataları düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
- 2024.07.19 [PR#756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756)
- İçerik: GPT eğitiminde özel `bucket_sampler` kullanılırken eğitim adımlarında tutarsızlık sorunu düzeltildi
- Tür: Düzeltme
- Katkıda Bulunan: huangxu1991
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- İçerik: Sentez sırasında konuşma hızı ayarlama özelliği eklendi (rastgeleliği sabitleme ve sadece hızı kontrol etme seçeneği dahil). Bu özellik `api.py` dosyasına eklendi
- Tür: Özellik
- Katkıda Bulunan: RVC-Boss, 红血球AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- İçerik: BS-RoFormer vokal eşlik ayırma modeli desteği eklendi.
- Tür: Yeni Özellik
- Katkıda Bulunan: KamioRinn
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351)
- İçerik: Çince metin ön işleme iyileştirildi.
- Tür: Yeni Özellik
- Katkıda Bulunan: KamioRinn
## 202408 (V2 Sürümü) ### 20240202 Güncellemesi
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355) 1. `/` ile biten bir ASR yolunun dosya adını kaydetme hatasına neden olma sorununu düzelttik.
- İçerik: WebUI'de dosya işlerken yolların otomatik doldurulması. 2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) PaddleSpeech'in Normalizer'ını tanıtarak "xx.xx%" (yüzde sembolleri) ve "元/吨" ifadesinin "元吨" yerine "元每吨" olarak okunması gibi sorunları düzelttik ve alt çizgi hatalarını giderdik.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
- İçerik: BS-Roformer için FP16 çıkarım desteği etkinleştirildi.
- Tür: Performans Optimizasyonu
- Katkıda Bulunan: RVC-Boss
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
- İçerik: GPU tanıma mantığı optimize edildi, kullanıcıların girdiği rastgele GPU indekslerini işlemek için kullanıcı dostu mantık eklendi.
- Tür: Chore
- Katkıda Bulunan: RVC-Boss
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758)
- İçerik: **GPT-SoVITS V2 modeli eklendi.**
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
- İçerik: FunASR kullanarak Kantonca ASR desteği eklendi.
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388)
- İçerik: UI ve zamanlama mantığı optimize edildi.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488)
- İçerik: Çok sesli karakter işleme mantığı optimize edildi (Yalnızca V2).
- Tür: Düzeltme, Yeni Özellik
- Katkıda Bulunan: KamioRinn, RVC-Boss
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422)
- İçerik: Yalnızca bir referans ses yüklenebilme hatası düzeltildi; eksik dosyalar için uyarıılır pencereleriyle veri seti doğrulama eklendi.
- Tür: Düzeltme, Chore
- Katkıda Bulunan: XXXXRT666
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508)
- İçerik: Yukarı akış LangSegment kütüphanesi artık SSML etiketleri kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize ediyor.
- Tür: Yeni Özellik
- Katkıda Bulunan: juntaosun
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503)
- İçerik: API düzeltildi ve optimize edildi.
- Tür: Düzeltme
- Katkıda Bulunan: KamioRinn
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490)
- İçerik: `fast_inference` dalı ana dala birleştirildi.
- Tür: Yeniden Yapılandırma
- Katkıda Bulunan: ChasonJiang
- 2024.08.21 **GPT-SoVITS V2 sürümü resmi olarak yayınlandı.**
## 202502 (V3 Sürümü) ### 20240207 Güncellemesi
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4) 1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)de bildirilen dil parametresi karışıklığının Çinçe çıkarım kalitesini düşürme sorununu düzelttik.
- İçerik: **İnce ayar için 14GB VRAM gerektiren GPT-SoVITS V3 modeli eklendi.** 2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) ile UVR5'i daha yüksek versiyonlarda librosa'ya uyarladık.
- Tür: Yeni Özellik ([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) referans) 3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) `is_half` parametresinin booleana dönüştürülmemesi nedeniyle sürekli yarım hassasiyet çıkarımı yaparak 16 serisi GPU'larda `inf` hatasına neden olan UVR5 inf hatasını düzelttik.
- Katkıda Bulunan: RVC-Boss 4. İngilizce metin önyüzünü optimize ettik.
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032) 5. Gradio bağımlılıklarını düzelttik.
- İçerik: Çok dilli proje dokümantasyonu güncellendi. 6. Veri seti hazırlığı sırasında kök dizini boş bırakıldığında `.list` tam yollarının otomatik olarak okunmasını destekledik.
- Tür: Dokümantasyon 7. Japonca ve İngilizce için Faster Whisper ASR'yi entegre ettik.
- Katkıda Bulunan: StaryLan
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033)
- İçerik: Japonca dokümantasyon güncellendi.
- Tür: Dokümantasyon
- Katkıda Bulunan: Fyphen
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010)
- İçerik: Dikkat hesaplama mantığı optimize edildi.
- Tür: Performans Optimizasyonu
- Katkıda Bulunan: wzy3650
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040)
- İçerik: İnce ayar için gradyan kontrol noktası desteği eklendi (12GB VRAM gerektirir).
- Tür: Yeni Özellik
- Katkıda Bulunan: Kakaru Hayate
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073)
- İçerik: Yeni dil bölümleme aracına geçildi, çok dilli karışık metin bölme stratejisi iyileştirildi, sayı ve İngilizce işleme mantığı optimize edildi.
- Tür: Yeni Özellik
- Katkıda Bulunan: KamioRinn
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e)
- İçerik: **GPT-SoVITS V3 modeli artık LoRA eğitimini destekliyor (ince ayar için 8GB GPU Belleği gerektirir).**
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078)
- İçerik: Vokal ve enstrüman ayırma için Mel Band Roformer model desteği eklendi.
- Tür: Yeni Özellik
- Katkıda Bulunan: Sucial
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114)
- İçerik: Çince yollarda MeCab hatası düzeltildi (özel olarak Japonca/Korece veya çok dilli metin bölme için).
- Tür: Düzeltme
- Katkıda Bulunan: KamioRinn
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc)
- İçerik: V3 modeliyle 24K ses üretirken "boğuk" ses sorununu hafifletmek için **24kHz'den 48kHz'e ses süper çözünürlük modelleri eklendi**.
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss
- İlgili: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123)
- İçerik: Çok dilli proje dokümantasyonu güncellendi.
- Tür: Dokümantasyon
- Katkıda Bulunan: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- İçerik: Model tanımlayamadığında kısa CJK karakterleri için kural tabanlı tespit uygulandı.
- Tür: Düzeltme
- Katkıda Bulunan: KamioRinn
- İlgili: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5)
- İçerik: Sentez hızını kontrol etmek için konuşma hızı parametresi eklendi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.02.28 **GPT-SoVITS V3 resmi olarak yayınlandı**.
## 202503 ### 20240208 Güncellemesi
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236) 1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) ile Windows 10 1909'da ve [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)de (Geleneksel Çince Sistem Dili) bildirilen GPT eğitim durma sorununu düzeltmeye çalıştık.
- İçerik: Bağımlılıkların yanlış sürümlerinden kaynaklanan sorunlar düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: XXXXRT666
- İlgili:
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
- İçerik: **SoVITS v3 için paralel çıkarım etkinleştirildi.**
- Tür: Yeni Özellik
- Katkıda Bulunan: ChasonJiang
- Diğer küçük hatalar düzeltildi. ### 20240212 Güncellemesi
- ONNX çalışma zamanı GPU çıkarım desteği için entegre paket düzeltmeleri: 1. Faster Whisper ve FunASR için mantığı optimize ettik, Hugging Face bağlantı sorunlarını önlemek için Faster Whisper'ı ayna indirmelere yönlendirdik.
- Tür: Düzeltme 2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) GPT tekrarı ve eksik karakterleri azaltmak için eğitim sırasında negatif örnekler oluşturarak deneysel DPO Loss eğitim seçeneğini etkinleştirdi ve çıkarım WebUI'de çeşitli çıkarım parametrelerini kullanılabilir hale getirdi.
- Detaylar:
- G2PW içindeki ONNX modelleri CPU'dan GPU çıkarımına geçirildi, CPU darboğazı önemli ölçüde azaltıldı;
- foxjoy yankı giderme modeli artık GPU çıkarımını destekliyor.
## 202504 (V4 Sürümü) ### 20240214 Güncellemesi
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f) 1. Eğitimde Çince deney adlarını destekledik (önceden hatalara neden oluyordu).
- İçerik: SoVITS v3 paralel çıkarımı kilit açıldı; asenkron model yükleme mantığı düzeltildi. 2. DPO eğitimini zorunlu yerine isteğe bağlı bir özellik yaptık. Seçilirse, parti boyutu otomatik olarak yarıya indirilir. Çıkarım WebUI'de yeni parametrelerin iletilmemesi sorunlarını düzelttik.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255)
- İçerik: Ruff ile kod biçimlendirme; G2PW bağlantısı güncellendi.
- Tür: Stil
- Katkıda Bulunan: XXXXRT666
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290)
- İçerik: Dokümantasyon temizlendi; Python 3.11 desteği eklendi; yükleyiciler güncellendi.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300)
- İçerik: Colab, kurulum dosyaları ve model indirmeleri güncellendi.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf)
- İçerik: **GPT-SoVITS V4 modeli eklendi.**
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307)
- İçerik: V4 için paralel çıkarım etkinleştirildi.
- Tür: Yeni Özellik
- Katkıda Bulunan: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309)
- İçerik: Model sürümü parametre aktarımı düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss, ChasonJiang
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310)
- İçerik: Numpy ve Numba sürüm uyumsuzluğu sorunu düzeltildi; librosa sürümü güncellendi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss, XXXXRT666
- İlgili: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
- **2025.04.22 GPT-SoVITS V4 resmi olarak yayınlandı**.
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311)
- İçerik: Gradio parametreleri güncellendi.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322)
- İçerik: Colab/Kaggle notebook betikleri iyileştirildi.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
## 202505 ### 20240216 Güncellemesi
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351) 1. Referans metin olmadan girişi destekledik.
- İçerik: Docker ve Windows otomatik derleme betikleri iyileştirildi; ön işleme biçimlendirme eklendi. 2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) de bildirilen Çince önyüz hatalarını düzelttik.
- Tür: Chore
- Katkıda Bulunan: XXXXRT666
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408)
- İçerik: Çok dilli metin bölme ve tanıma mantığı optimize edildi.
- Tür: Düzeltme
- Katkıda Bulunan: KamioRinn
- İlgili: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377)
- İçerik: SoVITS V3/V4 çıkarım hızını %10 artırmak için önbellekleme stratejileri uygulandı.
- Tür: Performans Optimizasyonu
- Katkıda Bulunan: Kakaru Hayate
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba)
- İçerik: Açıklama arayüzü uyarı ile güncellendi: her sayfa tamamlandıktan sonra "Metni Gönder"e tıklayın, aksi takdirde değişiklikler kaydedilmez.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc)
- İçerik: UVR5 ve ONNX yankı giderme modellerinde, FFmpeg'in orijinal yollarında boşluk bulunan MP3/M4A dosyalarını kodlarken oluşan hatalar düzeltildi.
- Tür: Düzeltme
- Katkıda Bulunan: RVC-Boss
## 202506 (V2Pro Serisi) ### 20240221 Güncellemesi
- 2025.06.03 [PR#2420](https://github.com/RVC-Boss/GPT-SoVITS/pull/2420) 1. Veri işleme sırasında bir gürültü azaltma seçeneği ekledik (gürültü azaltma sadece 16kHz örnekleme hızını bırakır; yalnızca arka plan gürültüsü önemliyse kullanın).
- İçerik: Çok dilli proje dokümantasyonu güncellendi 2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) ile Çince ve Japonca önyüz işlemesini optimize ettik.
- Tür: Dokümantasyon 3. Mac CPU çıkarımını daha hızlı performans için MPS yerine CPU kullanacak şekilde değiştirdik.
- Katkıda Bulunan: StaryLan 4. Colab genel URL sorununu düzelttik.
- 2025.06.04 [PR#2417](https://github.com/RVC-Boss/GPT-SoVITS/pull/2417)
- İçerik: TorchScript ile V4 dışa aktarma desteği eklendi ### 20240306 Güncellemesi
- Tür: Özellik
- Katkıda Bulunan: L-jasmine 1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) çıkarımı %50 hızlandırdı (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 üzerinde test edildi).
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)~[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70) 2. Faster Whisper'ın Çince olmayan ASR'sini kullanırken artık önce Çin FunASR modelini indirmeyi gerektirmiyor.
- İçerik: **GPT-SoVITS V2Pro Serisi model eklendi (V2Pro, V2ProPlus)** 3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) UVR5 yankı giderme modelindeki ayarın tersine çevrildiği sorunu düzeltti.
- Tür: Özellik 4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) CUDA mevcut olmadığında Faster Whisper için otomatik CPU çıkarımını etkinleştirdi.
- Katkıda Bulunan: RVC-Boss 5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) Mac'te doğru CPU çıkarımı sağlamak için `is_half` kontrolünü değiştirdi.
- 2025.06.05 [PR#2426](https://github.com/RVC-Boss/GPT-SoVITS/pull/2426)
- İçerik: `config/inference_webui` başlatma hatası düzeltildi ### 202403/202404/202405 Güncellemeleri
- Tür: Hata Düzeltme
- Katkıda Bulunan: StaryLan #### Küçük Düzeltmeler:
- 2025.06.05 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427), [Commit#7d70852a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7d70852a3f67c3b52e3a62857f8663d529efc8cd), [PR#2434](https://github.com/RVC-Boss/GPT-SoVITS/pull/2434)
- İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi 1. Referans metin olmayan mod ile ilgili sorunlar düzeltildi.
- Tür: Yeni Özellik 2. Çince ve İngilizce metin önyüzü optimize edildi.
- Katkıda Bulunanlar: XXXXRT666, RVC-Boss 3. API formatı iyileştirildi.
4. CMD format sorunları düzeltildi.
5. Eğitim verisi işleme sırasında desteklenmeyen diller için hata uyarıları eklendi.
6. Hubert çıkarımındaki hata düzeltildi.
#### Büyük Düzeltmeler:
1. VQ'yu dondurmadan yapılan SoVITS eğitimi sorunu (bu kalite düşüşüne neden olabilir) düzeltildi.
2. Hızlı çıkarım dalı eklendi.
### 20240610 Güncellemesi
#### Küçük Düzeltmeler:
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) saf noktalama işareti ve çoklu noktalama işareti metin girdisi için mantığı geliştirdi.
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5'teki MDXNet yankı giderme için CMD formatını düzeltti, boşluk içeren yolları destekledi.
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py` içindeki SoVITS eğitimi için ilerleme çubuğu mantığını düzeltti.
#### Büyük Düzeltmeler:
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu.
**Dikkat: Daha önce büyük miktarda veri ile ince ayar yaptıysanız, modelin kalitesini artırmak için yeniden ayar yapmanız önerilir.**
### 20240706 Güncellemesi
#### Küçük Düzeltmeler:
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU çıkarımında varsayılan yığın boyutu ondalık sorununu düzeltti.
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) gürültü giderme veya ASR ile ilgili istisnalarla karşılaşıldığında bekleyen tüm ses dosyalarının çıkış yapmasına neden olan sorunları düzeltti.
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) noktalama işaretlerine göre ayrılırken ondalıkların bölünmesi sorununu düzeltti.
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) çoklu GPU eğitimi için çoklu işlem kaydetme mantığını düzeltti.
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) gereksiz `my_utils`'ı kaldırdı.
#### Büyük Düzeltmeler:
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) hızlandırılmış çıkarım kodu doğrulandı ve ana dala birleştirildi, taban ile tutarlı çıkarım etkileri sağlandı.
Ayrıca referans metni olmayan modda hızlandırılmış çıkarımı destekler.
**Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.**
### 20240727 Güncellemesi
#### Küçük Düzeltmeler:
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) gereksiz i18n kodlarını temizledi.
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) kullanıcı dosya yollarındaki sonlandırma eğik çizgilerinin komut satırı hatalarına neden olduğu sorunları düzeltti.
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT eğitimindeki adım hesaplama mantığını düzeltti.
#### Büyük Düzeltmeler:
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi.
Konuşma hızını ayarlarken rastgeleliği dondurmayı etkinleştirdi.
### 20240806 Güncellemesi
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer vokal eşlik ayırma modelini desteklemeye başladı. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 çıkarımı etkinleştirdi.
2. Çince metin ön yüzünü geliştirdi.
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) çoklu heceli karakterler için destek ekledi (v2 sadece);
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) sayı belirleyici ekledi;
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) aritmetik ve temel matematik formüllerini destekler;
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) karışık metin hatalarını düzeltti.
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI'de ses işlenirken yolları otomatik olarak doldurdu.
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU tanıma mantığını optimize etti.
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) Kantonca ASR desteği ekledi.
6. GPT-SoVITS v2 desteği eklendi.
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti.
### 20240821 Güncelleme
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` dalını ana dala birleştir.
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML etiketlerini kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize etme desteği.
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API düzeltildi ve optimize edildi.
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Karıştırmak için yalnızca bir referans sesi yüklenebiliyordu hatası düzeltildi, çeşitli veri seti kontrolleri eklendi ve eksik dosyalar için uyarılar çıkar.
### 20250211 Güncellemesi
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 Modeli Eklendi, SoVITS v3'ü ince ayar yapmak için 14GB GPU belleği gereklidir.
### 20250212 Güncellemesi
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3'ü ince ayar yapmak için gradyan kontrol noktası ekledi, 12GB GPU belleği gereklidir.
### 20250214 Güncellemesi
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Çok dilli karışık metin segmentasyon stratejisi **A**'yı optimize etti.
- `split-lang` bir dil segmentasyon aracı olarak eklendi ve çok dilli karışık metinlerin segmentasyon yeteneklerini iyileştirdi.
### 20250217 Güncellemesi
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Metindeki sayılar ve İngilizceyi işleme mantığını optimize etti.
### 20250218 Güncellemesi
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Çok dilli karışık metin segmentasyon stratejisi **B**'yi optimize etti.
### 20250223 Güncellemesi
1. SoVITS V3 için LoRA eğitimi, ince ayar yapmayı destekler. 8GB GPU belleği gereklidir ve sonuçlar tam parametreli ince ayar yapmaktan daha iyidir.
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Mel Band RoFormer modelini vokal ve enstrüman ayrımı için ekledi.
### 20250226 Güncellemesi
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
- Korece için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
- Korece/Japonca için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
### 20250227 Güncellemesi
- 24K sesli V3 modeliyle 24K ses oluştururken meydana gelen boğukluk sorununu hafifletmek için, 24K'dan 48K'ya ses süper çözünürlük modelleri eklendi. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117) de bildirilen sorunlar.

View File

@ -7,17 +7,12 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org) <!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb) [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![Huggingface](https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface)](https://lj1995-gpt-sovits-proplus.hf.space/) [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits) [![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
[![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
[![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe** [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
@ -48,63 +43,46 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
### Test Edilmiş Ortamlar ### Test Edilmiş Ortamlar
| Python Version | PyTorch Version | Device | | Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- | |----------------|------------------|-----------------|
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | | Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | | Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon | | Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon | | Python 3.11 | PyTorch 2.6.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU | | Python 3.9 | PyTorch 2.2.2 | CPU |
### Windows ### Windows
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın. Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
```
### Linux ### Linux
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### macOS ### macOS
**Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.** **Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.**
Aşağıdaki komutları çalıştırarak programı yükleyin: 1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin.
2. Aşağıdaki komutları çalıştırarak programı yükleyin:
```bash ```bash
conda create -n GPTSoVits python=3.10 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5] bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
``` ```
### El ile Yükleme ### El ile Yükleme
#### Bağımlılıkları Yükleme
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
#### FFmpeg'i Yükleme #### FFmpeg'i Yükleme
##### Conda Kullanıcıları ##### Conda Kullanıcıları
```bash ```bash
conda activate GPTSoVits
conda install ffmpeg conda install ffmpeg
``` ```
@ -113,13 +91,12 @@ conda install ffmpeg
```bash ```bash
sudo apt install ffmpeg sudo apt install ffmpeg
sudo apt install libsox-dev sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
``` ```
##### Windows Kullanıcıları ##### Windows Kullanıcıları
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin
##### MacOS Kullanıcıları ##### MacOS Kullanıcıları
@ -127,54 +104,36 @@ sudo apt install libsox-dev
brew install ffmpeg brew install ffmpeg
``` ```
### GPT-SoVITS Çalıştırma (Docker Kullanarak) #### Bağımlılıkları Yükleme
#### Docker İmajı Seçimi
Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:
- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin
- Ortamınıza uygun bir imaj etiketi seçin
- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin **bulunmadığı** anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir
- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir
- Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
#### Ortam Değişkenleri
- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPUnuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.
#### Paylaşılan Bellek Yapılandırması
Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.
#### Servis Seçimi
`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:
- `GPT-SoVITS-CU126` ve `GPT-SoVITS-CU128`: Tüm özellikleri içeren tam sürüm.
- `GPT-SoVITS-CU126-Lite` ve `GPT-SoVITS-CU128-Lite`: Daha az bağımlılığa ve sınırlı işlevselliğe sahip hafif sürüm.
Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanın:
```bash ```bash
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
#### Docker İmajını Yerel Olarak Oluşturma ### Docker Kullanarak
Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın: #### docker-compose.yaml yapılandırması
```bash 0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(eski sürüm) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun.
bash docker_build.sh --cuda <12.6|12.8> [--lite] 1. Ortam Değişkenleri:
- is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın.
2. Birim Yapılandırması, Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler.
3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın.
4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır.
#### docker compose ile çalıştırma
```
docker compose -f "docker-compose.yaml" up -d
``` ```
#### Çalışan Konteynere Erişim (Bash Shell) #### docker komutu ile çalıştırma
Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz: Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın:
```bash ```
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
``` ```
## Önceden Eğitilmiş Modeller ## Önceden Eğitilmiş Modeller
@ -244,12 +203,12 @@ veya WebUI'de manuel olarak sürüm değiştirin.
#### Yol Otomatik Doldurma artık destekleniyor #### Yol Otomatik Doldurma artık destekleniyor
1. Ses yolunu doldurun 1. Ses yolunu doldurun
2. Sesi küçük parçalara ayırın 2. Sesi küçük parçalara ayırın
3. Gürültü azaltma (isteğe bağlı) 3. Gürültü azaltma (isteğe bağlı)
4. ASR 4. ASR
5. ASR transkripsiyonlarını düzeltin 5. ASR transkripsiyonlarını düzeltin
6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın 6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın
### Çıkarım WebUI'sini Açın ### Çıkarım WebUI'sini Açın
@ -291,13 +250,13 @@ V1 ortamından V2'yi kullanmak için:
2. github'dan en son kodları klonlayın. 2. github'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
## V3 Sürüm Notları ## V3 Sürüm Notları
Yeni Özellikler: ### Yeni Özellikler:
1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir). 1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
@ -305,48 +264,16 @@ Yeni Özellikler:
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>) [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V2 ortamında V3 kullanımı: ### v2 ortamında v3 kullanımı:
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin. 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
2. GitHub'dan en son kodları klonlayın. 2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz. ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
## V4 Sürüm Notları
Yeni Özellikler:
1. **V4, V3'te görülen non-integer upsample işleminden kaynaklanan metalik ses sorununu düzeltti ve sesin boğuklaşmasını önlemek için doğrudan 48kHz ses çıktısı sunar (V3 sadece 24kHz destekler)**. Yazar, V4'ün V3'ün yerine geçebileceğini belirtmiştir ancak daha fazla test yapılması gerekmektedir.
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 ortamından V4'e geçiş:
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V4 ön eğitilmiş modelleri indirin (`gsv-v4-pretrained/s2v4.ckpt` ve `gsv-v4-pretrained/vocoder.pth`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
## V2Pro Sürüm Notları
Yeni Özellikler:
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V2Pro ön eğitilmiş modelleri indirin (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, ve `sv/pretrained_eres2netv2w24s4ep4.ckpt`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
## Yapılacaklar Listesi ## Yapılacaklar Listesi
- [x] **Yüksek Öncelikli:** - [x] **Yüksek Öncelikli:**
@ -371,7 +298,7 @@ V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
UVR5 için Web Arayüzünü açmak için komut satırını kullanın UVR5 için Web Arayüzünü açmak için komut satırını kullanın
```bash ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
@ -382,7 +309,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
```bash ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \ --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
--output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \ --output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
@ -394,7 +321,7 @@ python audio_slicer.py \
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
```bash ```
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı> python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
``` ```
@ -402,7 +329,7 @@ ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışınd
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
```bash ```
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil> python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
``` ```
@ -429,7 +356,6 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
### Tahmin İçin Metin Ön Ucu ### Tahmin İçin Metin Ön Ucu

View File

@ -1,6 +1,2 @@
set "SCRIPT_DIR=%~dp0"
set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
cd /d "%SCRIPT_DIR%"
set "PATH=%SCRIPT_DIR%\runtime;%PATH%"
runtime\python.exe -I webui.py zh_CN runtime\python.exe -I webui.py zh_CN
pause pause

View File

@ -1,7 +1,4 @@
$ErrorActionPreference = "SilentlyContinue" $ErrorActionPreference = "SilentlyContinue"
chcp 65001 chcp 65001
Set-Location $PSScriptRoot & "$PSScriptRoot\runtime\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
$runtimePath = Join-Path $PSScriptRoot "runtime"
$env:PATH = "$runtimePath;$env:PATH"
& "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
pause pause

View File

@ -1,13 +1,5 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"id": "9fd922fb",
"metadata": {},
"source": [
"# Deprecated"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

View File

@ -1,241 +0,0 @@
Param (
[Parameter(Mandatory=$true)][ValidateSet("CU126", "CU128", "CPU")][string]$Device,
[Parameter(Mandatory=$true)][ValidateSet("HF", "HF-Mirror", "ModelScope")][string]$Source,
[switch]$DownloadUVR5
)
$global:ErrorActionPreference = 'Stop'
trap {
Write-ErrorLog $_
}
function Write-ErrorLog {
param (
[System.Management.Automation.ErrorRecord]$ErrorRecord
)
Write-Host "`n[ERROR] Command failed:" -ForegroundColor Red
if (-not $ErrorRecord.Exception.Message){
} else {
Write-Host "Message:" -ForegroundColor Red
$ErrorRecord.Exception.Message -split "`n" | ForEach-Object {
Write-Host " $_"
}
}
Write-Host "Command:" -ForegroundColor Red -NoNewline
Write-Host " $($ErrorRecord.InvocationInfo.Line)".Replace("`r", "").Replace("`n", "")
Write-Host "Location:" -ForegroundColor Red -NoNewline
Write-Host " $($ErrorRecord.InvocationInfo.ScriptName):$($ErrorRecord.InvocationInfo.ScriptLineNumber)"
Write-Host "Call Stack:" -ForegroundColor DarkRed
$ErrorRecord.ScriptStackTrace -split "`n" | ForEach-Object {
Write-Host " $_" -ForegroundColor DarkRed
}
exit 1
}
function Write-Info($msg) {
Write-Host "[INFO]:" -ForegroundColor Green -NoNewline
Write-Host " $msg"
}
function Write-Success($msg) {
Write-Host "[SUCCESS]:" -ForegroundColor Blue -NoNewline
Write-Host " $msg"
}
function Invoke-Conda {
param (
[Parameter(ValueFromRemainingArguments = $true)]
[string[]]$Args
)
$output = & conda install -y -q -c conda-forge @Args 2>&1
$exitCode = $LASTEXITCODE
if ($exitCode -ne 0) {
Write-Host "Conda Install $Args Failed" -ForegroundColor Red
$errorMessages = @()
foreach ($item in $output) {
if ($item -is [System.Management.Automation.ErrorRecord]) {
$msg = $item.Exception.Message
Write-Host "$msg" -ForegroundColor Red
$errorMessages += $msg
}
else {
Write-Host $item
$errorMessages += $item
}
}
throw [System.Exception]::new(($errorMessages -join "`n"))
}
}
function Invoke-Pip {
param (
[Parameter(ValueFromRemainingArguments = $true)]
[string[]]$Args
)
$output = & pip install @Args 2>&1
$exitCode = $LASTEXITCODE
if ($exitCode -ne 0) {
$errorMessages = @()
Write-Host "Pip Install $Args Failed" -ForegroundColor Red
foreach ($item in $output) {
if ($item -is [System.Management.Automation.ErrorRecord]) {
$msg = $item.Exception.Message
Write-Host "$msg" -ForegroundColor Red
$errorMessages += $msg
}
else {
Write-Host $item
$errorMessages += $item
}
}
throw [System.Exception]::new(($errorMessages -join "`n"))
}
}
function Invoke-Download {
param (
[Parameter(Mandatory = $true)]
[string]$Uri,
[Parameter()]
[string]$OutFile
)
try {
$params = @{
Uri = $Uri
}
if ($OutFile) {
$params["OutFile"] = $OutFile
}
$null = Invoke-WebRequest @params -ErrorAction Stop
} catch {
Write-Host "Failed to download:" -ForegroundColor Red
Write-Host " $Uri"
throw
}
}
function Invoke-Unzip {
param($ZipPath, $DestPath)
Expand-Archive -Path $ZipPath -DestinationPath $DestPath -Force
Remove-Item $ZipPath -Force
}
chcp 65001
Set-Location $PSScriptRoot
Write-Info "Installing FFmpeg & CMake..."
Invoke-Conda ffmpeg cmake
Write-Success "FFmpeg & CMake Installed"
$PretrainedURL = ""
$G2PWURL = ""
$UVR5URL = ""
$NLTKURL = ""
$OpenJTalkURL = ""
switch ($Source) {
"HF" {
Write-Info "Download Model From HuggingFace"
$PretrainedURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
$G2PWURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
$UVR5URL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
$NLTKURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
$OpenJTalkURL = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
}
"HF-Mirror" {
Write-Info "Download Model From HuggingFace-Mirror"
$PretrainedURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
$G2PWURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
$UVR5URL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
$NLTKURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
$OpenJTalkURL = "https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
}
"ModelScope" {
Write-Info "Download Model From ModelScope"
$PretrainedURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
$G2PWURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
$UVR5URL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
$NLTKURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
$OpenJTalkURL = "https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
}
}
if (-not (Test-Path "GPT_SoVITS/pretrained_models/sv")) {
Write-Info "Downloading Pretrained Models..."
Invoke-Download -Uri $PretrainedURL -OutFile "pretrained_models.zip"
Invoke-Unzip "pretrained_models.zip" "GPT_SoVITS"
Write-Success "Pretrained Models Downloaded"
} else {
Write-Info "Pretrained Model Exists"
Write-Info "Skip Downloading Pretrained Models"
}
if (-not (Test-Path "GPT_SoVITS/text/G2PWModel")) {
Write-Info "Downloading G2PWModel..."
Invoke-Download -Uri $G2PWURL -OutFile "G2PWModel.zip"
Invoke-Unzip "G2PWModel.zip" "GPT_SoVITS/text"
Write-Success "G2PWModel Downloaded"
} else {
Write-Info "G2PWModel Exists"
Write-Info "Skip Downloading G2PWModel"
}
if ($DownloadUVR5) {
if (-not (Test-Path "tools/uvr5/uvr5_weights")) {
Write-Info "Downloading UVR5 Models..."
Invoke-Download -Uri $UVR5URL -OutFile "uvr5_weights.zip"
Invoke-Unzip "uvr5_weights.zip" "tools/uvr5"
Write-Success "UVR5 Models Downloaded"
} else {
Write-Info "UVR5 Models Exists"
Write-Info "Skip Downloading UVR5 Models"
}
}
switch ($Device) {
"CU128" {
Write-Info "Installing PyTorch For CUDA 12.8..."
Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu128"
}
"CU126" {
Write-Info "Installing PyTorch For CUDA 12.6..."
Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cu126"
}
"CPU" {
Write-Info "Installing PyTorch For CPU..."
Invoke-Pip torch torchaudio --index-url "https://download.pytorch.org/whl/cpu"
}
}
Write-Success "PyTorch Installed"
Write-Info "Installing Python Dependencies From requirements.txt..."
Invoke-Pip -r extra-req.txt --no-deps
Invoke-Pip -r requirements.txt
Write-Success "Python Dependencies Installed"
Write-Info "Downloading NLTK Data..."
Invoke-Download -Uri $NLTKURL -OutFile "nltk_data.zip"
Invoke-Unzip "nltk_data.zip" (python -c "import sys; print(sys.prefix)").Trim()
Write-Info "Downloading Open JTalk Dict..."
Invoke-Download -Uri $OpenJTalkURL -OutFile "open_jtalk_dic_utf_8-1.11.tar.gz"
$target = (python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))").Trim()
tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C $target
Remove-Item "open_jtalk_dic_utf_8-1.11.tar.gz" -Force
Write-Success "Open JTalk Dic Downloaded"
Write-Success "Installation Completed"

View File

@ -5,84 +5,31 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1 cd "$SCRIPT_DIR" || exit 1
RESET="\033[0m" set -e
BOLD="\033[1m"
ERROR="\033[1;31m[ERROR]: $RESET"
WARNING="\033[1;33m[WARNING]: $RESET"
INFO="\033[1;32m[INFO]: $RESET"
SUCCESS="\033[1;34m[SUCCESS]: $RESET"
set -eE
set -o errtrace
trap 'on_error $LINENO "$BASH_COMMAND" $?' ERR
# shellcheck disable=SC2317
on_error() {
local lineno="$1"
local cmd="$2"
local code="$3"
echo -e "${ERROR}${BOLD}Command \"${cmd}\" Failed${RESET} at ${BOLD}Line ${lineno}${RESET} with Exit Code ${BOLD}${code}${RESET}"
echo -e "${ERROR}${BOLD}Call Stack:${RESET}"
for ((i = ${#FUNCNAME[@]} - 1; i >= 1; i--)); do
echo -e " in ${BOLD}${FUNCNAME[i]}()${RESET} at ${BASH_SOURCE[i]}:${BOLD}${BASH_LINENO[i - 1]}${RESET}"
done
exit "$code"
}
run_conda_quiet() {
local output
output=$(conda install --yes --quiet -c conda-forge "$@" 2>&1) || {
echo -e "${ERROR} Conda install failed:\n$output"
exit 1
}
}
run_pip_quiet() {
local output
output=$(pip install "$@" 2>&1) || {
echo -e "${ERROR} Pip install failed:\n$output"
exit 1
}
}
run_wget_quiet() {
if wget --tries=25 --wait=5 --read-timeout=40 -q --show-progress "$@" 2>&1; then
tput cuu1 && tput el
else
echo -e "${ERROR} Wget failed"
exit 1
fi
}
if ! command -v conda &>/dev/null; then if ! command -v conda &>/dev/null; then
echo -e "${ERROR}Conda Not Found" echo "Conda Not Found"
exit 1 exit 1
fi fi
USE_CUDA=false trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
USE_ROCM=false
USE_CPU=false
WORKFLOW=${WORKFLOW:-"false"}
USE_HF=false is_HF=false
USE_HF_MIRROR=false is_HF_MIRROR=false
USE_MODELSCOPE=false is_MODELSCOPE=false
DOWNLOAD_UVR5=false DOWNLOAD_UVR5=false
print_help() { print_help() {
echo "Usage: bash install.sh [OPTIONS]" echo "Usage: bash install.sh [OPTIONS]"
echo "" echo ""
echo "Options:" echo "Options:"
echo " --device CU126|CU128|ROCM|MPS|CPU Specify the Device (REQUIRED)"
echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)" echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)"
echo " --download-uvr5 Enable downloading the UVR5 model" echo " --download-uvr5 Enable downloading the UVR5 model"
echo " -h, --help Show this help message and exit" echo " -h, --help Show this help message and exit"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " bash install.sh --device CU128 --source HF --download-uvr5" echo " bash install.sh --source HF --download-uvr5"
echo " bash install.sh --device MPS --source ModelScope" echo " bash install.sh --source ModelScope"
} }
# Show help if no arguments provided # Show help if no arguments provided
@ -97,44 +44,17 @@ while [[ $# -gt 0 ]]; do
--source) --source)
case "$2" in case "$2" in
HF) HF)
USE_HF=true is_HF=true
;; ;;
HF-Mirror) HF-Mirror)
USE_HF_MIRROR=true is_HF_MIRROR=true
;; ;;
ModelScope) ModelScope)
USE_MODELSCOPE=true is_MODELSCOPE=true
;; ;;
*) *)
echo -e "${ERROR}Error: Invalid Download Source: $2" echo "Error: Invalid Download Source: $2"
echo -e "${ERROR}Choose From: [HF, HF-Mirror, ModelScope]" echo "Choose From: [HF, HF-Mirror, ModelScope]"
exit 1
;;
esac
shift 2
;;
--device)
case "$2" in
CU126)
CUDA=126
USE_CUDA=true
;;
CU128)
CUDA=128
USE_CUDA=true
;;
ROCM)
USE_ROCM=true
;;
MPS)
USE_CPU=true
;;
CPU)
USE_CPU=true
;;
*)
echo -e "${ERROR}Error: Invalid Device: $2"
echo -e "${ERROR}Choose From: [CU126, CU128, ROCM, MPS, CPU]"
exit 1 exit 1
;; ;;
esac esac
@ -144,237 +64,150 @@ while [[ $# -gt 0 ]]; do
DOWNLOAD_UVR5=true DOWNLOAD_UVR5=true
shift shift
;; ;;
-h | --help) -h|--help)
print_help print_help
exit 0 exit 0
;; ;;
*) *)
echo -e "${ERROR}Unknown Argument: $1" echo "Unknown Argument: $1"
echo "" echo "Use -h or --help to see available options."
print_help
exit 1 exit 1
;; ;;
esac esac
done done
if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then if ! $is_HF && ! $is_HF_MIRROR && ! $is_MODELSCOPE; then
echo -e "${ERROR}Error: Device is REQUIRED" echo "Error: Download Source is REQUIRED"
echo "" echo ""
print_help print_help
exit 1 exit 1
fi fi
if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then if [ "$is_HF" = "true" ]; then
echo -e "${ERROR}Error: Download Source is REQUIRED" echo "Download Model From HuggingFace"
echo ""
print_help
exit 1
fi
case "$(uname -m)" in
x86_64 | amd64) SYSROOT_PKG="sysroot_linux-64>=2.28" ;;
aarch64 | arm64) SYSROOT_PKG="sysroot_linux-aarch64>=2.28" ;;
ppc64le) SYSROOT_PKG="sysroot_linux-ppc64le>=2.28" ;;
*)
echo "Unsupported architecture: $(uname -m)"
exit 1
;;
esac
# Install build tools
echo -e "${INFO}Detected system: $(uname -s) $(uname -r) $(uname -m)"
if [ "$(uname)" != "Darwin" ]; then
gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0)
if [ "$gcc_major_version" -lt 11 ]; then
echo -e "${INFO}Installing GCC & G++..."
run_conda_quiet gcc=11 gxx=11
run_conda_quiet "$SYSROOT_PKG"
echo -e "${SUCCESS}GCC & G++ Installed..."
else
echo -e "${INFO}Detected GCC Version: $gcc_major_version"
echo -e "${INFO}Skip Installing GCC & G++ From Conda-Forge"
echo -e "${INFO}Installing libstdcxx-ng From Conda-Forge"
run_conda_quiet "libstdcxx-ng>=$gcc_major_version"
echo -e "${SUCCESS}libstdcxx-ng=$gcc_major_version Installed..."
fi
else
if ! xcode-select -p &>/dev/null; then
echo -e "${INFO}Installing Xcode Command Line Tools..."
xcode-select --install
echo -e "${INFO}Waiting For Xcode Command Line Tools Installation Complete..."
while true; do
sleep 20
if xcode-select -p &>/dev/null; then
echo -e "${SUCCESS}Xcode Command Line Tools Installed"
break
else
echo -e "${INFO}InstallingPlease Wait..."
fi
done
else
XCODE_PATH=$(xcode-select -p)
if [[ "$XCODE_PATH" == *"Xcode.app"* ]]; then
echo -e "${WARNING} Detected Xcode path: $XCODE_PATH"
echo -e "${WARNING} If your Xcode version does not match your macOS version, it may cause unexpected issues during compilation or package builds."
fi
fi
fi
echo -e "${INFO}Installing FFmpeg & CMake..."
run_conda_quiet ffmpeg cmake make
echo -e "${SUCCESS}FFmpeg & CMake Installed"
echo -e "${INFO}Installing unzip..."
run_conda_quiet unzip
echo -e "${SUCCESS}unzip Installed"
if [ "$USE_HF" = "true" ]; then
echo -e "${INFO}Download Model From HuggingFace"
PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" elif [ "$is_HF_MIRROR" = "true" ]; then
PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" echo "Download Model From HuggingFace-Mirror"
elif [ "$USE_HF_MIRROR" = "true" ]; then
echo -e "${INFO}Download Model From HuggingFace-Mirror"
PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" elif [ "$is_MODELSCOPE" = "true" ]; then
PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" echo "Download Model From ModelScope"
elif [ "$USE_MODELSCOPE" = "true" ]; then
echo -e "${INFO}Download Model From ModelScope"
PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip" PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip" G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip" UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
NLTK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
fi fi
if [ ! -d "GPT_SoVITS/pretrained_models/sv" ]; then if find "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
echo -e "${INFO}Downloading Pretrained Models..." echo "Pretrained Model Exists"
rm -rf pretrained_models.zip
run_wget_quiet "$PRETRINED_URL"
unzip -q -o pretrained_models.zip -d GPT_SoVITS
rm -rf pretrained_models.zip
echo -e "${SUCCESS}Pretrained Models Downloaded"
else else
echo -e "${INFO}Pretrained Model Exists" echo "Download Pretrained Models"
echo -e "${INFO}Skip Downloading Pretrained Models" wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$PRETRINED_URL"
unzip pretrained_models.zip
rm -rf pretrained_models.zip
mv pretrained_models/* GPT_SoVITS/pretrained_models
rm -rf pretrained_models
fi fi
if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then
echo -e "${INFO}Downloading G2PWModel.." echo "Download G2PWModel"
rm -rf G2PWModel.zip wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$G2PW_URL"
run_wget_quiet "$G2PW_URL"
unzip -q -o G2PWModel.zip -d GPT_SoVITS/text unzip G2PWModel.zip
rm -rf G2PWModel.zip rm -rf G2PWModel.zip
echo -e "${SUCCESS}G2PWModel Downloaded" mv G2PWModel GPT_SoVITS/text/G2PWModel
else else
echo -e "${INFO}G2PWModel Exists" echo "G2PWModel Exists"
echo -e "${INFO}Skip Downloading G2PWModel"
fi fi
if [ "$DOWNLOAD_UVR5" = "true" ]; then if [ "$DOWNLOAD_UVR5" = "true" ];then
if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then if find "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then
echo -e"${INFO}UVR5 Models Exists" echo "UVR5 Model Exists"
echo -e "${INFO}Skip Downloading UVR5 Models"
else else
echo -e "${INFO}Downloading UVR5 Models..." echo "Download UVR5 Model"
rm -rf uvr5_weights.zip wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$UVR5_URL"
run_wget_quiet "$UVR5_URL"
unzip -q -o uvr5_weights.zip -d tools/uvr5 unzip uvr5_weights.zip
rm -rf uvr5_weights.zip rm -rf uvr5_weights.zip
echo -e "${SUCCESS}UVR5 Models Downloaded" mv uvr5_weights/* tools/uvr5/uvr5_weights
rm -rf uvr5_weights
fi fi
fi fi
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then # 安装构建工具
echo -e "${INFO}Checking For Nvidia Driver Installation..." # Install build tools
if command -v nvidia-smi &>/dev/null; then echo "Installing GCC..."
echo "${INFO}Nvidia Driver Founded" conda install -c conda-forge gcc=14 -y
else
echo -e "${WARNING}Nvidia Driver Not Found, Fallback to CPU" echo "Installing G++..."
conda install -c conda-forge gxx -y
echo "Installing ffmpeg and cmake..."
conda install ffmpeg cmake -y
echo "Installing git-lfs and zip..."
conda install git-lfs -y
conda install zip -y
git-lfs install
echo "Checking for CUDA installation..."
if command -v nvidia-smi &>/dev/null; then
USE_CUDA=true
echo "CUDA found."
else
echo "CUDA not found."
USE_CUDA=false USE_CUDA=false
USE_CPU=true
fi
fi fi
if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then if [ "$USE_CUDA" = false ]; then
echo -e "${INFO}Checking For ROCm Installation..." echo "Checking for ROCm installation..."
if [ -d "/opt/rocm" ]; then if [ -d "/opt/rocm" ]; then
echo -e "${INFO}ROCm Founded" USE_ROCM=true
echo "ROCm found."
if grep -qi "microsoft" /proc/version; then if grep -qi "microsoft" /proc/version; then
echo -e "${INFO}WSL2 Founded" echo "You are running WSL."
IS_WSL=true IS_WSL=true
else else
echo "You are NOT running WSL."
IS_WSL=false IS_WSL=false
fi fi
else else
echo -e "${WARNING}ROCm Not Found, Fallback to CPU" echo "ROCm not found."
USE_ROCM=false USE_ROCM=false
USE_CPU=true
fi fi
fi fi
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then if [ "$USE_CUDA" = true ]; then
if [ "$CUDA" = 128 ]; then echo "Installing PyTorch with CUDA support..."
echo -e "${INFO}Installing PyTorch For CUDA 12.8..." pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu128" elif [ "$USE_ROCM" = true ]; then
elif [ "$CUDA" = 126 ]; then echo "Installing PyTorch with ROCm support..."
echo -e "${INFO}Installing PyTorch For CUDA 12.6..." pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu126" else
fi echo "Installing PyTorch for CPU..."
elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
echo -e "${INFO}Installing PyTorch For ROCm 6.2..."
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/rocm6.2"
elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then
echo -e "${INFO}Installing PyTorch For CPU..."
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cpu"
elif [ "$WORKFLOW" = false ]; then
echo -e "${ERROR}Unknown Err"
exit 1
fi fi
echo -e "${SUCCESS}PyTorch Installed"
echo -e "${INFO}Installing Python Dependencies From requirements.txt..." echo "Installing Python dependencies from requirements.txt..."
# 刷新环境
# Refresh environment
hash -r hash -r
run_pip_quiet -r extra-req.txt --no-deps pip install -r extra-req.txt --no-deps
run_pip_quiet -r requirements.txt pip install -r requirements.txt
echo -e "${SUCCESS}Python Dependencies Installed"
PY_PREFIX=$(python -c "import sys; print(sys.prefix)")
PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))")
echo -e "${INFO}Downloading NLTK Data..."
rm -rf nltk_data.zip
run_wget_quiet "$NLTK_URL" -O nltk_data.zip
unzip -q -o nltk_data -d "$PY_PREFIX"
rm -rf nltk_data.zip
echo -e "${SUCCESS}NLTK Data Downloaded"
echo -e "${INFO}Downloading Open JTalk Dict..."
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
run_wget_quiet "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz
tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX"
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
echo -e "${SUCCESS}Open JTalk Dic Downloaded"
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
echo -e "${INFO}Updating WSL Compatible Runtime Lib For ROCm..." echo "Update to WSL compatible runtime lib..."
location=$(pip show torch | grep Location | awk -F ": " '{print $2}') location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
cd "${location}"/torch/lib/ || exit cd "${location}"/torch/lib/ || exit
rm libhsa-runtime64.so* rm libhsa-runtime64.so*
cp "$(readlink -f /opt/rocm/lib/libhsa-runtime64.so)" libhsa-runtime64.so cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
echo -e "${SUCCESS}ROCm Runtime Lib Updated..."
fi fi
echo -e "${SUCCESS}Installation Completed" echo "Installation completed successfully!"

View File

@ -1,14 +1,13 @@
--no-binary=opencc
numpy<2.0 numpy<2.0
scipy scipy
tensorboard tensorboard
librosa==0.10.2 librosa==0.10.2
numba numba
pytorch-lightning>=2.4 pytorch-lightning>=2.4
gradio<5 gradio>=4.0,<=4.24.0
ffmpeg-python ffmpeg-python
onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64" onnxruntime; sys_platform == 'darwin'
onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64" onnxruntime-gpu; sys_platform != 'darwin'
tqdm tqdm
funasr==1.0.27 funasr==1.0.27
cn2an cn2an
@ -18,7 +17,7 @@ g2p_en
torchaudio torchaudio
modelscope==1.10.0 modelscope==1.10.0
sentencepiece sentencepiece
transformers>=4.43,<=4.50 transformers>=4.43
peft peft
chardet chardet
PyYAML PyYAML
@ -32,9 +31,10 @@ rotary_embedding_torch
ToJyutping ToJyutping
g2pk2 g2pk2
ko_pron ko_pron
opencc opencc; sys_platform != 'linux'
opencc==1.1.1; sys_platform == 'linux'
python_mecab_ko; sys_platform != 'win32' python_mecab_ko; sys_platform != 'win32'
fastapi[standard]>=0.115.2 fastapi<0.112.2
x_transformers x_transformers
torchmetrics<=1.5 torchmetrics<=1.5
pydantic<=2.10.6 pydantic<=2.10.6

View File

@ -6,10 +6,15 @@ def check_fw_local_models():
启动时检查本地是否有 Faster Whisper 模型. 启动时检查本地是否有 Faster Whisper 模型.
""" """
model_size_list = [ model_size_list = [
"tiny",
"tiny.en",
"base",
"base.en",
"small",
"small.en",
"medium", "medium",
"medium.en", "medium.en",
"distil-large-v2", "large",
"distil-large-v3",
"large-v1", "large-v1",
"large-v2", "large-v2",
"large-v3", "large-v3",
@ -20,24 +25,11 @@ def check_fw_local_models():
return model_size_list return model_size_list
def get_models():
model_size_list = [
"medium",
"medium.en",
"distil-large-v2",
"distil-large-v3",
"large-v1",
"large-v2",
"large-v3",
]
return model_size_list
asr_dict = { asr_dict = {
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
"Faster Whisper (多语种)": { "Faster Whisper (多语种)": {
"lang": ["auto", "zh", "en", "ja", "ko", "yue"], "lang": ["auto", "zh", "en", "ja", "ko", "yue"],
"size": get_models(), "size": check_fw_local_models(),
"path": "fasterwhisper_asr.py", "path": "fasterwhisper_asr.py",
"precision": ["float32", "float16", "int8"], "precision": ["float32", "float16", "int8"],
}, },

View File

@ -1,17 +1,15 @@
import argparse import argparse
import os import os
import time
import traceback import traceback
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import torch import torch
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
from huggingface_hub import snapshot_download
from huggingface_hub.errors import LocalEntryNotFoundError
from tqdm import tqdm from tqdm import tqdm
from tools.asr.config import get_models from tools.asr.config import check_fw_local_models
from tools.asr.funasr_asr import only_asr
from tools.my_utils import load_cudnn
# fmt: off # fmt: off
language_code_list = [ language_code_list = [
@ -39,54 +37,20 @@ language_code_list = [
# fmt: on # fmt: on
def download_model(model_size: str): def execute_asr(input_folder, output_folder, model_size, language, precision):
if "distil" in model_size: if "-local" in model_size:
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1)) model_size = model_size[:-6]
model_path = f"tools/asr/models/faster-whisper-{model_size}"
else: else:
repo_id = f"Systran/faster-whisper-{model_size}" model_path = model_size
model_path = f"tools/asr/models/{repo_id.strip('Systran/')}"
files: list[str] = [
"config.json",
"model.bin",
"tokenizer.json",
"vocabulary.txt",
]
if model_size == "large-v3" or "distil" in model_size:
files.append("preprocessor_config.json")
files.append("vocabulary.json")
files.remove("vocabulary.txt")
for attempt in range(2):
try:
snapshot_download(
repo_id=repo_id,
allow_patterns=files,
local_dir=model_path,
)
break
except LocalEntryNotFoundError:
if attempt < 1:
time.sleep(2)
else:
print("[ERROR] LocalEntryNotFoundError and no fallback.")
traceback.print_exc()
exit(1)
except Exception as e:
print(f"[ERROR] Unexpected error on attempt {attempt + 1}: {e}")
traceback.print_exc()
exit(1)
return model_path
def execute_asr(input_folder, output_folder, model_path, language, precision):
if language == "auto": if language == "auto":
language = None # 不设置语种由模型自动输出概率最高的语种 language = None # 不设置语种由模型自动输出概率最高的语种
print("loading faster whisper model:", model_path, model_path) print("loading faster whisper model:", model_size, model_path)
device = "cuda" if torch.cuda.is_available() else "cpu" device = "cuda" if torch.cuda.is_available() else "cpu"
try:
model = WhisperModel(model_path, device=device, compute_type=precision) model = WhisperModel(model_path, device=device, compute_type=precision)
except:
return print(traceback.format_exc())
input_file_names = os.listdir(input_folder) input_file_names = os.listdir(input_folder)
input_file_names.sort() input_file_names.sort()
@ -108,15 +72,16 @@ def execute_asr(input_folder, output_folder, model_path, language, precision):
if info.language == "zh": if info.language == "zh":
print("检测为中文文本, 转 FunASR 处理") print("检测为中文文本, 转 FunASR 处理")
if "only_asr" not in globals():
from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型
text = only_asr(file_path, language=info.language.lower()) text = only_asr(file_path, language=info.language.lower())
if text == "": if text == "":
for segment in segments: for segment in segments:
text += segment.text text += segment.text
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
except Exception as e: except:
print(e) print(traceback.format_exc())
traceback.print_exc()
output_folder = output_folder or "output/asr_opt" output_folder = output_folder or "output/asr_opt"
os.makedirs(output_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True)
@ -128,8 +93,6 @@ def execute_asr(input_folder, output_folder, model_path, language, precision):
return output_file_path return output_file_path
load_cudnn()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
@ -141,7 +104,7 @@ if __name__ == "__main__":
"--model_size", "--model_size",
type=str, type=str,
default="large-v3", default="large-v3",
choices=get_models(), choices=check_fw_local_models(),
help="Model Size of Faster Whisper", help="Model Size of Faster Whisper",
) )
parser.add_argument( parser.add_argument(
@ -157,14 +120,10 @@ if __name__ == "__main__":
) )
cmd = parser.parse_args() cmd = parser.parse_args()
model_size = cmd.model_size
if model_size == "large":
model_size = "large-v3"
model_path = download_model(model_size)
output_file_path = execute_asr( output_file_path = execute_asr(
input_folder=cmd.input_folder, input_folder=cmd.input_folder,
output_folder=cmd.output_folder, output_folder=cmd.output_folder,
model_path=model_path, model_size=cmd.model_size,
language=cmd.language, language=cmd.language,
precision=cmd.precision, precision=cmd.precision,
) )

View File

@ -1,73 +0,0 @@
js = """
function deleteTheme() {
const params = new URLSearchParams(window.location.search);
if (params.has('__theme')) {
params.delete('__theme');
const newUrl = `${window.location.pathname}?${params.toString()}`;
window.location.replace(newUrl);
}
}
"""
css = """
/* CSSStyleRule */
.markdown {
padding: 6px 10px;
}
@media (prefers-color-scheme: light) {
.markdown {
background-color: lightblue;
color: #000;
}
}
@media (prefers-color-scheme: dark) {
.markdown {
background-color: #4b4b4b;
color: rgb(244, 244, 245);
}
}
::selection {
background: #ffc078 !important;
}
footer {
height: 50px !important; /* 设置页脚高度 */
background-color: transparent !important; /* 背景透明 */
display: flex;
justify-content: center; /* 居中对齐 */
align-items: center; /* 垂直居中 */
}
footer * {
display: none !important; /* 隐藏所有子元素 */
}
"""
top_html = """
<div align="center">
<div style="margin-bottom: 5px; font-size: 15px;">{}</div>
<div style="display: flex; gap: 60px; justify-content: center;">
<a href="https://github.com/RVC-Boss/GPT-SoVITS" target="_blank">
<img src="https://img.shields.io/badge/GitHub-GPT--SoVITS-blue.svg?style=for-the-badge&logo=github" style="width: auto; height: 30px;">
</a>
<a href="https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" target="_blank">
<img src="https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white" style="width: auto; height: 30px;">
</a>
<a href="https://lj1995-gpt-sovits-proplus.hf.space/" target="_blank">
<img src="https://img.shields.io/badge/免费在线体验-free_online_demo-yellow.svg?style=for-the-badge&logo=huggingface" style="width: auto; height: 30px;">
</a>
<a href="https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" target="_blank">
<img src="https://img.shields.io/badge/English-READ%20DOCS-blue?style=for-the-badge&logo=googledocs&logoColor=white" style="width: auto; height: 30px;">
</a>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE" target="_blank">
<img src="https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative" style="width: auto; height: 30px;">
</a>
</div>
</div>
"""

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR model", "ASR 模型": "ASR model",
"ASR 模型尺寸": "ASR model size", "ASR 模型尺寸": "ASR model size",
"ASR 语言设置": "ASR language", "ASR 语言设置": "ASR language",
"CPU训练,较慢": "Training on CPU (slower)",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT Training: Model Weights saved in GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT Training: Model Weights saved in GPT_weights/",
"GPT模型列表": "GPT weight list", "GPT模型列表": "GPT weight list",
"GPT训练": "GPT Training", "GPT训练": "GPT Training",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU number, can only input ONE integer", "GPU卡号,只能填1个整数": "GPU number, can only input ONE integer",
"GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ", "GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ",
"LoRA秩": "LoRA Rank", "LoRA秩": "LoRA Rank",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained SoVITS V3 Model, Cannot Load LoRA Weights",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/",
"SoVITS模型列表": "SoVITS weight list", "SoVITS模型列表": "SoVITS weight list",
"SoVITS训练": "SoVITS Training", "SoVITS训练": "SoVITS Training",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: Manually save all text box contents on the current page to memory and file (If you don't click this button before switching pages or exiting the labeling page, the data will be rolled back when you return, which would be a waste of work.)",
"TTS推理WebUI": "TTS Inference WebUI", "TTS推理WebUI": "TTS Inference WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)",
"V3不支持无参考文本模式请填写参考文本": "V3 does not support the no-reference-text mode. Please provide reference text!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
"batch_size": "Batch Size", "batch_size": "Batch Size",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3 does not support this mode currently, using it will cause an error.", "v3暂不支持该模式使用了会报错。": "v3 does not support this mode currently, using it will cause an error.",
"v3输出如果觉得闷可以试试开超分": "For V3 model, if generated audio sounds somewhat muffled, try enable audio super-resolution.", "v3输出如果觉得闷可以试试开超分": "For V3 model, if generated audio sounds somewhat muffled, try enable audio super-resolution.",
"不切": "No slice", "不切": "No slice",
"不训练直接推v2ProPlus底模": "Use v2ProPlus base model directly without training!",
"不训练直接推v2Pro底模": "Use v2Pro base model directly without training!",
"不训练直接推v2底模": "Use v2 base model directly without training!",
"不训练直接推v3底模": "Use v3 base model directly without training!",
"中文": "Chinese", "中文": "Chinese",
"中文教程文档": "Chinese Tutorial",
"中英混合": "Chinese-English Mixed", "中英混合": "Chinese-English Mixed",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)",
"主参考音频的文本": "Text of Primary Reference Audio", "主参考音频的文本": "Text of Primary Reference Audio",
@ -92,7 +87,6 @@
"句间停顿秒数": "Pause Duration between Sentences (Seconds)", "句间停顿秒数": "Pause Duration between Sentences (Seconds)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.",
"合成语音": "Start inference", "合成语音": "Start inference",
"合成音频": "Synthesize Audio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.",
@ -110,15 +104,11 @@
"已关闭": " is Closed", "已关闭": " is Closed",
"已完成": " Finished", "已完成": " Finished",
"已开启": " is Opened", "已开启": " is Opened",
"并行合成中": "Parallel Synthesis in Progress",
"并行推理": "Parallel Inference", "并行推理": "Parallel Inference",
"并行推理模式已关闭": "Parallel Inference Mode Disabled", "并行推理模式已关闭": "Parallel Inference Mode Disabled",
"并行推理模式已开启": "Parallel Inference Mode Enabled", "并行推理模式已开启": "Parallel Inference Mode Enabled",
"底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained Model, Cannot Load LoRA Weights",
"开启": "Open ", "开启": "Open ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "When parallel inference mode is enabled, SoVITS V3/4 models do not support bucket processing; bucket processing has been automatically disabled.",
"微调模型信息": "Fine-tuned Model Information",
"微调训练": "Fine-Tuning", "微调训练": "Fine-Tuning",
"怎么切": "How to slice the sentence", "怎么切": "How to slice the sentence",
"总训练轮数total_epoch": "Total training epochs (total_epoch):", "总训练轮数total_epoch": "Total training epochs (total_epoch):",
@ -150,8 +140,8 @@
"模型": "Model", "模型": "Model",
"模型分为三类:": "Models are categorized into three types:", "模型分为三类:": "Models are categorized into three types:",
"模型切换": "Model switch", "模型切换": "Model switch",
"模型加载中,请等待": "Model is loading, please wait...",
"每张显卡的batch_size": "Batch size per GPU:", "每张显卡的batch_size": "Batch size per GPU:",
"版本": "Version",
"粤英混合": "Yue-English Mixed", "粤英混合": "Yue-English Mixed",
"粤语": "Yue", "粤语": "Yue",
"终止合成": "Terminate Synthesis", "终止合成": "Terminate Synthesis",
@ -160,7 +150,6 @@
"缺少音素数据集": "Missing Phoneme Dataset", "缺少音素数据集": "Missing Phoneme Dataset",
"缺少音频数据集": "Missing Audio Dataset", "缺少音频数据集": "Missing Audio Dataset",
"英文": "English", "英文": "English",
"训练模型的版本": "Version of the trained model",
"训练集格式化一键三连": "Training Set One-Click Formatting", "训练集格式化一键三连": "Training Set One-Click Formatting",
"训练集格式化工具": "Dataset Formatting Tool", "训练集格式化工具": "Dataset Formatting Tool",
"语义Token提取": "Semantics Token Extraction", "语义Token提取": "Semantics Token Extraction",
@ -174,6 +163,7 @@
"语音识别": "Speech Recognition", "语音识别": "Speech Recognition",
"语音识别工具": "Speech Recognition Tool", "语音识别工具": "Speech Recognition Tool",
"语音降噪": "Speech Denoising", "语音降噪": "Speech Denoising",
"语音降噪工具": "Speech Denoising Tool",
"请上传3~10秒内参考音频超过会报错": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.", "请上传3~10秒内参考音频超过会报错": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
"请上传参考音频": "Please Upload the Reference Audio", "请上传参考音频": "Please Upload the Reference Audio",
"请填入推理文本": "Please Fill in the Terget Text", "请填入推理文本": "Please Fill in the Terget Text",
@ -197,8 +187,7 @@
"进度": "Progress", "进度": "Progress",
"进程已终止": " Process Terminated", "进程已终止": " Process Terminated",
"进程输出信息": " Process Output Information", "进程输出信息": " Process Output Information",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Select the model from SoVITS_weights and GPT_weights. The default models are pretrained models for experiencing 5-second Zero-Shot TTS without training.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
"采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing",
"重复惩罚": "Repetition Penalty", "重复惩罚": "Repetition Penalty",
"随机种子": "Random Seed", "随机种子": "Random Seed",
@ -214,13 +203,9 @@
"音频标注WebUI": "Audio Labeling WebUI", "音频标注WebUI": "Audio Labeling WebUI",
"音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)", "音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
"音频超分中": "Running Audio Super-Resolution", "音频超分中": "Running Audio Super-Resolution",
"音频超采样": "Audio Upsampling",
"音频超采样(仅对V3生效))": "Audio Upsampling (V3 Only)",
"预测语义Token": "Predict Semantic Token",
"预训练GPT模型路径": "Pretrained GPT Model Path", "预训练GPT模型路径": "Pretrained GPT Model Path",
"预训练SSL模型路径": "Pretrained SSL Model Path", "预训练SSL模型路径": "Pretrained SSL Model Path",
"预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path", "预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path",
"预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path", "预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path",
"预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path", "预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path"
"预训练模型路径": "Pretrained Model Path"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "Modelo ASR", "ASR 模型": "Modelo ASR",
"ASR 模型尺寸": "Tamaño del modelo ASR", "ASR 模型尺寸": "Tamaño del modelo ASR",
"ASR 语言设置": "Configuración del idioma ASR", "ASR 语言设置": "Configuración del idioma ASR",
"CPU训练,较慢": "Entrenamiento en CPU, más lento",
"GPT 训练: 模型权重文件在 GPT_weights/": "Entrenamiento de GPT: los archivos de pesos del modelo están en GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "Entrenamiento de GPT: los archivos de pesos del modelo están en GPT_weights/",
"GPT模型列表": "Lista de modelos GPT", "GPT模型列表": "Lista de modelos GPT",
"GPT训练": "Entrenamiento de GPT", "GPT训练": "Entrenamiento de GPT",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "Número de tarjeta GPU, solo se puede ingresar un número entero", "GPU卡号,只能填1个整数": "Número de tarjeta GPU, solo se puede ingresar un número entero",
"GPU卡号以-分割,每个卡号一个进程": "Número de tarjeta GPU separado por '-', cada número de tarjeta es un proceso", "GPU卡号以-分割,每个卡号一个进程": "Número de tarjeta GPU separado por '-', cada número de tarjeta es un proceso",
"LoRA秩": "Rango de LoRA", "LoRA秩": "Rango de LoRA",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Falta el modelo base de SoVITS V3, no se pueden cargar los pesos de LoRA correspondientes",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entrenamiento de SoVITS: los archivos de pesos del modelo están en SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entrenamiento de SoVITS: los archivos de pesos del modelo están en SoVITS_weights/",
"SoVITS模型列表": "Lista de modelos SoVITS", "SoVITS模型列表": "Lista de modelos SoVITS",
"SoVITS训练": "Entrenamiento de SoVITS", "SoVITS训练": "Entrenamiento de SoVITS",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Enviar texto: Guarda manualmente el contenido de todos los campos de texto en la página actual en memoria y archivo (si no haces clic en este botón antes o después de cambiar de página o salir de la página de etiquetado, al regresar se desharán los cambios, ¡todo ese trabajo se perderá!).",
"TTS推理WebUI": "WebUI de inferencia TTS", "TTS推理WebUI": "WebUI de inferencia TTS",
"UVR5人声伴奏分离&去混响去延迟工具": "Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo", "UVR5人声伴奏分离&去混响去延迟工具": "Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo",
"V3不支持无参考文本模式请填写参考文本": "¡V3 no admite el modo sin texto de referencia! Por favor, introduce el texto de referencia.",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
"batch_size": "Tamaño de lote", "batch_size": "Tamaño de lote",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3 no es compatible con este modo actualmente y su uso generará un error.", "v3暂不支持该模式使用了会报错。": "v3 no es compatible con este modo actualmente y su uso generará un error.",
"v3输出如果觉得闷可以试试开超分": "Si la salida de V3 parece aburrida, puedes intentar activar la superresolución", "v3输出如果觉得闷可以试试开超分": "Si la salida de V3 parece aburrida, puedes intentar activar la superresolución",
"不切": "No cortar", "不切": "No cortar",
"不训练直接推v2ProPlus底模": "¡Usar directamente el modelo base v2ProPlus sin entrenar!",
"不训练直接推v2Pro底模": "¡Usar directamente el modelo base v2Pro sin entrenar!",
"不训练直接推v2底模": "¡Usar directamente el modelo base v2 sin entrenar!",
"不训练直接推v3底模": "¡Usar directamente el modelo base v3 sin entrenar!",
"中文": "Chino", "中文": "Chino",
"中文教程文档": "Documentación del tutorial en chino",
"中英混合": "Chino e inglés mezclados", "中英混合": "Chino e inglés mezclados",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de referencia principal (Por favor, suba un audio de referencia de entre 3 y 10 segundos, si supera este límite se producirá un error)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de referencia principal (Por favor, suba un audio de referencia de entre 3 y 10 segundos, si supera este límite se producirá un error)",
"主参考音频的文本": "Texto del audio de referencia principal", "主参考音频的文本": "Texto del audio de referencia principal",
@ -92,7 +87,6 @@
"句间停顿秒数": "Segundos de pausa entre frases", "句间停顿秒数": "Segundos de pausa entre frases",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Sube varios archivos de audio de referencia arrastrándolos y soltándolos (se recomienda que sean del mismo género) y promedia sus tonos. Si esta opción se deja en blanco, el tono será controlado por el único audio de referencia a la izquierda. Si se está afinando el modelo, se recomienda que todos los archivos de audio de referencia tengan tonos dentro del conjunto de entrenamiento de ajuste fino; se puede ignorar el modelo preentrenado.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Sube varios archivos de audio de referencia arrastrándolos y soltándolos (se recomienda que sean del mismo género) y promedia sus tonos. Si esta opción se deja en blanco, el tono será controlado por el único audio de referencia a la izquierda. Si se está afinando el modelo, se recomienda que todos los archivos de audio de referencia tengan tonos dentro del conjunto de entrenamiento de ajuste fino; se puede ignorar el modelo preentrenado.",
"合成语音": "Síntesis de voz", "合成语音": "Síntesis de voz",
"合成音频": "Sintetizar Audio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Se añadirá soporte para conversión de fonemas, edición manual de fonemas y síntesis de voz por pasos en el futuro.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Se añadirá soporte para conversión de fonemas, edición manual de fonemas y síntesis de voz por pasos en el futuro.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si el audio de referencia no es claro o no sabe qué escribir, habilite esta opción para ignorar el texto de referencia.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si el audio de referencia no es claro o no sabe qué escribir, habilite esta opción para ignorar el texto de referencia.",
@ -110,15 +104,11 @@
"已关闭": " Desactivado", "已关闭": " Desactivado",
"已完成": " Completado", "已完成": " Completado",
"已开启": " Activado", "已开启": " Activado",
"并行合成中": "Síntesis en paralelo en curso",
"并行推理": "Inferencia paralela", "并行推理": "Inferencia paralela",
"并行推理模式已关闭": "Modo de inferencia paralela deshabilitado", "并行推理模式已关闭": "Modo de inferencia paralela deshabilitado",
"并行推理模式已开启": "Modo de inferencia paralela habilitado", "并行推理模式已开启": "Modo de inferencia paralela habilitado",
"底模缺失,无法加载相应 LoRA 权重": "Falta el modelo base, no se pueden cargar los pesos de LoRA correspondientes",
"开启": "Activar ", "开启": "Activar ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "Al activar el modo de inferencia paralela, los modelos SoVITS V3/4 no admiten el procesamiento por lotes, por lo que este ha sido desactivado automáticamente.",
"微调模型信息": "Información del modelo fine-tuned",
"微调训练": "Entrenamiento de ajuste fino", "微调训练": "Entrenamiento de ajuste fino",
"怎么切": "Cómo cortar", "怎么切": "Cómo cortar",
"总训练轮数total_epoch": "Número total de épocas de entrenamiento", "总训练轮数total_epoch": "Número total de épocas de entrenamiento",
@ -150,8 +140,8 @@
"模型": "Modelo", "模型": "Modelo",
"模型分为三类:": "Los modelos se dividen en tres categorías:", "模型分为三类:": "Los modelos se dividen en tres categorías:",
"模型切换": "Cambio de modelo", "模型切换": "Cambio de modelo",
"模型加载中,请等待": "El modelo se está cargando, por favor espera...",
"每张显卡的batch_size": "Tamaño de lote por tarjeta gráfica", "每张显卡的batch_size": "Tamaño de lote por tarjeta gráfica",
"版本": "Versión",
"粤英混合": "Mezcla Cantonés-Inglés", "粤英混合": "Mezcla Cantonés-Inglés",
"粤语": "Cantonés", "粤语": "Cantonés",
"终止合成": "Terminar síntesis", "终止合成": "Terminar síntesis",
@ -160,7 +150,6 @@
"缺少音素数据集": "Falta el Conjunto de Datos de Fonemas", "缺少音素数据集": "Falta el Conjunto de Datos de Fonemas",
"缺少音频数据集": "Falta el Conjunto de Datos de Audio", "缺少音频数据集": "Falta el Conjunto de Datos de Audio",
"英文": "Inglés", "英文": "Inglés",
"训练模型的版本": "Versión del modelo entrenado",
"训练集格式化一键三连": "Formato del conjunto de entrenamiento en un solo paso", "训练集格式化一键三连": "Formato del conjunto de entrenamiento en un solo paso",
"训练集格式化工具": "Herramienta de formateo del conjunto de datos de entrenamiento", "训练集格式化工具": "Herramienta de formateo del conjunto de datos de entrenamiento",
"语义Token提取": "Extracción de tokens semánticos", "语义Token提取": "Extracción de tokens semánticos",
@ -174,6 +163,7 @@
"语音识别": "Reconocimiento de voz", "语音识别": "Reconocimiento de voz",
"语音识别工具": "Herramienta de reconocimiento de voz", "语音识别工具": "Herramienta de reconocimiento de voz",
"语音降噪": "Reducción de ruido en la voz", "语音降噪": "Reducción de ruido en la voz",
"语音降噪工具": "Herramienta de reducción de ruido de voz",
"请上传3~10秒内参考音频超过会报错": "Por favor, suba un audio de referencia de entre 3 y 10 segundos, ¡más de eso causará un error!", "请上传3~10秒内参考音频超过会报错": "Por favor, suba un audio de referencia de entre 3 y 10 segundos, ¡más de eso causará un error!",
"请上传参考音频": "Por Favor, Suba el Audio de Referencia", "请上传参考音频": "Por Favor, Suba el Audio de Referencia",
"请填入推理文本": "Por Favor, Ingrese el Texto Objetivo", "请填入推理文本": "Por Favor, Ingrese el Texto Objetivo",
@ -197,8 +187,7 @@
"进度": "Progreso", "进度": "Progreso",
"进程已终止": " Proceso terminado", "进程已终止": " Proceso terminado",
"进程输出信息": " Información de salida del proceso", "进程输出信息": " Información de salida del proceso",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Seleccione los modelos guardados en SoVITS_weights y GPT_weights después del entrenamiento. Los predeterminados son modelos base para experimentar con inferencia Zero Shot TTS de 5 segundos sin entrenamiento.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Seleccione el modelo almacenado en SoVITS_weights y GPT_weights después del entrenamiento. Uno de ellos es el modelo base, útil para experimentar con TTS de 5 segundos sin entrenamiento.",
"采样步数(仅对V3/4生效)": "Pasos de muestreo (solo efectivo para V3/4)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Pasos de muestreo: si se siente ruidoso, intente aumentarlo; si es lento, intente reducirlo", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Pasos de muestreo: si se siente ruidoso, intente aumentarlo; si es lento, intente reducirlo",
"重复惩罚": "Penalización por repetición", "重复惩罚": "Penalización por repetición",
"随机种子": "Semilla aleatoria", "随机种子": "Semilla aleatoria",
@ -214,13 +203,9 @@
"音频标注WebUI": "WebUI de etiquetado de audio", "音频标注WebUI": "WebUI de etiquetado de audio",
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta", "音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
"音频超分中": "Superresolución de audio en proceso", "音频超分中": "Superresolución de audio en proceso",
"音频超采样": "Muestreo superior del audio",
"音频超采样(仅对V3生效))": "Muestreo superior del audio (solo efectivo para V3)",
"预测语义Token": "Predecir token semántico",
"预训练GPT模型路径": "Ruta del modelo GPT preentrenado", "预训练GPT模型路径": "Ruta del modelo GPT preentrenado",
"预训练SSL模型路径": "Ruta del modelo SSL preentrenado", "预训练SSL模型路径": "Ruta del modelo SSL preentrenado",
"预训练SoVITS-D模型路径": "Ruta del modelo SoVITS-D preentrenado", "预训练SoVITS-D模型路径": "Ruta del modelo SoVITS-D preentrenado",
"预训练SoVITS-G模型路径": "Ruta del modelo SoVITS-G preentrenado", "预训练SoVITS-G模型路径": "Ruta del modelo SoVITS-G preentrenado",
"预训练中文BERT模型路径": "Ruta del modelo BERT Chino Preentrenado", "预训练中文BERT模型路径": "Ruta del modelo BERT en Chino preentrenado"
"预训练模型路径": "Ruta del modelo preentrenado"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "Modèle ASR", "ASR 模型": "Modèle ASR",
"ASR 模型尺寸": "Taille du modèle ASR", "ASR 模型尺寸": "Taille du modèle ASR",
"ASR 语言设置": "Paramètres de langue ASR", "ASR 语言设置": "Paramètres de langue ASR",
"CPU训练,较慢": "Entraînement sur CPU, plus lent",
"GPT 训练: 模型权重文件在 GPT_weights/": "Entraînement GPT : les poids du modèle sont dans GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "Entraînement GPT : les poids du modèle sont dans GPT_weights/",
"GPT模型列表": "Liste des modèles GPT", "GPT模型列表": "Liste des modèles GPT",
"GPT训练": "Entraînement GPT", "GPT训练": "Entraînement GPT",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "Numéro de carte GPU, ne peut contenir qu'un seul entier", "GPU卡号,只能填1个整数": "Numéro de carte GPU, ne peut contenir qu'un seul entier",
"GPU卡号以-分割,每个卡号一个进程": "Numéro de carte GPU séparé par des tirets, un processus par numéro de carte", "GPU卡号以-分割,每个卡号一个进程": "Numéro de carte GPU séparé par des tirets, un processus par numéro de carte",
"LoRA秩": "Rang LoRA", "LoRA秩": "Rang LoRA",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Modèle de base SoVITS V3 manquant, impossible de charger les poids LoRA correspondants",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entraînement SoVITS : les poids du modèle sont dans SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entraînement SoVITS : les poids du modèle sont dans SoVITS_weights/",
"SoVITS模型列表": "Liste des modèles SoVITS", "SoVITS模型列表": "Liste des modèles SoVITS",
"SoVITS训练": "Entraînement SoVITS", "SoVITS训练": "Entraînement SoVITS",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Soumettre le texte : Enregistrez manuellement le contenu de tous les champs textuels de la page actuelle en mémoire et dans un fichier (si vous ne cliquez pas sur ce bouton avant ou après avoir changé de page, ou avant de quitter la page d'étiquetage, vos modifications seront annulées lorsque vous reviendrez, tout votre travail sera perdu).",
"TTS推理WebUI": "Interface Web d'inférence TTS", "TTS推理WebUI": "Interface Web d'inférence TTS",
"UVR5人声伴奏分离&去混响去延迟工具": "Outil UVR5 de séparation voix/accompagnement & suppression de réverbération et de latence", "UVR5人声伴奏分离&去混响去延迟工具": "Outil UVR5 de séparation voix/accompagnement & suppression de réverbération et de latence",
"V3不支持无参考文本模式请填写参考文本": "La version V3 ne prend pas en charge le mode sans texte de référence. Veuillez fournir un texte de référence !",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé",
"batch_size": "Taille de lot", "batch_size": "Taille de lot",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "Le mode n'est pas encore supporté par la v3. Une erreur se produira en cas d'utilisation.", "v3暂不支持该模式使用了会报错。": "Le mode n'est pas encore supporté par la v3. Une erreur se produira en cas d'utilisation.",
"v3输出如果觉得闷可以试试开超分": "Si la sortie v3 semble étouffée, essayez l'upscaling", "v3输出如果觉得闷可以试试开超分": "Si la sortie v3 semble étouffée, essayez l'upscaling",
"不切": "Pas de découpe", "不切": "Pas de découpe",
"不训练直接推v2ProPlus底模": "Utiliser directement le modèle de base v2ProPlus sans entraînement !",
"不训练直接推v2Pro底模": "Utiliser directement le modèle de base v2Pro sans entraînement !",
"不训练直接推v2底模": "Utiliser directement le modèle de base v2 sans entraînement !",
"不训练直接推v3底模": "Utiliser directement le modèle de base v3 sans entraînement !",
"中文": "Chinois", "中文": "Chinois",
"中文教程文档": "Documentation du tutoriel en chinois",
"中英混合": "Mélange de chinois et d'anglais", "中英混合": "Mélange de chinois et d'anglais",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de référence principal (Veuillez télécharger un audio de référence entre 3 et 10 secondes, dépasser cette limite entraînera une erreur !)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio de référence principal (Veuillez télécharger un audio de référence entre 3 et 10 secondes, dépasser cette limite entraînera une erreur !)",
"主参考音频的文本": "Texte de laudio de référence principal", "主参考音频的文本": "Texte de laudio de référence principal",
@ -92,7 +87,6 @@
"句间停顿秒数": "Temps de pause entre les phrases (secondes)", "句间停顿秒数": "Temps de pause entre les phrases (secondes)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optionnel : Téléchargez plusieurs fichiers audio de référence en les faisant glisser (recommandé d'être du même genre) et fusionnez leur tonalité. Si cette option est laissée vide, la tonalité sera contrôlée par l'unique fichier audio de référence à gauche. Si vous ajustez le modèle, il est recommandé que tous les fichiers audio de référence aient des tonalités dans l'ensemble d'entraînement d'ajustement ; le modèle pré-entrainé peut être ignoré.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optionnel : Téléchargez plusieurs fichiers audio de référence en les faisant glisser (recommandé d'être du même genre) et fusionnez leur tonalité. Si cette option est laissée vide, la tonalité sera contrôlée par l'unique fichier audio de référence à gauche. Si vous ajustez le modèle, il est recommandé que tous les fichiers audio de référence aient des tonalités dans l'ensemble d'entraînement d'ajustement ; le modèle pré-entrainé peut être ignoré.",
"合成语音": "Synthèse vocale", "合成语音": "Synthèse vocale",
"合成音频": "Synthétiser l'audio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Le support pour la conversion phonémique, lédition manuelle des phonèmes et la synthèse vocale par étapes sera ajouté ultérieurement.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Le support pour la conversion phonémique, lédition manuelle des phonèmes et la synthèse vocale par étapes sera ajouté ultérieurement.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si vous ne comprenez pas bien l'audio de référence (vous ne savez pas quoi écrire), vous pouvez activer cette option. Une fois activée, le texte de référence sera ignoré.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si vous ne comprenez pas bien l'audio de référence (vous ne savez pas quoi écrire), vous pouvez activer cette option. Une fois activée, le texte de référence sera ignoré.",
@ -110,15 +104,11 @@
"已关闭": " Fermé", "已关闭": " Fermé",
"已完成": " Terminé", "已完成": " Terminé",
"已开启": " Activé", "已开启": " Activé",
"并行合成中": "Synthèse parallèle en cours",
"并行推理": "Inférence parallèle", "并行推理": "Inférence parallèle",
"并行推理模式已关闭": "Mode dinférence parallèle désactivé", "并行推理模式已关闭": "Mode dinférence parallèle désactivé",
"并行推理模式已开启": "Mode dinférence parallèle activé", "并行推理模式已开启": "Mode dinférence parallèle activé",
"底模缺失,无法加载相应 LoRA 权重": "Modèle de base manquant, impossible de charger les poids LoRA correspondants",
"开启": "Activer ", "开启": "Activer ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "Lorsque le mode d'inférence parallèle est activé, les modèles SoVITS V3/4 ne prennent pas en charge le traitement par lots, qui a donc été désactivé automatiquement.",
"微调模型信息": "Informations sur le modèle affiné",
"微调训练": "Entraînement de fine-tuning", "微调训练": "Entraînement de fine-tuning",
"怎么切": "Comment découper", "怎么切": "Comment découper",
"总训练轮数total_epoch": "Nombre total d'époques d'entraînement", "总训练轮数total_epoch": "Nombre total d'époques d'entraînement",
@ -150,8 +140,8 @@
"模型": "Modèle", "模型": "Modèle",
"模型分为三类:": "Les modèles sont classés en trois catégories:", "模型分为三类:": "Les modèles sont classés en trois catégories:",
"模型切换": "Changement de modèle", "模型切换": "Changement de modèle",
"模型加载中,请等待": "Le modèle est en cours de chargement, veuillez patienter...",
"每张显卡的batch_size": "Taille de lot par carte graphique", "每张显卡的batch_size": "Taille de lot par carte graphique",
"版本": "Version",
"粤英混合": "Mélange Cantonais-Anglais", "粤英混合": "Mélange Cantonais-Anglais",
"粤语": "Cantonais", "粤语": "Cantonais",
"终止合成": "Terminer la synthèse", "终止合成": "Terminer la synthèse",
@ -160,7 +150,6 @@
"缺少音素数据集": "Jeu de Données de Phonèmes Manquant", "缺少音素数据集": "Jeu de Données de Phonèmes Manquant",
"缺少音频数据集": "Jeu de Données Audio Manquant", "缺少音频数据集": "Jeu de Données Audio Manquant",
"英文": "Anglais", "英文": "Anglais",
"训练模型的版本": "Version du modèle entraîné",
"训练集格式化一键三连": "Formatage de l'ensemble d'entraînement en un clic", "训练集格式化一键三连": "Formatage de l'ensemble d'entraînement en un clic",
"训练集格式化工具": "Outil de formatage des ensembles d'entraînement", "训练集格式化工具": "Outil de formatage des ensembles d'entraînement",
"语义Token提取": "Extraction de tokens sémantiques", "语义Token提取": "Extraction de tokens sémantiques",
@ -174,6 +163,7 @@
"语音识别": "Reconnaissance vocale", "语音识别": "Reconnaissance vocale",
"语音识别工具": "Outil de reconnaissance vocale", "语音识别工具": "Outil de reconnaissance vocale",
"语音降噪": "Réduction du bruit audio", "语音降噪": "Réduction du bruit audio",
"语音降噪工具": "Outil de réduction du bruit audio",
"请上传3~10秒内参考音频超过会报错": "Veuillez télécharger une référence audio de 3 à 10 secondes ; les fichiers plus longs généreront une erreur!", "请上传3~10秒内参考音频超过会报错": "Veuillez télécharger une référence audio de 3 à 10 secondes ; les fichiers plus longs généreront une erreur!",
"请上传参考音频": "Veuillez télécharger l'audio de référence", "请上传参考音频": "Veuillez télécharger l'audio de référence",
"请填入推理文本": "Veuillez remplir le texte cible", "请填入推理文本": "Veuillez remplir le texte cible",
@ -197,8 +187,7 @@
"进度": "Progression", "进度": "Progression",
"进程已终止": " Processus terminé", "进程已终止": " Processus terminé",
"进程输出信息": " Sortie du processus", "进程输出信息": " Sortie du processus",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Sélectionnez les modèles entraînés stockés dans SoVITS_weights et GPT_weights. Par défaut, certains sont des modèles de base pour l'inférence Zero Shot TTS en 5 secondes sans entraînement.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Choisissez le modèle entraîné stocké sous SoVITS_weights et GPT_weights. Par défaut, l'un d'eux est un modèle de base pour l'expérience de TTS Zero Shot de 5 secondes.",
"采样步数(仅对V3/4生效)": "Nombre d'étapes d'échantillonnage (uniquement effectif pour V3/4)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Nombre détapes déchantillonnage : si le son est bruité, essayez daugmenter, si cest lent, essayez de réduire", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Nombre détapes déchantillonnage : si le son est bruité, essayez daugmenter, si cest lent, essayez de réduire",
"重复惩罚": "Pénalité de répétition", "重复惩罚": "Pénalité de répétition",
"随机种子": "Graine aléatoire", "随机种子": "Graine aléatoire",
@ -213,14 +202,10 @@
"音频文件不存在,跳过:": "Fichier audio introuvable, passage : ", "音频文件不存在,跳过:": "Fichier audio introuvable, passage : ",
"音频标注WebUI": "Interface Web d'annotation audio", "音频标注WebUI": "Interface Web d'annotation audio",
"音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier", "音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier",
"音频超分中": "Super-résolution audio en cours", "音频超分中": "Upscaling audio en cours",
"音频超采样": "Suréchantillonnage audio",
"音频超采样(仅对V3生效))": "Suréchantillonnage audio (uniquement effectif pour V3)",
"预测语义Token": "Prédire le jeton sémantique",
"预训练GPT模型路径": "Chemin du modèle GPT pré-entraîné", "预训练GPT模型路径": "Chemin du modèle GPT pré-entraîné",
"预训练SSL模型路径": "Chemin du modèle SSL pré-entraîné", "预训练SSL模型路径": "Chemin du modèle SSL pré-entraîné",
"预训练SoVITS-D模型路径": "Chemin du modèle SoVITS-D pré-entraîné", "预训练SoVITS-D模型路径": "Chemin du modèle SoVITS-D pré-entraîné",
"预训练SoVITS-G模型路径": "Chemin du modèle SoVITS-G pré-entraîné", "预训练SoVITS-G模型路径": "Chemin du modèle SoVITS-G pré-entraîné",
"预训练中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné", "预训练中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné"
"预训练模型路径": "Chemin du modèle pré-entraîné"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "Modello ASR", "ASR 模型": "Modello ASR",
"ASR 模型尺寸": "Dimensioni del modello ASR", "ASR 模型尺寸": "Dimensioni del modello ASR",
"ASR 语言设置": "Impostazioni linguistiche ASR", "ASR 语言设置": "Impostazioni linguistiche ASR",
"CPU训练,较慢": "Addestramento su CPU, più lento",
"GPT 训练: 模型权重文件在 GPT_weights/": "Addestramento GPT: i pesi del modello sono in GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "Addestramento GPT: i pesi del modello sono in GPT_weights/",
"GPT模型列表": "Elenco dei modelli GPT", "GPT模型列表": "Elenco dei modelli GPT",
"GPT训练": "Addestramento GPT", "GPT训练": "Addestramento GPT",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "Numero della scheda grafica, può essere inserito solo un numero intero", "GPU卡号,只能填1个整数": "Numero della scheda grafica, può essere inserito solo un numero intero",
"GPU卡号以-分割,每个卡号一个进程": "Numero di GPU separati da '-'; ogni numero corrisponde a un processo", "GPU卡号以-分割,每个卡号一个进程": "Numero di GPU separati da '-'; ogni numero corrisponde a un processo",
"LoRA秩": "Rango LoRA", "LoRA秩": "Rango LoRA",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Modello base SoVITS V3 mancante, impossibile caricare i pesi LoRA corrispondenti",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Addestramento SoVITS: i pesi del modello sono in SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Addestramento SoVITS: i pesi del modello sono in SoVITS_weights/",
"SoVITS模型列表": "Elenco dei modelli SoVITS", "SoVITS模型列表": "Elenco dei modelli SoVITS",
"SoVITS训练": "Addestramento SoVITS", "SoVITS训练": "Addestramento SoVITS",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Invia testo: Salva manualmente i contenuti di tutti i campi testuali della pagina corrente in memoria e su file (se non premi questo pulsante prima o dopo aver girato pagina oppure prima di uscire dalla pagina di annotazione, tornando indietro i dati saranno ripristinati e avrai lavorato invano).",
"TTS推理WebUI": "Interfaccia Web per inferenza TTS", "TTS推理WebUI": "Interfaccia Web per inferenza TTS",
"UVR5人声伴奏分离&去混响去延迟工具": "Strumento UVR5 per separazione voce/accompagnamento & rimozione riverbero e latenza", "UVR5人声伴奏分离&去混响去延迟工具": "Strumento UVR5 per separazione voce/accompagnamento & rimozione riverbero e latenza",
"V3不支持无参考文本模式请填写参考文本": "V3 non supporta la modalità senza testo di riferimento! Inserisci il testo di riferimento!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Quanta proporzione dell'audio normalizzato deve essere miscelata", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Quanta proporzione dell'audio normalizzato deve essere miscelata",
"batch_size": "Dimensione del batch", "batch_size": "Dimensione del batch",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: Come calcolare la curva del volume. Più piccolo è, maggiore è la precisione ma aumenta la complessità computazionale (non significa che una maggiore precisione dà risultati migliori)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: Come calcolare la curva del volume. Più piccolo è, maggiore è la precisione ma aumenta la complessità computazionale (non significa che una maggiore precisione dà risultati migliori)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "V3 non supporta questa modalità, utilizzarla genererà un errore.", "v3暂不支持该模式使用了会报错。": "V3 non supporta questa modalità, utilizzarla genererà un errore.",
"v3输出如果觉得闷可以试试开超分": "Se l'output v3 sembra ovattato, prova ad attivare il super-risoluzione", "v3输出如果觉得闷可以试试开超分": "Se l'output v3 sembra ovattato, prova ad attivare il super-risoluzione",
"不切": "Nessuna suddivisione", "不切": "Nessuna suddivisione",
"不训练直接推v2ProPlus底模": "Usa direttamente il modello base v2ProPlus senza addestramento!",
"不训练直接推v2Pro底模": "Usa direttamente il modello base v2Pro senza addestramento!",
"不训练直接推v2底模": "Usa direttamente il modello base v2 senza addestramento!",
"不训练直接推v3底模": "Usa direttamente il modello base v3 senza addestramento!",
"中文": "Cinese", "中文": "Cinese",
"中文教程文档": "Documentazione del tutorial in cinese",
"中英混合": "Cinese e inglese misti", "中英混合": "Cinese e inglese misti",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio di riferimento principale (Si prega di caricare un audio di riferimento tra 3 e 10 secondi, superato questo limite si verificherà un errore!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Audio di riferimento principale (Si prega di caricare un audio di riferimento tra 3 e 10 secondi, superato questo limite si verificherà un errore!)",
"主参考音频的文本": "Testo dell'audio di riferimento principale", "主参考音频的文本": "Testo dell'audio di riferimento principale",
@ -92,7 +87,6 @@
"句间停顿秒数": "Durata pausa tra le frasi (secondi)", "句间停顿秒数": "Durata pausa tra le frasi (secondi)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opzionale: Carica più file audio di riferimento trascinandoli (si consiglia dello stesso genere) e media il loro tono. Se questa opzione è lasciata vuota, il tono sarà controllato dal singolo file audio di riferimento a sinistra. Se si sta perfezionando il modello, è consigliato che tutti i file audio di riferimento abbiano toni presenti nel set di addestramento per il perfezionamento; il modello pre-addestrato può essere ignorato.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opzionale: Carica più file audio di riferimento trascinandoli (si consiglia dello stesso genere) e media il loro tono. Se questa opzione è lasciata vuota, il tono sarà controllato dal singolo file audio di riferimento a sinistra. Se si sta perfezionando il modello, è consigliato che tutti i file audio di riferimento abbiano toni presenti nel set di addestramento per il perfezionamento; il modello pre-addestrato può essere ignorato.",
"合成语音": "Sintesi vocale", "合成语音": "Sintesi vocale",
"合成音频": "Sintesi audio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Formato di percorso della cartella valido: E:\\codes\\py39\\vits_vc_gpu\\Esempio di test di BaiLuShuangHua (copiare direttamente dalla barra degli indirizzi del gestore file).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Formato di percorso della cartella valido: E:\\codes\\py39\\vits_vc_gpu\\Esempio di test di BaiLuShuangHua (copiare direttamente dalla barra degli indirizzi del gestore file).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "In futuro verrà aggiunto il supporto per la conversione dei fonemi, la modifica manuale dei fonemi e la sintesi vocale passo dopo passo.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "In futuro verrà aggiunto il supporto per la conversione dei fonemi, la modifica manuale dei fonemi e la sintesi vocale passo dopo passo.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se l'audio di riferimento non è chiaro o non sai cosa scrivere, abilita questa opzione per ignorare il testo di riferimento.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se l'audio di riferimento non è chiaro o non sai cosa scrivere, abilita questa opzione per ignorare il testo di riferimento.",
@ -110,15 +104,11 @@
"已关闭": " Chiuso", "已关闭": " Chiuso",
"已完成": " Completato", "已完成": " Completato",
"已开启": " Attivato", "已开启": " Attivato",
"并行合成中": "Sintesi parallela in corso",
"并行推理": "Inferenza parallela", "并行推理": "Inferenza parallela",
"并行推理模式已关闭": "Modalità di inferenza parallela disabilitata", "并行推理模式已关闭": "Modalità di inferenza parallela disabilitata",
"并行推理模式已开启": "Modalità di inferenza parallela abilitata", "并行推理模式已开启": "Modalità di inferenza parallela abilitata",
"底模缺失,无法加载相应 LoRA 权重": "Mancano il modello base, non è possibile caricare i pesi LoRA corrispondenti",
"开启": "Attiva ", "开启": "Attiva ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Attivare la modalità senza testo di riferimento. Anche se non inserisci un testo di riferimento, la modalità verrà attivata.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Attivare la modalità senza testo di riferimento. Anche se non inserisci un testo di riferimento, la modalità verrà attivata.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "Quando la modalità di inferenza parallela è attiva, i modelli SoVITS V3/4 non supportano l'elaborazione suddivisa in gruppi e questa è stata automaticamente disattivata.",
"微调模型信息": "Informazioni sul modello raffinato",
"微调训练": "Addestramento fine-tuning", "微调训练": "Addestramento fine-tuning",
"怎么切": "Come tagliare", "怎么切": "Come tagliare",
"总训练轮数total_epoch": "Numero totale di epoche di addestramento", "总训练轮数total_epoch": "Numero totale di epoche di addestramento",
@ -150,8 +140,8 @@
"模型": "Modello", "模型": "Modello",
"模型分为三类:": "I modelli sono divisi in tre categorie:", "模型分为三类:": "I modelli sono divisi in tre categorie:",
"模型切换": "Cambio del modello", "模型切换": "Cambio del modello",
"模型加载中,请等待": "Il modello si sta caricando, attendere prego...",
"每张显卡的batch_size": "Batch size per ogni scheda grafica", "每张显卡的batch_size": "Batch size per ogni scheda grafica",
"版本": "Versione",
"粤英混合": "Misto Cantonese-Inglese", "粤英混合": "Misto Cantonese-Inglese",
"粤语": "Cantonese", "粤语": "Cantonese",
"终止合成": "Termina sintesi", "终止合成": "Termina sintesi",
@ -160,7 +150,6 @@
"缺少音素数据集": "Dataset di Fonemi Mancante", "缺少音素数据集": "Dataset di Fonemi Mancante",
"缺少音频数据集": "Dataset Audio Mancante", "缺少音频数据集": "Dataset Audio Mancante",
"英文": "Inglese", "英文": "Inglese",
"训练模型的版本": "Versione del modello addestrato",
"训练集格式化一键三连": "Formattazione del dataset di addestramento in un clic", "训练集格式化一键三连": "Formattazione del dataset di addestramento in un clic",
"训练集格式化工具": "Strumento di formattazione dataset", "训练集格式化工具": "Strumento di formattazione dataset",
"语义Token提取": "Estrazione token semantici", "语义Token提取": "Estrazione token semantici",
@ -174,6 +163,7 @@
"语音识别": "Riconoscimento vocale", "语音识别": "Riconoscimento vocale",
"语音识别工具": "Strumento di riconoscimento vocale", "语音识别工具": "Strumento di riconoscimento vocale",
"语音降噪": "Riduzione del rumore audio", "语音降噪": "Riduzione del rumore audio",
"语音降噪工具": "Strumento di riduzione del rumore audio",
"请上传3~10秒内参考音频超过会报错": "Carica un audio di riferimento della durata compresa tra 3 e 10 secondi. Superiore a questo, verrà generato un errore!", "请上传3~10秒内参考音频超过会报错": "Carica un audio di riferimento della durata compresa tra 3 e 10 secondi. Superiore a questo, verrà generato un errore!",
"请上传参考音频": "Si prega di caricare l'audio di riferimento", "请上传参考音频": "Si prega di caricare l'audio di riferimento",
"请填入推理文本": "Si prega di inserire il testo di destinazione", "请填入推理文本": "Si prega di inserire il testo di destinazione",
@ -197,8 +187,7 @@
"进度": "Avanzamento", "进度": "Avanzamento",
"进程已终止": " Processo terminato", "进程已终止": " Processo terminato",
"进程输出信息": " Output del processo", "进程输出信息": " Output del processo",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Seleziona i modelli addestrati salvati in SoVITS_weights e GPT_weights. Quelli predefiniti sono modelli base per sperimentare l'inferenza Zero Shot TTS in 5 secondi senza addestramento.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Scegli il modello salvato in SoVITS_weights e GPT_weights dopo l'addestramento. Uno di default è il modello di base, utilizzato per l'esperienza di Zero Shot TTS in 5 secondi.",
"采样步数(仅对V3/4生效)": "Numero di passaggi di campionamento (valido solo per V3/4)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passi di campionamento: se sembra rumoroso, prova a aumentarlo, se è lento, prova a diminuirlo", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passi di campionamento: se sembra rumoroso, prova a aumentarlo, se è lento, prova a diminuirlo",
"重复惩罚": "Penalità di ripetizione", "重复惩罚": "Penalità di ripetizione",
"随机种子": "Seme casuale", "随机种子": "Seme casuale",
@ -214,13 +203,9 @@
"音频标注WebUI": "Interfaccia Web per annotazione audio", "音频标注WebUI": "Interfaccia Web per annotazione audio",
"音频自动切分输入路径,可文件可文件夹": "Percorso di input per la segmentazione automatica dell'audio, può essere un file o una cartella", "音频自动切分输入路径,可文件可文件夹": "Percorso di input per la segmentazione automatica dell'audio, può essere un file o una cartella",
"音频超分中": "Super-risoluzione audio in corso", "音频超分中": "Super-risoluzione audio in corso",
"音频超采样": "Upsampling audio",
"音频超采样(仅对V3生效))": "Upsampling audio (valido solo per V3)",
"预测语义Token": "Predici token semantico",
"预训练GPT模型路径": "Percorso del modello GPT pre-addestrato", "预训练GPT模型路径": "Percorso del modello GPT pre-addestrato",
"预训练SSL模型路径": "Percorso del modello SSL pre-addestrato", "预训练SSL模型路径": "Percorso del modello SSL pre-addestrato",
"预训练SoVITS-D模型路径": "Percorso del modello SoVITS-D pre-addestrato", "预训练SoVITS-D模型路径": "Percorso del modello SoVITS-D pre-addestrato",
"预训练SoVITS-G模型路径": "Percorso del modello SoVITS-G pre-addestrato", "预训练SoVITS-G模型路径": "Percorso del modello SoVITS-G pre-addestrato",
"预训练中文BERT模型路径": "Percorso del modello BERT cinese pre-addestrato", "预训练中文BERT模型路径": "Percorso del modello BERT cinese pre-addestrato"
"预训练模型路径": "Percorso del modello pre-addestrato"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR モデル", "ASR 模型": "ASR モデル",
"ASR 模型尺寸": "ASRモデルサイズ", "ASR 模型尺寸": "ASRモデルサイズ",
"ASR 语言设置": "ASR 言語設定", "ASR 语言设置": "ASR 言語設定",
"CPU训练,较慢": "CPUトレーニング、速度低下",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT トレーニング: モデルの重みファイルは GPT_weights/ にあります", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT トレーニング: モデルの重みファイルは GPT_weights/ にあります",
"GPT模型列表": "GPTモデルリスト", "GPT模型列表": "GPTモデルリスト",
"GPT训练": "GPTトレーニング", "GPT训练": "GPTトレーニング",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU番号、1つの整数しか入力できません", "GPU卡号,只能填1个整数": "GPU番号、1つの整数しか入力できません",
"GPU卡号以-分割,每个卡号一个进程": "GPUカード番号はハイフンで区切り、各カード番号ごとに1つのプロセスが実行されます", "GPU卡号以-分割,每个卡号一个进程": "GPUカード番号はハイフンで区切り、各カード番号ごとに1つのプロセスが実行されます",
"LoRA秩": "LoRAランク", "LoRA秩": "LoRAランク",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 のベースモデルが不足しているため、対応する LoRA の重みをロードできません",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS トレーニング: モデルの重みファイルは SoVITS_weights/ にあります", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS トレーニング: モデルの重みファイルは SoVITS_weights/ にあります",
"SoVITS模型列表": "SoVITSモデルリスト", "SoVITS模型列表": "SoVITSモデルリスト",
"SoVITS训练": "SoVITSトレーニング", "SoVITS训练": "SoVITSトレーニング",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "現在のページにあるすべてのテキストフィールドの内容を手動で保存します(メモリとファイルに反映)。ページ切り替えやアノテーション画面の終了前にこのボタンを押さないと、再度戻った際に変更が破棄され、作業が無駄になります。",
"TTS推理WebUI": "TTS推論WebUI", "TTS推理WebUI": "TTS推論WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5ボーカルアカンパニメント分離リバーブおよびディレイ除去ツール", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5ボーカルアカンパニメント分離リバーブおよびディレイ除去ツール",
"V3不支持无参考文本模式请填写参考文本": "V3は参照テキストなしのモードをサポートしていません。必ず参照テキストを入力してください",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix正規化後のオーディオが入る割合", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix正規化後のオーディオが入る割合",
"batch_size": "バッチサイズ", "batch_size": "バッチサイズ",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: 音量曲線の計算方法、小さいほど精度が高くなりますが、計算量が増加します(精度が高いほど必ずしも効果が良いわけではありません)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: 音量曲線の計算方法、小さいほど精度が高くなりますが、計算量が増加します(精度が高いほど必ずしも効果が良いわけではありません)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3では現在このモードをサポートしておらず、使用するとエラーが発生します。", "v3暂不支持该模式使用了会报错。": "v3では現在このモードをサポートしておらず、使用するとエラーが発生します。",
"v3输出如果觉得闷可以试试开超分": "v3の出力がこもっていると感じた場合、超解像を試してください", "v3输出如果觉得闷可以试试开超分": "v3の出力がこもっていると感じた場合、超解像を試してください",
"不切": "切らない", "不切": "切らない",
"不训练直接推v2ProPlus底模": "学習せずに直接v2ProPlusベースモデルを使用",
"不训练直接推v2Pro底模": "学習せずに直接v2Proベースモデルを使用",
"不训练直接推v2底模": "学習せずに直接v2ベースモデルを使用",
"不训练直接推v3底模": "学習せずに直接v3ベースモデルを使用",
"中文": "中国語", "中文": "中国語",
"中文教程文档": "中国語チュートリアルドキュメント",
"中英混合": "中英混合", "中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "プライマリーFERエンス音声3〜10秒以内の参考音声をアップロードしてください。これを超えるとエラーが発生します", "主参考音频(请上传3~10秒内参考音频超过会报错)": "プライマリーFERエンス音声3〜10秒以内の参考音声をアップロードしてください。これを超えるとエラーが発生します",
"主参考音频的文本": "プライマリーFERエンス音声のテキスト", "主参考音频的文本": "プライマリーFERエンス音声のテキスト",
@ -92,7 +87,6 @@
"句间停顿秒数": "文間のポーズ秒数", "句间停顿秒数": "文間のポーズ秒数",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "オプション:複数の参照オーディオファイルをドラッグ&ドロップしてアップロードし、それらのトーンを平均化します(同性推奨)。このオプションを空白のままにした場合、トーンは左側の単一の参照オーディオによって制御されます。モデルを微調整する場合、すべての参照オーディオファイルが微調整のトレーニングセット内のトーンを持つことをお勧めします。プリトレーニングモデルは無視しても構いません。", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "オプション:複数の参照オーディオファイルをドラッグ&ドロップしてアップロードし、それらのトーンを平均化します(同性推奨)。このオプションを空白のままにした場合、トーンは左側の単一の参照オーディオによって制御されます。モデルを微調整する場合、すべての参照オーディオファイルが微調整のトレーニングセット内のトーンを持つことをお勧めします。プリトレーニングモデルは無視しても構いません。",
"合成语音": "推論を開始", "合成语音": "推論を開始",
"合成音频": "音声を合成する",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "適切なフォルダパスの例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华テストサンプル(ファイルマネージャのアドレスバーからコピーしてください)。", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "適切なフォルダパスの例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华テストサンプル(ファイルマネージャのアドレスバーからコピーしてください)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "今後、フォンメ转换、手動フォンメ編集、音声合成のステップバイステップ実行をサポートします。", "后续将支持转音素、手工修改音素、语音合成分步执行。": "今後、フォンメ转换、手動フォンメ編集、音声合成のステップバイステップ実行をサポートします。",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "参照音声がはっきり、または何を書くかわからない場合は、このオプションを有効にして参照テキストを無視します。", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "参照音声がはっきり、または何を書くかわからない場合は、このオプションを有効にして参照テキストを無視します。",
@ -110,15 +104,11 @@
"已关闭": "閉じました", "已关闭": "閉じました",
"已完成": "完了しました", "已完成": "完了しました",
"已开启": "有効化しました", "已开启": "有効化しました",
"并行合成中": "並列合成処理中",
"并行推理": "並列推論", "并行推理": "並列推論",
"并行推理模式已关闭": "並列推論モードを無効化", "并行推理模式已关闭": "並列推論モードを無効化",
"并行推理模式已开启": "並列推論モードを有効化", "并行推理模式已开启": "並列推論モードを有効化",
"底模缺失,无法加载相应 LoRA 权重": "ベースモデルが不足しているため、対応する LoRA の重みをロードできません",
"开启": "有効化", "开启": "有効化",
"开启无参考文本模式。不填参考文本亦相当于开启。": "参照テキストなしモードを有効にします。参照テキストを入力しない場合も同様に有効になります。", "开启无参考文本模式。不填参考文本亦相当于开启。": "参照テキストなしモードを有効にします。参照テキストを入力しない場合も同様に有効になります。",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "並列推論モードが有効な場合、SoVITS V3/4モデルはバケット処理をサポートしないため、自動的に無効になっています。",
"微调模型信息": "ファインチューニングモデル情報",
"微调训练": "ファインチューニング", "微调训练": "ファインチューニング",
"怎么切": "どうやって切るか", "怎么切": "どうやって切るか",
"总训练轮数total_epoch": "総トレーニングエポック数total_epoch", "总训练轮数total_epoch": "総トレーニングエポック数total_epoch",
@ -150,8 +140,8 @@
"模型": "モデル", "模型": "モデル",
"模型分为三类:": "モデルは3種類に分かれています", "模型分为三类:": "モデルは3種類に分かれています",
"模型切换": "モデル切り替え", "模型切换": "モデル切り替え",
"模型加载中,请等待": "モデルを読み込み中です。しばらくお待ちください...",
"每张显卡的batch_size": "各グラフィックカードのバッチサイズ", "每张显卡的batch_size": "各グラフィックカードのバッチサイズ",
"版本": "バージョン",
"粤英混合": "粤英混合", "粤英混合": "粤英混合",
"粤语": "粤語", "粤语": "粤語",
"终止合成": "合成を終了", "终止合成": "合成を終了",
@ -160,7 +150,6 @@
"缺少音素数据集": "音素データセットが欠落しています", "缺少音素数据集": "音素データセットが欠落しています",
"缺少音频数据集": "オーディオデータセットが欠落しています", "缺少音频数据集": "オーディオデータセットが欠落しています",
"英文": "英語", "英文": "英語",
"训练模型的版本": "学習済みモデルのバージョン",
"训练集格式化一键三连": "トレーニングデータのフォーマットを一括処理", "训练集格式化一键三连": "トレーニングデータのフォーマットを一括処理",
"训练集格式化工具": "トレーニングデータフォーマットツール", "训练集格式化工具": "トレーニングデータフォーマットツール",
"语义Token提取": "セマンティックトークン抽出", "语义Token提取": "セマンティックトークン抽出",
@ -174,6 +163,7 @@
"语音识别": "音声認識", "语音识别": "音声認識",
"语音识别工具": "音声認識ツール", "语音识别工具": "音声認識ツール",
"语音降噪": "音声ノイズ除去", "语音降噪": "音声ノイズ除去",
"语音降噪工具": "音声ノイズ除去ツール",
"请上传3~10秒内参考音频超过会报错": "310秒以内の参照音声をアップロードしてください。それを超えるとエラーが発生します", "请上传3~10秒内参考音频超过会报错": "310秒以内の参照音声をアップロードしてください。それを超えるとエラーが発生します",
"请上传参考音频": "リファレンスオーディオをアップロードしてください", "请上传参考音频": "リファレンスオーディオをアップロードしてください",
"请填入推理文本": "ターゲットテキストを入力してください", "请填入推理文本": "ターゲットテキストを入力してください",
@ -197,8 +187,7 @@
"进度": "進捗", "进度": "進捗",
"进程已终止": "プロセスが終了しました", "进程已终止": "プロセスが終了しました",
"进程输出信息": "プロセスの出力情報", "进程输出信息": "プロセスの出力情報",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "SoVITS_weightsとGPT_weightsに保存された学習済みモデルを選択してください。デフォルトのモデルはベースモデルで、5秒Zero Shot TTS推論を学習なしで体験できます。", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "SoVITS_weightsおよびGPT_weightsに保存されたモデルを選択します。デフォルトのものはプレトレインであり、ゼロショットTTSを体験できます。",
"采样步数(仅对V3/4生效)": "サンプリングステップ数V3/V4のみ有効",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "サンプリングステップ数:ノイズが強い場合は増やし、遅い場合は減らしてみてください", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "サンプリングステップ数:ノイズが強い場合は増やし、遅い場合は減らしてみてください",
"重复惩罚": "繰り返しペナルティ", "重复惩罚": "繰り返しペナルティ",
"随机种子": "ランダムシード", "随机种子": "ランダムシード",
@ -210,17 +199,13 @@
"韩文": "韓国語", "韩文": "韓国語",
"韩英混合": "韓英混合", "韩英混合": "韓英混合",
"音频加载失败": "音声の読み込みに失敗しました", "音频加载失败": "音声の読み込みに失敗しました",
"音频文件不存在,跳过:": "音声ファイルが見つかりません。スキップします:", "音频文件不存在,跳过:": "オーディオファイルが見つからない。スキップ:",
"音频标注WebUI": "音声アテーション用WebUI", "音频标注WebUI": "音声ラベリングWebUI",
"音频自动切分输入路径,可文件可文件夹": "音声自動分割の入力パス(ファイルまたはフォルダ指定可)", "音频自动切分输入路径,可文件可文件夹": "オーディオの自動分割入力パス、ファイルまたはフォルダを指定できます",
"音频超分中": "音声スーパーレゾリューション処理中", "音频超分中": "音声超解像中",
"音频超采样": "音声アップサンプリング", "预训练GPT模型路径": "事前にトレーニングされたGPTモデルのパス",
"音频超采样(仅对V3生效))": "音声アップサンプリングV3のみ有効", "预训练SSL模型路径": "事前にトレーニングされたSSLモデルのパス",
"预测语义Token": "意味的トークンを予測する", "预训练SoVITS-D模型路径": "事前にトレーニングされたSoVITS-Dモデルのパス",
"预训练GPT模型路径": "事前学習済みGPTモデルのパス", "预训练SoVITS-G模型路径": "事前にトレーニングされたSoVITS-Gモデルのパス",
"预训练SSL模型路径": "事前学習済みSSLモデルのパス", "预训练中文BERT模型路径": "事前にトレーニングされた中国語BERTモデルのパス"
"预训练SoVITS-D模型路径": "事前学習済みSoVITS-Dモデルのパス",
"预训练SoVITS-G模型路径": "事前学習済みSoVITS-Gモデルのパス",
"预训练中文BERT模型路径": "事前学習済み中国語BERTモデルのパス",
"预训练模型路径": "事前学習済みモデルのパス"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR 모델", "ASR 模型": "ASR 모델",
"ASR 模型尺寸": "ASR 모델 크기", "ASR 模型尺寸": "ASR 모델 크기",
"ASR 语言设置": "ASR 언어 설정", "ASR 语言设置": "ASR 언어 설정",
"CPU训练,较慢": "CPU 학습, 속도 느림",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT 훈련: 모델 가중치 파일은 GPT_weights/에 있습니다", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT 훈련: 모델 가중치 파일은 GPT_weights/에 있습니다",
"GPT模型列表": "GPT 모델 목록", "GPT模型列表": "GPT 모델 목록",
"GPT训练": "GPT훈련", "GPT训练": "GPT훈련",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU 카드 번호, 1개의 정수만 입력 가능", "GPU卡号,只能填1个整数": "GPU 카드 번호, 1개의 정수만 입력 가능",
"GPU卡号以-分割,每个卡号一个进程": "GPU 카드 번호는 -로 구분되며 각 카드 번호에 하나의 프로세스가 있어야 함", "GPU卡号以-分割,每个卡号一个进程": "GPU 카드 번호는 -로 구분되며 각 카드 번호에 하나의 프로세스가 있어야 함",
"LoRA秩": "LoRA 랭크", "LoRA秩": "LoRA 랭크",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 기본 모델이 없어서 해당 LoRA 가중치를 로드할 수 없습니다",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 훈련: 모델 가중치 파일은 SoVITS_weights/에 있습니다", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 훈련: 모델 가중치 파일은 SoVITS_weights/에 있습니다",
"SoVITS模型列表": "SoVITS 모델 목록", "SoVITS模型列表": "SoVITS 모델 목록",
"SoVITS训练": "SoVITS훈련", "SoVITS训练": "SoVITS훈련",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "현재 페이지의 모든 텍스트 상자 내용을 수동으로 메모리와 파일에 저장합니다. (페이지 전환 전후 또는 주석 작업을 종료하기 전에 이 버튼을 누르지 않으면, 다시 돌아왔을 때 변경 사항이 롤백되어 작업이 무효가 됩니다.)",
"TTS推理WebUI": "TTS 추론 WebUI", "TTS推理WebUI": "TTS 추론 WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 보컬 및 반주 분리 & 리버브 제거 및 딜레이 제거 도구", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 보컬 및 반주 분리 & 리버브 제거 및 딜레이 제거 도구",
"V3不支持无参考文本模式请填写参考文本": "V3는 참조 텍스트 없이 작동할 수 없습니다. 반드시 참조 텍스트를 입력해주세요!",
"alpha_mix:混多少比例归一化后音频进来": "알파 믹스: 정규화된 오디오가 들어오는 비율", "alpha_mix:混多少比例归一化后音频进来": "알파 믹스: 정규화된 오디오가 들어오는 비율",
"batch_size": "배치 크기", "batch_size": "배치 크기",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop 크기: 볼륨 곡선을 계산하는 방법. 작을수록 정확도가 높아지지만 계산량이 높아집니다 (정확도가 높다고 효과가 좋아지지 않음)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop 크기: 볼륨 곡선을 계산하는 방법. 작을수록 정확도가 높아지지만 계산량이 높아집니다 (정확도가 높다고 효과가 좋아지지 않음)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3에서는 이 모드를 지원하지 않으며, 사용 시 오류가 발생합니다.", "v3暂不支持该模式使用了会报错。": "v3에서는 이 모드를 지원하지 않으며, 사용 시 오류가 발생합니다.",
"v3输出如果觉得闷可以试试开超分": "v3 출력이 답답하게 들리면 슈퍼 레졸루션을 켜보세요", "v3输出如果觉得闷可以试试开超分": "v3 출력이 답답하게 들리면 슈퍼 레졸루션을 켜보세요",
"不切": "자르지 않음", "不切": "자르지 않음",
"不训练直接推v2ProPlus底模": "학습 없이 바로 v2ProPlus 베이스 모델 사용!",
"不训练直接推v2Pro底模": "학습 없이 바로 v2Pro 베이스 모델 사용!",
"不训练直接推v2底模": "학습 없이 바로 v2 베이스 모델 사용!",
"不训练直接推v3底模": "학습 없이 바로 v3 베이스 모델 사용!",
"中文": "중국어", "中文": "중국어",
"中文教程文档": "중국어 튜토리얼 문서",
"中英混合": "중영 혼합", "中英混合": "중영 혼합",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "주 참조 오디오 (3~10초 이내의 참조 오디오를 업로드하세요. 초과 시 오류 발생!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "주 참조 오디오 (3~10초 이내의 참조 오디오를 업로드하세요. 초과 시 오류 발생!)",
"主参考音频的文本": "주 참조 오디오의 텍스트", "主参考音频的文本": "주 참조 오디오의 텍스트",
@ -92,7 +87,6 @@
"句间停顿秒数": "문장 간 정지 시간 (초)", "句间停顿秒数": "문장 간 정지 시간 (초)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "선택 사항: 여러 참조 오디오 파일을 드래그하여 업로드하고 (동일한 성별을 권장), 그들의 톤을 평균화합니다. 이 옵션을 비워두면 톤은 왼쪽의 단일 참조 오디오로 제어됩니다. 모델을 미세 조정하는 경우 모든 참조 오디오 파일이 미세 조정 훈련 세트 내의 톤을 가지고 있는 것이 좋으며, 사전 훈련된 모델은 무시할 수 있습니다.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "선택 사항: 여러 참조 오디오 파일을 드래그하여 업로드하고 (동일한 성별을 권장), 그들의 톤을 평균화합니다. 이 옵션을 비워두면 톤은 왼쪽의 단일 참조 오디오로 제어됩니다. 모델을 미세 조정하는 경우 모든 참조 오디오 파일이 미세 조정 훈련 세트 내의 톤을 가지고 있는 것이 좋으며, 사전 훈련된 모델은 무시할 수 있습니다.",
"合成语音": "합성 음성", "合成语音": "합성 음성",
"合成音频": "오디오 생성",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "적절한 폴더 경로 형식 예: E:\\codes\\py39\\vits_vc_gpu\\백로서리 테스트 샘플 (파일 관리자 주소 표시줄에서 복사하면 됩니다).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "적절한 폴더 경로 형식 예: E:\\codes\\py39\\vits_vc_gpu\\백로서리 테스트 샘플 (파일 관리자 주소 표시줄에서 복사하면 됩니다).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "향후 음소 변환, 수동 음소 편집, 단계별 음성 합성 지원이 추가될 예정입니다.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "향후 음소 변환, 수동 음소 편집, 단계별 음성 합성 지원이 추가될 예정입니다.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "참고 오디오의 내용을 정확히 알아들을 수 없을 경우 이 옵션을 활성화하세요. 활성화하면 입력한 참고 텍스트를 무시합니다.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "참고 오디오의 내용을 정확히 알아들을 수 없을 경우 이 옵션을 활성화하세요. 활성화하면 입력한 참고 텍스트를 무시합니다.",
@ -110,15 +104,11 @@
"已关闭": "닫힘", "已关闭": "닫힘",
"已完成": "완료됨", "已完成": "완료됨",
"已开启": "켜짐", "已开启": "켜짐",
"并行合成中": "병렬 오디오 생성 중",
"并行推理": "병렬 추론", "并行推理": "병렬 추론",
"并行推理模式已关闭": "병렬 추론 모드 비활성화됨", "并行推理模式已关闭": "병렬 추론 모드 비활성화됨",
"并行推理模式已开启": "병렬 추론 모드 활성화됨", "并行推理模式已开启": "병렬 추론 모드 활성화됨",
"底模缺失,无法加载相应 LoRA 权重": "기본 모델이 없어서 해당 LoRA 가중치를 로드할 수 없습니다",
"开启": "켜기", "开启": "켜기",
"开启无参考文本模式。不填参考文本亦相当于开启。": "참고 텍스트 없이 모드를 활성화합니다. 참고 텍스트를 입력하지 않으면 자동으로 활성화됩니다.", "开启无参考文本模式。不填参考文本亦相当于开启。": "참고 텍스트 없이 모드를 활성화합니다. 참고 텍스트를 입력하지 않으면 자동으로 활성화됩니다.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "병렬 추론 모드가 활성화된 경우, SoVITS V3/4 모델은 버킷 처리를 지원하지 않으며, 자동으로 비활성화됩니다.",
"微调模型信息": "미세 조정(Fine-tuning) 모델 정보",
"微调训练": "미세 조정 훈련", "微调训练": "미세 조정 훈련",
"怎么切": "자르기 옵션", "怎么切": "자르기 옵션",
"总训练轮数total_epoch": "총 훈련 라운드 수 (total_epoch)", "总训练轮数total_epoch": "총 훈련 라운드 수 (total_epoch)",
@ -150,8 +140,8 @@
"模型": "모델", "模型": "모델",
"模型分为三类:": "모델은 3가지로 나뉩니다:", "模型分为三类:": "모델은 3가지로 나뉩니다:",
"模型切换": "모델 전환", "模型切换": "모델 전환",
"模型加载中,请等待": "모델을 불러오는 중입니다. 잠시 기다려주세요...",
"每张显卡的batch_size": "각 그래픽 카드의 배치 크기", "每张显卡的batch_size": "각 그래픽 카드의 배치 크기",
"版本": "버전",
"粤英混合": "粤영 혼합", "粤英混合": "粤영 혼합",
"粤语": "粤語", "粤语": "粤語",
"终止合成": "합성 종료", "终止合成": "합성 종료",
@ -160,7 +150,6 @@
"缺少音素数据集": "음소 데이터셋이 없습니다", "缺少音素数据集": "음소 데이터셋이 없습니다",
"缺少音频数据集": "오디오 데이터셋이 없습니다", "缺少音频数据集": "오디오 데이터셋이 없습니다",
"英文": "영어", "英文": "영어",
"训练模型的版本": "학습된 모델 버전",
"训练集格式化一键三连": "훈련 데이터셋 포맷팅 원클릭 실행", "训练集格式化一键三连": "훈련 데이터셋 포맷팅 원클릭 실행",
"训练集格式化工具": "훈련 데이터셋 포맷팅 도구", "训练集格式化工具": "훈련 데이터셋 포맷팅 도구",
"语义Token提取": "의미론적 토큰 추출", "语义Token提取": "의미론적 토큰 추출",
@ -174,6 +163,7 @@
"语音识别": "음성 인식", "语音识别": "음성 인식",
"语音识别工具": "음성 인식 도구", "语音识别工具": "음성 인식 도구",
"语音降噪": "음성 잡음 제거", "语音降噪": "음성 잡음 제거",
"语音降噪工具": "음성 잡음 제거 도구",
"请上传3~10秒内参考音频超过会报错": "3~10초 이내의 참고 오디오를 업로드하십시오. 초과하면 오류가 발생합니다!", "请上传3~10秒内参考音频超过会报错": "3~10초 이내의 참고 오디오를 업로드하십시오. 초과하면 오류가 발생합니다!",
"请上传参考音频": "참고 오디오를 업로드하세요", "请上传参考音频": "참고 오디오를 업로드하세요",
"请填入推理文本": "목표 텍스트를 입력하세요", "请填入推理文本": "목표 텍스트를 입력하세요",
@ -197,8 +187,7 @@
"进度": "진행 상태", "进度": "진행 상태",
"进程已终止": "프로세스 종료됨", "进程已终止": "프로세스 종료됨",
"进程输出信息": "프로세스 출력 정보", "进程输出信息": "프로세스 출력 정보",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "SoVITS_weights와 GPT_weights에 저장된 학습 완료 모델을 선택하세요. 기본 제공되는 모델은 베이스 모델로, 5초 Zero Shot TTS 추론을 학습 없이 체험할 수 있습니다.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "SoVITS_weights 및 GPT_weights에 저장된 훈련 완료된 모델 중 선택. 기본적으로 하나는 기본 모델이며 5초 Zero Shot TTS를 체험할 수 있습니다.",
"采样步数(仅对V3/4生效)": "샘플링 단계 수 (V3/V4에만 적용됨)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "샘플링 스텝: 노이즈가 느껴지면 증가, 느리다면 감소 시도", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "샘플링 스텝: 노이즈가 느껴지면 증가, 느리다면 감소 시도",
"重复惩罚": "반복 패널티", "重复惩罚": "반복 패널티",
"随机种子": "랜덤 시드", "随机种子": "랜덤 시드",
@ -213,14 +202,10 @@
"音频文件不存在,跳过:": "오디오 파일이 존재하지 않음, 건너뜀: ", "音频文件不存在,跳过:": "오디오 파일이 존재하지 않음, 건너뜀: ",
"音频标注WebUI": "오디오 주석 WebUI", "音频标注WebUI": "오디오 주석 WebUI",
"音频自动切分输入路径,可文件可文件夹": "오디오 자동 분리 입력 경로, 파일 또는 폴더 가능", "音频自动切分输入路径,可文件可文件夹": "오디오 자동 분리 입력 경로, 파일 또는 폴더 가능",
"音频超分中": "오디오 슈퍼 레졸루션 처리 중", "音频超分中": "오디오 슈퍼 레졸루션 중",
"音频超采样": "오디오 업샘플링", "预训练GPT模型路径": "사전 훈련된 GPT 모델 경로",
"音频超采样(仅对V3生效))": "오디오 업샘플링 (V3에만 적용됨)", "预训练SSL模型路径": "사전 훈련된 SSL 모델 경로",
"预测语义Token": "의미 기반 토큰 예측", "预训练SoVITS-D模型路径": "사전 훈련된 SoVITS-D 모델 경로",
"预训练GPT模型路径": "사전 학습된 GPT 모델 경로", "预训练SoVITS-G模型路径": "사전 훈련된 SoVITS-G 모델 경로",
"预训练SSL模型路径": "사전 학습된 SSL 모델 경로", "预训练中文BERT模型路径": "사전 훈련된 중국어 BERT 모델 경로"
"预训练SoVITS-D模型路径": "사전 학습된 SoVITS-D 모델 경로",
"预训练SoVITS-G模型路径": "사전 학습된 SoVITS-G 모델 경로",
"预训练中文BERT模型路径": "사전 학습된 중국어 BERT 모델 경로",
"预训练模型路径": "사전 학습 모델 경로"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "Modelo ASR", "ASR 模型": "Modelo ASR",
"ASR 模型尺寸": "Tamanho do modelo ASR", "ASR 模型尺寸": "Tamanho do modelo ASR",
"ASR 语言设置": "Configurações de idioma do ASR", "ASR 语言设置": "Configurações de idioma do ASR",
"CPU训练,较慢": "Treinamento em CPU, mais lento",
"GPT 训练: 模型权重文件在 GPT_weights/": "Treinamento GPT: O arquivo de pesos do modelo está em GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "Treinamento GPT: O arquivo de pesos do modelo está em GPT_weights/",
"GPT模型列表": "Lista de modelos GPT", "GPT模型列表": "Lista de modelos GPT",
"GPT训练": "Treinamento GPT", "GPT训练": "Treinamento GPT",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "Número da placa de vídeo, só é possível preencher com um número inteiro", "GPU卡号,只能填1个整数": "Número da placa de vídeo, só é possível preencher com um número inteiro",
"GPU卡号以-分割,每个卡号一个进程": "Número da placa de vídeo dividido por-, cada número de placa é um processo", "GPU卡号以-分割,每个卡号一个进程": "Número da placa de vídeo dividido por-, cada número de placa é um processo",
"LoRA秩": "Classificação LoRA", "LoRA秩": "Classificação LoRA",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "O modelo base do SoVITS V3 está ausente, impossibilitando o carregamento dos pesos do LoRA",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Treinamento SoVITS: O arquivo de pesos do modelo está em SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Treinamento SoVITS: O arquivo de pesos do modelo está em SoVITS_weights/",
"SoVITS模型列表": "Lista de modelos SoVITS", "SoVITS模型列表": "Lista de modelos SoVITS",
"SoVITS训练": "Treinamento SoVITS", "SoVITS训练": "Treinamento SoVITS",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Enviar texto: Salve manualmente o conteúdo de todos os campos de texto da página atual na memória e no arquivo. Se você não clicar neste botão antes ou depois de mudar de página ou sair da tela de anotação, ao retornar as alterações serão desfeitas e todo o trabalho será perdido.",
"TTS推理WebUI": "Inferência TTS WebUI", "TTS推理WebUI": "Inferência TTS WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Separação de voz e acompanhamento & remoção de reverberação e atraso", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Separação de voz e acompanhamento & remoção de reverberação e atraso",
"V3不支持无参考文本模式请填写参考文本": "O modo sem texto de referência não é suportado pelo V3. Por favor, forneça um texto de referência!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta",
"batch_size": "Tamanho do Lote", "batch_size": "Tamanho do Lote",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "O v3 não suporta este modo no momento. O uso resultará em erro.", "v3暂不支持该模式使用了会报错。": "O v3 não suporta este modo no momento. O uso resultará em erro.",
"v3输出如果觉得闷可以试试开超分": "Se a saída do v3 parecer abafada, tente ativar a super-resolução", "v3输出如果觉得闷可以试试开超分": "Se a saída do v3 parecer abafada, tente ativar a super-resolução",
"不切": "Não dividir", "不切": "Não dividir",
"不训练直接推v2ProPlus底模": "Usar diretamente o modelo base v2ProPlus sem treinamento!",
"不训练直接推v2Pro底模": "Usar diretamente o modelo base v2Pro sem treinamento!",
"不训练直接推v2底模": "Usar diretamente o modelo base v2 sem treinamento!",
"不训练直接推v3底模": "Usar diretamente o modelo base v3 sem treinamento!",
"中文": "Chinês", "中文": "Chinês",
"中文教程文档": "Documentação do tutorial em chinês",
"中英混合": "Mistura de Chinês e Inglês", "中英混合": "Mistura de Chinês e Inglês",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Áudio de Referência Principal (Por favor, carregue um áudio de referência entre 3-10 segundos, exceder esse limite causará um erro!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Áudio de Referência Principal (Por favor, carregue um áudio de referência entre 3-10 segundos, exceder esse limite causará um erro!)",
"主参考音频的文本": "Texto do Áudio de Referência Principal", "主参考音频的文本": "Texto do Áudio de Referência Principal",
@ -92,7 +87,6 @@
"句间停顿秒数": "Tempo de pausa entre frases (segundos)", "句间停顿秒数": "Tempo de pausa entre frases (segundos)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Faça upload de vários arquivos de áudio de referência arrastando e soltando-os (recomendado que sejam do mesmo gênero) e faça uma média dos seus tons. Se essa opção for deixada em branco, o tom será controlado pelo único áudio de referência à esquerda. Se estiver ajustando o modelo, é recomendado que todos os arquivos de áudio de referência tenham tons dentro do conjunto de treinamento de ajuste; o modelo pré-treinado pode ser ignorado.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Faça upload de vários arquivos de áudio de referência arrastando e soltando-os (recomendado que sejam do mesmo gênero) e faça uma média dos seus tons. Se essa opção for deixada em branco, o tom será controlado pelo único áudio de referência à esquerda. Se estiver ajustando o modelo, é recomendado que todos os arquivos de áudio de referência tenham tons dentro do conjunto de treinamento de ajuste; o modelo pré-treinado pode ser ignorado.",
"合成语音": "Voz sintetizada", "合成语音": "Voz sintetizada",
"合成音频": "Sintetizar áudio",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemplo de formato de caminho de pasta válido: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copie do endereço da barra do gerenciador de arquivos).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemplo de formato de caminho de pasta válido: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copie do endereço da barra do gerenciador de arquivos).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Suporte para Conversão de Fonemas, Edição Manual de Fonemas e Síntese de Fase por Fase será adicionado no futuro.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Suporte para Conversão de Fonemas, Edição Manual de Fonemas e Síntese de Fase por Fase será adicionado no futuro.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se não conseguir entender claramente o áudio de referência, ative esta opção. Quando ativada, o texto de referência inserido será ignorado.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se não conseguir entender claramente o áudio de referência, ative esta opção. Quando ativada, o texto de referência inserido será ignorado.",
@ -110,15 +104,11 @@
"已关闭": " Fechado", "已关闭": " Fechado",
"已完成": " Concluído", "已完成": " Concluído",
"已开启": " Ativado", "已开启": " Ativado",
"并行合成中": "Síntese em paralelo em andamento",
"并行推理": "Inferência Paralela", "并行推理": "Inferência Paralela",
"并行推理模式已关闭": "Modo de Inferência Paralela Desativado", "并行推理模式已关闭": "Modo de Inferência Paralela Desativado",
"并行推理模式已开启": "Modo de Inferência Paralela Ativado", "并行推理模式已开启": "Modo de Inferência Paralela Ativado",
"底模缺失,无法加载相应 LoRA 权重": "Falta o modelo base, não foi possível carregar os pesos LoRA correspondentes",
"开启": "Ativar ", "开启": "Ativar ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "Quando o modo de inferência paralela está ativado, os modelos SoVITS V3/4 não suportam processamento por lotes e esta funcionalidade foi automaticamente desativada.",
"微调模型信息": "Informações do modelo ajustado",
"微调训练": "Treinamento de ajuste fino", "微调训练": "Treinamento de ajuste fino",
"怎么切": "Como cortar", "怎么切": "Como cortar",
"总训练轮数total_epoch": "Total de epoch de treinamento", "总训练轮数total_epoch": "Total de epoch de treinamento",
@ -150,8 +140,8 @@
"模型": "Modelo", "模型": "Modelo",
"模型分为三类:": "Modelos dividem-se em três categorias:", "模型分为三类:": "Modelos dividem-se em três categorias:",
"模型切换": "Troca de modelo", "模型切换": "Troca de modelo",
"模型加载中,请等待": "Carregando o modelo, por favor aguarde...",
"每张显卡的batch_size": "Tamanho do lote de cada placa de vídeo", "每张显卡的batch_size": "Tamanho do lote de cada placa de vídeo",
"版本": "Versão",
"粤英混合": "Mistura Yue-Inglês", "粤英混合": "Mistura Yue-Inglês",
"粤语": "Yue", "粤语": "Yue",
"终止合成": "Terminar Síntese", "终止合成": "Terminar Síntese",
@ -160,7 +150,6 @@
"缺少音素数据集": "Conjunto de Dados de Fonemas Ausente", "缺少音素数据集": "Conjunto de Dados de Fonemas Ausente",
"缺少音频数据集": "Conjunto de Dados de Áudio Ausente", "缺少音频数据集": "Conjunto de Dados de Áudio Ausente",
"英文": "Inglês", "英文": "Inglês",
"训练模型的版本": "Versão do modelo treinado",
"训练集格式化一键三连": "Formatação do conjunto de treinamento em um clique", "训练集格式化一键三连": "Formatação do conjunto de treinamento em um clique",
"训练集格式化工具": "Ferramenta de formatação do conjunto de treinamento", "训练集格式化工具": "Ferramenta de formatação do conjunto de treinamento",
"语义Token提取": "Extração de tokens semânticos", "语义Token提取": "Extração de tokens semânticos",
@ -174,6 +163,7 @@
"语音识别": "Reconhecimento de voz", "语音识别": "Reconhecimento de voz",
"语音识别工具": "Ferramenta de reconhecimento de voz", "语音识别工具": "Ferramenta de reconhecimento de voz",
"语音降噪": "Redução de ruído de áudio", "语音降噪": "Redução de ruído de áudio",
"语音降噪工具": "Ferramenta de redução de ruído de áudio",
"请上传3~10秒内参考音频超过会报错": "Por favor, faça upload de um áudio de referência com duração entre 3 e 10 segundos. Áudios fora dessa faixa causarão erro!", "请上传3~10秒内参考音频超过会报错": "Por favor, faça upload de um áudio de referência com duração entre 3 e 10 segundos. Áudios fora dessa faixa causarão erro!",
"请上传参考音频": "Por Favor, Carregue o Áudio de Referência", "请上传参考音频": "Por Favor, Carregue o Áudio de Referência",
"请填入推理文本": "Por Favor, Preencha o Texto de Inferência", "请填入推理文本": "Por Favor, Preencha o Texto de Inferência",
@ -197,8 +187,7 @@
"进度": "Progresso", "进度": "Progresso",
"进程已终止": " Processo encerrado", "进程已终止": " Processo encerrado",
"进程输出信息": " Informações de saída do processo", "进程输出信息": " Informações de saída do processo",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Selecione os modelos treinados armazenados em SoVITS_weights e GPT_weights. Os padrões são modelos base para experimentar inferência Zero Shot TTS de 5 segundos sem treinamento.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Selecione os modelos armazenados em Sovits_weights e GPT_WEIGHTS. O padrão é o modelo inferior, experiência para 5 segundos de Zero Shot TTS",
"采样步数(仅对V3/4生效)": "Número de passos de amostragem (apenas válido para V3/4)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passos de Amostragem: Se parecer ruidoso, tente aumentar; se parecer lento, tente diminuir", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passos de Amostragem: Se parecer ruidoso, tente aumentar; se parecer lento, tente diminuir",
"重复惩罚": "Penalidade de Repetição", "重复惩罚": "Penalidade de Repetição",
"随机种子": "Semente Aleatória", "随机种子": "Semente Aleatória",
@ -210,17 +199,13 @@
"韩文": "Coreano", "韩文": "Coreano",
"韩英混合": "Mistura Coreano-Inglês", "韩英混合": "Mistura Coreano-Inglês",
"音频加载失败": "Falha ao Carregar o Áudio", "音频加载失败": "Falha ao Carregar o Áudio",
"音频文件不存在,跳过:": "Arquivo de áudio não encontrado, pulando: ", "音频文件不存在,跳过:": "Arquivo de Áudio Não Encontrado, Pulando: ",
"音频标注WebUI": "WebUI de Anotação de Áudio", "音频标注WebUI": "WebUI de anotação de áudio",
"音频自动切分输入路径,可文件可文件夹": "Caminho de entrada para divisão automática de áudio (arquivo ou pasta)", "音频自动切分输入路径,可文件可文件夹": "Caminho de entrada automático de corte de áudio, pode ser um arquivo ou uma pasta",
"音频超分中": "Executando Super-Resolução de Áudio", "音频超分中": "Super-resolução de áudio em andamento",
"音频超采样": "Superamostragem de áudio",
"音频超采样(仅对V3生效))": "Superamostragem de áudio (apenas válida para V3)",
"预测语义Token": "Prever token semântico",
"预训练GPT模型路径": "Caminho do modelo GPT pré-treinado", "预训练GPT模型路径": "Caminho do modelo GPT pré-treinado",
"预训练SSL模型路径": "Caminho do modelo SSL pré-treinado", "预训练SSL模型路径": "Caminho do modelo SSL pré-treinado",
"预训练SoVITS-D模型路径": "Caminho do modelo SoVITS-D pré-treinado", "预训练SoVITS-D模型路径": "Caminho do modelo SoVITS-D pré-treinado",
"预训练SoVITS-G模型路径": "Caminho do modelo SoVITS-G pré-treinado", "预训练SoVITS-G模型路径": "Caminho do modelo SoVITS-G pré-treinado",
"预训练中文BERT模型路径": "Caminho do modelo BERT chinês pré-treinado", "预训练中文BERT模型路径": "Caminho do modelo BERT chinês pré-treinado"
"预训练模型路径": "Caminho do modelo pré-treinado"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "Модель ASR", "ASR 模型": "Модель ASR",
"ASR 模型尺寸": "Размер модели ASR", "ASR 模型尺寸": "Размер модели ASR",
"ASR 语言设置": "Настройки языка ASR", "ASR 语言设置": "Настройки языка ASR",
"CPU训练,较慢": "Обучение на CPU (медленнее)",
"GPT 训练: 模型权重文件在 GPT_weights/": "Обучение GPT: файлы весов модели находятся в GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "Обучение GPT: файлы весов модели находятся в GPT_weights/",
"GPT模型列表": "Список моделей GPT", "GPT模型列表": "Список моделей GPT",
"GPT训练": "Обучение GPT", "GPT训练": "Обучение GPT",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "Номер GPU, можно указать только одно целое число", "GPU卡号,只能填1个整数": "Номер GPU, можно указать только одно целое число",
"GPU卡号以-分割,每个卡号一个进程": "Номера GPU разделяются дефисом, на каждый номер отдельный процесс", "GPU卡号以-分割,每个卡号一个进程": "Номера GPU разделяются дефисом, на каждый номер отдельный процесс",
"LoRA秩": "Ранг LoRA", "LoRA秩": "Ранг LoRA",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Отсутствует базовая модель SoVITS V3, невозможно загрузить соответствующие веса LoRA",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Обучение SoVITS: файлы весов модели находятся в SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Обучение SoVITS: файлы весов модели находятся в SoVITS_weights/",
"SoVITS模型列表": "Список моделей SoVITS", "SoVITS模型列表": "Список моделей SoVITS",
"SoVITS训练": "Обучение SoVITS", "SoVITS训练": "Обучение SoVITS",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Отправить текст: Вручную сохраните содержимое всех текстовых полей текущей страницы в память и файл. Если вы не нажмете эту кнопку до или после смены страницы или перед выходом из интерфейса разметки, при возврате все изменения будут отменены — работа пропадет зря.",
"TTS推理WebUI": "TTS WebUI для инференса", "TTS推理WebUI": "TTS WebUI для инференса",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Инструмент разделения вокала и сопровождения & удаления реверберации и задержки", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Инструмент разделения вокала и сопровождения & удаления реверберации и задержки",
"V3不支持无参考文本模式请填写参考文本": "V3 не поддерживает режим без опорного текста. Пожалуйста, укажите опорный текст!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Какая доля нормализованного аудио смешивается", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Какая доля нормализованного аудио смешивается",
"batch_size": "размер пакета", "batch_size": "размер пакета",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Как рассчитывается кривая громкости, чем меньше, тем выше точность и больше вычислительная нагрузка (большая точность не всегда означает лучший результат)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Как рассчитывается кривая громкости, чем меньше, тем выше точность и больше вычислительная нагрузка (большая точность не всегда означает лучший результат)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3 пока не поддерживает этот режим, при использовании возникнет ошибка.", "v3暂不支持该模式使用了会报错。": "v3 пока не поддерживает этот режим, при использовании возникнет ошибка.",
"v3输出如果觉得闷可以试试开超分": "Если вывод v3 кажется глухим, попробуйте включить супер-разрешение", "v3输出如果觉得闷可以试试开超分": "Если вывод v3 кажется глухим, попробуйте включить супер-разрешение",
"不切": "Не разрезать", "不切": "Не разрезать",
"不训练直接推v2ProPlus底模": "Использовать базовую модель v2ProPlus без обучения!",
"不训练直接推v2Pro底模": "Использовать базовую модель v2Pro без обучения!",
"不训练直接推v2底模": "Использовать базовую модель v2 без обучения!",
"不训练直接推v3底模": "Использовать базовую модель v3 без обучения!",
"中文": "Китайский", "中文": "Китайский",
"中文教程文档": "Документация по китайскому руководству",
"中英混合": "Китайский и английский", "中英混合": "Китайский и английский",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Основной опорный аудиофайл (пожалуйста, загрузите опорный аудиофайл длительностью от 3 до 10 секунд, превышение этого лимита вызовет ошибку!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Основной опорный аудиофайл (пожалуйста, загрузите опорный аудиофайл длительностью от 3 до 10 секунд, превышение этого лимита вызовет ошибку!)",
"主参考音频的文本": "Текст основного опорного аудиофайла", "主参考音频的文本": "Текст основного опорного аудиофайла",
@ -92,7 +87,6 @@
"句间停顿秒数": "Время паузы между предложениями (в секундах)", "句间停顿秒数": "Время паузы между предложениями (в секундах)",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Необязательно: загрузите несколько файлов с эталонными аудиозаписями, перетащив их (рекомендуется одного пола), и усредните их тон. Если этот параметр не заполнен, тон будет контролироваться одной эталонной аудиозаписью слева. При тонкой настройке модели рекомендуется, чтобы все эталонные аудиозаписи имели тон в пределах обучающего набора для тонкой настройки; предварительно обученную модель можно игнорировать.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Необязательно: загрузите несколько файлов с эталонными аудиозаписями, перетащив их (рекомендуется одного пола), и усредните их тон. Если этот параметр не заполнен, тон будет контролироваться одной эталонной аудиозаписью слева. При тонкой настройке модели рекомендуется, чтобы все эталонные аудиозаписи имели тон в пределах обучающего набора для тонкой настройки; предварительно обученную модель можно игнорировать.",
"合成语音": "Синтезированный голос", "合成语音": "Синтезированный голос",
"合成音频": "Синтезировать аудио",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Пример допустимого формата пути к папке: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (просто скопируйте из адресной строки файлового менеджера).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Пример допустимого формата пути к папке: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (просто скопируйте из адресной строки файлового менеджера).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "В будущем будет добавлена поддержка преобразования в фонемы, ручного редактирования фонемов и пошагового выполнения синтеза речи.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "В будущем будет добавлена поддержка преобразования в фонемы, ручного редактирования фонемов и пошагового выполнения синтеза речи.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Если невозможно разобрать речь в эталонном аудио (и непонятно, что писать), можно включить эту опцию. При включении вводимый эталонный текст будет игнорироваться.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Если невозможно разобрать речь в эталонном аудио (и непонятно, что писать), можно включить эту опцию. При включении вводимый эталонный текст будет игнорироваться.",
@ -110,15 +104,11 @@
"已关闭": " Закрыто", "已关闭": " Закрыто",
"已完成": " Завершено", "已完成": " Завершено",
"已开启": " Включено", "已开启": " Включено",
"并行合成中": "Синтез в параллельном режиме",
"并行推理": "Параллельный вывод", "并行推理": "Параллельный вывод",
"并行推理模式已关闭": "Режим параллельного вывода отключен", "并行推理模式已关闭": "Режим параллельного вывода отключен",
"并行推理模式已开启": "Режим параллельного вывода включен", "并行推理模式已开启": "Режим параллельного вывода включен",
"底模缺失,无法加载相应 LoRA 权重": "Отсутствует базовая модель, не удалось загрузить соответствующие веса LoRA.",
"开启": "Включить ", "开启": "Включить ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Включить режим без референтного текста. Не заполняя референтный текст, вы также включаете этот режим.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Включить режим без референтного текста. Не заполняя референтный текст, вы также включаете этот режим.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "При включенном режиме параллельного вывода модель SoVits V3/4 не поддерживает обработку по бакетам, эта функция была автоматически отключена.",
"微调模型信息": "Информация о дообученной модели",
"微调训练": "Обучение с тонкой настройкой", "微调训练": "Обучение с тонкой настройкой",
"怎么切": "Как разрезать", "怎么切": "Как разрезать",
"总训练轮数total_epoch": "Общее количество эпох обучения total_epoch", "总训练轮数total_epoch": "Общее количество эпох обучения total_epoch",
@ -150,8 +140,8 @@
"模型": "Модели", "模型": "Модели",
"模型分为三类:": "Модели делятся на три типа:", "模型分为三类:": "Модели делятся на три типа:",
"模型切换": "Переключение модели", "模型切换": "Переключение модели",
"模型加载中,请等待": "Модель загружается, пожалуйста, подождите...",
"每张显卡的batch_size": "Размер пакета для каждой видеокарты", "每张显卡的batch_size": "Размер пакета для каждой видеокарты",
"版本": "Версия",
"粤英混合": "Кантоно-английская смесь", "粤英混合": "Кантоно-английская смесь",
"粤语": "Кантонийский", "粤语": "Кантонийский",
"终止合成": "Завершить синтез", "终止合成": "Завершить синтез",
@ -160,7 +150,6 @@
"缺少音素数据集": "Отсутствует набор данных фонем", "缺少音素数据集": "Отсутствует набор данных фонем",
"缺少音频数据集": "Отсутствует набор данных аудио", "缺少音频数据集": "Отсутствует набор данных аудио",
"英文": "Английский", "英文": "Английский",
"训练模型的版本": "Версия обученной модели",
"训练集格式化一键三连": "Форматирование обучающего набора одним кликом", "训练集格式化一键三连": "Форматирование обучающего набора одним кликом",
"训练集格式化工具": "Инструмент форматирования обучающего набора", "训练集格式化工具": "Инструмент форматирования обучающего набора",
"语义Token提取": "Извлечение семантических токенов", "语义Token提取": "Извлечение семантических токенов",
@ -174,6 +163,7 @@
"语音识别": "Распознавание речи", "语音识别": "Распознавание речи",
"语音识别工具": "Инструмент распознавания речи", "语音识别工具": "Инструмент распознавания речи",
"语音降噪": "Шумоподавление аудиофайлов", "语音降噪": "Шумоподавление аудиофайлов",
"语音降噪工具": "Инструмент шумоподавления аудиофайлов",
"请上传3~10秒内参考音频超过会报错": "Пожалуйста, загрузите референтное аудио длительностью от 3 до 10 секунд, иначе будет ошибка!", "请上传3~10秒内参考音频超过会报错": "Пожалуйста, загрузите референтное аудио длительностью от 3 до 10 секунд, иначе будет ошибка!",
"请上传参考音频": "Пожалуйста, загрузите эталонное аудио", "请上传参考音频": "Пожалуйста, загрузите эталонное аудио",
"请填入推理文本": "Пожалуйста, введите целевой текст", "请填入推理文本": "Пожалуйста, введите целевой текст",
@ -197,8 +187,7 @@
"进度": "Прогресс", "进度": "Прогресс",
"进程已终止": " Процесс завершён", "进程已终止": " Процесс завершён",
"进程输出信息": " Выходные данные процесса", "进程输出信息": " Выходные данные процесса",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "Выберите обученные модели в папках SoVITS_weights и GPT_weights. По умолчанию доступны базовые модели для Zero Shot TTS-синтеза за 5 секунд без обучения.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Выберите модель, сохраненную в SoVITS_weights и GPT_weights после обучения. По умолчанию используется базовая модель для 5-секундного Zero Shot TTS.",
"采样步数(仅对V3/4生效)": "Число шагов выборки (действительно только для V3/4)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Количество шагов выборки: если кажется, что шумно, попробуйте увеличить, если кажется, что медленно, попробуйте уменьшить", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Количество шагов выборки: если кажется, что шумно, попробуйте увеличить, если кажется, что медленно, попробуйте уменьшить",
"重复惩罚": "Штраф за повторение", "重复惩罚": "Штраф за повторение",
"随机种子": "Случайное начальное значение", "随机种子": "Случайное начальное значение",
@ -210,17 +199,13 @@
"韩文": "Корейский", "韩文": "Корейский",
"韩英混合": "Корейско-английская смесь", "韩英混合": "Корейско-английская смесь",
"音频加载失败": "Не удалось загрузить аудио", "音频加载失败": "Не удалось загрузить аудио",
"音频文件不存在,跳过:": "Файл аудио не найден, пропускается: ", "音频文件不存在,跳过:": "Аудиофайл не найден, пропускается: ",
"音频标注WebUI": "Веб-интерфейс разметки аудио", "音频标注WebUI": "WebUI для аннотирования аудиофайлов",
"音频自动切分输入路径,可文件可文件夹": "Путь ввода для автоматического разделения аудио, может быть файлом или папкой", "音频自动切分输入路径,可文件可文件夹": "Путь ввода для автоматического разделения аудио, может быть файлом или папкой",
"音频超分中": "Супер-разрешение аудио в процессе", "音频超分中": "Супер-разрешение аудио в процессе",
"音频超采样": "Апсэмплирование аудио",
"音频超采样(仅对V3生效))": "Апсэмплирование аудио (действительно только для V3)",
"预测语义Token": "Предсказать семантический токен",
"预训练GPT模型路径": "Путь к предобученной модели GPT", "预训练GPT模型路径": "Путь к предобученной модели GPT",
"预训练SSL模型路径": "Путь к предобученной модели SSL", "预训练SSL模型路径": "Путь к предобученной модели SSL",
"预训练SoVITS-D模型路径": "Путь к предобученной модели SoVITS-D", "预训练SoVITS-D模型路径": "Путь к предобученной модели SoVITS-D",
"预训练SoVITS-G模型路径": "Путь к предобученной модели SoVITS-G", "预训练SoVITS-G模型路径": "Путь к предобученной модели SoVITS-G",
"预训练中文BERT模型路径": "Путь к предобученной китайской модели BERT", "预训练中文BERT模型路径": "Путь к предобученной китайской модели BERT"
"预训练模型路径": "Путь к предобученной модели"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR modeli", "ASR 模型": "ASR modeli",
"ASR 模型尺寸": "ASR model boyutu", "ASR 模型尺寸": "ASR model boyutu",
"ASR 语言设置": "ASR dil ayarları", "ASR 语言设置": "ASR dil ayarları",
"CPU训练,较慢": "CPU ile Eğitim, daha yavaş",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT Eğitimi: Model ağırlık dosyaları GPT_weights/ içinde", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT Eğitimi: Model ağırlık dosyaları GPT_weights/ içinde",
"GPT模型列表": "GPT model listesi", "GPT模型列表": "GPT model listesi",
"GPT训练": "GPT Eğitimi", "GPT训练": "GPT Eğitimi",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU kart numarası, sadece bir tamsayı girilebilir", "GPU卡号,只能填1个整数": "GPU kart numarası, sadece bir tamsayı girilebilir",
"GPU卡号以-分割,每个卡号一个进程": "GPU kart numaraları - ile ayrılır, her kart numarası için bir işlem", "GPU卡号以-分割,每个卡号一个进程": "GPU kart numaraları - ile ayrılır, her kart numarası için bir işlem",
"LoRA秩": "LoRA Derecesi", "LoRA秩": "LoRA Derecesi",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 temel modeli eksik, ilgili LoRA ağırlıkları yüklenemiyor",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Eğitimi: Model ağırlık dosyaları SoVITS_weights/ içinde", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Eğitimi: Model ağırlık dosyaları SoVITS_weights/ içinde",
"SoVITS模型列表": "SoVITS model listesi", "SoVITS模型列表": "SoVITS model listesi",
"SoVITS训练": "SoVITS Eğitimi", "SoVITS训练": "SoVITS Eğitimi",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Geçerli sayfadaki tüm metin kutusu içeriklerini manuel olarak belleğe ve dosyaya kaydedin. Bu butona sayfa değiştirmeden önce ya da etiketleme sayfasından çıkmadan önce tıklamazsanız, geri döndüğünüzde değişiklikler geri alınıp tüm işlemler boşa gidecektir.",
"TTS推理WebUI": "TTS Çıkarım WebUI", "TTS推理WebUI": "TTS Çıkarım WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 İnsan Sesli ve Enstrümantal Ayrım & Reverb ve Gecikme Giderme Aracı", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 İnsan Sesli ve Enstrümantal Ayrım & Reverb ve Gecikme Giderme Aracı",
"V3不支持无参考文本模式请填写参考文本": "V3, referans metin olmadan çalışmayı desteklememektedir! Lütfen bir referans metin giriniz!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Normalizasyondan sonraki sesin ne kadarlık bir oranı karıştırılsın", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Normalizasyondan sonraki sesin ne kadarlık bir oranı karıştırılsın",
"batch_size": "Toplu Boyut", "batch_size": "Toplu Boyut",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Ses seviyesi eğrisi nasıl hesaplanır, ne kadar küçükse hassasiyet o kadar yüksek ve hesaplama yükü o kadar artar (hassasiyet arttıkça etki mutlaka daha iyi olmaz)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Ses seviyesi eğrisi nasıl hesaplanır, ne kadar küçükse hassasiyet o kadar yüksek ve hesaplama yükü o kadar artar (hassasiyet arttıkça etki mutlaka daha iyi olmaz)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3 bu modu desteklemiyor, kullanıldığında hata verecektir.", "v3暂不支持该模式使用了会报错。": "v3 bu modu desteklemiyor, kullanıldığında hata verecektir.",
"v3输出如果觉得闷可以试试开超分": "v3 çıkışı eğer sıkıcıysa, süper çözünürlük açmayı deneyebilirsiniz", "v3输出如果觉得闷可以试试开超分": "v3 çıkışı eğer sıkıcıysa, süper çözünürlük açmayı deneyebilirsiniz",
"不切": "Kesme", "不切": "Kesme",
"不训练直接推v2ProPlus底模": "Eğitim yapmadan doğrudan v2ProPlus temel modelini kullan!",
"不训练直接推v2Pro底模": "Eğitim yapmadan doğrudan v2Pro temel modelini kullan!",
"不训练直接推v2底模": "Eğitim yapmadan doğrudan v2 temel modelini kullan!",
"不训练直接推v3底模": "Eğitim yapmadan doğrudan v3 temel modelini kullan!",
"中文": "Çince", "中文": "Çince",
"中文教程文档": "Çince Eğitim Belgesi",
"中英混合": "Çince ve İngilizce karışık", "中英混合": "Çince ve İngilizce karışık",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "Ana Referans Sesi (Lütfen 3-10 saniye arasında referans sesi yükleyin, bu sınırı aşarsanız hata alırsınız!)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "Ana Referans Sesi (Lütfen 3-10 saniye arasında referans sesi yükleyin, bu sınırı aşarsanız hata alırsınız!)",
"主参考音频的文本": "Ana Referans Sesinin Metni", "主参考音频的文本": "Ana Referans Sesinin Metni",
@ -92,7 +87,6 @@
"句间停顿秒数": "Cümleler Arası Duraklama Süresi", "句间停顿秒数": "Cümleler Arası Duraklama Süresi",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "İsteğe bağlı: Birden fazla referans ses dosyasını sürükleyip bırakarak yükleyin (aynı cinsiyetten olmaları önerilir) ve tonlarını ortalayın. Bu seçenek boş bırakılırsa, ton soldaki tek referans ses dosyası tarafından kontrol edilir. Modeli ince ayar yapıyorsanız, tüm referans ses dosyalarının ince ayar eğitim seti içindeki tonlara sahip olması önerilir; önceden eğitilmiş model dikkate alınmayabilir.", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "İsteğe bağlı: Birden fazla referans ses dosyasını sürükleyip bırakarak yükleyin (aynı cinsiyetten olmaları önerilir) ve tonlarını ortalayın. Bu seçenek boş bırakılırsa, ton soldaki tek referans ses dosyası tarafından kontrol edilir. Modeli ince ayar yapıyorsanız, tüm referans ses dosyalarının ince ayar eğitim seti içindeki tonlara sahip olması önerilir; önceden eğitilmiş model dikkate alınmayabilir.",
"合成语音": "Ses sentezi", "合成语音": "Ses sentezi",
"合成音频": "Ses Sentezleme",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Geçerli klasör yolu formatı örneği: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (dosya yöneticisi adres çubuğundan kopyalayabilirsiniz).", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Geçerli klasör yolu formatı örneği: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (dosya yöneticisi adres çubuğundan kopyalayabilirsiniz).",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Sonraki sürümlerde fonem dönüşümü, el ile fonem düzenleme ve adım adım konuşma sentezi desteği eklenecek.", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Sonraki sürümlerde fonem dönüşümü, el ile fonem düzenleme ve adım adım konuşma sentezi desteği eklenecek.",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Referans ses kaydını anlamıyorsanız (ne yazacağınızı bilmiyorsanız) açabilirsiniz. Açıldığında yazılmış olan referans metni göz ardı edilir.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Referans ses kaydını anlamıyorsanız (ne yazacağınızı bilmiyorsanız) açabilirsiniz. Açıldığında yazılmış olan referans metni göz ardı edilir.",
@ -110,15 +104,11 @@
"已关闭": " Kapalı", "已关闭": " Kapalı",
"已完成": " Tamamlandı", "已完成": " Tamamlandı",
"已开启": " Açık", "已开启": " Açık",
"并行合成中": "Paralel Sentezleme Yapılıyor",
"并行推理": "Paralel Çıkarım", "并行推理": "Paralel Çıkarım",
"并行推理模式已关闭": "Paralel Çıkarım Modu Kapalı", "并行推理模式已关闭": "Paralel Çıkarım Modu Kapalı",
"并行推理模式已开启": "Paralel Çıkarım Modu Etkin", "并行推理模式已开启": "Paralel Çıkarım Modu Etkin",
"底模缺失,无法加载相应 LoRA 权重": "Temel model eksik, ilgili LoRA ağırlıkları yüklenemedi.",
"开启": "Aç ", "开启": "Aç ",
"开启无参考文本模式。不填参考文本亦相当于开启。": "Referans metni olmayan mod açık. Referans metni doldurulmazsa bu mod otomatik olarak açılır.", "开启无参考文本模式。不填参考文本亦相当于开启。": "Referans metni olmayan mod açık. Referans metni doldurulmazsa bu mod otomatik olarak açılır.",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "Paralel çıkarım modu etkinleştirildiğinde, SoVITS V3/4 modelleri bölme işlemeyi desteklemez ve bu işlem otomatik olarak devre dışı bırakılır.",
"微调模型信息": "İnce Ayar Model Bilgisi",
"微调训练": "İnce Ayar Eğitimi", "微调训练": "İnce Ayar Eğitimi",
"怎么切": "Nasıl kesilir", "怎么切": "Nasıl kesilir",
"总训练轮数total_epoch": "Toplam eğitim turu sayısı total_epoch", "总训练轮数total_epoch": "Toplam eğitim turu sayısı total_epoch",
@ -150,8 +140,8 @@
"模型": "Model", "模型": "Model",
"模型分为三类:": "Modeller üç türdedir:", "模型分为三类:": "Modeller üç türdedir:",
"模型切换": "Model değiştirme", "模型切换": "Model değiştirme",
"模型加载中,请等待": "Model yükleniyor, lütfen bekleyin...",
"每张显卡的batch_size": "Her bir ekran kartı için batch_size", "每张显卡的batch_size": "Her bir ekran kartı için batch_size",
"版本": "Versiyon",
"粤英混合": "Yue-İngilizce Karışık", "粤英混合": "Yue-İngilizce Karışık",
"粤语": "Yue", "粤语": "Yue",
"终止合成": "Sentezi Sonlandır", "终止合成": "Sentezi Sonlandır",
@ -160,7 +150,6 @@
"缺少音素数据集": "Fonem Veri Seti Eksik", "缺少音素数据集": "Fonem Veri Seti Eksik",
"缺少音频数据集": "Ses Veri Seti Eksik", "缺少音频数据集": "Ses Veri Seti Eksik",
"英文": "İngilizce", "英文": "İngilizce",
"训练模型的版本": "Eğitilmiş Model Versiyonu",
"训练集格式化一键三连": "Eğitim Seti Formatlama Tek Tıkla Üçlü", "训练集格式化一键三连": "Eğitim Seti Formatlama Tek Tıkla Üçlü",
"训练集格式化工具": "Eğitim Seti Formatlama Aracı", "训练集格式化工具": "Eğitim Seti Formatlama Aracı",
"语义Token提取": "Anlamlı Token Çıkartma", "语义Token提取": "Anlamlı Token Çıkartma",
@ -174,6 +163,7 @@
"语音识别": "Ses Tanıma", "语音识别": "Ses Tanıma",
"语音识别工具": "Ses Tanıma Aracı", "语音识别工具": "Ses Tanıma Aracı",
"语音降噪": "Ses Gürültü Azaltma", "语音降噪": "Ses Gürültü Azaltma",
"语音降噪工具": "Ses Gürültü Azaltma Aracı",
"请上传3~10秒内参考音频超过会报错": "Lütfen 3~10 saniye arasında bir referans ses dosyası yükleyin, aşım durumunda hata verilecektir!", "请上传3~10秒内参考音频超过会报错": "Lütfen 3~10 saniye arasında bir referans ses dosyası yükleyin, aşım durumunda hata verilecektir!",
"请上传参考音频": "Lütfen Referans Sesi Yükleyin", "请上传参考音频": "Lütfen Referans Sesi Yükleyin",
"请填入推理文本": "Lütfen Hedef Metni Girin", "请填入推理文本": "Lütfen Hedef Metni Girin",
@ -197,8 +187,7 @@
"进度": "İlerleme", "进度": "İlerleme",
"进程已终止": " İşlem Sonlandırıldı", "进程已终止": " İşlem Sonlandırıldı",
"进程输出信息": " İşlem Çıktı Bilgisi", "进程输出信息": " İşlem Çıktı Bilgisi",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "SoVITS_weights ve GPT_weights altında kayıtlı eğitilmiş modelleri seçin. Varsayılanlar temel modellerdir, 5 saniyede Zero Shot TTS çıkarımını eğitimsiz deneyimleyin.", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "Eğitimi tamamlanmış ve SoVITS_weights ile GPT_weights altına kaydedilmiş modeli seçin. Varsayılan bir temel modeldir, 5 saniyelik Zero Shot TTS deneyimi için kullanılır.",
"采样步数(仅对V3/4生效)": "Örnekleme Adım Sayısı (Sadece V3/4 için geçerli)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Örnekleme Adımları: Eğer gürültülü görünüyorsa, adımları artırın; eğer yavaş görünüyorsa, adımları azaltın", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Örnekleme Adımları: Eğer gürültülü görünüyorsa, adımları artırın; eğer yavaş görünüyorsa, adımları azaltın",
"重复惩罚": "Tekrarlama Cezası", "重复惩罚": "Tekrarlama Cezası",
"随机种子": "Rastgele Tohum", "随机种子": "Rastgele Tohum",
@ -210,17 +199,13 @@
"韩文": "Korece", "韩文": "Korece",
"韩英混合": "Korece-İngilizce Karışık", "韩英混合": "Korece-İngilizce Karışık",
"音频加载失败": "Ses Yüklenemedi", "音频加载失败": "Ses Yüklenemedi",
"音频文件不存在,跳过:": "Ses dosyası bulunamadı, atlanıyor: ", "音频文件不存在,跳过:": "Ses Dosyası Bulunamadı, Atlanıyor: ",
"音频标注WebUI": "Ses Etiketleme WebUI", "音频标注WebUI": "Ses Etiketleme WebUI",
"音频自动切分输入路径,可文件可文件夹": "Ses otomatik bölme giriş yolu, dosya veya klasör olabilir", "音频自动切分输入路径,可文件可文件夹": "Ses otomatik bölme giriş yolu, dosya veya klasör olabilir",
"音频超分中": "Ses Süper Çözünürlük İşlemi Devam Ediyor", "音频超分中": "Ses Süper Çözünürlük Yapılıyor",
"音频超采样": "Ses Üst-örnekleme",
"音频超采样(仅对V3生效))": "Ses Üst-örnekleme (Sadece V3 için geçerli)",
"预测语义Token": "Anlamsal Token Tahmini",
"预训练GPT模型路径": "Önceden Eğitilmiş GPT Modeli Yolu", "预训练GPT模型路径": "Önceden Eğitilmiş GPT Modeli Yolu",
"预训练SSL模型路径": "Önceden Eğitilmiş SSL Modeli Yolu", "预训练SSL模型路径": "Önceden Eğitilmiş SSL Modeli Yolu",
"预训练SoVITS-D模型路径": "Önceden Eğitilmiş SoVITS-D Modeli Yolu", "预训练SoVITS-D模型路径": "Önceden Eğitilmiş SoVITS-D Modeli Yolu",
"预训练SoVITS-G模型路径": "Önceden Eğitilmiş SoVITS-G Modeli Yolu", "预训练SoVITS-G模型路径": "Önceden Eğitilmiş SoVITS-G Modeli Yolu",
"预训练中文BERT模型路径": "Önceden Eğitilmiş Çince BERT Modeli Yolu", "预训练中文BERT模型路径": "Önceden Eğitilmiş Çince BERT Modeli Yolu"
"预训练模型路径": "Önceden Eğitilmiş Model Yolu"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR 模型", "ASR 模型": "ASR 模型",
"ASR 模型尺寸": "ASR 模型尺寸", "ASR 模型尺寸": "ASR 模型尺寸",
"ASR 语言设置": "ASR 语言设置", "ASR 语言设置": "ASR 语言设置",
"CPU训练,较慢": "CPU训练,较慢",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT 训练: 模型权重文件在 GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT 训练: 模型权重文件在 GPT_weights/",
"GPT模型列表": "GPT模型列表", "GPT模型列表": "GPT模型列表",
"GPT训练": "GPT训练", "GPT训练": "GPT训练",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU卡号,只能填1个整数", "GPU卡号,只能填1个整数": "GPU卡号,只能填1个整数",
"GPU卡号以-分割,每个卡号一个进程": "GPU卡号以-分割,每个卡号一个进程", "GPU卡号以-分割,每个卡号一个进程": "GPU卡号以-分割,每个卡号一个进程",
"LoRA秩": "LoRA秩", "LoRA秩": "LoRA秩",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,无法加载相应 LoRA 权重",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 训练: 模型权重文件在 SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 训练: 模型权重文件在 SoVITS_weights/",
"SoVITS模型列表": "SoVITS模型列表", "SoVITS模型列表": "SoVITS模型列表",
"SoVITS训练": "SoVITS训练", "SoVITS训练": "SoVITS训练",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)",
"TTS推理WebUI": "TTS推理WebUI", "TTS推理WebUI": "TTS推理WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人声伴奏分离&去混响去延迟工具", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5人声伴奏分离&去混响去延迟工具",
"V3不支持无参考文本模式请填写参考文本": "V3不支持无参考文本模式请填写参考文本",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例归一化后音频进来", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例归一化后音频进来",
"batch_size": "batch_size", "batch_size": "batch_size",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3暂不支持该模式使用了会报错。", "v3暂不支持该模式使用了会报错。": "v3暂不支持该模式使用了会报错。",
"v3输出如果觉得闷可以试试开超分": "v3输出如果觉得闷可以试试开超分", "v3输出如果觉得闷可以试试开超分": "v3输出如果觉得闷可以试试开超分",
"不切": "不切", "不切": "不切",
"不训练直接推v2ProPlus底模": "不训练直接推v2ProPlus底模",
"不训练直接推v2Pro底模": "不训练直接推v2Pro底模",
"不训练直接推v2底模": "不训练直接推v2底模",
"不训练直接推v3底模": "不训练直接推v3底模",
"中文": "中文", "中文": "中文",
"中文教程文档": "中文教程文档",
"中英混合": "中英混合", "中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主参考音频(请上传3~10秒内参考音频超过会报错)", "主参考音频(请上传3~10秒内参考音频超过会报错)": "主参考音频(请上传3~10秒内参考音频超过会报错)",
"主参考音频的文本": "主参考音频的文本", "主参考音频的文本": "主参考音频的文本",
@ -92,7 +87,6 @@
"句间停顿秒数": "句间停顿秒数", "句间停顿秒数": "句间停顿秒数",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。",
"合成语音": "合成语音", "合成语音": "合成语音",
"合成音频": "合成音频",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。", "后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。",
@ -110,15 +104,11 @@
"已关闭": "已关闭", "已关闭": "已关闭",
"已完成": "已完成", "已完成": "已完成",
"已开启": "已开启", "已开启": "已开启",
"并行合成中": "并行合成中",
"并行推理": "并行推理", "并行推理": "并行推理",
"并行推理模式已关闭": "并行推理模式已关闭", "并行推理模式已关闭": "并行推理模式已关闭",
"并行推理模式已开启": "并行推理模式已开启", "并行推理模式已开启": "并行推理模式已开启",
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,无法加载相应 LoRA 权重",
"开启": "开启", "开启": "开启",
"开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。", "开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理",
"微调模型信息": "微调模型信息",
"微调训练": "微调训练", "微调训练": "微调训练",
"怎么切": "怎么切", "怎么切": "怎么切",
"总训练轮数total_epoch": "总训练轮数total_epoch", "总训练轮数total_epoch": "总训练轮数total_epoch",
@ -150,8 +140,8 @@
"模型": "模型", "模型": "模型",
"模型分为三类:": "模型分为三类:", "模型分为三类:": "模型分为三类:",
"模型切换": "模型切换", "模型切换": "模型切换",
"模型加载中,请等待": "模型加载中,请等待",
"每张显卡的batch_size": "每张显卡的batch_size", "每张显卡的batch_size": "每张显卡的batch_size",
"版本": "版本",
"粤英混合": "粤英混合", "粤英混合": "粤英混合",
"粤语": "粤语", "粤语": "粤语",
"终止合成": "终止合成", "终止合成": "终止合成",
@ -160,7 +150,6 @@
"缺少音素数据集": "缺少音素数据集", "缺少音素数据集": "缺少音素数据集",
"缺少音频数据集": "缺少音频数据集", "缺少音频数据集": "缺少音频数据集",
"英文": "英文", "英文": "英文",
"训练模型的版本": "训练模型的版本",
"训练集格式化一键三连": "训练集格式化一键三连", "训练集格式化一键三连": "训练集格式化一键三连",
"训练集格式化工具": "训练集格式化工具", "训练集格式化工具": "训练集格式化工具",
"语义Token提取": "语义Token提取", "语义Token提取": "语义Token提取",
@ -174,6 +163,7 @@
"语音识别": "语音识别", "语音识别": "语音识别",
"语音识别工具": "语音识别工具", "语音识别工具": "语音识别工具",
"语音降噪": "语音降噪", "语音降噪": "语音降噪",
"语音降噪工具": "语音降噪工具",
"请上传3~10秒内参考音频超过会报错": "请上传3~10秒内参考音频超过会报错", "请上传3~10秒内参考音频超过会报错": "请上传3~10秒内参考音频超过会报错",
"请上传参考音频": "请上传参考音频", "请上传参考音频": "请上传参考音频",
"请填入推理文本": "请填入推理文本", "请填入推理文本": "请填入推理文本",
@ -197,8 +187,7 @@
"进度": "进度", "进度": "进度",
"进程已终止": "进程已终止", "进程已终止": "进程已终止",
"进程输出信息": "进程输出信息", "进程输出信息": "进程输出信息",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。",
"采样步数(仅对V3/4生效)": "采样步数(仅对V3/4生效)",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试",
"重复惩罚": "重复惩罚", "重复惩罚": "重复惩罚",
"随机种子": "随机种子", "随机种子": "随机种子",
@ -214,13 +203,9 @@
"音频标注WebUI": "音频标注WebUI", "音频标注WebUI": "音频标注WebUI",
"音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹", "音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹",
"音频超分中": "音频超分中", "音频超分中": "音频超分中",
"音频超采样": "音频超采样",
"音频超采样(仅对V3生效))": "音频超采样(仅对V3生效))",
"预测语义Token": "预测语义Token",
"预训练GPT模型路径": "预训练GPT模型路径", "预训练GPT模型路径": "预训练GPT模型路径",
"预训练SSL模型路径": "预训练SSL模型路径", "预训练SSL模型路径": "预训练SSL模型路径",
"预训练SoVITS-D模型路径": "预训练SoVITS-D模型路径", "预训练SoVITS-D模型路径": "预训练SoVITS-D模型路径",
"预训练SoVITS-G模型路径": "预训练SoVITS-G模型路径", "预训练SoVITS-G模型路径": "预训练SoVITS-G模型路径",
"预训练中文BERT模型路径": "预训练中文BERT模型路径", "预训练中文BERT模型路径": "预训练中文BERT模型路径"
"预训练模型路径": "预训练模型路径"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR 模型", "ASR 模型": "ASR 模型",
"ASR 模型尺寸": "ASR 模型尺寸", "ASR 模型尺寸": "ASR 模型尺寸",
"ASR 语言设置": "ASR 語言設置", "ASR 语言设置": "ASR 語言設置",
"CPU训练,较慢": "CPU訓練較慢",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT 訓練: 模型權重檔案在 GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT 訓練: 模型權重檔案在 GPT_weights/",
"GPT模型列表": "GPT模型列表", "GPT模型列表": "GPT模型列表",
"GPT训练": "GPT訓練", "GPT训练": "GPT訓練",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數", "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程", "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
"LoRA秩": "LoRA秩", "LoRA秩": "LoRA秩",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重檔案在 SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重檔案在 SoVITS_weights/",
"SoVITS模型列表": "SoVITS模型列表", "SoVITS模型列表": "SoVITS模型列表",
"SoVITS训练": "SoVITS訓練", "SoVITS训练": "SoVITS訓練",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "提交文字:手動儲存目前頁面所有文字方塊內容至記憶體及檔案。若在換頁前後或離開標註頁面前未按下此按鈕,當你返回時變更將會還原,工作便會白費。",
"TTS推理WebUI": "TTS推理WebUI", "TTS推理WebUI": "TTS推理WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
"V3不支持无参考文本模式请填写参考文本": "V3 不支援無參考文字模式,請填寫參考文字!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小", "batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3暫不支持該模式使用了會報錯。", "v3暂不支持该模式使用了会报错。": "v3暫不支持該模式使用了會報錯。",
"v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分", "v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分",
"不切": "不切", "不切": "不切",
"不训练直接推v2ProPlus底模": "唔訓練直接推v2ProPlus底模",
"不训练直接推v2Pro底模": "唔訓練直接推v2Pro底模",
"不训练直接推v2底模": "唔訓練直接推v2底模",
"不训练直接推v3底模": "唔訓練直接推v3底模",
"中文": "中文", "中文": "中文",
"中文教程文档": "中文教程文檔",
"中英混合": "中英混合", "中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音頻請上傳3~10秒內參考音頻超過會報錯", "主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音頻請上傳3~10秒內參考音頻超過會報錯",
"主参考音频的文本": "主參考音頻的文本", "主参考音频的文本": "主參考音頻的文本",
@ -92,7 +87,6 @@
"句间停顿秒数": "句間停頓秒數", "句间停顿秒数": "句間停頓秒數",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音", "合成语音": "合成語音",
"合成音频": "合成音訊",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試樣例(去文件管理器地址欄拷就行了)。", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試樣例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。", "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。",
@ -110,15 +104,11 @@
"已关闭": "已關閉", "已关闭": "已關閉",
"已完成": "已完成", "已完成": "已完成",
"已开启": "已開啟", "已开启": "已開啟",
"并行合成中": "平行合成中",
"并行推理": "並行推理", "并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉", "并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟", "并行推理模式已开启": "並行推理模式已開啟",
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,無法加載相應 LoRA 權重",
"开启": "開啟", "开启": "開啟",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。", "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "當啟用平行推論模式時SoVITS V3/4 模型不支援分倉處理,已自動關閉分倉處理。",
"微调模型信息": "微調模型信息",
"微调训练": "微調訓練", "微调训练": "微調訓練",
"怎么切": "怎麼切", "怎么切": "怎麼切",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
@ -150,8 +140,8 @@
"模型": "模型", "模型": "模型",
"模型分为三类:": "模型分為三類:", "模型分为三类:": "模型分為三類:",
"模型切换": "模型切換", "模型切换": "模型切換",
"模型加载中,请等待": "模型載入中,請稍等",
"每张显卡的batch_size": "每張顯卡的batch_size", "每张显卡的batch_size": "每張顯卡的batch_size",
"版本": "版本",
"粤英混合": "粵英混合", "粤英混合": "粵英混合",
"粤语": "粵語", "粤语": "粵語",
"终止合成": "終止合成", "终止合成": "終止合成",
@ -160,7 +150,6 @@
"缺少音素数据集": "缺少音素數據集", "缺少音素数据集": "缺少音素數據集",
"缺少音频数据集": "缺少音頻數據集", "缺少音频数据集": "缺少音頻數據集",
"英文": "英文", "英文": "英文",
"训练模型的版本": "訓練模型嘅版本",
"训练集格式化一键三连": "訓練集格式化一鍵三連", "训练集格式化一键三连": "訓練集格式化一鍵三連",
"训练集格式化工具": "訓練集格式化工具", "训练集格式化工具": "訓練集格式化工具",
"语义Token提取": "語義Token提取", "语义Token提取": "語義Token提取",
@ -174,6 +163,7 @@
"语音识别": "語音識別", "语音识别": "語音識別",
"语音识别工具": "語音識別工具", "语音识别工具": "語音識別工具",
"语音降噪": "語音降噪", "语音降噪": "語音降噪",
"语音降噪工具": "語音降噪工具",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯", "请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
"请上传参考音频": "請上傳參考音頻", "请上传参考音频": "請上傳參考音頻",
"请填入推理文本": "請填入推理文本", "请填入推理文本": "請填入推理文本",
@ -197,8 +187,7 @@
"进度": "進度", "进度": "進度",
"进程已终止": "進程已終止", "进程已终止": "進程已終止",
"进程输出信息": "進程輸出信息", "进程输出信息": "進程輸出信息",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "選擇訓練完存放在SoVITS_weights同GPT_weights下嘅模型。默認嘅幾個係底模體驗5秒Zero Shot TTS唔訓練推理用。", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数(仅对V3/4生效)": "取樣步數(僅適用於 V3/4",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰", "重复惩罚": "重複懲罰",
"随机种子": "隨機種子", "随机种子": "隨機種子",
@ -211,16 +200,12 @@
"韩英混合": "韓英混合", "韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻", "音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音頻檔案不存在,跳過:", "音频文件不存在,跳过:": "音頻檔案不存在,跳過:",
"音频标注WebUI": "音頻標註 WebUI", "音频标注WebUI": "音頻標註WebUI",
"音频自动切分输入路径,可文件可文件夹": "音頻自動分割輸入路徑,可為檔案或資料夾", "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"音频超分中": "音頻超高解像度處理中", "音频超分中": "音頻超分中",
"音频超采样": "音頻超取樣", "预训练GPT模型路径": "預訓練GPT模型路徑",
"音频超采样(仅对V3生效))": "音頻超取樣(僅適用於 V3", "预训练SSL模型路径": "預訓練SSL模型路徑",
"预测语义Token": "預測語意 Token", "预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
"预训练GPT模型路径": "預訓練 GPT 模型路徑", "预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
"预训练SSL模型路径": "預訓練 SSL 模型路徑", "预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
"预训练SoVITS-D模型路径": "預訓練 SoVITS-D 模型路徑",
"预训练SoVITS-G模型路径": "預訓練 SoVITS-G 模型路徑",
"预训练中文BERT模型路径": "預訓練中文BERT模型路徑",
"预训练模型路径": "預訓練模型路徑"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR 模型", "ASR 模型": "ASR 模型",
"ASR 模型尺寸": "ASR 模型尺寸", "ASR 模型尺寸": "ASR 模型尺寸",
"ASR 语言设置": "ASR 語言設定", "ASR 语言设置": "ASR 語言設定",
"CPU训练,较慢": "CPU训练速度较慢",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT 训练: 模型权重文件在 GPT_weights/ 目錄下", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT 训练: 模型权重文件在 GPT_weights/ 目錄下",
"GPT模型列表": "GPT模型列表", "GPT模型列表": "GPT模型列表",
"GPT训练": "GPT訓練", "GPT训练": "GPT訓練",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數", "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程", "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
"LoRA秩": "LoRA秩", "LoRA秩": "LoRA秩",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/ 目錄下", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/ 目錄下",
"SoVITS模型列表": "SoVITS模型列表", "SoVITS模型列表": "SoVITS模型列表",
"SoVITS训练": "SoVITS訓練", "SoVITS训练": "SoVITS訓練",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "提交文字:手动保存当前页面所有文本框内容至内存和文件。若您在换页前后或退出标注页面前没有点击此按钮,再次返回时更改将会被撤销,您的工作便会白费。",
"TTS推理WebUI": "TTS推理WebUI", "TTS推理WebUI": "TTS推理WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
"V3不支持无参考文本模式请填写参考文本": "V3 不支持无参考文本模式,请填写参考文本!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小", "batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3暫不支援該模式使用了會報錯。", "v3暂不支持该模式使用了会报错。": "v3暫不支援該模式使用了會報錯。",
"v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分", "v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分",
"不切": "不切", "不切": "不切",
"不训练直接推v2ProPlus底模": "不训练直接使用v2ProPlus基础模型",
"不训练直接推v2Pro底模": "不训练直接使用v2Pro基础模型",
"不训练直接推v2底模": "不训练直接使用v2基础模型",
"不训练直接推v3底模": "不训练直接使用v3基础模型",
"中文": "中文", "中文": "中文",
"中文教程文档": "中文教程文檔",
"中英混合": "中英混合", "中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯", "主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯",
"主参考音频的文本": "主參考音檔的文本", "主参考音频的文本": "主參考音檔的文本",
@ -92,7 +87,6 @@
"句间停顿秒数": "句間停頓秒數", "句间停顿秒数": "句間停頓秒數",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音", "合成语音": "合成語音",
"合成音频": "合成音频",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。", "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。",
@ -110,15 +104,11 @@
"已关闭": "已關閉", "已关闭": "已關閉",
"已完成": "已完成", "已完成": "已完成",
"已开启": "已開啟", "已开启": "已開啟",
"并行合成中": "并行合成中",
"并行推理": "並行推理", "并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉", "并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟", "并行推理模式已开启": "並行推理模式已開啟",
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,无法加载相应 LoRA 权重",
"开启": "開啟", "开启": "開啟",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。", "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "当启用并行推理模式时SoVits V3/4 模型不支持分桶处理,已自动关闭分桶处理。",
"微调模型信息": "微调模型信息",
"微调训练": "微調訓練", "微调训练": "微調訓練",
"怎么切": "怎麼切", "怎么切": "怎麼切",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
@ -150,8 +140,8 @@
"模型": "模型", "模型": "模型",
"模型分为三类:": "模型分為三類:", "模型分为三类:": "模型分為三類:",
"模型切换": "模型切換", "模型切换": "模型切換",
"模型加载中,请等待": "模型加载中,请等待",
"每张显卡的batch_size": "每張顯卡的batch_size", "每张显卡的batch_size": "每張顯卡的batch_size",
"版本": "版本",
"粤英混合": "粵英混合", "粤英混合": "粵英混合",
"粤语": "粵語", "粤语": "粵語",
"终止合成": "終止合成", "终止合成": "終止合成",
@ -160,7 +150,6 @@
"缺少音素数据集": "缺少音素數據集", "缺少音素数据集": "缺少音素數據集",
"缺少音频数据集": "缺少音頻數據集", "缺少音频数据集": "缺少音頻數據集",
"英文": "英文", "英文": "英文",
"训练模型的版本": "训练模型的版本",
"训练集格式化一键三连": "訓練集格式化一鍵三連", "训练集格式化一键三连": "訓練集格式化一鍵三連",
"训练集格式化工具": "訓練集格式化工具", "训练集格式化工具": "訓練集格式化工具",
"语义Token提取": "語義Token提取", "语义Token提取": "語義Token提取",
@ -174,6 +163,7 @@
"语音识别": "語音識別", "语音识别": "語音識別",
"语音识别工具": "語音識別工具", "语音识别工具": "語音識別工具",
"语音降噪": "語音降噪", "语音降噪": "語音降噪",
"语音降噪工具": "語音降噪工具",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯", "请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
"请上传参考音频": "請上傳參考音頻", "请上传参考音频": "請上傳參考音頻",
"请填入推理文本": "請填入推理文本", "请填入推理文本": "請填入推理文本",
@ -197,8 +187,7 @@
"进度": "進度", "进度": "進度",
"进程已终止": "進程已終止", "进程已终止": "進程已終止",
"进程输出信息": "進程輸出信息", "进程输出信息": "進程輸出信息",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "选择训练完成后存储在SoVITS_weights和GPT_weights下的模型。默认的几个是基础模型用于体验5秒Zero Shot TTS不训练推理。", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数(仅对V3/4生效)": "采样步数(仅适用于 V3/4",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰", "重复惩罚": "重複懲罰",
"随机种子": "隨機種子", "随机种子": "隨機種子",
@ -210,17 +199,13 @@
"韩文": "韓文", "韩文": "韓文",
"韩英混合": "韓英混合", "韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻", "音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音频文件不存在,跳过:", "音频文件不存在,跳过:": "音檔不存在,跳過:",
"音频标注WebUI": "音频标注 WebUI", "音频标注WebUI": "音頻標註WebUI",
"音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹", "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"音频超分中": "音频超分辨率处理中", "音频超分中": "音頻超分中",
"音频超采样": "音频超采样", "预训练GPT模型路径": "預訓練GPT模型路徑",
"音频超采样(仅对V3生效))": "音频超采样(仅适用于 V3", "预训练SSL模型路径": "預訓練SSL模型路徑",
"预测语义Token": "预测语义 Token", "预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
"预训练GPT模型路径": "预训练 GPT 模型路径", "预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
"预训练SSL模型路径": "预训练 SSL 模型路径", "预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
"预训练SoVITS-D模型路径": "预训练 SoVITS-D 模型路径",
"预训练SoVITS-G模型路径": "预训练 SoVITS-G 模型路径",
"预训练中文BERT模型路径": "预训练中文BERT模型路径",
"预训练模型路径": "预训练模型路径"
} }

View File

@ -18,7 +18,6 @@
"ASR 模型": "ASR 模型", "ASR 模型": "ASR 模型",
"ASR 模型尺寸": "ASR 模型尺寸", "ASR 模型尺寸": "ASR 模型尺寸",
"ASR 语言设置": "ASR 語言設置", "ASR 语言设置": "ASR 語言設置",
"CPU训练,较慢": "CPU訓練較慢",
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT 訓練: 模型權重文件在 GPT_weights/", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT 訓練: 模型權重文件在 GPT_weights/",
"GPT模型列表": "GPT模型列表", "GPT模型列表": "GPT模型列表",
"GPT训练": "GPT訓練", "GPT训练": "GPT訓練",
@ -26,13 +25,12 @@
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數", "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程", "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
"LoRA秩": "LoRA階", "LoRA秩": "LoRA階",
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/",
"SoVITS模型列表": "SoVITS模型列表", "SoVITS模型列表": "SoVITS模型列表",
"SoVITS训练": "SoVITS訓練", "SoVITS训练": "SoVITS訓練",
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "送出文字:手動儲存目前頁面所有文字欄位內容至記憶體與檔案。若您在換頁前後或離開標註頁面前未按下此按鈕,當您返回時異動將會復原,辛苦的工作便會白費。",
"TTS推理WebUI": "TTS推理WebUI", "TTS推理WebUI": "TTS推理WebUI",
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
"V3不支持无参考文本模式请填写参考文本": "V3 不支援無參考文字模式,請填寫參考文字!",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
"batch_size": "批次大小", "batch_size": "批次大小",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
@ -47,11 +45,8 @@
"v3暂不支持该模式使用了会报错。": "v3暫不支援該模式使用了會報錯。", "v3暂不支持该模式使用了会报错。": "v3暫不支援該模式使用了會報錯。",
"v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分", "v3输出如果觉得闷可以试试开超分": "v3輸出如果覺得悶可以試試開超分",
"不切": "不切", "不切": "不切",
"不训练直接推v2ProPlus底模": "不訓練直接使用v2ProPlus底模",
"不训练直接推v2Pro底模": "不訓練直接使用v2Pro底模",
"不训练直接推v2底模": "不訓練直接使用v2底模",
"不训练直接推v3底模": "不訓練直接使用v3底模",
"中文": "中文", "中文": "中文",
"中文教程文档": "中文教程文檔",
"中英混合": "中英混合", "中英混合": "中英混合",
"主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯", "主参考音频(请上传3~10秒内参考音频超过会报错)": "主參考音檔請上傳3~10秒內參考音檔超過會報錯",
"主参考音频的文本": "主參考音檔的文字", "主参考音频的文本": "主參考音檔的文字",
@ -92,7 +87,6 @@
"句间停顿秒数": "句間停頓秒數", "句间停顿秒数": "句間停頓秒數",
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
"合成语音": "合成語音", "合成语音": "合成語音",
"合成音频": "合成音訊",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。", "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
@ -110,15 +104,11 @@
"已关闭": "已關閉", "已关闭": "已關閉",
"已完成": "已完成", "已完成": "已完成",
"已开启": "已開啟", "已开启": "已開啟",
"并行合成中": "平行合成中",
"并行推理": "並行推理", "并行推理": "並行推理",
"并行推理模式已关闭": "並行推理模式已關閉", "并行推理模式已关闭": "並行推理模式已關閉",
"并行推理模式已开启": "並行推理模式已開啟", "并行推理模式已开启": "並行推理模式已開啟",
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,無法載入相應 LoRA 權重",
"开启": "開啟", "开启": "開啟",
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。", "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
"当开启并行推理模式时SoVits V3/4模型不支持分桶处理已自动关闭分桶处理": "啟用平行推論模式時SoVITS V3/4 模型不支援分倉處理,已自動關閉該功能。",
"微调模型信息": "微調模型資訊",
"微调训练": "微調訓練", "微调训练": "微調訓練",
"怎么切": "怎麼切", "怎么切": "怎麼切",
"总训练轮数total_epoch": "總訓練輪數total_epoch", "总训练轮数total_epoch": "總訓練輪數total_epoch",
@ -150,8 +140,8 @@
"模型": "模型", "模型": "模型",
"模型分为三类:": "模型分為三類:", "模型分为三类:": "模型分為三類:",
"模型切换": "模型切換", "模型切换": "模型切換",
"模型加载中,请等待": "模型載入中,請稍候",
"每张显卡的batch_size": "每張顯卡的batch_size", "每张显卡的batch_size": "每張顯卡的batch_size",
"版本": "版本",
"粤英混合": "粵英混合", "粤英混合": "粵英混合",
"粤语": "粵語", "粤语": "粵語",
"终止合成": "終止合成", "终止合成": "終止合成",
@ -160,7 +150,6 @@
"缺少音素数据集": "缺少音素數據集", "缺少音素数据集": "缺少音素數據集",
"缺少音频数据集": "缺少音頻數據集", "缺少音频数据集": "缺少音頻數據集",
"英文": "英文", "英文": "英文",
"训练模型的版本": "訓練模型的版本",
"训练集格式化一键三连": "訓練集格式化一鍵三連", "训练集格式化一键三连": "訓練集格式化一鍵三連",
"训练集格式化工具": "訓練集格式化工具", "训练集格式化工具": "訓練集格式化工具",
"语义Token提取": "語義Token提取", "语义Token提取": "語義Token提取",
@ -174,6 +163,7 @@
"语音识别": "語音識別", "语音识别": "語音識別",
"语音识别工具": "語音識別工具", "语音识别工具": "語音識別工具",
"语音降噪": "語音降噪", "语音降噪": "語音降噪",
"语音降噪工具": "語音降噪工具",
"请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯", "请上传3~10秒内参考音频超过会报错": "請上傳3~10秒內參考音頻超過會報錯",
"请上传参考音频": "請上傳參考音頻", "请上传参考音频": "請上傳參考音頻",
"请填入推理文本": "請填入推理文本", "请填入推理文本": "請填入推理文本",
@ -197,8 +187,7 @@
"进度": "進度", "进度": "進度",
"进程已终止": "進程已終止", "进程已终止": "進程已終止",
"进程输出信息": "進程輸出資訊", "进程输出信息": "進程輸出資訊",
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模体验5秒Zero Shot TTS不训练推理用。": "選擇訓練完成存放在SoVITS_weights和GPT_weights下的模型。預設的幾個是底模體驗5秒Zero Shot TTS不訓練推理使用。", "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模體驗5秒Zero Shot TTS用。",
"采样步数(仅对V3/4生效)": "取樣步數(僅適用於 V3/4",
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
"重复惩罚": "重複懲罰", "重复惩罚": "重複懲罰",
"随机种子": "隨機種子", "随机种子": "隨機種子",
@ -210,17 +199,13 @@
"韩文": "韓文", "韩文": "韓文",
"韩英混合": "韓英混合", "韩英混合": "韓英混合",
"音频加载失败": "無法加載音頻", "音频加载失败": "無法加載音頻",
"音频文件不存在,跳过:": "音訊檔案不存在,跳過:", "音频文件不存在,跳过:": "音檔不存在,跳過:",
"音频标注WebUI": "音訊標註 WebUI", "音频标注WebUI": "音頻標註WebUI",
"音频自动切分输入路径,可文件可文件夹": "音訊自動切割輸入路徑,可為檔案或資料夾", "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
"音频超分中": "音訊超高解析度處理中", "音频超分中": "音頻超分中",
"音频超采样": "音訊超取樣", "预训练GPT模型路径": "預訓練GPT模型路徑",
"音频超采样(仅对V3生效))": "音訊超取樣(僅適用於 V3", "预训练SSL模型路径": "預訓練SSL模型路徑",
"预测语义Token": "預測語意 Token", "预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
"预训练GPT模型路径": "預訓練 GPT 模型路徑", "预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
"预训练SSL模型路径": "預訓練 SSL 模型路徑", "预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
"预训练SoVITS-D模型路径": "預訓練 SoVITS-D 模型路徑",
"预训练SoVITS-G模型路径": "預訓練 SoVITS-G 模型路徑",
"预训练中文BERT模型路径": "預訓練中文BERT模型路徑",
"预训练模型路径": "預訓練模型路徑"
} }

View File

@ -1,14 +1,10 @@
import ctypes
import os import os
import sys import traceback
from pathlib import Path
import ffmpeg import ffmpeg
import gradio as gr
import numpy as np import numpy as np
import pandas as pd import gradio as gr
from tools.i18n.i18n import I18nAuto from tools.i18n.i18n import I18nAuto
import pandas as pd
i18n = I18nAuto(language=os.environ.get("language", "Auto")) i18n = I18nAuto(language=os.environ.get("language", "Auto"))
@ -19,7 +15,7 @@ def load_audio(file, sr):
# This launches a subprocess to decode audio while down-mixing and resampling as necessary. # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) is False: if os.path.exists(file) == False:
raise RuntimeError("You input a wrong audio path that does not exists, please fix it!") raise RuntimeError("You input a wrong audio path that does not exists, please fix it!")
out, _ = ( out, _ = (
ffmpeg.input(file, threads=0) ffmpeg.input(file, threads=0)
@ -27,11 +23,7 @@ def load_audio(file, sr):
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
) )
except Exception: except Exception:
out, _ = ( traceback.print_exc()
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True)
) # Expose the Error
raise RuntimeError(i18n("音频加载失败")) raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten() return np.frombuffer(out, np.float32).flatten()
@ -109,7 +101,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
if os.path.exists(wav_path): if os.path.exists(wav_path):
... ...
else: else:
gr.Warning(wav_path + i18n("路径错误")) gr.Warning(i18n("路径错误"))
return return
if is_train: if is_train:
path_list.append(os.path.join(path_list[0], "2-name2text.txt")) path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
@ -135,97 +127,3 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
... ...
else: else:
gr.Warning(i18n("缺少语义数据集")) gr.Warning(i18n("缺少语义数据集"))
def load_cudnn():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping cuDNN setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll"))
if not matching_files:
print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
if not cudnn_dir.exists():
print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
return
matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*"))
if not matching_files:
print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")
def load_nvrtc():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping nvrtc setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll"))
if not matching_files:
print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib"
if not nvrtc_dir.exists():
print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}")
return
matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*"))
if not matching_files:
print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")

View File

@ -1,12 +1,7 @@
import sys
from tools.i18n.i18n import I18nAuto, scan_language_list
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
i18n = I18nAuto(language=language)
import argparse import argparse
import os
import copy import copy
import json import json
import os
import uuid import uuid
try: try:
@ -16,8 +11,8 @@ try:
except: except:
... ...
import gradio as gr
import librosa import librosa
import gradio as gr
import numpy as np import numpy as np
import soundfile import soundfile
@ -308,12 +303,7 @@ if __name__ == "__main__":
set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch)
with gr.Blocks(analytics_enabled=False) as demo: with gr.Blocks() as demo:
gr.Markdown(
value=i18n(
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)"
)
)
with gr.Row(): with gr.Row():
btn_change_index = gr.Button("Change Index") btn_change_index = gr.Button("Change Index")
btn_submit_change = gr.Button("Submit Text") btn_submit_change = gr.Button("Submit Text")
@ -419,7 +409,7 @@ if __name__ == "__main__":
demo.launch( demo.launch(
server_name="0.0.0.0", server_name="0.0.0.0",
inbrowser=True, inbrowser=True,
# quiet=True, quiet=True,
share=eval(args.is_share), share=eval(args.is_share),
server_port=int(args.webui_port_subfix), server_port=int(args.webui_port_subfix),
) )

View File

@ -32,10 +32,18 @@ def make_pair(mix_dir, inst_dir):
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted( X_list = sorted(
[os.path.join(mix_dir, fname) for fname in os.listdir(mix_dir) if os.path.splitext(fname)[1] in input_exts] [
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts
]
) )
y_list = sorted( y_list = sorted(
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts] [
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
) )
filelist = list(zip(X_list, y_list)) filelist = list(zip(X_list, y_list))
@ -57,10 +65,14 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
train_filelist = filelist[:-val_size] train_filelist = filelist[:-val_size]
val_filelist = filelist[-val_size:] val_filelist = filelist[-val_size:]
else: else:
train_filelist = [pair for pair in filelist if list(pair) not in val_filelist] train_filelist = [
pair for pair in filelist if list(pair) not in val_filelist
]
elif split_mode == "subdirs": elif split_mode == "subdirs":
if len(val_filelist) != 0: if len(val_filelist) != 0:
raise ValueError("The `val_filelist` option is not available in `subdirs` mode") raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
train_filelist = make_pair( train_filelist = make_pair(
os.path.join(dataset_dir, "training/mixtures"), os.path.join(dataset_dir, "training/mixtures"),
@ -79,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X)) perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)): for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate: if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
if np.random.uniform() < 0.5: if np.random.uniform() < 0.5:
# swap channel # swap channel
@ -138,7 +152,9 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = [] patch_list = []
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset) patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
os.makedirs(patch_dir, exist_ok=True) os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)): for i, (X_path, y_path) in enumerate(tqdm(filelist)):

View File

@ -63,7 +63,9 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -89,14 +91,24 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) self.conv3 = SeperableConv2DBNActiv(
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) )
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -63,7 +63,9 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -89,14 +91,24 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) self.conv3 = SeperableConv2DBNActiv(
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) )
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -63,7 +63,9 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -89,14 +91,24 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) self.conv3 = SeperableConv2DBNActiv(
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) )
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

View File

@ -63,7 +63,9 @@ class Encoder(nn.Module):
class Decoder(nn.Module): class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -89,16 +91,30 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
) )
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) self.conv3 = SeperableConv2DBNActiv(
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) )
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) self.conv4 = SeperableConv2DBNActiv(
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) )
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x): def forward(self, x):
_, _, h, w = x.size() _, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x) feat2 = self.conv2(x)
feat3 = self.conv3(x) feat3 = self.conv3(x)
feat4 = self.conv4(x) feat4 = self.conv4(x)

Some files were not shown because too many files have changed in this diff Show More