mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-18 07:42:04 +08:00
Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements
This commit is contained in:
commit
b8de0ec0ac
204
.dockerignore
204
.dockerignore
@ -1,8 +1,198 @@
|
||||
docs
|
||||
logs
|
||||
output
|
||||
reference
|
||||
SoVITS_weights
|
||||
GPT_weights
|
||||
TEMP
|
||||
GPT_SoVITS/pretrained_models/*
|
||||
tools/asr/models/*
|
||||
tools/uvr5/uvr5_weights/*
|
||||
|
||||
.git
|
||||
.DS_Store
|
||||
.vscode
|
||||
*.pyc
|
||||
env
|
||||
runtime
|
||||
.idea
|
||||
output
|
||||
logs
|
||||
SoVITS_weights*/
|
||||
GPT_weights*/
|
||||
TEMP
|
||||
weight.json
|
||||
ffmpeg*
|
||||
ffprobe*
|
||||
cfg.json
|
||||
speakers.json
|
||||
ref_audios
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
**/__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
194
.github/build_windows_packages.ps1
vendored
Normal file
194
.github/build_windows_packages.ps1
vendored
Normal file
@ -0,0 +1,194 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
Write-Host "Current location: $(Get-Location)"
|
||||
|
||||
$cuda = $env:TORCH_CUDA
|
||||
if (-not $cuda) {
|
||||
Write-Error "Missing TORCH_CUDA env (cu124 or cu128)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
$date = $env:DATE_SUFFIX
|
||||
if ([string]::IsNullOrWhiteSpace($date)) {
|
||||
$date = Get-Date -Format "MMdd"
|
||||
}
|
||||
|
||||
$pkgName = "GPT-SoVITS-$date"
|
||||
$tmpDir = "tmp"
|
||||
$srcDir = $PWD
|
||||
|
||||
$suffix = $env:PKG_SUFFIX
|
||||
if (-not [string]::IsNullOrWhiteSpace($suffix)) {
|
||||
$pkgName = "$pkgName$suffix"
|
||||
}
|
||||
|
||||
$pkgName = "$pkgName-$cuda"
|
||||
|
||||
$baseHF = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main"
|
||||
$PRETRAINED_URL = "$baseHF/pretrained_models.zip"
|
||||
$G2PW_URL = "$baseHF/G2PWModel.zip"
|
||||
$UVR5_URL = "$baseHF/uvr5_weights.zip"
|
||||
$NLTK_URL = "$baseHF/nltk_data.zip"
|
||||
$JTALK_URL = "$baseHF/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||
|
||||
$PYTHON_VERSION = "3.11.12"
|
||||
$PY_RELEASE_VERSION = "20250409"
|
||||
|
||||
Write-Host "[INFO] Cleaning .git..."
|
||||
Remove-Item "$srcDir\.git" -Recurse -Force -ErrorAction SilentlyContinue
|
||||
|
||||
Write-Host "[INFO] Creating tmp dir..."
|
||||
New-Item -ItemType Directory -Force -Path $tmpDir
|
||||
|
||||
Write-Host "[INFO] System Python version:"
|
||||
python --version
|
||||
python -m site
|
||||
|
||||
Write-Host "[INFO] Downloading Python $PYTHON_VERSION..."
|
||||
$zst = "$tmpDir\python.tar.zst"
|
||||
Invoke-WebRequest "https://github.com/astral-sh/python-build-standalone/releases/download/$PY_RELEASE_VERSION/cpython-$PYTHON_VERSION+$PY_RELEASE_VERSION-x86_64-pc-windows-msvc-pgo-full.tar.zst" -OutFile $zst
|
||||
& "C:\Program Files\7-Zip\7z.exe" e $zst -o"$tmpDir" -aoa
|
||||
$tar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
|
||||
& "C:\Program Files\7-Zip\7z.exe" x $tar.FullName -o"$tmpDir\extracted" -aoa
|
||||
Move-Item "$tmpDir\extracted\python\install" "$srcDir\runtime"
|
||||
|
||||
Write-Host "[INFO] Copying Redistributing Visual C++ Runtime..."
|
||||
$vswhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
|
||||
$vsPath = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
|
||||
$redistRoot = Join-Path $vsPath "VC\Redist\MSVC"
|
||||
$targetVer = Get-ChildItem -Path $redistRoot -Directory |
|
||||
Where-Object { $_.Name -match "^14\." } |
|
||||
Sort-Object Name -Descending |
|
||||
Select-Object -First 1
|
||||
$x64Path = Join-Path $targetVer.FullName "x64"
|
||||
Get-ChildItem -Path $x64Path -Directory | Where-Object {
|
||||
$_.Name -match '^Microsoft\..*\.(CRT|OpenMP)$'
|
||||
} | ForEach-Object {
|
||||
Get-ChildItem -Path $_.FullName -Filter "*.dll" | ForEach-Object {
|
||||
Copy-Item -Path $_.FullName -Destination "$srcDir\runtime" -Force
|
||||
}
|
||||
}
|
||||
|
||||
function DownloadAndUnzip($url, $targetRelPath) {
|
||||
$filename = Split-Path $url -Leaf
|
||||
$tmpZip = "$tmpDir\$filename"
|
||||
Invoke-WebRequest $url -OutFile $tmpZip
|
||||
Expand-Archive -Path $tmpZip -DestinationPath $tmpDir -Force
|
||||
$subdirName = $filename -replace '\.zip$', ''
|
||||
$sourcePath = Join-Path $tmpDir $subdirName
|
||||
$destRoot = Join-Path $srcDir $targetRelPath
|
||||
$destPath = Join-Path $destRoot $subdirName
|
||||
if (Test-Path $destPath) {
|
||||
Remove-Item $destPath -Recurse -Force
|
||||
}
|
||||
Move-Item $sourcePath $destRoot
|
||||
Remove-Item $tmpZip
|
||||
}
|
||||
|
||||
Write-Host "[INFO] Download pretrained_models..."
|
||||
DownloadAndUnzip $PRETRAINED_URL "GPT_SoVITS"
|
||||
|
||||
Write-Host "[INFO] Download G2PWModel..."
|
||||
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
|
||||
|
||||
Write-Host "[INFO] Download UVR5 model..."
|
||||
DownloadAndUnzip $UVR5_URL "tools\uvr5"
|
||||
|
||||
Write-Host "[INFO] Downloading funasr..."
|
||||
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
|
||||
$funasrZip = "$tmpDir\funasr.zip"
|
||||
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
|
||||
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
|
||||
Remove-Item $funasrZip
|
||||
|
||||
Write-Host "[INFO] Download ffmpeg..."
|
||||
$ffUrl = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
|
||||
$ffZip = "$tmpDir\ffmpeg.zip"
|
||||
Invoke-WebRequest -Uri $ffUrl -OutFile $ffZip
|
||||
Expand-Archive $ffZip -DestinationPath $tmpDir -Force
|
||||
$ffDir = Get-ChildItem -Directory "$tmpDir" | Where-Object { $_.Name -like "ffmpeg*" } | Select-Object -First 1
|
||||
Move-Item "$($ffDir.FullName)\bin\ffmpeg.exe" "$srcDir\runtime"
|
||||
Move-Item "$($ffDir.FullName)\bin\ffprobe.exe" "$srcDir\runtime"
|
||||
Remove-Item $ffZip
|
||||
Remove-Item $ffDir.FullName -Recurse -Force
|
||||
|
||||
Write-Host "[INFO] Installing PyTorch..."
|
||||
& ".\runtime\python.exe" -m ensurepip
|
||||
& ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location
|
||||
switch ($cuda) {
|
||||
"cu124" {
|
||||
& ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
|
||||
}
|
||||
"cu128" {
|
||||
& ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location
|
||||
}
|
||||
default {
|
||||
Write-Error "Unsupported CUDA version: $cuda"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
Write-Host "[INFO] Installing dependencies..."
|
||||
& ".\runtime\python.exe" -m pip install -r extra-req.txt --no-deps --no-warn-script-location
|
||||
& ".\runtime\python.exe" -m pip install -r requirements.txt --no-warn-script-location
|
||||
|
||||
Write-Host "[INFO] Downloading NLTK and pyopenjtalk dictionary..."
|
||||
$PYTHON = ".\runtime\python.exe"
|
||||
$prefix = & $PYTHON -c "import sys; print(sys.prefix)"
|
||||
$jtalkPath = & $PYTHON -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))"
|
||||
$nltkZip = "$tmpDir\nltk_data.zip"
|
||||
$jtalkTar = "$tmpDir\open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||
|
||||
Invoke-WebRequest -Uri $NLTK_URL -OutFile $nltkZip
|
||||
Expand-Archive -Path $nltkZip -DestinationPath $prefix -Force
|
||||
Remove-Item $nltkZip
|
||||
|
||||
Invoke-WebRequest -Uri $JTALK_URL -OutFile $jtalkTar
|
||||
& "C:\Program Files\7-Zip\7z.exe" e $jtalkTar -o"$tmpDir" -aoa
|
||||
$innerTar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
|
||||
& "C:\Program Files\7-Zip\7z.exe" x $innerTar.FullName -o"$jtalkPath" -aoa
|
||||
Remove-Item $jtalkTar
|
||||
Remove-Item $innerTar.FullName
|
||||
|
||||
Write-Host "[INFO] Preparing final directory $pkgName ..."
|
||||
$items = @(Get-ChildItem -Filter "*.sh") +
|
||||
@(Get-ChildItem -Filter "*.ipynb") +
|
||||
@("$tmpDir", ".github", "Docker", "docs", ".gitignore", ".dockerignore", "README.md")
|
||||
Remove-Item $items -Force -Recurse -ErrorAction SilentlyContinue
|
||||
$curr = Get-Location
|
||||
Set-Location ../
|
||||
Get-ChildItem .
|
||||
Copy-Item -Path $curr -Destination $pkgName -Recurse
|
||||
$7zPath = "$pkgName.7z"
|
||||
$start = Get-Date
|
||||
Write-Host "Compress Starting at $start"
|
||||
& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
|
||||
$end = Get-Date
|
||||
Write-Host "Elapsed time: $($end - $start)"
|
||||
Get-ChildItem .
|
||||
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install "modelscope" "huggingface_hub[hf_transfer]" --no-warn-script-location
|
||||
|
||||
Write-Host "[INFO] Uploading to ModelScope..."
|
||||
$msUser = $env:MODELSCOPE_USERNAME
|
||||
$msToken = $env:MODELSCOPE_TOKEN
|
||||
if (-not $msUser -or -not $msToken) {
|
||||
Write-Error "Missing MODELSCOPE_USERNAME or MODELSCOPE_TOKEN"
|
||||
exit 1
|
||||
}
|
||||
modelscope upload "$msUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $msToken
|
||||
|
||||
Write-Host "[SUCCESS] Uploaded: $7zPath to ModelScope"
|
||||
|
||||
Write-Host "[INFO] Uploading to HuggingFace..."
|
||||
$hfUser = $env:HUGGINGFACE_USERNAME
|
||||
$hfToken = $env:HUGGINGFACE_TOKEN
|
||||
if (-not $hfUser -or -not $hfToken) {
|
||||
Write-Error "Missing HUGGINGFACE_USERNAME or HUGGINGFACE_TOKEN"
|
||||
exit 1
|
||||
}
|
||||
$env:HF_HUB_ENABLE_HF_TRANSFER = "1"
|
||||
huggingface-cli upload "$hfUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $hfToken
|
||||
|
||||
Write-Host "[SUCCESS] Uploaded: $7zPath to HuggingFace"
|
38
.github/workflows/build_windows_packages.yaml
vendored
Normal file
38
.github/workflows/build_windows_packages.yaml
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
name: Build and Upload Windows Package
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
date:
|
||||
description: "Date suffix (optional)"
|
||||
required: false
|
||||
default: ""
|
||||
suffix:
|
||||
description: "Package name suffix (optional)"
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: windows-latest
|
||||
strategy:
|
||||
matrix:
|
||||
torch_cuda: [cu124, cu128]
|
||||
env:
|
||||
TORCH_CUDA: ${{ matrix.torch_cuda }}
|
||||
MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
|
||||
MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
|
||||
HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}
|
||||
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
|
||||
DATE_SUFFIX: ${{ github.event.inputs.date }}
|
||||
PKG_SUFFIX: ${{ github.event.inputs.suffix }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run Build and Upload Script
|
||||
shell: pwsh
|
||||
run: |
|
||||
Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1
|
||||
../build_windows_packages.ps1
|
276
.github/workflows/docker-publish.yaml
vendored
Normal file
276
.github/workflows/docker-publish.yaml
vendored
Normal file
@ -0,0 +1,276 @@
|
||||
name: Build and Publish Docker Image
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
generate-meta:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
tag: ${{ steps.meta.outputs.tag }}
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Generate Tag
|
||||
id: meta
|
||||
run: |
|
||||
DATE=$(date +'%Y%m%d')
|
||||
COMMIT=$(git rev-parse --short=6 HEAD)
|
||||
echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT
|
||||
build-amd64:
|
||||
needs: generate-meta
|
||||
runs-on: ubuntu-22.04
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda_version: 12.6
|
||||
lite: true
|
||||
torch_base: lite
|
||||
tag_prefix: cu126-lite
|
||||
- cuda_version: 12.6
|
||||
lite: false
|
||||
torch_base: full
|
||||
tag_prefix: cu126
|
||||
- cuda_version: 12.8
|
||||
lite: true
|
||||
torch_base: lite
|
||||
tag_prefix: cu128-lite
|
||||
- cuda_version: 12.8
|
||||
lite: false
|
||||
torch_base: full
|
||||
tag_prefix: cu128
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
echo "Before cleanup:"
|
||||
df -h
|
||||
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo rm -rf /opt/hostedtoolcache/PyPy
|
||||
sudo rm -rf /opt/hostedtoolcache/go
|
||||
sudo rm -rf /opt/hostedtoolcache/node
|
||||
sudo rm -rf /opt/hostedtoolcache/Ruby
|
||||
sudo rm -rf /opt/microsoft
|
||||
sudo rm -rf /opt/pipx
|
||||
sudo rm -rf /opt/az
|
||||
sudo rm -rf /opt/google
|
||||
|
||||
|
||||
sudo rm -rf /usr/lib/jvm
|
||||
sudo rm -rf /usr/lib/google-cloud-sdk
|
||||
sudo rm -rf /usr/lib/dotnet
|
||||
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/local/.ghcup
|
||||
sudo rm -rf /usr/local/julia1.11.5
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/local/share/chromium
|
||||
|
||||
sudo rm -rf /usr/share/swift
|
||||
sudo rm -rf /usr/share/miniconda
|
||||
sudo rm -rf /usr/share/az_12.1.0
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
|
||||
echo "After cleanup:"
|
||||
df -h
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
|
||||
|
||||
- name: Build and Push Docker Image (amd64)
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
platforms: linux/amd64
|
||||
build-args: |
|
||||
LITE=${{ matrix.lite }}
|
||||
TORCH_BASE=${{ matrix.torch_base }}
|
||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||
WORKFLOW=true
|
||||
tags: |
|
||||
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64
|
||||
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64
|
||||
|
||||
build-arm64:
|
||||
needs: generate-meta
|
||||
runs-on: ubuntu-22.04-arm
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda_version: 12.6
|
||||
lite: true
|
||||
torch_base: lite
|
||||
tag_prefix: cu126-lite
|
||||
- cuda_version: 12.6
|
||||
lite: false
|
||||
torch_base: full
|
||||
tag_prefix: cu126
|
||||
- cuda_version: 12.8
|
||||
lite: true
|
||||
torch_base: lite
|
||||
tag_prefix: cu128-lite
|
||||
- cuda_version: 12.8
|
||||
lite: false
|
||||
torch_base: full
|
||||
tag_prefix: cu128
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
echo "Before cleanup:"
|
||||
df -h
|
||||
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo rm -rf /opt/hostedtoolcache/PyPy
|
||||
sudo rm -rf /opt/hostedtoolcache/go
|
||||
sudo rm -rf /opt/hostedtoolcache/node
|
||||
sudo rm -rf /opt/hostedtoolcache/Ruby
|
||||
sudo rm -rf /opt/microsoft
|
||||
sudo rm -rf /opt/pipx
|
||||
sudo rm -rf /opt/az
|
||||
sudo rm -rf /opt/google
|
||||
|
||||
|
||||
sudo rm -rf /usr/lib/jvm
|
||||
sudo rm -rf /usr/lib/google-cloud-sdk
|
||||
sudo rm -rf /usr/lib/dotnet
|
||||
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /usr/local/.ghcup
|
||||
sudo rm -rf /usr/local/julia1.11.5
|
||||
sudo rm -rf /usr/local/share/powershell
|
||||
sudo rm -rf /usr/local/share/chromium
|
||||
|
||||
sudo rm -rf /usr/share/swift
|
||||
sudo rm -rf /usr/share/miniconda
|
||||
sudo rm -rf /usr/share/az_12.1.0
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
|
||||
echo "After cleanup:"
|
||||
df -h
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
|
||||
|
||||
- name: Build and Push Docker Image (arm64)
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
platforms: linux/arm64
|
||||
build-args: |
|
||||
LITE=${{ matrix.lite }}
|
||||
TORCH_BASE=${{ matrix.torch_base }}
|
||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||
WORKFLOW=true
|
||||
tags: |
|
||||
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64
|
||||
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64
|
||||
|
||||
|
||||
merge-and-clean:
|
||||
needs:
|
||||
- build-amd64
|
||||
- build-arm64
|
||||
- generate-meta
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- tag_prefix: cu126-lite
|
||||
- tag_prefix: cu126
|
||||
- tag_prefix: cu128-lite
|
||||
- tag_prefix: cu128
|
||||
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
|
||||
|
||||
- name: Merge amd64 and arm64 into multi-arch image
|
||||
run: |
|
||||
DATE_TAG=${{ needs.generate-meta.outputs.tag }}
|
||||
TAG_PREFIX=${{ matrix.tag_prefix }}
|
||||
|
||||
docker buildx imagetools create \
|
||||
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64
|
||||
|
||||
docker buildx imagetools create \
|
||||
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64
|
||||
- name: Delete old platform-specific tags via Docker Hub API
|
||||
env:
|
||||
DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }}
|
||||
TAG_PREFIX: ${{ matrix.tag_prefix }}
|
||||
DATE_TAG: ${{ needs.generate-meta.outputs.tag }}
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y jq
|
||||
|
||||
TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \
|
||||
"https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \
|
||||
| jq -r .token)
|
||||
|
||||
for PLATFORM in amd64 arm64; do
|
||||
SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g')
|
||||
TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}"
|
||||
LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}"
|
||||
|
||||
for DEL_TAG in "$TAG" "$LATEST_TAG"; do
|
||||
echo "Deleting tag: $DEL_TAG"
|
||||
curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG
|
||||
done
|
||||
done
|
||||
create-default:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- merge-and-clean
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
|
||||
|
||||
- name: Create Default Tag
|
||||
run: |
|
||||
docker buildx imagetools create \
|
||||
--tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite
|
||||
|
9
.gitignore
vendored
9
.gitignore
vendored
@ -7,13 +7,8 @@ runtime
|
||||
.idea
|
||||
output
|
||||
logs
|
||||
reference
|
||||
GPT_weights
|
||||
SoVITS_weights
|
||||
GPT_weights_v2
|
||||
SoVITS_weights_v2
|
||||
GPT_weights_v3
|
||||
SoVITS_weights_v3
|
||||
SoVITS_weights*/
|
||||
GPT_weights*/
|
||||
TEMP
|
||||
weight.json
|
||||
ffmpeg*
|
||||
|
15
.pre-commit-config.yaml
Normal file
15
.pre-commit-config.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
ci:
|
||||
autoupdate_schedule: monthly
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.7
|
||||
hooks:
|
||||
# Run the linter.
|
||||
- id: ruff
|
||||
types_or: [ python, pyi ]
|
||||
args: [ --fix ]
|
||||
# Run the formatter.
|
||||
- id: ruff-format
|
||||
types_or: [ python, pyi ]
|
||||
args: [ --line-length, "120", --target-version, "py310" ]
|
@ -1,5 +1,12 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -39,9 +46,9 @@
|
||||
"\n",
|
||||
"cd GPT-SoVITS\n",
|
||||
"\n",
|
||||
"mkdir GPT_weights\n",
|
||||
"mkdir -p GPT_weights\n",
|
||||
"\n",
|
||||
"mkdir SoVITS_weights\n",
|
||||
"mkdir -p SoVITS_weights\n",
|
||||
"\n",
|
||||
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
|
||||
" :\n",
|
||||
@ -53,7 +60,7 @@
|
||||
"\n",
|
||||
"pip install ipykernel\n",
|
||||
"\n",
|
||||
"bash install.sh --source HF"
|
||||
"bash install.sh --device CU126 --source HF"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -7,7 +7,7 @@
|
||||
"id": "view-in-github"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -59,7 +59,7 @@
|
||||
"\n",
|
||||
"pip install ipykernel\n",
|
||||
"\n",
|
||||
"bash install.sh --source HF --download-uvr5"
|
||||
"bash install.sh --device CU126 --source HF --download-uvr5"
|
||||
]
|
||||
},
|
||||
{
|
@ -1,3 +0,0 @@
|
||||
5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
|
||||
b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
|
||||
a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb
|
@ -1,8 +0,0 @@
|
||||
# Download moda ASR related models
|
||||
from modelscope import snapshot_download
|
||||
|
||||
model_dir = snapshot_download(
|
||||
"damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", revision="v2.0.4"
|
||||
)
|
||||
model_dir = snapshot_download("damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", revision="v2.0.4")
|
||||
model_dir = snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", revision="v2.0.4")
|
@ -1,11 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
echo "Downloading models..."
|
||||
|
||||
aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
|
||||
|
||||
echo "Checking SHA256..."
|
||||
|
||||
parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"
|
33
Docker/install_wrapper.sh
Normal file
33
Docker/install_wrapper.sh
Normal file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||
|
||||
cd "$SCRIPT_DIR" || exit 1
|
||||
|
||||
cd .. || exit 1
|
||||
|
||||
set -e
|
||||
|
||||
source "$HOME/miniconda3/etc/profile.d/conda.sh"
|
||||
|
||||
mkdir -p GPT_SoVITS
|
||||
|
||||
mkdir -p GPT_SoVITS/text
|
||||
|
||||
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
||||
|
||||
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
||||
|
||||
bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
|
||||
|
||||
pip cache purge
|
||||
|
||||
pip show torch
|
||||
|
||||
rm -rf /tmp/* /var/tmp/*
|
||||
|
||||
rm -rf "$HOME/miniconda3/pkgs"
|
||||
|
||||
mkdir -p "$HOME/miniconda3/pkgs"
|
||||
|
||||
rm -rf /root/.conda /root/.cache
|
@ -1,12 +0,0 @@
|
||||
b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
|
||||
020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
|
||||
e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
|
||||
39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
|
||||
45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
|
||||
5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
|
||||
8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
|
||||
01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
|
||||
56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
|
||||
233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
|
@ -1,34 +0,0 @@
|
||||
# GPT-SoVITS models
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
|
||||
out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
|
||||
out=GPT_SoVITS/pretrained_models/s2D488k.pth
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
|
||||
out=GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
|
||||
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
|
||||
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
|
||||
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
|
||||
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
|
||||
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
|
||||
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
|
||||
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
|
||||
# UVR5
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
|
||||
out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
|
||||
out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
|
||||
out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
|
||||
out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
|
||||
out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
|
||||
out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
|
||||
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
|
||||
out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
|
70
Docker/miniconda_install.sh
Normal file
70
Docker/miniconda_install.sh
Normal file
@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||
|
||||
cd "$SCRIPT_DIR" || exit 1
|
||||
|
||||
cd .. || exit 1
|
||||
|
||||
if [ -d "$HOME/miniconda3" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
WORKFLOW=${WORKFLOW:-"false"}
|
||||
TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"}
|
||||
|
||||
if [ "$WORKFLOW" = "true" ]; then
|
||||
WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
||||
else
|
||||
WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
||||
fi
|
||||
|
||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
|
||||
"${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
|
||||
elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
|
||||
"${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LOG_PATH="/tmp/miniconda-install.log"
|
||||
|
||||
bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "== Miniconda Installed =="
|
||||
else
|
||||
echo "Failed to Install miniconda"
|
||||
tail -n 50 "$LOG_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm miniconda.sh
|
||||
|
||||
source "$HOME/miniconda3/etc/profile.d/conda.sh"
|
||||
|
||||
"$HOME/miniconda3/bin/conda" config --add channels conda-forge
|
||||
|
||||
"$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
|
||||
|
||||
"$HOME/miniconda3/bin/conda" install python=3.11 -q -y
|
||||
|
||||
"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
|
||||
|
||||
if [ "$CUDA_VERSION" = "12.8" ]; then
|
||||
"$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
|
||||
elif [ "$CUDA_VERSION" = "12.6" ]; then
|
||||
"$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
|
||||
fi
|
||||
|
||||
"$HOME/miniconda3/bin/pip" cache purge
|
||||
|
||||
rm $LOG_PATH
|
||||
|
||||
rm -rf "$HOME/miniconda3/pkgs"
|
||||
|
||||
mkdir -p "$HOME/miniconda3/pkgs"
|
||||
|
||||
rm -rf "$HOME/.conda" "$HOME/.cache"
|
80
Dockerfile
80
Dockerfile
@ -1,42 +1,62 @@
|
||||
# Base CUDA image
|
||||
FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
|
||||
ARG CUDA_VERSION=12.6
|
||||
ARG TORCH_BASE=full
|
||||
|
||||
LABEL maintainer="breakstring@hotmail.com"
|
||||
LABEL version="dev-20240209"
|
||||
FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
|
||||
|
||||
LABEL maintainer="XXXXRT"
|
||||
LABEL version="V4"
|
||||
LABEL description="Docker image for GPT-SoVITS"
|
||||
|
||||
ARG CUDA_VERSION=12.6
|
||||
|
||||
# Install 3rd party apps
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=Etc/UTC
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
|
||||
git lfs install && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
ENV CUDA_VERSION=${CUDA_VERSION}
|
||||
|
||||
# Copy only requirements.txt initially to leverage Docker cache
|
||||
WORKDIR /workspace
|
||||
COPY requirements.txt /workspace/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# Define a build-time argument for image type
|
||||
ARG IMAGE_TYPE=full
|
||||
WORKDIR /workspace/GPT-SoVITS
|
||||
|
||||
# Conditional logic based on the IMAGE_TYPE argument
|
||||
# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
|
||||
COPY ./Docker /workspace/Docker
|
||||
# elite 类型的镜像里面不包含额外的模型
|
||||
RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
|
||||
chmod +x /workspace/Docker/download.sh && \
|
||||
/workspace/Docker/download.sh && \
|
||||
python /workspace/Docker/download.py && \
|
||||
python -m nltk.downloader averaged_perceptron_tagger cmudict; \
|
||||
fi
|
||||
COPY Docker /workspace/GPT-SoVITS/Docker/
|
||||
|
||||
ARG LITE=false
|
||||
ENV LITE=${LITE}
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . /workspace
|
||||
ARG WORKFLOW=false
|
||||
ENV WORKFLOW=${WORKFLOW}
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
ENV TARGETPLATFORM=${TARGETPLATFORM}
|
||||
|
||||
RUN bash Docker/miniconda_install.sh
|
||||
|
||||
COPY extra-req.txt /workspace/GPT-SoVITS/
|
||||
|
||||
COPY requirements.txt /workspace/GPT-SoVITS/
|
||||
|
||||
COPY install.sh /workspace/GPT-SoVITS/
|
||||
|
||||
RUN bash Docker/install_wrapper.sh
|
||||
|
||||
EXPOSE 9871 9872 9873 9874 9880
|
||||
|
||||
CMD ["python", "webui.py"]
|
||||
ENV PYTHONPATH="/workspace/GPT-SoVITS"
|
||||
|
||||
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN rm -rf /workspace/GPT-SoVITS
|
||||
|
||||
WORKDIR /workspace/GPT-SoVITS
|
||||
|
||||
COPY . /workspace/GPT-SoVITS
|
||||
|
||||
CMD ["/bin/bash", "-c", "\
|
||||
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
||||
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
||||
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
|
||||
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
|
||||
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
||||
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
||||
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
|
||||
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
|
||||
exec bash"]
|
@ -108,7 +108,7 @@ resample_transform_dict = {}
|
||||
|
||||
def resample(audio_tensor, sr0, sr1, device):
|
||||
global resample_transform_dict
|
||||
key="%s-%s"%(sr0,sr1)
|
||||
key = "%s-%s" % (sr0, sr1)
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
@ -252,7 +252,6 @@ class TTS_Config:
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
|
||||
}
|
||||
configs: dict = None
|
||||
v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
|
||||
@ -432,7 +431,6 @@ class TTS:
|
||||
"aux_ref_audio_paths": [],
|
||||
}
|
||||
|
||||
|
||||
self.stop_flag: bool = False
|
||||
self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32
|
||||
|
||||
@ -468,7 +466,7 @@ class TTS:
|
||||
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
||||
|
||||
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
|
||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
|
||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
||||
raise FileExistsError(info)
|
||||
|
||||
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||
@ -507,7 +505,7 @@ class TTS:
|
||||
)
|
||||
self.configs.use_vocoder = False
|
||||
else:
|
||||
kwargs["version"]=model_version
|
||||
kwargs["version"] = model_version
|
||||
vits_model = SynthesizerTrnV3(
|
||||
self.configs.filter_length // 2 + 1,
|
||||
self.configs.segment_size // self.configs.hop_length,
|
||||
@ -572,7 +570,7 @@ class TTS:
|
||||
self.vocoder.cpu()
|
||||
del self.vocoder
|
||||
self.empty_cache()
|
||||
|
||||
|
||||
self.vocoder = BigVGAN.from_pretrained(
|
||||
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
|
||||
use_cuda_kernel=False,
|
||||
@ -595,18 +593,21 @@ class TTS:
|
||||
self.empty_cache()
|
||||
|
||||
self.vocoder = Generator(
|
||||
initial_channel=100,
|
||||
resblock="1",
|
||||
resblock_kernel_sizes=[3, 7, 11],
|
||||
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
||||
upsample_rates=[10, 6, 2, 2, 2],
|
||||
upsample_initial_channel=512,
|
||||
upsample_kernel_sizes=[20, 12, 4, 4, 4],
|
||||
gin_channels=0, is_bias=True
|
||||
)
|
||||
initial_channel=100,
|
||||
resblock="1",
|
||||
resblock_kernel_sizes=[3, 7, 11],
|
||||
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
||||
upsample_rates=[10, 6, 2, 2, 2],
|
||||
upsample_initial_channel=512,
|
||||
upsample_kernel_sizes=[20, 12, 4, 4, 4],
|
||||
gin_channels=0,
|
||||
is_bias=True,
|
||||
)
|
||||
self.vocoder.remove_weight_norm()
|
||||
state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu")
|
||||
print("loading vocoder",self.vocoder.load_state_dict(state_dict_g))
|
||||
state_dict_g = torch.load(
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
|
||||
)
|
||||
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
|
||||
|
||||
self.vocoder_configs["sr"] = 48000
|
||||
self.vocoder_configs["T_ref"] = 500
|
||||
@ -614,9 +615,6 @@ class TTS:
|
||||
self.vocoder_configs["upsample_rate"] = 480
|
||||
self.vocoder_configs["overlapped_len"] = 12
|
||||
|
||||
|
||||
|
||||
|
||||
self.vocoder = self.vocoder.eval()
|
||||
if self.configs.is_half == True:
|
||||
self.vocoder = self.vocoder.half().to(self.configs.device)
|
||||
@ -1439,7 +1437,7 @@ class TTS:
|
||||
ref_audio = ref_audio.to(self.configs.device).float()
|
||||
if ref_audio.shape[0] == 2:
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
|
||||
|
||||
# tgt_sr = self.vocoder_configs["sr"]
|
||||
tgt_sr = 24000 if self.configs.version == "v3" else 32000
|
||||
if ref_sr != tgt_sr:
|
||||
|
@ -143,6 +143,9 @@ class DiT(nn.Module):
|
||||
drop_audio_cond=False, # cfg for cond audio
|
||||
drop_text=False, # cfg for text
|
||||
# mask: bool["b n"] | None = None, # noqa: F722
|
||||
infer=False, # bool
|
||||
text_cache=None, # torch tensor as text_embed
|
||||
dt_cache=None, # torch tensor as dt
|
||||
):
|
||||
x = x0.transpose(2, 1)
|
||||
cond = cond0.transpose(2, 1)
|
||||
@ -155,9 +158,17 @@ class DiT(nn.Module):
|
||||
|
||||
# t: conditioning time, c: context (text + masked cond audio), x: noised input audio
|
||||
t = self.time_embed(time)
|
||||
dt = self.d_embed(dt_base_bootstrap)
|
||||
if infer and dt_cache is not None:
|
||||
dt = dt_cache
|
||||
else:
|
||||
dt = self.d_embed(dt_base_bootstrap)
|
||||
t += dt
|
||||
text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change
|
||||
|
||||
if infer and text_cache is not None:
|
||||
text_embed = text_cache
|
||||
else:
|
||||
text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change
|
||||
|
||||
x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
|
||||
|
||||
rope = self.rotary_embed.forward_from_seq_len(seq_len)
|
||||
@ -177,4 +188,7 @@ class DiT(nn.Module):
|
||||
x = self.norm_out(x, t)
|
||||
output = self.proj_out(x)
|
||||
|
||||
return output
|
||||
if infer:
|
||||
return output, text_embed, dt
|
||||
else:
|
||||
return output
|
@ -7,11 +7,17 @@
|
||||
全部按日文识别
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
from text.LangSegmenter import LangSegmenter
|
||||
|
||||
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
||||
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
||||
@ -23,20 +29,6 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
||||
logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
|
||||
warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from text.LangSegmenter import LangSegmenter
|
||||
|
||||
try:
|
||||
import gradio.analytics as analytics
|
||||
|
||||
analytics.version_check = lambda: None
|
||||
except:
|
||||
...
|
||||
version = model_version = os.environ.get("version", "v2")
|
||||
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
|
||||
@ -106,7 +98,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||
|
||||
import random
|
||||
|
||||
from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3,Generator
|
||||
from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
|
||||
|
||||
|
||||
def set_seed(seed):
|
||||
@ -226,9 +218,9 @@ else:
|
||||
resample_transform_dict = {}
|
||||
|
||||
|
||||
def resample(audio_tensor, sr0,sr1):
|
||||
def resample(audio_tensor, sr0, sr1):
|
||||
global resample_transform_dict
|
||||
key="%s-%s"%(sr0,sr1)
|
||||
key = "%s-%s" % (sr0, sr1)
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
@ -238,14 +230,18 @@ def resample(audio_tensor, sr0,sr1):
|
||||
# symbol_version-model_version-if_lora_v3
|
||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||
|
||||
v3v4set={"v3","v4"}
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
|
||||
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
||||
global vq_model, hps, version, model_version, dict_language, if_lora_v3
|
||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
||||
print(sovits_path,version, model_version, if_lora_v3)
|
||||
is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4
|
||||
print(sovits_path, version, model_version, if_lora_v3)
|
||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||
if if_lora_v3 == True and is_exist == False:
|
||||
info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
|
||||
info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n(
|
||||
"SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version
|
||||
)
|
||||
gr.Warning(info)
|
||||
raise FileExistsError(info)
|
||||
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
||||
@ -276,10 +272,15 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
prompt_language_update,
|
||||
text_update,
|
||||
text_language_update,
|
||||
{"__type__": "update", "visible": visible_sample_steps, "value": 32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]},
|
||||
{
|
||||
"__type__": "update",
|
||||
"visible": visible_sample_steps,
|
||||
"value": 32 if model_version == "v3" else 8,
|
||||
"choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
|
||||
},
|
||||
{"__type__": "update", "visible": visible_inp_refs},
|
||||
{"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False},
|
||||
{"__type__": "update", "visible": True if model_version =="v3" else False},
|
||||
{"__type__": "update", "visible": True if model_version == "v3" else False},
|
||||
{"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False},
|
||||
)
|
||||
|
||||
@ -304,7 +305,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
)
|
||||
model_version = version
|
||||
else:
|
||||
hps.model.version=model_version
|
||||
hps.model.version = model_version
|
||||
vq_model = SynthesizerTrnV3(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
@ -326,7 +327,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
else:
|
||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||
print(
|
||||
"loading sovits_%spretrained_G"%model_version,
|
||||
"loading sovits_%spretrained_G" % model_version,
|
||||
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
|
||||
)
|
||||
lora_rank = dict_s2["lora_rank"]
|
||||
@ -337,7 +338,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
init_lora_weights=True,
|
||||
)
|
||||
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
|
||||
print("loading sovits_%s_lora%s" % (model_version,lora_rank))
|
||||
print("loading sovits_%s_lora%s" % (model_version, lora_rank))
|
||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
vq_model.cfm = vq_model.cfm.merge_and_unload()
|
||||
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
||||
@ -350,10 +351,15 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
prompt_language_update,
|
||||
text_update,
|
||||
text_language_update,
|
||||
{"__type__": "update", "visible": visible_sample_steps, "value":32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]},
|
||||
{
|
||||
"__type__": "update",
|
||||
"visible": visible_sample_steps,
|
||||
"value": 32 if model_version == "v3" else 8,
|
||||
"choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
|
||||
},
|
||||
{"__type__": "update", "visible": visible_inp_refs},
|
||||
{"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False},
|
||||
{"__type__": "update", "visible": True if model_version =="v3" else False},
|
||||
{"__type__": "update", "visible": True if model_version == "v3" else False},
|
||||
{"__type__": "update", "value": i18n("合成语音"), "interactive": True},
|
||||
)
|
||||
with open("./weight.json") as f:
|
||||
@ -400,7 +406,7 @@ now_dir = os.getcwd()
|
||||
|
||||
|
||||
def init_bigvgan():
|
||||
global bigvgan_model,hifigan_model
|
||||
global bigvgan_model, hifigan_model
|
||||
from BigVGAN import bigvgan
|
||||
|
||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
||||
@ -411,17 +417,20 @@ def init_bigvgan():
|
||||
bigvgan_model.remove_weight_norm()
|
||||
bigvgan_model = bigvgan_model.eval()
|
||||
if hifigan_model:
|
||||
hifigan_model=hifigan_model.cpu()
|
||||
hifigan_model=None
|
||||
try:torch.cuda.empty_cache()
|
||||
except:pass
|
||||
hifigan_model = hifigan_model.cpu()
|
||||
hifigan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
if is_half == True:
|
||||
bigvgan_model = bigvgan_model.half().to(device)
|
||||
else:
|
||||
bigvgan_model = bigvgan_model.to(device)
|
||||
|
||||
|
||||
def init_hifigan():
|
||||
global hifigan_model,bigvgan_model
|
||||
global hifigan_model, bigvgan_model
|
||||
hifigan_model = Generator(
|
||||
initial_channel=100,
|
||||
resblock="1",
|
||||
@ -430,26 +439,32 @@ def init_hifigan():
|
||||
upsample_rates=[10, 6, 2, 2, 2],
|
||||
upsample_initial_channel=512,
|
||||
upsample_kernel_sizes=[20, 12, 4, 4, 4],
|
||||
gin_channels=0, is_bias=True
|
||||
gin_channels=0,
|
||||
is_bias=True,
|
||||
)
|
||||
hifigan_model.eval()
|
||||
hifigan_model.remove_weight_norm()
|
||||
state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu")
|
||||
print("loading vocoder",hifigan_model.load_state_dict(state_dict_g))
|
||||
state_dict_g = torch.load(
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
|
||||
)
|
||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||
if bigvgan_model:
|
||||
bigvgan_model=bigvgan_model.cpu()
|
||||
bigvgan_model=None
|
||||
try:torch.cuda.empty_cache()
|
||||
except:pass
|
||||
bigvgan_model = bigvgan_model.cpu()
|
||||
bigvgan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
if is_half == True:
|
||||
hifigan_model = hifigan_model.half().to(device)
|
||||
else:
|
||||
hifigan_model = hifigan_model.to(device)
|
||||
|
||||
bigvgan_model=hifigan_model=None
|
||||
if model_version=="v3":
|
||||
|
||||
bigvgan_model = hifigan_model = None
|
||||
if model_version == "v3":
|
||||
init_bigvgan()
|
||||
if model_version=="v4":
|
||||
if model_version == "v4":
|
||||
init_hifigan()
|
||||
|
||||
|
||||
@ -837,17 +852,17 @@ def get_tts_wav(
|
||||
ref_audio = ref_audio.to(device).float()
|
||||
if ref_audio.shape[0] == 2:
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
tgt_sr=24000 if model_version=="v3"else 32000
|
||||
tgt_sr = 24000 if model_version == "v3" else 32000
|
||||
if sr != tgt_sr:
|
||||
ref_audio = resample(ref_audio, sr,tgt_sr)
|
||||
ref_audio = resample(ref_audio, sr, tgt_sr)
|
||||
# print("ref_audio",ref_audio.abs().mean())
|
||||
mel2 = mel_fn(ref_audio)if model_version=="v3"else mel_fn_v4(ref_audio)
|
||||
mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
Tref=468 if model_version=="v3"else 500
|
||||
Tchunk=934 if model_version=="v3"else 1000
|
||||
Tref = 468 if model_version == "v3" else 500
|
||||
Tchunk = 934 if model_version == "v3" else 1000
|
||||
if T_min > Tref:
|
||||
mel2 = mel2[:, :, -Tref:]
|
||||
fea_ref = fea_ref[:, :, -Tref:]
|
||||
@ -872,13 +887,13 @@ def get_tts_wav(
|
||||
cfm_resss.append(cfm_res)
|
||||
cfm_res = torch.cat(cfm_resss, 2)
|
||||
cfm_res = denorm_spec(cfm_res)
|
||||
if model_version=="v3":
|
||||
if model_version == "v3":
|
||||
if bigvgan_model == None:
|
||||
init_bigvgan()
|
||||
else:#v4
|
||||
else: # v4
|
||||
if hifigan_model == None:
|
||||
init_hifigan()
|
||||
vocoder_model=bigvgan_model if model_version=="v3"else hifigan_model
|
||||
vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
|
||||
with torch.inference_mode():
|
||||
wav_gen = vocoder_model(cfm_res)
|
||||
audio = wav_gen[0][0] # .cpu().detach().numpy()
|
||||
@ -892,9 +907,12 @@ def get_tts_wav(
|
||||
t1 = ttime()
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
|
||||
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
||||
if model_version in {"v1","v2"}:opt_sr=32000
|
||||
elif model_version=="v3":opt_sr=24000
|
||||
else:opt_sr=48000#v4
|
||||
if model_version in {"v1", "v2"}:
|
||||
opt_sr = 32000
|
||||
elif model_version == "v3":
|
||||
opt_sr = 24000
|
||||
else:
|
||||
opt_sr = 48000 # v4
|
||||
if if_sr == True and opt_sr == 24000:
|
||||
print(i18n("音频超分中"))
|
||||
audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
|
||||
@ -1067,7 +1085,7 @@ def html_left(text, label="p"):
|
||||
</div>"""
|
||||
|
||||
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
|
||||
gr.Markdown(
|
||||
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
||||
+ "<br>"
|
||||
@ -1137,16 +1155,16 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
sample_steps = (
|
||||
gr.Radio(
|
||||
label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
|
||||
value=32 if model_version=="v3"else 8,
|
||||
choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32],
|
||||
value=32 if model_version == "v3" else 8,
|
||||
choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
|
||||
visible=True,
|
||||
)
|
||||
if model_version in v3v4set
|
||||
else gr.Radio(
|
||||
label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
|
||||
choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32],
|
||||
choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
|
||||
visible=False,
|
||||
value=32 if model_version=="v3"else 8,
|
||||
value=32 if model_version == "v3" else 8,
|
||||
)
|
||||
)
|
||||
if_sr_Checkbox = gr.Checkbox(
|
||||
@ -1154,7 +1172,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
value=False,
|
||||
interactive=True,
|
||||
show_label=True,
|
||||
visible=False if model_version !="v3" else True,
|
||||
visible=False if model_version != "v3" else True,
|
||||
)
|
||||
gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
|
||||
with gr.Row():
|
||||
|
@ -14,6 +14,8 @@ import random
|
||||
import re
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
||||
@ -25,14 +27,6 @@ logging.getLogger("httpx").setLevel(logging.ERROR)
|
||||
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
||||
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
||||
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
||||
import torch
|
||||
|
||||
try:
|
||||
import gradio.analytics as analytics
|
||||
|
||||
analytics.version_check = lambda: None
|
||||
except:
|
||||
...
|
||||
|
||||
|
||||
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
||||
@ -262,15 +256,17 @@ SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast
|
||||
|
||||
v3v4set={"v3","v4"}
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
|
||||
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
||||
global version, model_version, dict_language, if_lora_v3
|
||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
||||
# print(sovits_path,version, model_version, if_lora_v3)
|
||||
is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4
|
||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||
if if_lora_v3 == True and is_exist == False:
|
||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version)
|
||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
||||
gr.Warning(info)
|
||||
raise FileExistsError(info)
|
||||
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
||||
@ -328,7 +324,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
f.write(json.dumps(data))
|
||||
|
||||
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
|
||||
gr.Markdown(
|
||||
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
||||
+ "<br>"
|
||||
|
@ -470,6 +470,7 @@ class TextAudioSpeakerCollateV3:
|
||||
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
|
||||
return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
|
||||
|
||||
|
||||
class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, speaker_id, text pairs
|
||||
@ -596,7 +597,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
|
||||
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
|
||||
)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
spec1 = spectrogram_torch(audio_norm, 1280,32000, 320, 1280,center=False)
|
||||
spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False)
|
||||
mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None)
|
||||
mel = self.norm_spec(torch.squeeze(mel, 0))
|
||||
return spec, mel
|
||||
@ -643,7 +644,7 @@ class TextAudioSpeakerCollateV4:
|
||||
mel_lengths = torch.LongTensor(len(batch))
|
||||
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len*2)
|
||||
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2)
|
||||
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
# wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
|
@ -39,24 +39,36 @@ hann_window = {}
|
||||
|
||||
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
||||
if torch.min(y) < -1.2:
|
||||
print('min value is ', torch.min(y))
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.2:
|
||||
print('max value is ', torch.max(y))
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
# wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
key = "%s-%s-%s-%s-%s" %(dtype_device,n_fft, sampling_rate, hop_size, win_size)
|
||||
key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size)
|
||||
# if wnsize_dtype_device not in hann_window:
|
||||
if key not in hann_window:
|
||||
# hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = torch.nn.functional.pad(
|
||||
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
|
||||
)
|
||||
y = y.squeeze(1)
|
||||
# spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[key],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
||||
spec = torch.stft(
|
||||
y,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window[key],
|
||||
center=center,
|
||||
pad_mode="reflect",
|
||||
normalized=False,
|
||||
onesided=True,
|
||||
return_complex=False,
|
||||
)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)
|
||||
return spec
|
||||
@ -64,9 +76,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
|
||||
|
||||
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
global mel_basis
|
||||
dtype_device = str(spec.dtype) + '_' + str(spec.device)
|
||||
dtype_device = str(spec.dtype) + "_" + str(spec.device)
|
||||
# fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
key = "%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, fmin, fmax)
|
||||
key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax)
|
||||
# if fmax_dtype_device not in mel_basis:
|
||||
if key not in mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
||||
@ -78,17 +90,25 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
return spec
|
||||
|
||||
|
||||
|
||||
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||
if torch.min(y) < -1.2:
|
||||
print('min value is ', torch.min(y))
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.2:
|
||||
print('max value is ', torch.max(y))
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
# fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax)
|
||||
fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % (
|
||||
dtype_device,
|
||||
n_fft,
|
||||
num_mels,
|
||||
sampling_rate,
|
||||
hop_size,
|
||||
win_size,
|
||||
fmin,
|
||||
fmax,
|
||||
)
|
||||
# wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
wnsize_dtype_device = fmax_dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
@ -97,11 +117,23 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = torch.nn.functional.pad(
|
||||
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
|
||||
)
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
||||
spec = torch.stft(
|
||||
y,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window[wnsize_dtype_device],
|
||||
center=center,
|
||||
pad_mode="reflect",
|
||||
normalized=False,
|
||||
onesided=True,
|
||||
return_complex=False,
|
||||
)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)
|
||||
|
||||
|
@ -414,7 +414,8 @@ class Generator(torch.nn.Module):
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
gin_channels=0,is_bias=False,
|
||||
gin_channels=0,
|
||||
is_bias=False,
|
||||
):
|
||||
super(Generator, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
@ -980,7 +981,6 @@ class SynthesizerTrn(nn.Module):
|
||||
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
||||
return codes.transpose(0, 1)
|
||||
|
||||
|
||||
class CFM(torch.nn.Module):
|
||||
def __init__(self, in_channels, dit):
|
||||
super().__init__()
|
||||
@ -992,6 +992,8 @@ class CFM(torch.nn.Module):
|
||||
|
||||
self.criterion = torch.nn.MSELoss()
|
||||
|
||||
self.use_conditioner_cache = True
|
||||
|
||||
@torch.inference_mode()
|
||||
def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0):
|
||||
"""Forward diffusion"""
|
||||
@ -1004,25 +1006,38 @@ class CFM(torch.nn.Module):
|
||||
mu = mu.transpose(2, 1)
|
||||
t = 0
|
||||
d = 1 / n_timesteps
|
||||
text_cache = None
|
||||
text_cfg_cache = None
|
||||
dt_cache = None
|
||||
d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
|
||||
for j in range(n_timesteps):
|
||||
t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
|
||||
d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
|
||||
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
|
||||
v_pred = self.estimator(
|
||||
x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False
|
||||
).transpose(2, 1)
|
||||
v_pred, text_emb, dt = self.estimator(
|
||||
x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False, infer=True, text_cache=text_cache, dt_cache=dt_cache
|
||||
)
|
||||
v_pred = v_pred.transpose(2, 1)
|
||||
if self.use_conditioner_cache:
|
||||
text_cache = text_emb
|
||||
dt_cache = dt
|
||||
if inference_cfg_rate > 1e-5:
|
||||
neg = self.estimator(
|
||||
x,
|
||||
prompt_x,
|
||||
x_lens,
|
||||
t_tensor,
|
||||
d_tensor,
|
||||
mu,
|
||||
use_grad_ckpt=False,
|
||||
drop_audio_cond=True,
|
||||
drop_text=True,
|
||||
).transpose(2, 1)
|
||||
neg, text_cfg_emb, _ = self.estimator(
|
||||
x,
|
||||
prompt_x,
|
||||
x_lens,
|
||||
t_tensor,
|
||||
d_tensor,
|
||||
mu,
|
||||
use_grad_ckpt=False,
|
||||
drop_audio_cond=True,
|
||||
drop_text=True,
|
||||
infer=True,
|
||||
text_cache=text_cfg_cache,
|
||||
dt_cache=dt_cache
|
||||
)
|
||||
neg = neg.transpose(2, 1)
|
||||
if self.use_conditioner_cache:
|
||||
text_cfg_cache = text_cfg_emb
|
||||
v_pred = v_pred + (v_pred - neg) * inference_cfg_rate
|
||||
x = x + d * v_pred
|
||||
t = t + d
|
||||
@ -1173,7 +1188,7 @@ class SynthesizerTrnV3(nn.Module):
|
||||
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
|
||||
fea = self.bridge(x)
|
||||
fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT
|
||||
fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT
|
||||
fea, y_mask_ = self.wns1(
|
||||
fea, mel_lengths, ge
|
||||
) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
|
||||
@ -1196,9 +1211,9 @@ class SynthesizerTrnV3(nn.Module):
|
||||
ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
|
||||
y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
|
||||
if speed == 1:
|
||||
sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4))
|
||||
sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4))
|
||||
else:
|
||||
sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4) / speed) + 1
|
||||
sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1
|
||||
y_lengths1 = torch.LongTensor([sizee]).to(codes.device)
|
||||
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
|
||||
|
||||
@ -1207,7 +1222,7 @@ class SynthesizerTrnV3(nn.Module):
|
||||
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
|
||||
fea = self.bridge(x)
|
||||
fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT
|
||||
fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT
|
||||
####more wn paramter to learn mel
|
||||
fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
|
||||
return fea, ge
|
||||
|
@ -28,18 +28,18 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
def my_save2(fea, path,cfm_version):
|
||||
def my_save2(fea, path, cfm_version):
|
||||
bio = BytesIO()
|
||||
torch.save(fea, bio)
|
||||
bio.seek(0)
|
||||
data = bio.getvalue()
|
||||
byte=b"03" if cfm_version=="v3"else b"04"
|
||||
byte = b"03" if cfm_version == "v3" else b"04"
|
||||
data = byte + data[2:]
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
|
||||
def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None):
|
||||
def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None):
|
||||
try:
|
||||
opt = OrderedDict()
|
||||
opt["weight"] = {}
|
||||
@ -51,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None):
|
||||
opt["info"] = "%sepoch_%siteration" % (epoch, steps)
|
||||
if lora_rank:
|
||||
opt["lora_rank"] = lora_rank
|
||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version)
|
||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version)
|
||||
else:
|
||||
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
||||
return "Success."
|
||||
|
@ -31,7 +31,6 @@ from module.data_utils import (
|
||||
TextAudioSpeakerLoaderV3,
|
||||
TextAudioSpeakerCollateV4,
|
||||
TextAudioSpeakerLoaderV4,
|
||||
|
||||
)
|
||||
from module.models import (
|
||||
SynthesizerTrnV3 as SynthesizerTrn,
|
||||
@ -88,8 +87,8 @@ def run(rank, n_gpus, hps):
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_device(rank)
|
||||
|
||||
TextAudioSpeakerLoader=TextAudioSpeakerLoaderV3 if hps.model.version=="v3"else TextAudioSpeakerLoaderV4
|
||||
TextAudioSpeakerCollate=TextAudioSpeakerCollateV3 if hps.model.version=="v3"else TextAudioSpeakerCollateV4
|
||||
TextAudioSpeakerLoader = TextAudioSpeakerLoaderV3 if hps.model.version == "v3" else TextAudioSpeakerLoaderV4
|
||||
TextAudioSpeakerCollate = TextAudioSpeakerCollateV3 if hps.model.version == "v3" else TextAudioSpeakerCollateV4
|
||||
train_dataset = TextAudioSpeakerLoader(hps.data) ########
|
||||
train_sampler = DistributedBucketSampler(
|
||||
train_dataset,
|
||||
@ -365,7 +364,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank),
|
||||
epoch,
|
||||
global_step,
|
||||
hps,cfm_version=hps.model.version,
|
||||
hps,
|
||||
cfm_version=hps.model.version,
|
||||
lora_rank=lora_rank,
|
||||
),
|
||||
)
|
||||
|
@ -3,44 +3,38 @@ import re
|
||||
|
||||
# jieba静音
|
||||
import jieba
|
||||
|
||||
jieba.setLogLevel(logging.CRITICAL)
|
||||
|
||||
# 更改fast_langdetect大模型位置
|
||||
from pathlib import Path
|
||||
import fast_langdetect
|
||||
|
||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
|
||||
fast_langdetect.infer.LangDetectConfig(
|
||||
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
|
||||
)
|
||||
)
|
||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
|
||||
|
||||
|
||||
from split_lang import LangSplitter
|
||||
|
||||
|
||||
def full_en(text):
|
||||
pattern = r"^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
|
||||
pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
|
||||
return bool(re.match(pattern, text))
|
||||
|
||||
|
||||
def full_cjk(text):
|
||||
# 来自wiki
|
||||
cjk_ranges = [
|
||||
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||||
(0x3400, 0x4DB5), # CJK Extension A
|
||||
(0x20000, 0x2A6DD), # CJK Extension B
|
||||
(0x2A700, 0x2B73F), # CJK Extension C
|
||||
(0x2B740, 0x2B81F), # CJK Extension D
|
||||
(0x2B820, 0x2CEAF), # CJK Extension E
|
||||
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
||||
(0x30000, 0x3134A), # CJK Extension G
|
||||
(0x31350, 0x323AF), # CJK Extension H
|
||||
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
||||
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||||
(0x3400, 0x4DB5), # CJK Extension A
|
||||
(0x20000, 0x2A6DD), # CJK Extension B
|
||||
(0x2A700, 0x2B73F), # CJK Extension C
|
||||
(0x2B740, 0x2B81F), # CJK Extension D
|
||||
(0x2B820, 0x2CEAF), # CJK Extension E
|
||||
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
||||
(0x30000, 0x3134A), # CJK Extension G
|
||||
(0x31350, 0x323AF), # CJK Extension H
|
||||
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
||||
]
|
||||
|
||||
pattern = r"[0-9、-〜。!?.!?… ]+$"
|
||||
pattern = r'[0-9、-〜。!?.!?… /]+$'
|
||||
|
||||
cjk_text = ""
|
||||
for char in text:
|
||||
@ -51,7 +45,7 @@ def full_cjk(text):
|
||||
return cjk_text
|
||||
|
||||
|
||||
def split_jako(tag_lang, item):
|
||||
def split_jako(tag_lang,item):
|
||||
if tag_lang == "ja":
|
||||
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
|
||||
else:
|
||||
@ -59,40 +53,41 @@ def split_jako(tag_lang, item):
|
||||
|
||||
lang_list: list[dict] = []
|
||||
tag = 0
|
||||
for match in re.finditer(pattern, item["text"]):
|
||||
for match in re.finditer(pattern, item['text']):
|
||||
if match.start() > tag:
|
||||
lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
|
||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
|
||||
|
||||
tag = match.end()
|
||||
lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
|
||||
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
|
||||
|
||||
if tag < len(item["text"]):
|
||||
lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
|
||||
if tag < len(item['text']):
|
||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
|
||||
|
||||
return lang_list
|
||||
|
||||
|
||||
def merge_lang(lang_list, item):
|
||||
if lang_list and item["lang"] == lang_list[-1]["lang"]:
|
||||
lang_list[-1]["text"] += item["text"]
|
||||
if lang_list and item['lang'] == lang_list[-1]['lang']:
|
||||
lang_list[-1]['text'] += item['text']
|
||||
else:
|
||||
lang_list.append(item)
|
||||
return lang_list
|
||||
|
||||
|
||||
class LangSegmenter:
|
||||
class LangSegmenter():
|
||||
# 默认过滤器, 基于gsv目前四种语言
|
||||
DEFAULT_LANG_MAP = {
|
||||
"zh": "zh",
|
||||
"yue": "zh", # 粤语
|
||||
"wuu": "zh", # 吴语
|
||||
"zh-cn": "zh",
|
||||
"zh-tw": "x", # 繁体设置为x
|
||||
"zh-tw": "x", # 繁体设置为x
|
||||
"ko": "ko",
|
||||
"ja": "ja",
|
||||
"en": "en",
|
||||
}
|
||||
|
||||
|
||||
def getTexts(text):
|
||||
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
|
||||
substr = lang_splitter.split_by_lang(text=text)
|
||||
@ -100,18 +95,18 @@ class LangSegmenter:
|
||||
lang_list: list[dict] = []
|
||||
|
||||
for _, item in enumerate(substr):
|
||||
dict_item = {"lang": item.lang, "text": item.text}
|
||||
dict_item = {'lang':item.lang,'text':item.text}
|
||||
|
||||
# 处理短英文被识别为其他语言的问题
|
||||
if full_en(dict_item["text"]):
|
||||
dict_item["lang"] = "en"
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
if full_en(dict_item['text']):
|
||||
dict_item['lang'] = 'en'
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
continue
|
||||
|
||||
# 处理非日语夹日文的问题(不包含CJK)
|
||||
ja_list: list[dict] = []
|
||||
if dict_item["lang"] != "ja":
|
||||
ja_list = split_jako("ja", dict_item)
|
||||
if dict_item['lang'] != 'ja':
|
||||
ja_list = split_jako('ja',dict_item)
|
||||
|
||||
if not ja_list:
|
||||
ja_list.append(dict_item)
|
||||
@ -120,8 +115,8 @@ class LangSegmenter:
|
||||
ko_list: list[dict] = []
|
||||
temp_list: list[dict] = []
|
||||
for _, ko_item in enumerate(ja_list):
|
||||
if ko_item["lang"] != "ko":
|
||||
ko_list = split_jako("ko", ko_item)
|
||||
if ko_item["lang"] != 'ko':
|
||||
ko_list = split_jako('ko',ko_item)
|
||||
|
||||
if ko_list:
|
||||
temp_list.extend(ko_list)
|
||||
@ -131,32 +126,50 @@ class LangSegmenter:
|
||||
# 未存在非日韩文夹日韩文
|
||||
if len(temp_list) == 1:
|
||||
# 未知语言检查是否为CJK
|
||||
if dict_item["lang"] == "x":
|
||||
cjk_text = full_cjk(dict_item["text"])
|
||||
if dict_item['lang'] == 'x':
|
||||
cjk_text = full_cjk(dict_item['text'])
|
||||
if cjk_text:
|
||||
dict_item = {"lang": "zh", "text": cjk_text}
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
dict_item = {'lang':'zh','text':cjk_text}
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
else:
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
continue
|
||||
else:
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
continue
|
||||
|
||||
# 存在非日韩文夹日韩文
|
||||
for _, temp_item in enumerate(temp_list):
|
||||
# 未知语言检查是否为CJK
|
||||
if temp_item["lang"] == "x":
|
||||
cjk_text = full_cjk(dict_item["text"])
|
||||
if temp_item['lang'] == 'x':
|
||||
cjk_text = full_cjk(dict_item['text'])
|
||||
if cjk_text:
|
||||
dict_item = {"lang": "zh", "text": cjk_text}
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
dict_item = {'lang':'zh','text':cjk_text}
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
else:
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
else:
|
||||
lang_list = merge_lang(lang_list, temp_item)
|
||||
return lang_list
|
||||
lang_list = merge_lang(lang_list,temp_item)
|
||||
|
||||
temp_list = lang_list
|
||||
lang_list = []
|
||||
for _, temp_item in enumerate(temp_list):
|
||||
if temp_item['lang'] == 'x':
|
||||
if lang_list:
|
||||
temp_item['lang'] = lang_list[-1]['lang']
|
||||
elif len(temp_list) > 1:
|
||||
temp_item['lang'] = temp_list[1]['lang']
|
||||
else:
|
||||
temp_item['lang'] = 'zh'
|
||||
|
||||
lang_list = merge_lang(lang_list,temp_item)
|
||||
|
||||
return lang_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "MyGO?,你也喜欢まいご吗?"
|
||||
print(LangSegmenter.getTexts(text))
|
||||
|
||||
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
|
||||
print(LangSegmenter.getTexts(text))
|
||||
print(LangSegmenter.getTexts(text))
|
@ -1,27 +1,32 @@
|
||||
# This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw
|
||||
# This code is modified from https://github.com/GitYCC/g2pW
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
import warnings
|
||||
import zipfile
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
import requests
|
||||
|
||||
onnxruntime.set_default_logger_severity(3)
|
||||
import torch
|
||||
from opencc import OpenCC
|
||||
from pypinyin import Style, pinyin
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
||||
|
||||
from ..zh_normalization.char_convert import tranditional_to_simplified
|
||||
from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input
|
||||
from .utils import load_config
|
||||
|
||||
onnxruntime.set_default_logger_severity(3)
|
||||
try:
|
||||
onnxruntime.preload_dlls()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
model_version = "1.1"
|
||||
|
||||
|
||||
@ -87,7 +92,7 @@ class G2PWOnnxConverter:
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
sess_options.intra_op_num_threads = 2
|
||||
sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
|
||||
try:
|
||||
self.session_g2pW = onnxruntime.InferenceSession(
|
||||
os.path.join(uncompress_path, "g2pW.onnx"),
|
||||
|
@ -13,10 +13,10 @@ import torch
|
||||
|
||||
logging.getLogger("numba").setLevel(logging.ERROR)
|
||||
logging.getLogger("matplotlib").setLevel(logging.ERROR)
|
||||
|
||||
logging.getLogger("httpx").setLevel(logging.ERROR)
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
logger = logging
|
||||
|
||||
|
||||
@ -309,13 +309,13 @@ def check_git_hash(model_dir):
|
||||
def get_logger(model_dir, filename="train.log"):
|
||||
global logger
|
||||
logger = logging.getLogger(os.path.basename(model_dir))
|
||||
logger.setLevel(logging.ERROR)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename))
|
||||
h.setLevel(logging.ERROR)
|
||||
h.setLevel(logging.INFO)
|
||||
h.setFormatter(formatter)
|
||||
logger.addHandler(h)
|
||||
return logger
|
||||
|
132
README.md
132
README.md
@ -44,15 +44,15 @@ For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/
|
||||
|
||||
### Tested Environments
|
||||
|
||||
| Python Version | PyTorch Version | Device |
|
||||
|----------------|------------------|-----------------|
|
||||
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
||||
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
||||
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA12.8(for Nvidia50x0) |
|
||||
| Python Version | PyTorch Version | Device |
|
||||
| -------------- | ---------------- | ------------- |
|
||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
|
||||
### Windows
|
||||
|
||||
@ -63,31 +63,41 @@ If you are a Windows user (tested with win>=10), you can [download the integrate
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### macOS
|
||||
|
||||
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
|
||||
|
||||
1. Install Xcode command-line tools by running `xcode-select --install`.
|
||||
2. Install the program by running the following commands:
|
||||
Install the program by running the following commands:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### Install Manually
|
||||
|
||||
#### Install Dependences
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### Install FFmpeg
|
||||
|
||||
##### Conda Users
|
||||
|
||||
```bash
|
||||
conda activate GPTSoVits
|
||||
conda install ffmpeg
|
||||
```
|
||||
|
||||
@ -96,14 +106,13 @@ conda install ffmpeg
|
||||
```bash
|
||||
sudo apt install ffmpeg
|
||||
sudo apt install libsox-dev
|
||||
conda install -c conda-forge 'ffmpeg<7'
|
||||
```
|
||||
|
||||
##### Windows Users
|
||||
|
||||
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root.
|
||||
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root
|
||||
|
||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
|
||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe)
|
||||
|
||||
##### MacOS Users
|
||||
|
||||
@ -111,36 +120,53 @@ Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Ko
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
#### Install Dependences
|
||||
### Running GPT-SoVITS with Docker
|
||||
|
||||
#### Docker Image Selection
|
||||
|
||||
Due to rapid development in the codebase and a slower Docker image release cycle, please:
|
||||
|
||||
- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags
|
||||
- Choose an appropriate image tag for your environment
|
||||
- `Lite` means the Docker image does not include ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed
|
||||
- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose
|
||||
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
|
||||
|
||||
#### Environment Variables
|
||||
|
||||
- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.
|
||||
|
||||
#### Shared Memory Configuration
|
||||
|
||||
On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.
|
||||
|
||||
#### Choosing a Service
|
||||
|
||||
The `docker-compose.yaml` defines two services:
|
||||
|
||||
- `GPT-SoVITS-CU126` & `GPT-SoVITS-CU128`: Full version with all features.
|
||||
- `GPT-SoVITS-CU126-Lite` & `GPT-SoVITS-CU128-Lite`: Lightweight version with reduced dependencies and functionality.
|
||||
|
||||
To run a specific service with Docker Compose, use:
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
#### Building the Docker Image Locally
|
||||
|
||||
#### docker-compose.yaml configuration
|
||||
If you want to build the image yourself, use:
|
||||
|
||||
0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(outdated) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs.
|
||||
1. Environment Variables:
|
||||
- is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation.
|
||||
2. Volumes Configuration, The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content.
|
||||
3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation.
|
||||
4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances.
|
||||
|
||||
#### Running with docker compose
|
||||
|
||||
```
|
||||
docker compose -f "docker-compose.yaml" up -d
|
||||
```bash
|
||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||
```
|
||||
|
||||
#### Running with docker command
|
||||
#### Accessing the Running Container (Bash Shell)
|
||||
|
||||
As above, modify the corresponding parameters based on your actual situation, then run the following command:
|
||||
Once the container is running in the background, you can access it using:
|
||||
|
||||
```
|
||||
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
||||
```bash
|
||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||
```
|
||||
|
||||
## Pretrained Models
|
||||
@ -168,7 +194,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
The TTS annotation .list file format:
|
||||
|
||||
```
|
||||
|
||||
vocal_path|speaker_name|language|text
|
||||
|
||||
```
|
||||
|
||||
Language dictionary:
|
||||
@ -182,7 +210,9 @@ Language dictionary:
|
||||
Example:
|
||||
|
||||
```
|
||||
|
||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||
|
||||
```
|
||||
|
||||
## Finetune and inference
|
||||
@ -212,12 +242,12 @@ Or maunally switch version in WebUI
|
||||
|
||||
#### Path Auto-filling is now supported
|
||||
|
||||
1. Fill in the audio path
|
||||
2. Slice the audio into small chunks
|
||||
3. Denoise(optinal)
|
||||
4. ASR
|
||||
5. Proofreading ASR transcriptions
|
||||
6. Go to the next Tab, then finetune the model
|
||||
1. Fill in the audio path
|
||||
2. Slice the audio into small chunks
|
||||
3. Denoise(optinal)
|
||||
4. ASR
|
||||
5. Proofreading ASR transcriptions
|
||||
6. Go to the next Tab, then finetune the model
|
||||
|
||||
### Open Inference WebUI
|
||||
|
||||
@ -259,7 +289,7 @@ Use v2 from v1 environment:
|
||||
|
||||
2. Clone the latest codes from github.
|
||||
|
||||
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
|
||||
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`.
|
||||
|
||||
Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.)
|
||||
|
||||
@ -279,7 +309,7 @@ Use v3 from v2 environment:
|
||||
|
||||
2. Clone the latest codes from github.
|
||||
|
||||
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
|
||||
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
@ -296,7 +326,7 @@ Use v4 from v1/v2/v3 environment:
|
||||
|
||||
2. Clone the latest codes from github.
|
||||
|
||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
|
||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
## Todo List
|
||||
|
||||
@ -322,7 +352,7 @@ Use v4 from v1/v2/v3 environment:
|
||||
|
||||
Use the command line to open the WebUI for UVR5
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
@ -333,7 +363,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
|
||||
|
||||
This is how the audio segmentation of the dataset is done using the command line
|
||||
|
||||
```
|
||||
```bash
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -345,7 +375,7 @@ python audio_slicer.py \
|
||||
|
||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
@ -353,7 +383,7 @@ ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
||||
|
||||
(No progress bars, GPU performance may cause time delays)
|
||||
|
||||
```
|
||||
```bash
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
|
@ -1,32 +1,95 @@
|
||||
version: '3.8'
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
gpt-sovits:
|
||||
image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container.
|
||||
container_name: gpt-sovits-container
|
||||
environment:
|
||||
- is_half=False
|
||||
- is_share=False
|
||||
volumes:
|
||||
- ./output:/workspace/output
|
||||
- ./logs:/workspace/logs
|
||||
- ./SoVITS_weights:/workspace/SoVITS_weights
|
||||
- ./reference:/workspace/reference
|
||||
working_dir: /workspace
|
||||
GPT-SoVITS-CU126:
|
||||
image: xxxxrt666/gpt-sovits:latest-cu126
|
||||
container_name: GPT-SoVITS-CU126
|
||||
ports:
|
||||
- "9880:9880"
|
||||
- "9871:9871"
|
||||
- "9872:9872"
|
||||
- "9873:9873"
|
||||
- "9874:9874"
|
||||
shm_size: 16G
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: "all"
|
||||
capabilities: [gpu]
|
||||
stdin_open: true
|
||||
- "9880:9880"
|
||||
volumes:
|
||||
- .:/workspace/GPT-SoVITS
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
||||
environment:
|
||||
- is_half=true
|
||||
tty: true
|
||||
stdin_open: true
|
||||
shm_size: "16g"
|
||||
restart: unless-stopped
|
||||
runtime: nvidia
|
||||
GPT-SoVITS-CU126-Lite:
|
||||
image: xxxxrt666/gpt-sovits:latest-cu126-lite
|
||||
container_name: GPT-SoVITS-CU126-Lite
|
||||
ports:
|
||||
- "9871:9871"
|
||||
- "9872:9872"
|
||||
- "9873:9873"
|
||||
- "9874:9874"
|
||||
- "9880:9880"
|
||||
volumes:
|
||||
- .:/workspace/GPT-SoVITS
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
||||
- tools/asr/models:/workspace/models/asr_models
|
||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||
environment:
|
||||
- is_half=true
|
||||
tty: true
|
||||
stdin_open: true
|
||||
shm_size: "16g"
|
||||
restart: unless-stopped
|
||||
runtime: nvidia
|
||||
GPT-SoVITS-CU128:
|
||||
image: xxxxrt666/gpt-sovits:latest-cu128
|
||||
container_name: GPT-SoVITS-CU128
|
||||
ports:
|
||||
- "9871:9871"
|
||||
- "9872:9872"
|
||||
- "9873:9873"
|
||||
- "9874:9874"
|
||||
- "9880:9880"
|
||||
volumes:
|
||||
- .:/workspace/GPT-SoVITS
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
||||
environment:
|
||||
- is_half=true
|
||||
tty: true
|
||||
stdin_open: true
|
||||
shm_size: "16g"
|
||||
restart: unless-stopped
|
||||
runtime: nvidia
|
||||
GPT-SoVITS-CU128-Lite:
|
||||
image: xxxxrt666/gpt-sovits:latest-cu128-lite
|
||||
container_name: GPT-SoVITS-CU128-Lite
|
||||
ports:
|
||||
- "9871:9871"
|
||||
- "9872:9872"
|
||||
- "9873:9873"
|
||||
- "9874:9874"
|
||||
- "9880:9880"
|
||||
volumes:
|
||||
- .:/workspace/GPT-SoVITS
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
||||
- tools/asr/models:/workspace/models/asr_models
|
||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||
environment:
|
||||
- is_half=true
|
||||
tty: true
|
||||
stdin_open: true
|
||||
shm_size: "16g"
|
||||
restart: unless-stopped
|
||||
runtime: nvidia
|
82
docker_build.sh
Normal file
82
docker_build.sh
Normal file
@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||
|
||||
cd "$SCRIPT_DIR" || exit 1
|
||||
|
||||
set -e
|
||||
|
||||
if ! command -v docker &>/dev/null; then
|
||||
echo "Docker Not Found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
|
||||
|
||||
LITE=false
|
||||
CUDA_VERSION=12.6
|
||||
|
||||
print_help() {
|
||||
echo "Usage: bash docker_build.sh [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)"
|
||||
echo " --lite Build a Lite Image"
|
||||
echo " -h, --help Show this help message and exit"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
|
||||
}
|
||||
|
||||
# Show help if no arguments provided
|
||||
if [[ $# -eq 0 ]]; then
|
||||
print_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--cuda)
|
||||
case "$2" in
|
||||
12.6)
|
||||
CUDA_VERSION=12.6
|
||||
;;
|
||||
12.8)
|
||||
CUDA_VERSION=12.8
|
||||
;;
|
||||
*)
|
||||
echo "Error: Invalid CUDA_VERSION: $2"
|
||||
echo "Choose From: [12.6, 12.8]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift 2
|
||||
;;
|
||||
--lite)
|
||||
LITE=true
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown Argument: $1"
|
||||
echo "Use -h or --help to see available options."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64")
|
||||
|
||||
if [ $LITE = true ]; then
|
||||
TORCH_BASE="lite"
|
||||
else
|
||||
TORCH_BASE="full"
|
||||
fi
|
||||
|
||||
docker build \
|
||||
--build-arg CUDA_VERSION=$CUDA_VERSION \
|
||||
--build-arg LITE=$LITE \
|
||||
--build-arg TARGETPLATFORM="$TARGETPLATFORM" \
|
||||
--build-arg TORCH_BASE=$TORCH_BASE \
|
||||
-t "${USER}/gpt-sovits:local" \
|
||||
.
|
@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 获取当前日期,格式为 YYYYMMDD
|
||||
DATE=$(date +%Y%m%d)
|
||||
# 获取最新的 Git commit 哈希值的前 7 位
|
||||
COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
|
||||
|
||||
# 构建 full 版本的镜像
|
||||
docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
|
||||
# 为同一个镜像添加带日期的标签
|
||||
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
|
||||
# 为同一个镜像添加带当前代码库Commit哈希值的标签
|
||||
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
|
||||
|
||||
|
||||
# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
|
||||
docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
|
||||
# 为同一个镜像添加带日期的标签
|
||||
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
|
||||
# 为同一个镜像添加带当前代码库Commit哈希值的标签
|
||||
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite
|
@ -20,21 +20,21 @@
|
||||
|
||||
---
|
||||
|
||||
## 功能:
|
||||
## 功能
|
||||
|
||||
1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
|
||||
1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
|
||||
|
||||
2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
|
||||
2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
|
||||
|
||||
3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文.
|
||||
3. **跨语言支持:** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文.
|
||||
|
||||
4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型.
|
||||
4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型.
|
||||
|
||||
**查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)**
|
||||
|
||||
未见过的说话者 few-shot 微调演示:
|
||||
未见过的说话者 few-shot 微调演示:
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
||||
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
|
||||
|
||||
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||
|
||||
@ -44,14 +44,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### 测试通过的环境
|
||||
|
||||
| Python Version | PyTorch Version | Device |
|
||||
|----------------|------------------|-----------------|
|
||||
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
||||
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
||||
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
| Python Version | PyTorch Version | Device |
|
||||
| -------------- | ---------------- | ------------- |
|
||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
|
||||
### Windows
|
||||
|
||||
@ -62,31 +63,41 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### macOS
|
||||
|
||||
**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.**
|
||||
|
||||
1. 运行 `xcode-select --install` 安装 Xcode command-line tools.
|
||||
2. 运行以下的命令来安装本项目:
|
||||
运行以下的命令来安装本项目:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### 手动安装
|
||||
|
||||
#### 安装依赖
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### 安装 FFmpeg
|
||||
|
||||
##### Conda 用户
|
||||
|
||||
```bash
|
||||
conda activate GPTSoVits
|
||||
conda install ffmpeg
|
||||
```
|
||||
|
||||
@ -95,14 +106,13 @@ conda install ffmpeg
|
||||
```bash
|
||||
sudo apt install ffmpeg
|
||||
sudo apt install libsox-dev
|
||||
conda install -c conda-forge 'ffmpeg<7'
|
||||
```
|
||||
|
||||
##### Windows 用户
|
||||
|
||||
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下.
|
||||
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下
|
||||
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境
|
||||
|
||||
##### MacOS 用户
|
||||
|
||||
@ -110,38 +120,53 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
#### 安装依赖
|
||||
### 运行 GPT-SoVITS (使用 Docker)
|
||||
|
||||
#### Docker 镜像选择
|
||||
|
||||
由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意:
|
||||
|
||||
- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags)
|
||||
- 根据你的运行环境选择合适的镜像标签
|
||||
- `Lite` Docker 镜像不包含 ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载
|
||||
- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64)
|
||||
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
|
||||
|
||||
#### 环境变量
|
||||
|
||||
- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用
|
||||
|
||||
#### 共享内存配置
|
||||
|
||||
在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)
|
||||
|
||||
#### 选择服务
|
||||
|
||||
`docker-compose.yaml` 文件定义了两个主要服务类型:
|
||||
|
||||
- `GPT-SoVITS-CU126` 与 `GPT-SoVITS-CU128`:完整版, 包含所有功能
|
||||
- `GPT-SoVITS-CU126-Lite` 与 `GPT-SoVITS-CU128-Lite`:轻量版, 依赖更少, 功能略有删减
|
||||
|
||||
如需使用 Docker Compose 运行指定服务, 请执行:
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||
```
|
||||
|
||||
### 在 Docker 中使用
|
||||
#### 本地构建 Docker 镜像
|
||||
|
||||
#### docker-compose.yaml 设置
|
||||
如果你希望自行构建镜像, 请使用以下命令:
|
||||
|
||||
0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建.
|
||||
1. 环境变量:
|
||||
|
||||
- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False.
|
||||
|
||||
2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容.
|
||||
3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置.
|
||||
4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置.
|
||||
|
||||
#### 通过 docker compose 运行
|
||||
|
||||
```
|
||||
docker compose -f "docker-compose.yaml" up -d
|
||||
```bash
|
||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||
```
|
||||
|
||||
#### 通过 docker 命令运行
|
||||
#### 访问运行中的容器 (Bash Shell)
|
||||
|
||||
同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令:
|
||||
当容器在后台运行时, 你可以通过以下命令进入容器:
|
||||
|
||||
```
|
||||
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
||||
```bash
|
||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||
```
|
||||
|
||||
## 预训练模型
|
||||
@ -166,13 +191,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
## 数据集格式
|
||||
|
||||
文本到语音 (TTS) 注释 .list 文件格式:
|
||||
文本到语音 (TTS) 注释 .list 文件格式:
|
||||
|
||||
```
|
||||
vocal_path|speaker_name|language|text
|
||||
```
|
||||
|
||||
语言字典:
|
||||
语言字典:
|
||||
|
||||
- 'zh': 中文
|
||||
- 'ja': 日语
|
||||
@ -180,7 +205,7 @@ vocal_path|speaker_name|language|text
|
||||
- 'ko': 韩语
|
||||
- 'yue': 粤语
|
||||
|
||||
示例:
|
||||
示例:
|
||||
|
||||
```
|
||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
|
||||
@ -213,12 +238,12 @@ python webui.py v1 <language(optional)>
|
||||
|
||||
#### 现已支持自动填充路径
|
||||
|
||||
1. 填入训练音频路径
|
||||
2. 切割音频
|
||||
3. 进行降噪(可选)
|
||||
4. 进行ASR
|
||||
5. 校对标注
|
||||
6. 前往下一个窗口,点击训练
|
||||
1. 填入训练音频路径
|
||||
2. 切割音频
|
||||
3. 进行降噪(可选)
|
||||
4. 进行 ASR
|
||||
5. 校对标注
|
||||
6. 前往下一个窗口,点击训练
|
||||
|
||||
### 打开推理 WebUI
|
||||
|
||||
@ -260,7 +285,7 @@ python webui.py
|
||||
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下
|
||||
|
||||
中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
||||
|
||||
@ -280,13 +305,13 @@ python webui.py
|
||||
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下
|
||||
|
||||
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
## 待办事项清单
|
||||
|
||||
- [x] **高优先级: **
|
||||
- [x] **高优先级:**
|
||||
|
||||
- [x] 日语和英语的本地化.
|
||||
- [x] 用户指南.
|
||||
@ -304,11 +329,11 @@ python webui.py
|
||||
- [x] 更好的 sovits 基础模型 (增强的音频质量).
|
||||
- [ ] 模型混合.
|
||||
|
||||
## (附加) 命令行运行方式
|
||||
## (附加) 命令行运行方式
|
||||
|
||||
使用命令行打开 UVR5 的 WebUI
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
@ -319,7 +344,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
|
||||
|
||||
这是使用命令行完成数据集的音频切分的方式
|
||||
|
||||
```
|
||||
```bash
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -331,15 +356,15 @@ python audio_slicer.py \
|
||||
|
||||
这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
|
||||
|
||||
(没有进度条, GPU 性能可能会导致时间延迟)
|
||||
(没有进度条, GPU 性能可能会导致时间延迟)
|
||||
|
||||
```
|
||||
```bash
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
@ -347,7 +372,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
|
||||
## 致谢
|
||||
|
||||
特别感谢以下项目和贡献者:
|
||||
特别感谢以下项目和贡献者:
|
||||
|
||||
### 理论研究
|
||||
|
||||
|
@ -40,14 +40,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### テスト済みの環境
|
||||
|
||||
| Python Version | PyTorch Version | Device |
|
||||
|----------------|------------------|-----------------|
|
||||
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
||||
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
||||
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
| Python Version | PyTorch Version | Device |
|
||||
| -------------- | ---------------- | ------------- |
|
||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
|
||||
### Windows
|
||||
|
||||
@ -56,31 +57,41 @@ Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケ
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### macOS
|
||||
|
||||
**注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.**
|
||||
|
||||
1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします.
|
||||
2. 以下のコマンドを実行してこのプロジェクトをインストールします.
|
||||
以下のコマンドを実行してこのプロジェクトをインストールします:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### 手動インストール
|
||||
|
||||
#### FFmpeg をインストールします.
|
||||
#### 依存関係をインストールします
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### FFmpeg をインストールします
|
||||
|
||||
##### Conda ユーザー
|
||||
|
||||
```bash
|
||||
conda activate GPTSoVits
|
||||
conda install ffmpeg
|
||||
```
|
||||
|
||||
@ -89,12 +100,13 @@ conda install ffmpeg
|
||||
```bash
|
||||
sudo apt install ffmpeg
|
||||
sudo apt install libsox-dev
|
||||
conda install -c conda-forge 'ffmpeg<7'
|
||||
```
|
||||
|
||||
##### Windows ユーザー
|
||||
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます.
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます
|
||||
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください
|
||||
|
||||
##### MacOS ユーザー
|
||||
|
||||
@ -102,38 +114,53 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
#### 依存関係をインストールします
|
||||
### GPT-SoVITS の実行 (Docker 使用)
|
||||
|
||||
#### Docker イメージの選択
|
||||
|
||||
コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください:
|
||||
|
||||
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください
|
||||
- 環境に合った適切なイメージタグを選択してください
|
||||
- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが含まれていないことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします
|
||||
- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます
|
||||
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
|
||||
|
||||
#### 環境変数
|
||||
|
||||
- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます
|
||||
|
||||
#### 共有メモリの設定
|
||||
|
||||
Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします
|
||||
|
||||
#### サービスの選択
|
||||
|
||||
`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています:
|
||||
|
||||
- `GPT-SoVITS-CU126` および `GPT-SoVITS-CU128`:すべての機能を含むフルバージョン
|
||||
- `GPT-SoVITS-CU126-Lite` および `GPT-SoVITS-CU128-Lite`:依存関係を削減した軽量バージョン
|
||||
|
||||
特定のサービスを Docker Compose で実行するには、以下のコマンドを使用します:
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirementx.txt
|
||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||
```
|
||||
|
||||
### Docker の使用
|
||||
#### Docker イメージのローカルビルド
|
||||
|
||||
#### docker-compose.yaml の設定
|
||||
自分でイメージをビルドするには、以下のコマンドを使ってください:
|
||||
|
||||
0. イメージのタグについて: コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(古いバージョン) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください.
|
||||
1. 環境変数:
|
||||
|
||||
- `is_half`: 半精度/倍精度の制御."SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です.実際の状況に応じて True または False に調整してください.
|
||||
|
||||
2. ボリューム設定: コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます.デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています.
|
||||
3. `shm_size`: Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります.状況に応じて適宜設定してください.
|
||||
4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください.
|
||||
|
||||
#### docker compose で実行する
|
||||
|
||||
```markdown
|
||||
docker compose -f "docker-compose.yaml" up -d
|
||||
```bash
|
||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||
```
|
||||
|
||||
#### docker コマンドで実行する
|
||||
#### 実行中のコンテナへアクセス (Bash Shell)
|
||||
|
||||
上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します:
|
||||
コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます:
|
||||
|
||||
```markdown
|
||||
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
||||
```bash
|
||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||
```
|
||||
|
||||
## 事前訓練済みモデル
|
||||
@ -201,12 +228,12 @@ python webui.py v1 <言語(オプション)>
|
||||
|
||||
#### パス自動補完のサポート
|
||||
|
||||
1. 音声パスを入力する
|
||||
2. 音声を小さなチャンクに分割する
|
||||
3. ノイズ除去 (オプション)
|
||||
4. ASR
|
||||
5. ASR転写を校正する
|
||||
6. 次のタブに移動し、モデルを微調整する
|
||||
1. 音声パスを入力する
|
||||
2. 音声を小さなチャンクに分割する
|
||||
3. ノイズ除去 (オプション)
|
||||
4. ASR
|
||||
5. ASR 転写を校正する
|
||||
6. 次のタブに移動し、モデルを微調整する
|
||||
|
||||
### 推論 WebUI を開く
|
||||
|
||||
@ -248,7 +275,7 @@ V1 環境から V2 を使用するには:
|
||||
|
||||
2. 最新のコードを github からクローン
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置
|
||||
|
||||
中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
||||
|
||||
@ -268,7 +295,7 @@ v2 環境から v3 を使用する方法:
|
||||
|
||||
2. GitHub から最新のコードをクローンします.
|
||||
|
||||
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します.
|
||||
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します.
|
||||
|
||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
|
||||
|
||||
@ -296,7 +323,7 @@ v2 環境から v3 を使用する方法:
|
||||
|
||||
コマンド ラインを使用して UVR5 の WebUI を開きます
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
@ -307,7 +334,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
|
||||
|
||||
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
|
||||
|
||||
```
|
||||
```bash
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -319,7 +346,7 @@ python audio_slicer.py \
|
||||
|
||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
@ -327,7 +354,7 @@ ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く A
|
||||
|
||||
(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
||||
|
||||
```
|
||||
```bash
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
@ -335,7 +362,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
|
||||
## クレジット
|
||||
|
||||
特に以下のプロジェクトと貢献者に感謝します:
|
||||
特に以下のプロジェクトと貢献者に感謝します:
|
||||
|
||||
### 理論研究
|
||||
|
||||
|
@ -40,14 +40,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### 테스트 통과 환경
|
||||
|
||||
| Python Version | PyTorch Version | Device |
|
||||
|----------------|------------------|-----------------|
|
||||
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
||||
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
||||
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
| Python Version | PyTorch Version | Device |
|
||||
| -------------- | ---------------- | ------------- |
|
||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
|
||||
### Windows
|
||||
|
||||
@ -56,31 +57,41 @@ Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### macOS
|
||||
|
||||
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
|
||||
|
||||
1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요.
|
||||
2. 다음 명령어를 실행하여 이 프로젝트를 설치하세요.
|
||||
다음 명령어를 실행하여 이 프로젝트를 설치하세요
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### 수동 설치
|
||||
|
||||
#### 의존성 설치
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### FFmpeg 설치
|
||||
|
||||
##### Conda 사용자
|
||||
|
||||
```bash
|
||||
conda activate GPTSoVits
|
||||
conda install ffmpeg
|
||||
```
|
||||
|
||||
@ -89,14 +100,13 @@ conda install ffmpeg
|
||||
```bash
|
||||
sudo apt install ffmpeg
|
||||
sudo apt install libsox-dev
|
||||
conda install -c conda-forge 'ffmpeg<7'
|
||||
```
|
||||
|
||||
##### Windows 사용자
|
||||
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다.
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다
|
||||
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치
|
||||
|
||||
##### MacOS 사용자
|
||||
|
||||
@ -104,41 +114,53 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
#### 의존성 설치
|
||||
### GPT-SoVITS 실행하기 (Docker 사용)
|
||||
|
||||
#### Docker 이미지 선택
|
||||
|
||||
코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:
|
||||
|
||||
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요
|
||||
- 환경에 맞는 적절한 이미지 태그를 선택하세요
|
||||
- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 포함되어 있지 않음을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다
|
||||
- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다
|
||||
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
|
||||
|
||||
#### 환경 변수
|
||||
|
||||
- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다
|
||||
|
||||
#### 공유 메모리 설정
|
||||
|
||||
Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다
|
||||
|
||||
#### 서비스 선택
|
||||
|
||||
`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:
|
||||
|
||||
- `GPT-SoVITS-CU126` 및 `GPT-SoVITS-CU128`: 전체 기능을 포함한 풀 버전
|
||||
- `GPT-SoVITS-CU126-Lite` 및 `GPT-SoVITS-CU128-Lite`: 의존성이 줄어든 경량 버전
|
||||
|
||||
특정 서비스를 Docker Compose로 실행하려면 다음 명령을 사용하세요:
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||
```
|
||||
|
||||
### Docker에서 사용
|
||||
#### Docker 이미지 직접 빌드하기
|
||||
|
||||
#### docker-compose.yaml 설정
|
||||
직접 이미지를 빌드하려면 다음 명령어를 사용하세요:
|
||||
|
||||
0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(오래된 버전) 에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬에서 빌드할 수 있습니다.
|
||||
|
||||
1. 환경 변수:
|
||||
|
||||
- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다.
|
||||
|
||||
2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다.
|
||||
|
||||
3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다.
|
||||
|
||||
4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다.
|
||||
|
||||
#### docker compose로 실행
|
||||
|
||||
```
|
||||
docker compose -f "docker-compose.yaml" up -d
|
||||
```bash
|
||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||
```
|
||||
|
||||
#### docker 명령으로 실행
|
||||
#### 실행 중인 컨테이너 접속하기 (Bash Shell)
|
||||
|
||||
위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다:
|
||||
컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다:
|
||||
|
||||
```
|
||||
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
||||
```bash
|
||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||
```
|
||||
|
||||
## 사전 학습된 모델
|
||||
@ -206,12 +228,12 @@ python webui.py v1 <언어(옵션)>
|
||||
|
||||
#### 경로 자동 채우기가 지원됩니다
|
||||
|
||||
1. 오디오 경로를 입력하십시오.
|
||||
2. 오디오를 작은 청크로 분할하십시오.
|
||||
3. 노이즈 제거(옵션)
|
||||
4. ASR 수행
|
||||
5. ASR 전사를 교정하십시오.
|
||||
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
|
||||
1. 오디오 경로를 입력하십시오.
|
||||
2. 오디오를 작은 청크로 분할하십시오.
|
||||
3. 노이즈 제거(옵션)
|
||||
4. ASR 수행
|
||||
5. ASR 전사를 교정하십시오.
|
||||
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
|
||||
|
||||
### 추론 WebUI 열기
|
||||
|
||||
@ -253,7 +275,7 @@ V1 환경에서 V2를 사용하려면:
|
||||
|
||||
2. github에서 최신 코드를 클론하십시오.
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오.
|
||||
|
||||
중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
||||
|
||||
@ -273,7 +295,7 @@ v2 환경에서 v3 사용하기:
|
||||
|
||||
2. 최신 코드를 github 에서 클론합니다.
|
||||
|
||||
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
|
||||
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다.
|
||||
|
||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||
|
||||
@ -302,7 +324,7 @@ v2 환경에서 v3 사용하기:
|
||||
|
||||
명령줄을 사용하여 UVR5용 WebUI 열기
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
@ -313,7 +335,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
|
||||
|
||||
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
||||
|
||||
```
|
||||
```bash
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -325,7 +347,7 @@ python audio_slicer.py \
|
||||
|
||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
@ -333,7 +355,7 @@ ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행
|
||||
|
||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
||||
|
||||
```
|
||||
```bash
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
|
@ -42,14 +42,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### Test Edilmiş Ortamlar
|
||||
|
||||
| Python Version | PyTorch Version | Device |
|
||||
|----------------|------------------|-----------------|
|
||||
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
||||
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
||||
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
| Python Version | PyTorch Version | Device |
|
||||
| -------------- | ---------------- | ------------- |
|
||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
||||
|
||||
### Windows
|
||||
|
||||
@ -58,31 +59,41 @@ Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### macOS
|
||||
|
||||
**Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.**
|
||||
|
||||
1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin.
|
||||
2. Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
||||
```
|
||||
|
||||
### El ile Yükleme
|
||||
|
||||
#### Bağımlılıkları Yükleme
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.10
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### FFmpeg'i Yükleme
|
||||
|
||||
##### Conda Kullanıcıları
|
||||
|
||||
```bash
|
||||
conda activate GPTSoVits
|
||||
conda install ffmpeg
|
||||
```
|
||||
|
||||
@ -91,12 +102,13 @@ conda install ffmpeg
|
||||
```bash
|
||||
sudo apt install ffmpeg
|
||||
sudo apt install libsox-dev
|
||||
conda install -c conda-forge 'ffmpeg<7'
|
||||
```
|
||||
|
||||
##### Windows Kullanıcıları
|
||||
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin
|
||||
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin
|
||||
|
||||
##### MacOS Kullanıcıları
|
||||
|
||||
@ -104,36 +116,53 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
brew install ffmpeg
|
||||
```
|
||||
|
||||
#### Bağımlılıkları Yükleme
|
||||
### GPT-SoVITS Çalıştırma (Docker Kullanarak)
|
||||
|
||||
#### Docker İmajı Seçimi
|
||||
|
||||
Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:
|
||||
|
||||
- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin
|
||||
- Ortamınıza uygun bir imaj etiketi seçin
|
||||
- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin bulunmadığı anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir
|
||||
- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir
|
||||
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
|
||||
|
||||
#### Ortam Değişkenleri
|
||||
|
||||
- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.
|
||||
|
||||
#### Paylaşılan Bellek Yapılandırması
|
||||
|
||||
Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.
|
||||
|
||||
#### Servis Seçimi
|
||||
|
||||
`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:
|
||||
|
||||
- `GPT-SoVITS-CU126` ve `GPT-SoVITS-CU128`: Tüm özellikleri içeren tam sürüm.
|
||||
- `GPT-SoVITS-CU126-Lite` ve `GPT-SoVITS-CU128-Lite`: Daha az bağımlılığa ve sınırlı işlevselliğe sahip hafif sürüm.
|
||||
|
||||
Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanın:
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||
```
|
||||
|
||||
### Docker Kullanarak
|
||||
#### Docker İmajını Yerel Olarak Oluşturma
|
||||
|
||||
#### docker-compose.yaml yapılandırması
|
||||
Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
|
||||
|
||||
0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(eski sürüm) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun.
|
||||
1. Ortam Değişkenleri:
|
||||
- is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın.
|
||||
2. Birim Yapılandırması, Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler.
|
||||
3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın.
|
||||
4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır.
|
||||
|
||||
#### docker compose ile çalıştırma
|
||||
|
||||
```
|
||||
docker compose -f "docker-compose.yaml" up -d
|
||||
```bash
|
||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||
```
|
||||
|
||||
#### docker komutu ile çalıştırma
|
||||
#### Çalışan Konteynere Erişim (Bash Shell)
|
||||
|
||||
Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın:
|
||||
Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz:
|
||||
|
||||
```
|
||||
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
||||
```bash
|
||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||
```
|
||||
|
||||
## Önceden Eğitilmiş Modeller
|
||||
@ -203,12 +232,12 @@ veya WebUI'de manuel olarak sürüm değiştirin.
|
||||
|
||||
#### Yol Otomatik Doldurma artık destekleniyor
|
||||
|
||||
1. Ses yolunu doldurun
|
||||
2. Sesi küçük parçalara ayırın
|
||||
3. Gürültü azaltma (isteğe bağlı)
|
||||
4. ASR
|
||||
5. ASR transkripsiyonlarını düzeltin
|
||||
6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın
|
||||
1. Ses yolunu doldurun
|
||||
2. Sesi küçük parçalara ayırın
|
||||
3. Gürültü azaltma (isteğe bağlı)
|
||||
4. ASR
|
||||
5. ASR transkripsiyonlarını düzeltin
|
||||
6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın
|
||||
|
||||
### Çıkarım WebUI'sini Açın
|
||||
|
||||
@ -250,7 +279,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
2. github'dan en son kodları klonlayın.
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin.
|
||||
|
||||
Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
||||
|
||||
@ -270,7 +299,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
2. GitHub'dan en son kodları klonlayın.
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
||||
|
||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
||||
|
||||
@ -298,7 +327,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
@ -309,7 +338,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --
|
||||
|
||||
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
||||
|
||||
```
|
||||
```bash
|
||||
python audio_slicer.py \
|
||||
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
||||
--output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
|
||||
@ -321,7 +350,7 @@ python audio_slicer.py \
|
||||
|
||||
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
||||
|
||||
```
|
||||
```bash
|
||||
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
||||
```
|
||||
|
||||
@ -329,7 +358,7 @@ ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışınd
|
||||
|
||||
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
||||
|
||||
```
|
||||
```bash
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
```
|
||||
|
||||
|
@ -1,2 +1,6 @@
|
||||
set "SCRIPT_DIR=%~dp0"
|
||||
set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
|
||||
cd /d "%SCRIPT_DIR%"
|
||||
set "PATH=%SCRIPT_DIR%\runtime;%PATH%"
|
||||
runtime\python.exe -I webui.py zh_CN
|
||||
pause
|
||||
|
@ -1,4 +1,7 @@
|
||||
$ErrorActionPreference = "SilentlyContinue"
|
||||
chcp 65001
|
||||
& "$PSScriptRoot\runtime\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
|
||||
Set-Location $PSScriptRoot
|
||||
$runtimePath = Join-Path $PSScriptRoot "runtime"
|
||||
$env:PATH = "$runtimePath;$env:PATH"
|
||||
& "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
|
||||
pause
|
||||
|
255
install.sh
255
install.sh
@ -14,18 +14,24 @@ fi
|
||||
|
||||
trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
|
||||
|
||||
is_HF=false
|
||||
is_HF_MIRROR=false
|
||||
is_MODELSCOPE=false
|
||||
USE_CUDA=false
|
||||
USE_ROCM=false
|
||||
USE_CPU=false
|
||||
WORKFLOW=${WORKFLOW:-"false"}
|
||||
|
||||
USE_HF=false
|
||||
USE_HF_MIRROR=false
|
||||
USE_MODELSCOPE=false
|
||||
DOWNLOAD_UVR5=false
|
||||
|
||||
print_help() {
|
||||
echo "Usage: bash install.sh [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)"
|
||||
echo " --download-uvr5 Enable downloading the UVR5 model"
|
||||
echo " -h, --help Show this help message and exit"
|
||||
echo " --device CU126|CU128|ROCM|MPS|CPU Specify the Device (REQUIRED)"
|
||||
echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)"
|
||||
echo " --download-uvr5 Enable downloading the UVR5 model"
|
||||
echo " -h, --help Show this help message and exit"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " bash install.sh --source HF --download-uvr5"
|
||||
@ -41,132 +47,192 @@ fi
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--source)
|
||||
case "$2" in
|
||||
HF)
|
||||
is_HF=true
|
||||
;;
|
||||
HF-Mirror)
|
||||
is_HF_MIRROR=true
|
||||
;;
|
||||
ModelScope)
|
||||
is_MODELSCOPE=true
|
||||
;;
|
||||
*)
|
||||
echo "Error: Invalid Download Source: $2"
|
||||
echo "Choose From: [HF, HF-Mirror, ModelScope]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift 2
|
||||
--source)
|
||||
case "$2" in
|
||||
HF)
|
||||
USE_HF=true
|
||||
;;
|
||||
--download-uvr5)
|
||||
DOWNLOAD_UVR5=true
|
||||
shift
|
||||
HF-Mirror)
|
||||
USE_HF_MIRROR=true
|
||||
;;
|
||||
-h|--help)
|
||||
print_help
|
||||
exit 0
|
||||
ModelScope)
|
||||
USE_MODELSCOPE=true
|
||||
;;
|
||||
*)
|
||||
echo "Unknown Argument: $1"
|
||||
echo "Use -h or --help to see available options."
|
||||
echo "Error: Invalid Download Source: $2"
|
||||
echo "Choose From: [HF, HF-Mirror, ModelScope]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift 2
|
||||
;;
|
||||
--device)
|
||||
case "$2" in
|
||||
CU126)
|
||||
CUDA=126
|
||||
USE_CUDA=true
|
||||
;;
|
||||
CU128)
|
||||
CUDA=128
|
||||
USE_CUDA=true
|
||||
;;
|
||||
ROCM)
|
||||
USE_ROCM=true
|
||||
;;
|
||||
MPS)
|
||||
USE_CPU=true
|
||||
;;
|
||||
CPU)
|
||||
USE_CPU=true
|
||||
;;
|
||||
*)
|
||||
echo "Error: Invalid Device: $2"
|
||||
echo "Choose From: [CU126, CU128, ROCM, MPS, CPU]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift 2
|
||||
;;
|
||||
--download-uvr5)
|
||||
DOWNLOAD_UVR5=true
|
||||
shift
|
||||
;;
|
||||
-h | --help)
|
||||
print_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown Argument: $1"
|
||||
echo "Use -h or --help to see available options."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! $is_HF && ! $is_HF_MIRROR && ! $is_MODELSCOPE; then
|
||||
if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then
|
||||
echo "Error: Device is REQUIRED"
|
||||
echo ""
|
||||
print_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then
|
||||
echo "Error: Download Source is REQUIRED"
|
||||
echo ""
|
||||
print_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$is_HF" = "true" ]; then
|
||||
# 安装构建工具
|
||||
# Install build tools
|
||||
if [ "$(uname)" != "Darwin" ]; then
|
||||
gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0)
|
||||
if [ "$gcc_major_version" -lt 11 ]; then
|
||||
echo "Installing GCC & G++..."
|
||||
conda install -c conda-forge gcc=11 gxx=11 -q -y
|
||||
else
|
||||
echo "GCC >=11"
|
||||
fi
|
||||
else
|
||||
if ! xcode-select -p &>/dev/null; then
|
||||
echo "Installing Xcode Command Line Tools..."
|
||||
xcode-select --install
|
||||
fi
|
||||
echo "Waiting For Xcode Command Line Tools Installation Complete..."
|
||||
while true; do
|
||||
sleep 20
|
||||
|
||||
if xcode-select -p &>/dev/null; then
|
||||
echo "Xcode Command Line Tools Installed"
|
||||
break
|
||||
else
|
||||
echo "Installing,Please Wait..."
|
||||
fi
|
||||
done
|
||||
conda install -c conda-forge -q -y
|
||||
fi
|
||||
|
||||
echo "Installing ffmpeg and cmake..."
|
||||
conda install ffmpeg cmake make -q -y
|
||||
|
||||
echo "Installing unzip..."
|
||||
conda install unzip -y --quiet
|
||||
|
||||
if [ "$USE_HF" = "true" ]; then
|
||||
echo "Download Model From HuggingFace"
|
||||
PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
||||
G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
||||
UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
||||
elif [ "$is_HF_MIRROR" = "true" ]; then
|
||||
NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
||||
PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||
elif [ "$USE_HF_MIRROR" = "true" ]; then
|
||||
echo "Download Model From HuggingFace-Mirror"
|
||||
PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
||||
G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
||||
UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
||||
elif [ "$is_MODELSCOPE" = "true" ]; then
|
||||
NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
||||
PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||
elif [ "$USE_MODELSCOPE" = "true" ]; then
|
||||
echo "Download Model From ModelScope"
|
||||
PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
|
||||
G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
|
||||
UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
|
||||
NLTK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip"
|
||||
PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||
fi
|
||||
|
||||
if find "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||
if [ "$WORKFLOW" = "true" ]; then
|
||||
WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
||||
else
|
||||
WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
||||
fi
|
||||
|
||||
if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||
echo "Pretrained Model Exists"
|
||||
else
|
||||
echo "Download Pretrained Models"
|
||||
wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$PRETRINED_URL"
|
||||
"${WGET_CMD[@]}" "$PRETRINED_URL"
|
||||
|
||||
unzip pretrained_models.zip
|
||||
unzip -q -o pretrained_models.zip -d GPT_SoVITS
|
||||
rm -rf pretrained_models.zip
|
||||
mv pretrained_models/* GPT_SoVITS/pretrained_models
|
||||
rm -rf pretrained_models
|
||||
fi
|
||||
|
||||
if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then
|
||||
echo "Download G2PWModel"
|
||||
wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$G2PW_URL"
|
||||
"${WGET_CMD[@]}" "$G2PW_URL"
|
||||
|
||||
unzip G2PWModel.zip
|
||||
unzip -q -o G2PWModel.zip -d GPT_SoVITS/text
|
||||
rm -rf G2PWModel.zip
|
||||
mv G2PWModel GPT_SoVITS/text/G2PWModel
|
||||
else
|
||||
echo "G2PWModel Exists"
|
||||
fi
|
||||
|
||||
if [ "$DOWNLOAD_UVR5" = "true" ];then
|
||||
if find "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||
if [ "$DOWNLOAD_UVR5" = "true" ]; then
|
||||
if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||
echo "UVR5 Model Exists"
|
||||
else
|
||||
echo "Download UVR5 Model"
|
||||
wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$UVR5_URL"
|
||||
"${WGET_CMD[@]}" "$UVR5_URL"
|
||||
|
||||
unzip uvr5_weights.zip
|
||||
unzip -q -o uvr5_weights.zip -d tools/uvr5
|
||||
rm -rf uvr5_weights.zip
|
||||
mv uvr5_weights/* tools/uvr5/uvr5_weights
|
||||
rm -rf uvr5_weights
|
||||
fi
|
||||
fi
|
||||
|
||||
# 安装构建工具
|
||||
# Install build tools
|
||||
echo "Installing GCC..."
|
||||
conda install -c conda-forge gcc=14 -y
|
||||
|
||||
echo "Installing G++..."
|
||||
conda install -c conda-forge gxx -y
|
||||
|
||||
echo "Installing ffmpeg and cmake..."
|
||||
conda install ffmpeg cmake -y
|
||||
|
||||
echo "Installing git-lfs and zip..."
|
||||
conda install git-lfs -y
|
||||
conda install zip -y
|
||||
|
||||
git-lfs install
|
||||
|
||||
echo "Checking for CUDA installation..."
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
USE_CUDA=true
|
||||
echo "CUDA found."
|
||||
else
|
||||
echo "CUDA not found."
|
||||
USE_CUDA=false
|
||||
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
||||
echo "Checking for CUDA installation..."
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
echo "CUDA found."
|
||||
else
|
||||
USE_CUDA=false
|
||||
USE_CPU=true
|
||||
echo "CUDA not found."
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$USE_CUDA" = false ]; then
|
||||
if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
||||
echo "Checking for ROCm installation..."
|
||||
if [ -d "/opt/rocm" ]; then
|
||||
USE_ROCM=true
|
||||
echo "ROCm found."
|
||||
if grep -qi "microsoft" /proc/version; then
|
||||
echo "You are running WSL."
|
||||
@ -176,20 +242,28 @@ if [ "$USE_CUDA" = false ]; then
|
||||
IS_WSL=false
|
||||
fi
|
||||
else
|
||||
echo "ROCm not found."
|
||||
USE_ROCM=false
|
||||
USE_CPU=true
|
||||
echo "ROCm not found."
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$USE_CUDA" = true ]; then
|
||||
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
||||
echo "Installing PyTorch with CUDA support..."
|
||||
pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
|
||||
elif [ "$USE_ROCM" = true ]; then
|
||||
if [ "$CUDA" = 128 ]; then
|
||||
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
elif [ "$CUDA" = 126 ]; then
|
||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu126
|
||||
fi
|
||||
elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
||||
echo "Installing PyTorch with ROCm support..."
|
||||
pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
|
||||
else
|
||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
|
||||
elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then
|
||||
echo "Installing PyTorch for CPU..."
|
||||
pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
elif [ "$WORKFLOW" = false ]; then
|
||||
echo "Unknown Err"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Installing Python dependencies from requirements.txt..."
|
||||
@ -198,11 +272,20 @@ echo "Installing Python dependencies from requirements.txt..."
|
||||
# Refresh environment
|
||||
hash -r
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r extra-req.txt --no-deps --quiet
|
||||
|
||||
pip install -r requirements.txt
|
||||
pip install -r requirements.txt --quiet
|
||||
|
||||
python -c "import nltk; nltk.download(['averaged_perceptron_tagger','averaged_perceptron_tagger_eng','cmudict'])"
|
||||
PY_PREFIX=$(python -c "import sys; print(sys.prefix)")
|
||||
PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))")
|
||||
|
||||
"${WGET_CMD[@]}" "$NLTK_URL" -O nltk_data.zip
|
||||
unzip -q -o nltk_data -d "$PY_PREFIX"
|
||||
rm -rf nltk_data.zip
|
||||
|
||||
"${WGET_CMD[@]}" "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz
|
||||
tar -xvzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX"
|
||||
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
|
||||
|
||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
||||
echo "Update to WSL compatible runtime lib..."
|
||||
|
@ -1,3 +1,4 @@
|
||||
--no-binary=opencc
|
||||
numpy<2.0
|
||||
scipy
|
||||
tensorboard
|
||||
@ -6,8 +7,8 @@ numba
|
||||
pytorch-lightning>=2.4
|
||||
gradio<5
|
||||
ffmpeg-python
|
||||
onnxruntime; sys_platform == 'darwin'
|
||||
onnxruntime-gpu; sys_platform != 'darwin'
|
||||
onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
|
||||
tqdm
|
||||
funasr==1.0.27
|
||||
cn2an
|
||||
@ -17,7 +18,7 @@ g2p_en
|
||||
torchaudio
|
||||
modelscope==1.10.0
|
||||
sentencepiece
|
||||
transformers>=4.43
|
||||
transformers>=4.43,<=4.50
|
||||
peft
|
||||
chardet
|
||||
PyYAML
|
||||
@ -31,8 +32,7 @@ rotary_embedding_torch
|
||||
ToJyutping
|
||||
g2pk2
|
||||
ko_pron
|
||||
opencc; sys_platform != 'linux'
|
||||
opencc==1.1.1; sys_platform == 'linux'
|
||||
opencc
|
||||
python_mecab_ko; sys_platform != 'win32'
|
||||
fastapi[standard]>=0.115.2
|
||||
x_transformers
|
||||
|
@ -10,6 +10,7 @@ from faster_whisper import WhisperModel
|
||||
from tqdm import tqdm
|
||||
|
||||
from tools.asr.config import check_fw_local_models
|
||||
from tools.my_utils import load_cudnn
|
||||
|
||||
# fmt: off
|
||||
language_code_list = [
|
||||
@ -93,6 +94,8 @@ def execute_asr(input_folder, output_folder, model_size, language, precision):
|
||||
return output_file_path
|
||||
|
||||
|
||||
load_cudnn()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
|
@ -1,11 +1,15 @@
|
||||
import ctypes
|
||||
import os
|
||||
import traceback
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import ffmpeg
|
||||
import numpy as np
|
||||
import gradio as gr
|
||||
from tools.i18n.i18n import I18nAuto
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from tools.i18n.i18n import I18nAuto
|
||||
|
||||
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
||||
|
||||
|
||||
@ -15,7 +19,7 @@ def load_audio(file, sr):
|
||||
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
||||
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
||||
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
|
||||
if os.path.exists(file) == False:
|
||||
if os.path.exists(file) is False:
|
||||
raise RuntimeError("You input a wrong audio path that does not exists, please fix it!")
|
||||
out, _ = (
|
||||
ffmpeg.input(file, threads=0)
|
||||
@ -23,7 +27,11 @@ def load_audio(file, sr):
|
||||
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
out, _ = (
|
||||
ffmpeg.input(file, threads=0)
|
||||
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
|
||||
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True)
|
||||
) # Expose the Error
|
||||
raise RuntimeError(i18n("音频加载失败"))
|
||||
|
||||
return np.frombuffer(out, np.float32).flatten()
|
||||
@ -127,3 +135,97 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
|
||||
...
|
||||
else:
|
||||
gr.Warning(i18n("缺少语义数据集"))
|
||||
|
||||
|
||||
def load_cudnn():
|
||||
import torch
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print("[INFO] CUDA is not available, skipping cuDNN setup.")
|
||||
return
|
||||
|
||||
if sys.platform == "win32":
|
||||
torch_lib_dir = Path(torch.__file__).parent / "lib"
|
||||
if torch_lib_dir.exists():
|
||||
os.add_dll_directory(str(torch_lib_dir))
|
||||
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
|
||||
matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll"))
|
||||
if not matching_files:
|
||||
print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}")
|
||||
return
|
||||
for dll_path in matching_files:
|
||||
dll_name = os.path.basename(dll_path)
|
||||
try:
|
||||
ctypes.CDLL(dll_name)
|
||||
print(f"[INFO] Loaded: {dll_name}")
|
||||
except OSError as e:
|
||||
print(f"[WARNING] Failed to load {dll_name}: {e}")
|
||||
else:
|
||||
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
|
||||
|
||||
elif sys.platform == "linux":
|
||||
site_packages = Path(torch.__file__).resolve().parents[1]
|
||||
cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
|
||||
|
||||
if not cudnn_dir.exists():
|
||||
print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
|
||||
return
|
||||
|
||||
matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*"))
|
||||
if not matching_files:
|
||||
print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
|
||||
return
|
||||
|
||||
for so_path in matching_files:
|
||||
try:
|
||||
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
|
||||
print(f"[INFO] Loaded: {so_path}")
|
||||
except OSError as e:
|
||||
print(f"[WARNING] Failed to load {so_path}: {e}")
|
||||
|
||||
|
||||
def load_nvrtc():
|
||||
import torch
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print("[INFO] CUDA is not available, skipping nvrtc setup.")
|
||||
return
|
||||
|
||||
if sys.platform == "win32":
|
||||
torch_lib_dir = Path(torch.__file__).parent / "lib"
|
||||
if torch_lib_dir.exists():
|
||||
os.add_dll_directory(str(torch_lib_dir))
|
||||
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
|
||||
matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll"))
|
||||
if not matching_files:
|
||||
print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}")
|
||||
return
|
||||
for dll_path in matching_files:
|
||||
dll_name = os.path.basename(dll_path)
|
||||
try:
|
||||
ctypes.CDLL(dll_name)
|
||||
print(f"[INFO] Loaded: {dll_name}")
|
||||
except OSError as e:
|
||||
print(f"[WARNING] Failed to load {dll_name}: {e}")
|
||||
else:
|
||||
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
|
||||
|
||||
elif sys.platform == "linux":
|
||||
site_packages = Path(torch.__file__).resolve().parents[1]
|
||||
nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib"
|
||||
|
||||
if not nvrtc_dir.exists():
|
||||
print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}")
|
||||
return
|
||||
|
||||
matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*"))
|
||||
if not matching_files:
|
||||
print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}")
|
||||
return
|
||||
|
||||
for so_path in matching_files:
|
||||
try:
|
||||
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
|
||||
print(f"[INFO] Loaded: {so_path}")
|
||||
except OSError as e:
|
||||
print(f"[WARNING] Failed to load {so_path}: {e}")
|
||||
|
@ -1,7 +1,12 @@
|
||||
# -*- coding: gbk -*-
|
||||
import sys
|
||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
|
||||
i18n = I18nAuto(language=language)
|
||||
import argparse
|
||||
import os
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
|
||||
try:
|
||||
@ -11,8 +16,8 @@ try:
|
||||
except:
|
||||
...
|
||||
|
||||
import librosa
|
||||
import gradio as gr
|
||||
import librosa
|
||||
import numpy as np
|
||||
import soundfile
|
||||
|
||||
@ -303,7 +308,10 @@ if __name__ == "__main__":
|
||||
|
||||
set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch)
|
||||
|
||||
with gr.Blocks() as demo:
|
||||
with gr.Blocks(analytics_enabled=False) as demo:
|
||||
gr.Markdown(
|
||||
value=i18n("Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)")
|
||||
)
|
||||
with gr.Row():
|
||||
btn_change_index = gr.Button("Change Index")
|
||||
btn_submit_change = gr.Button("Submit Text")
|
||||
|
@ -32,18 +32,10 @@ def make_pair(mix_dir, inst_dir):
|
||||
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
|
||||
|
||||
X_list = sorted(
|
||||
[
|
||||
os.path.join(mix_dir, fname)
|
||||
for fname in os.listdir(mix_dir)
|
||||
if os.path.splitext(fname)[1] in input_exts
|
||||
]
|
||||
[os.path.join(mix_dir, fname) for fname in os.listdir(mix_dir) if os.path.splitext(fname)[1] in input_exts]
|
||||
)
|
||||
y_list = sorted(
|
||||
[
|
||||
os.path.join(inst_dir, fname)
|
||||
for fname in os.listdir(inst_dir)
|
||||
if os.path.splitext(fname)[1] in input_exts
|
||||
]
|
||||
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
|
||||
)
|
||||
|
||||
filelist = list(zip(X_list, y_list))
|
||||
@ -65,14 +57,10 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
|
||||
train_filelist = filelist[:-val_size]
|
||||
val_filelist = filelist[-val_size:]
|
||||
else:
|
||||
train_filelist = [
|
||||
pair for pair in filelist if list(pair) not in val_filelist
|
||||
]
|
||||
train_filelist = [pair for pair in filelist if list(pair) not in val_filelist]
|
||||
elif split_mode == "subdirs":
|
||||
if len(val_filelist) != 0:
|
||||
raise ValueError(
|
||||
"The `val_filelist` option is not available in `subdirs` mode"
|
||||
)
|
||||
raise ValueError("The `val_filelist` option is not available in `subdirs` mode")
|
||||
|
||||
train_filelist = make_pair(
|
||||
os.path.join(dataset_dir, "training/mixtures"),
|
||||
@ -91,9 +79,7 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
|
||||
perm = np.random.permutation(len(X))
|
||||
for i, idx in enumerate(tqdm(perm)):
|
||||
if np.random.uniform() < reduction_rate:
|
||||
y[idx] = spec_utils.reduce_vocal_aggressively(
|
||||
X[idx], y[idx], reduction_mask
|
||||
)
|
||||
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
|
||||
|
||||
if np.random.uniform() < 0.5:
|
||||
# swap channel
|
||||
@ -152,9 +138,7 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
|
||||
|
||||
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
|
||||
patch_list = []
|
||||
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
|
||||
cropsize, sr, hop_length, n_fft, offset
|
||||
)
|
||||
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset)
|
||||
os.makedirs(patch_dir, exist_ok=True)
|
||||
|
||||
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv6 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv7 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv6 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv7 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -63,9 +63,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
||||
self.conv3 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv6 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv7 = SeperableConv2DBNActiv(
|
||||
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.bottleneck = nn.Sequential(
|
||||
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
||||
)
|
||||
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
|
@ -40,9 +40,7 @@ class Encoder(nn.Module):
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
||||
):
|
||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||
super(Decoder, self).__init__()
|
||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
||||
@ -72,23 +70,15 @@ class ASPPModule(nn.Module):
|
||||
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
|
||||
)
|
||||
self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
|
||||
self.conv3 = Conv2DBNActiv(
|
||||
nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
|
||||
)
|
||||
self.conv4 = Conv2DBNActiv(
|
||||
nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
|
||||
)
|
||||
self.conv5 = Conv2DBNActiv(
|
||||
nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
|
||||
)
|
||||
self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
|
||||
self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
|
||||
self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
|
||||
self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
|
||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||
|
||||
def forward(self, x):
|
||||
_, _, h, w = x.size()
|
||||
feat1 = F.interpolate(
|
||||
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
||||
)
|
||||
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
|
||||
feat2 = self.conv2(x)
|
||||
feat3 = self.conv3(x)
|
||||
feat4 = self.conv4(x)
|
||||
@ -106,12 +96,8 @@ class LSTMModule(nn.Module):
|
||||
def __init__(self, nin_conv, nin_lstm, nout_lstm):
|
||||
super(LSTMModule, self).__init__()
|
||||
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
|
||||
)
|
||||
self.dense = nn.Sequential(
|
||||
nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
|
||||
)
|
||||
self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
|
||||
self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
|
||||
|
||||
def forward(self, x):
|
||||
N, _, nbins, nframes = x.size()
|
||||
|
@ -1,5 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
default_param = {}
|
||||
@ -48,9 +47,7 @@ class ModelParameters(object):
|
||||
import zipfile
|
||||
|
||||
with zipfile.ZipFile(config_path, "r") as zip:
|
||||
self.param = json.loads(
|
||||
zip.read("param.json"), object_pairs_hook=int_keys
|
||||
)
|
||||
self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
|
||||
elif ".json" == pathlib.Path(config_path).suffix:
|
||||
with open(config_path, "r") as f:
|
||||
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
|
||||
@ -65,5 +62,5 @@ class ModelParameters(object):
|
||||
"stereo_n",
|
||||
"reverse",
|
||||
]:
|
||||
if not k in self.param:
|
||||
if k not in self.param:
|
||||
self.param[k] = False
|
||||
|
@ -3,8 +3,6 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from . import spec_utils
|
||||
|
||||
|
||||
class BaseASPPNet(nn.Module):
|
||||
def __init__(self, nin, ch, dilations=(4, 8, 16)):
|
||||
|
@ -1,4 +1,3 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
@ -1,4 +1,3 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
@ -6,9 +6,7 @@ from . import layers_new
|
||||
|
||||
|
||||
class BaseNet(nn.Module):
|
||||
def __init__(
|
||||
self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
|
||||
):
|
||||
def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
|
||||
super(BaseNet, self).__init__()
|
||||
self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
|
||||
self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
|
||||
@ -56,21 +54,15 @@ class CascadedNet(nn.Module):
|
||||
layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
|
||||
)
|
||||
|
||||
self.stg1_high_band_net = BaseNet(
|
||||
2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
|
||||
)
|
||||
self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
|
||||
|
||||
self.stg2_low_band_net = nn.Sequential(
|
||||
BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
|
||||
layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
|
||||
)
|
||||
self.stg2_high_band_net = BaseNet(
|
||||
nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
|
||||
)
|
||||
self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
|
||||
|
||||
self.stg3_full_band_net = BaseNet(
|
||||
3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
|
||||
)
|
||||
self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm)
|
||||
|
||||
self.out = nn.Conv2d(nout, 2, 1, bias=False)
|
||||
self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
|
||||
|
@ -27,9 +27,7 @@ def crop_center(h1, h2):
|
||||
return h1
|
||||
|
||||
|
||||
def wave_to_spectrogram(
|
||||
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
|
||||
):
|
||||
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
|
||||
if reverse:
|
||||
wave_left = np.flip(np.asfortranarray(wave[0]))
|
||||
wave_right = np.flip(np.asfortranarray(wave[1]))
|
||||
@ -43,7 +41,7 @@ def wave_to_spectrogram(
|
||||
wave_left = np.asfortranarray(wave[0])
|
||||
wave_right = np.asfortranarray(wave[1])
|
||||
|
||||
spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
|
||||
spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
|
||||
spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
|
||||
|
||||
spec = np.asfortranarray([spec_left, spec_right])
|
||||
@ -51,9 +49,7 @@ def wave_to_spectrogram(
|
||||
return spec
|
||||
|
||||
|
||||
def wave_to_spectrogram_mt(
|
||||
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
|
||||
):
|
||||
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
|
||||
import threading
|
||||
|
||||
if reverse:
|
||||
@ -103,21 +99,13 @@ def combine_spectrograms(specs, mp):
|
||||
raise ValueError("Too much bins")
|
||||
|
||||
# lowpass fiter
|
||||
if (
|
||||
mp.param["pre_filter_start"] > 0
|
||||
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
|
||||
if mp.param["pre_filter_start"] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
|
||||
if bands_n == 1:
|
||||
spec_c = fft_lp_filter(
|
||||
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
|
||||
)
|
||||
spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
|
||||
else:
|
||||
gp = 1
|
||||
for b in range(
|
||||
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
|
||||
):
|
||||
g = math.pow(
|
||||
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
|
||||
)
|
||||
for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]):
|
||||
g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0)
|
||||
gp = g
|
||||
spec_c[:, b, :] *= g
|
||||
|
||||
@ -189,9 +177,7 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
|
||||
else:
|
||||
e += fade_size
|
||||
|
||||
mag[:, :, s + fade_size : e - fade_size] += ref[
|
||||
:, :, s + fade_size : e - fade_size
|
||||
]
|
||||
mag[:, :, s + fade_size : e - fade_size] += ref[:, :, s + fade_size : e - fade_size]
|
||||
old_e = e
|
||||
|
||||
return mag
|
||||
@ -207,9 +193,7 @@ def cache_or_load(mix_path, inst_path, mp):
|
||||
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
|
||||
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
|
||||
|
||||
cache_dir = "mph{}".format(
|
||||
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
|
||||
)
|
||||
cache_dir = "mph{}".format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest())
|
||||
mix_cache_dir = os.path.join("cache", cache_dir)
|
||||
inst_cache_dir = os.path.join("cache", cache_dir)
|
||||
|
||||
@ -230,31 +214,27 @@ def cache_or_load(mix_path, inst_path, mp):
|
||||
|
||||
if d == len(mp.param["band"]): # high-end band
|
||||
X_wave[d], _ = librosa.load(
|
||||
mix_path,
|
||||
sr = bp["sr"],
|
||||
mono = False,
|
||||
dtype = np.float32,
|
||||
res_type = bp["res_type"]
|
||||
mix_path, sr=bp["sr"], mono=False, dtype=np.float32, res_type=bp["res_type"]
|
||||
)
|
||||
y_wave[d], _ = librosa.load(
|
||||
inst_path,
|
||||
sr = bp["sr"],
|
||||
mono = False,
|
||||
dtype = np.float32,
|
||||
res_type = bp["res_type"],
|
||||
sr=bp["sr"],
|
||||
mono=False,
|
||||
dtype=np.float32,
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
else: # lower bands
|
||||
X_wave[d] = librosa.resample(
|
||||
X_wave[d + 1],
|
||||
orig_sr = mp.param["band"][d + 1]["sr"],
|
||||
target_sr = bp["sr"],
|
||||
res_type = bp["res_type"],
|
||||
orig_sr=mp.param["band"][d + 1]["sr"],
|
||||
target_sr=bp["sr"],
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
y_wave[d] = librosa.resample(
|
||||
y_wave[d + 1],
|
||||
orig_sr = mp.param["band"][d + 1]["sr"],
|
||||
target_sr = bp["sr"],
|
||||
res_type = bp["res_type"],
|
||||
orig_sr=mp.param["band"][d + 1]["sr"],
|
||||
target_sr=bp["sr"],
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
|
||||
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
|
||||
@ -302,9 +282,7 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
|
||||
if reverse:
|
||||
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
|
||||
elif mid_side:
|
||||
return np.asfortranarray(
|
||||
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
|
||||
)
|
||||
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
|
||||
elif mid_side_b2:
|
||||
return np.asfortranarray(
|
||||
[
|
||||
@ -326,9 +304,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
|
||||
global wave_left
|
||||
wave_left = librosa.istft(**kwargs)
|
||||
|
||||
thread = threading.Thread(
|
||||
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
|
||||
)
|
||||
thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length})
|
||||
thread.start()
|
||||
wave_right = librosa.istft(spec_right, hop_length=hop_length)
|
||||
thread.join()
|
||||
@ -336,9 +312,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
|
||||
if reverse:
|
||||
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
|
||||
elif mid_side:
|
||||
return np.asfortranarray(
|
||||
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
|
||||
)
|
||||
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
|
||||
elif mid_side_b2:
|
||||
return np.asfortranarray(
|
||||
[
|
||||
@ -357,21 +331,15 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
|
||||
|
||||
for d in range(1, bands_n + 1):
|
||||
bp = mp.param["band"][d]
|
||||
spec_s = np.ndarray(
|
||||
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
|
||||
)
|
||||
spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex)
|
||||
h = bp["crop_stop"] - bp["crop_start"]
|
||||
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
|
||||
:, offset : offset + h, :
|
||||
]
|
||||
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :]
|
||||
|
||||
offset += h
|
||||
if d == bands_n: # higher
|
||||
if extra_bins_h: # if --high_end_process bypass
|
||||
max_bin = bp["n_fft"] // 2
|
||||
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
|
||||
:, :extra_bins_h, :
|
||||
]
|
||||
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :]
|
||||
if bp["hpf_start"] > 0:
|
||||
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
|
||||
if bands_n == 1:
|
||||
@ -405,9 +373,9 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
|
||||
mp.param["mid_side_b2"],
|
||||
mp.param["reverse"],
|
||||
),
|
||||
orig_sr = bp["sr"],
|
||||
target_sr = sr,
|
||||
res_type = "sinc_fastest",
|
||||
orig_sr=bp["sr"],
|
||||
target_sr=sr,
|
||||
res_type="sinc_fastest",
|
||||
)
|
||||
else: # mid
|
||||
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
|
||||
@ -456,10 +424,7 @@ def mirroring(a, spec_m, input_high_end, mp):
|
||||
np.abs(
|
||||
spec_m[
|
||||
:,
|
||||
mp.param["pre_filter_start"]
|
||||
- 10
|
||||
- input_high_end.shape[1] : mp.param["pre_filter_start"]
|
||||
- 10,
|
||||
mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10,
|
||||
:,
|
||||
]
|
||||
),
|
||||
@ -467,19 +432,14 @@ def mirroring(a, spec_m, input_high_end, mp):
|
||||
)
|
||||
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
|
||||
|
||||
return np.where(
|
||||
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
|
||||
)
|
||||
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
|
||||
|
||||
if "mirroring2" == a:
|
||||
mirror = np.flip(
|
||||
np.abs(
|
||||
spec_m[
|
||||
:,
|
||||
mp.param["pre_filter_start"]
|
||||
- 10
|
||||
- input_high_end.shape[1] : mp.param["pre_filter_start"]
|
||||
- 10,
|
||||
mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10,
|
||||
:,
|
||||
]
|
||||
),
|
||||
@ -528,7 +488,6 @@ def istft(spec, hl):
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
@ -573,10 +532,10 @@ if __name__ == "__main__":
|
||||
if d == len(mp.param["band"]): # high-end band
|
||||
wave[d], _ = librosa.load(
|
||||
args.input[i],
|
||||
sr = bp["sr"],
|
||||
mono = False,
|
||||
dtype = np.float32,
|
||||
res_type = bp["res_type"],
|
||||
sr=bp["sr"],
|
||||
mono=False,
|
||||
dtype=np.float32,
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
|
||||
if len(wave[d].shape) == 1: # mono to stereo
|
||||
@ -584,9 +543,9 @@ if __name__ == "__main__":
|
||||
else: # lower bands
|
||||
wave[d] = librosa.resample(
|
||||
wave[d + 1],
|
||||
orig_sr = mp.param["band"][d + 1]["sr"],
|
||||
target_sr = bp["sr"],
|
||||
res_type = bp["res_type"],
|
||||
orig_sr=mp.param["band"][d + 1]["sr"],
|
||||
target_sr=bp["sr"],
|
||||
res_type=bp["res_type"],
|
||||
)
|
||||
|
||||
spec[d] = wave_to_spectrogram(
|
||||
|
@ -27,9 +27,7 @@ def inference(X_spec, device, model, aggressiveness, data):
|
||||
data : dic configs
|
||||
"""
|
||||
|
||||
def _execute(
|
||||
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
|
||||
):
|
||||
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
preds = []
|
||||
@ -39,9 +37,7 @@ def inference(X_spec, device, model, aggressiveness, data):
|
||||
total_iterations = sum(iterations)
|
||||
for i in tqdm(range(n_window)):
|
||||
start = i * roi_size
|
||||
X_mag_window = X_mag_pad[
|
||||
None, :, :, start : start + data["window_size"]
|
||||
]
|
||||
X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]]
|
||||
X_mag_window = torch.from_numpy(X_mag_window)
|
||||
if is_half:
|
||||
X_mag_window = X_mag_window.half()
|
||||
@ -76,9 +72,7 @@ def inference(X_spec, device, model, aggressiveness, data):
|
||||
is_half = True
|
||||
else:
|
||||
is_half = False
|
||||
pred = _execute(
|
||||
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
|
||||
)
|
||||
pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
|
||||
pred = pred[:, :, :n_frame]
|
||||
|
||||
if data["tta"]:
|
||||
@ -88,9 +82,7 @@ def inference(X_spec, device, model, aggressiveness, data):
|
||||
|
||||
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
|
||||
|
||||
pred_tta = _execute(
|
||||
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
|
||||
)
|
||||
pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
|
||||
pred_tta = pred_tta[:, :, roi_size // 2 :]
|
||||
pred_tta = pred_tta[:, :, :n_frame]
|
||||
|
||||
|
@ -1,26 +1,22 @@
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
|
||||
import gradio as gr
|
||||
import logging
|
||||
|
||||
from tools.i18n.i18n import I18nAuto
|
||||
from tools.my_utils import clean_path
|
||||
|
||||
i18n = I18nAuto()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
import sys
|
||||
|
||||
import ffmpeg
|
||||
import torch
|
||||
import sys
|
||||
from bsroformer import Roformer_Loader
|
||||
from mdxnet import MDXNetDereverb
|
||||
from vr import AudioPre, AudioPreDeEcho
|
||||
from bsroformer import Roformer_Loader
|
||||
|
||||
try:
|
||||
import gradio.analytics as analytics
|
||||
|
||||
analytics.version_check = lambda: None
|
||||
except:
|
||||
...
|
||||
|
||||
weight_uvr5_root = "tools/uvr5/uvr5_weights"
|
||||
uvr5_names = []
|
||||
@ -129,7 +125,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
|
||||
yield "\n".join(infos)
|
||||
|
||||
|
||||
with gr.Blocks(title="UVR5 WebUI") as app:
|
||||
with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app:
|
||||
gr.Markdown(
|
||||
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
||||
+ "<br>"
|
||||
|
80
webui.py
80
webui.py
@ -147,7 +147,9 @@ if torch.cuda.is_available() or ngpu != 0:
|
||||
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
|
||||
|
||||
|
||||
v3v4set={"v3","v4"}
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
|
||||
def set_default():
|
||||
global \
|
||||
default_batch_size, \
|
||||
@ -382,7 +384,7 @@ def change_label(path_list):
|
||||
if p_label is None:
|
||||
check_for_existance([path_list])
|
||||
path_list = my_utils.clean_path(path_list)
|
||||
cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % (
|
||||
cmd = '"%s" -s tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % (
|
||||
python_exec,
|
||||
path_list,
|
||||
webui_port_subfix,
|
||||
@ -411,7 +413,13 @@ process_name_uvr5 = i18n("人声分离WebUI")
|
||||
def change_uvr5():
|
||||
global p_uvr5
|
||||
if p_uvr5 is None:
|
||||
cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s' % (python_exec, infer_device, is_half, webui_port_uvr5, is_share)
|
||||
cmd = '"%s" -s tools/uvr5/webui.py "%s" %s %s %s' % (
|
||||
python_exec,
|
||||
infer_device,
|
||||
is_half,
|
||||
webui_port_uvr5,
|
||||
is_share,
|
||||
)
|
||||
yield (
|
||||
process_info(process_name_uvr5, "opened"),
|
||||
{"__type__": "update", "visible": False},
|
||||
@ -435,9 +443,9 @@ process_name_tts = i18n("TTS推理WebUI")
|
||||
def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled):
|
||||
global p_tts_inference
|
||||
if batched_infer_enabled:
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language)
|
||||
cmd = '"%s" -s GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language)
|
||||
else:
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
|
||||
cmd = '"%s" -s GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language)
|
||||
# #####v3暂不支持加速推理
|
||||
# if version=="v3":
|
||||
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
@ -478,7 +486,7 @@ def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_
|
||||
asr_inp_dir = my_utils.clean_path(asr_inp_dir)
|
||||
asr_opt_dir = my_utils.clean_path(asr_opt_dir)
|
||||
check_for_existance([asr_inp_dir])
|
||||
cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
|
||||
cmd = f'"{python_exec}" -s tools/asr/{asr_dict[asr_model]["path"]}'
|
||||
cmd += f' -i "{asr_inp_dir}"'
|
||||
cmd += f' -o "{asr_opt_dir}"'
|
||||
cmd += f" -s {asr_model_size}"
|
||||
@ -539,7 +547,7 @@ def open_denoise(denoise_inp_dir, denoise_opt_dir):
|
||||
denoise_inp_dir = my_utils.clean_path(denoise_inp_dir)
|
||||
denoise_opt_dir = my_utils.clean_path(denoise_opt_dir)
|
||||
check_for_existance([denoise_inp_dir])
|
||||
cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % (
|
||||
cmd = '"%s" -s tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % (
|
||||
python_exec,
|
||||
denoise_inp_dir,
|
||||
denoise_opt_dir,
|
||||
@ -589,6 +597,7 @@ def close_denoise():
|
||||
p_train_SoVITS = None
|
||||
process_name_sovits = i18n("SoVITS训练")
|
||||
|
||||
|
||||
def open1Ba(
|
||||
batch_size,
|
||||
total_epoch,
|
||||
@ -635,13 +644,15 @@ def open1Ba(
|
||||
with open(tmp_config_path, "w") as f:
|
||||
f.write(json.dumps(data))
|
||||
if version in ["v1", "v2"]:
|
||||
cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path)
|
||||
cmd = '"%s" -s GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path)
|
||||
else:
|
||||
cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path)
|
||||
cmd = '"%s" -s GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path)
|
||||
yield (
|
||||
process_info(process_name_sovits, "opened"),
|
||||
{"__type__": "update", "visible": False},
|
||||
{"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"}
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update"},
|
||||
{"__type__": "update"},
|
||||
)
|
||||
print(cmd)
|
||||
p_train_SoVITS = Popen(cmd, shell=True)
|
||||
@ -651,13 +662,17 @@ def open1Ba(
|
||||
yield (
|
||||
process_info(process_name_sovits, "finish"),
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update
|
||||
{"__type__": "update", "visible": False},
|
||||
SoVITS_dropdown_update,
|
||||
GPT_dropdown_update,
|
||||
)
|
||||
else:
|
||||
yield (
|
||||
process_info(process_name_sovits, "occupy"),
|
||||
{"__type__": "update", "visible": False},
|
||||
{"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"}
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update"},
|
||||
{"__type__": "update"},
|
||||
)
|
||||
|
||||
|
||||
@ -722,11 +737,13 @@ def open1Bb(
|
||||
with open(tmp_config_path, "w") as f:
|
||||
f.write(yaml.dump(data, default_flow_style=False))
|
||||
# cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
|
||||
cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path)
|
||||
cmd = '"%s" -s GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path)
|
||||
yield (
|
||||
process_info(process_name_gpt, "opened"),
|
||||
{"__type__": "update", "visible": False},
|
||||
{"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"}
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update"},
|
||||
{"__type__": "update"},
|
||||
)
|
||||
print(cmd)
|
||||
p_train_GPT = Popen(cmd, shell=True)
|
||||
@ -736,13 +753,17 @@ def open1Bb(
|
||||
yield (
|
||||
process_info(process_name_gpt, "finish"),
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update
|
||||
{"__type__": "update", "visible": False},
|
||||
SoVITS_dropdown_update,
|
||||
GPT_dropdown_update,
|
||||
)
|
||||
else:
|
||||
yield (
|
||||
process_info(process_name_gpt, "occupy"),
|
||||
{"__type__": "update", "visible": False},
|
||||
{"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"}
|
||||
{"__type__": "update", "visible": True},
|
||||
{"__type__": "update"},
|
||||
{"__type__": "update"},
|
||||
)
|
||||
|
||||
|
||||
@ -793,7 +814,7 @@ def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max
|
||||
return
|
||||
if ps_slice == []:
|
||||
for i_part in range(n_parts):
|
||||
cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % (
|
||||
cmd = '"%s" -s tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % (
|
||||
python_exec,
|
||||
inp,
|
||||
opt_root,
|
||||
@ -887,7 +908,7 @@ def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir):
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1a.append(p)
|
||||
@ -974,7 +995,7 @@ def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir):
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1b.append(p)
|
||||
@ -1045,7 +1066,7 @@ def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path):
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1c.append(p)
|
||||
@ -1143,7 +1164,7 @@ def open1abc(
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1abc.append(p)
|
||||
@ -1189,7 +1210,7 @@ def open1abc(
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1abc.append(p)
|
||||
@ -1229,7 +1250,7 @@ def open1abc(
|
||||
}
|
||||
)
|
||||
os.environ.update(config)
|
||||
cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
ps1abc.append(p)
|
||||
@ -1291,6 +1312,7 @@ def close1abc():
|
||||
{"__type__": "update", "visible": False},
|
||||
)
|
||||
|
||||
|
||||
def switch_version(version_):
|
||||
os.environ["version"] = version_
|
||||
global version
|
||||
@ -1323,7 +1345,7 @@ def switch_version(version_):
|
||||
if os.path.exists("GPT_SoVITS/text/G2PWModel"):
|
||||
...
|
||||
else:
|
||||
cmd = '"%s" GPT_SoVITS/download.py' % python_exec
|
||||
cmd = '"%s" -s GPT_SoVITS/download.py' % python_exec
|
||||
p = Popen(cmd, shell=True)
|
||||
p.wait()
|
||||
|
||||
@ -1332,7 +1354,7 @@ def sync(text):
|
||||
return {"__type__": "update", "value": text}
|
||||
|
||||
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
|
||||
gr.Markdown(
|
||||
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
||||
+ "<br>"
|
||||
@ -1397,7 +1419,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
value=process_info(process_name_slice, "close"), variant="primary", visible=False
|
||||
)
|
||||
|
||||
gr.Markdown(value="0bb-" + i18n("语音降噪工具"))
|
||||
gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(非必需)"))
|
||||
with gr.Row():
|
||||
with gr.Column(scale=3):
|
||||
with gr.Row():
|
||||
@ -1492,7 +1514,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Row():
|
||||
exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
|
||||
gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
|
||||
version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"])#, "v3"
|
||||
version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"]) # , "v3"
|
||||
with gr.Row():
|
||||
pretrained_s2G = gr.Textbox(
|
||||
label=i18n("预训练SoVITS-G模型路径"),
|
||||
@ -1915,7 +1937,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
if_grad_ckpt,
|
||||
lora_rank,
|
||||
],
|
||||
[info1Ba, button1Ba_open, button1Ba_close,SoVITS_dropdown,GPT_dropdown],
|
||||
[info1Ba, button1Ba_open, button1Ba_close, SoVITS_dropdown, GPT_dropdown],
|
||||
)
|
||||
button1Bb_open.click(
|
||||
open1Bb,
|
||||
@ -1930,7 +1952,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
gpu_numbers1Bb,
|
||||
pretrained_s1,
|
||||
],
|
||||
[info1Bb, button1Bb_open, button1Bb_close,SoVITS_dropdown,GPT_dropdown],
|
||||
[info1Bb, button1Bb_open, button1Bb_close, SoVITS_dropdown, GPT_dropdown],
|
||||
)
|
||||
version_checkbox.change(
|
||||
switch_version,
|
||||
|
Loading…
x
Reference in New Issue
Block a user