diff --git a/.dockerignore b/.dockerignore index 4eca27be..bf36b884 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,8 +1,198 @@ -docs -logs -output -reference -SoVITS_weights -GPT_weights -TEMP +GPT_SoVITS/pretrained_models/* +tools/asr/models/* +tools/uvr5/uvr5_weights/* + .git +.DS_Store +.vscode +*.pyc +env +runtime +.idea +output +logs +SoVITS_weights*/ +GPT_weights*/ +TEMP +weight.json +ffmpeg* +ffprobe* +cfg.json +speakers.json +ref_audios + +# Byte-compiled / optimized / DLL files +__pycache__/ +**/__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/.github/build_windows_packages.ps1 b/.github/build_windows_packages.ps1 new file mode 100644 index 00000000..2e4acb2a --- /dev/null +++ b/.github/build_windows_packages.ps1 @@ -0,0 +1,194 @@ +$ErrorActionPreference = "Stop" + +Write-Host "Current location: $(Get-Location)" + +$cuda = $env:TORCH_CUDA +if (-not $cuda) { + Write-Error "Missing TORCH_CUDA env (cu124 or cu128)" + exit 1 +} + +$date = $env:DATE_SUFFIX +if ([string]::IsNullOrWhiteSpace($date)) { + $date = Get-Date -Format "MMdd" +} + +$pkgName = "GPT-SoVITS-$date" +$tmpDir = "tmp" +$srcDir = $PWD + +$suffix = $env:PKG_SUFFIX +if (-not [string]::IsNullOrWhiteSpace($suffix)) { + $pkgName = "$pkgName$suffix" +} + +$pkgName = "$pkgName-$cuda" + +$baseHF = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main" +$PRETRAINED_URL = "$baseHF/pretrained_models.zip" +$G2PW_URL = "$baseHF/G2PWModel.zip" +$UVR5_URL = "$baseHF/uvr5_weights.zip" +$NLTK_URL = "$baseHF/nltk_data.zip" +$JTALK_URL = "$baseHF/open_jtalk_dic_utf_8-1.11.tar.gz" + +$PYTHON_VERSION = "3.11.12" +$PY_RELEASE_VERSION = "20250409" + +Write-Host "[INFO] Cleaning .git..." +Remove-Item "$srcDir\.git" -Recurse -Force -ErrorAction SilentlyContinue + +Write-Host "[INFO] Creating tmp dir..." +New-Item -ItemType Directory -Force -Path $tmpDir + +Write-Host "[INFO] System Python version:" +python --version +python -m site + +Write-Host "[INFO] Downloading Python $PYTHON_VERSION..." +$zst = "$tmpDir\python.tar.zst" +Invoke-WebRequest "https://github.com/astral-sh/python-build-standalone/releases/download/$PY_RELEASE_VERSION/cpython-$PYTHON_VERSION+$PY_RELEASE_VERSION-x86_64-pc-windows-msvc-pgo-full.tar.zst" -OutFile $zst +& "C:\Program Files\7-Zip\7z.exe" e $zst -o"$tmpDir" -aoa +$tar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1 +& "C:\Program Files\7-Zip\7z.exe" x $tar.FullName -o"$tmpDir\extracted" -aoa +Move-Item "$tmpDir\extracted\python\install" "$srcDir\runtime" + +Write-Host "[INFO] Copying Redistributing Visual C++ Runtime..." +$vswhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" +$vsPath = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath +$redistRoot = Join-Path $vsPath "VC\Redist\MSVC" +$targetVer = Get-ChildItem -Path $redistRoot -Directory | + Where-Object { $_.Name -match "^14\." } | + Sort-Object Name -Descending | + Select-Object -First 1 +$x64Path = Join-Path $targetVer.FullName "x64" +Get-ChildItem -Path $x64Path -Directory | Where-Object { + $_.Name -match '^Microsoft\..*\.(CRT|OpenMP)$' +} | ForEach-Object { + Get-ChildItem -Path $_.FullName -Filter "*.dll" | ForEach-Object { + Copy-Item -Path $_.FullName -Destination "$srcDir\runtime" -Force + } +} + +function DownloadAndUnzip($url, $targetRelPath) { + $filename = Split-Path $url -Leaf + $tmpZip = "$tmpDir\$filename" + Invoke-WebRequest $url -OutFile $tmpZip + Expand-Archive -Path $tmpZip -DestinationPath $tmpDir -Force + $subdirName = $filename -replace '\.zip$', '' + $sourcePath = Join-Path $tmpDir $subdirName + $destRoot = Join-Path $srcDir $targetRelPath + $destPath = Join-Path $destRoot $subdirName + if (Test-Path $destPath) { + Remove-Item $destPath -Recurse -Force + } + Move-Item $sourcePath $destRoot + Remove-Item $tmpZip +} + +Write-Host "[INFO] Download pretrained_models..." +DownloadAndUnzip $PRETRAINED_URL "GPT_SoVITS" + +Write-Host "[INFO] Download G2PWModel..." +DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text" + +Write-Host "[INFO] Download UVR5 model..." +DownloadAndUnzip $UVR5_URL "tools\uvr5" + +Write-Host "[INFO] Downloading funasr..." +$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip" +$funasrZip = "$tmpDir\funasr.zip" +Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip +Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force +Remove-Item $funasrZip + +Write-Host "[INFO] Download ffmpeg..." +$ffUrl = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" +$ffZip = "$tmpDir\ffmpeg.zip" +Invoke-WebRequest -Uri $ffUrl -OutFile $ffZip +Expand-Archive $ffZip -DestinationPath $tmpDir -Force +$ffDir = Get-ChildItem -Directory "$tmpDir" | Where-Object { $_.Name -like "ffmpeg*" } | Select-Object -First 1 +Move-Item "$($ffDir.FullName)\bin\ffmpeg.exe" "$srcDir\runtime" +Move-Item "$($ffDir.FullName)\bin\ffprobe.exe" "$srcDir\runtime" +Remove-Item $ffZip +Remove-Item $ffDir.FullName -Recurse -Force + +Write-Host "[INFO] Installing PyTorch..." +& ".\runtime\python.exe" -m ensurepip +& ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location +switch ($cuda) { + "cu124" { + & ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location + } + "cu128" { + & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location + } + default { + Write-Error "Unsupported CUDA version: $cuda" + exit 1 + } +} + +Write-Host "[INFO] Installing dependencies..." +& ".\runtime\python.exe" -m pip install -r extra-req.txt --no-deps --no-warn-script-location +& ".\runtime\python.exe" -m pip install -r requirements.txt --no-warn-script-location + +Write-Host "[INFO] Downloading NLTK and pyopenjtalk dictionary..." +$PYTHON = ".\runtime\python.exe" +$prefix = & $PYTHON -c "import sys; print(sys.prefix)" +$jtalkPath = & $PYTHON -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))" +$nltkZip = "$tmpDir\nltk_data.zip" +$jtalkTar = "$tmpDir\open_jtalk_dic_utf_8-1.11.tar.gz" + +Invoke-WebRequest -Uri $NLTK_URL -OutFile $nltkZip +Expand-Archive -Path $nltkZip -DestinationPath $prefix -Force +Remove-Item $nltkZip + +Invoke-WebRequest -Uri $JTALK_URL -OutFile $jtalkTar +& "C:\Program Files\7-Zip\7z.exe" e $jtalkTar -o"$tmpDir" -aoa +$innerTar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1 +& "C:\Program Files\7-Zip\7z.exe" x $innerTar.FullName -o"$jtalkPath" -aoa +Remove-Item $jtalkTar +Remove-Item $innerTar.FullName + +Write-Host "[INFO] Preparing final directory $pkgName ..." +$items = @(Get-ChildItem -Filter "*.sh") + + @(Get-ChildItem -Filter "*.ipynb") + + @("$tmpDir", ".github", "Docker", "docs", ".gitignore", ".dockerignore", "README.md") +Remove-Item $items -Force -Recurse -ErrorAction SilentlyContinue +$curr = Get-Location +Set-Location ../ +Get-ChildItem . +Copy-Item -Path $curr -Destination $pkgName -Recurse +$7zPath = "$pkgName.7z" +$start = Get-Date +Write-Host "Compress Starting at $start" +& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1 +$end = Get-Date +Write-Host "Elapsed time: $($end - $start)" +Get-ChildItem . + +python -m pip install --upgrade pip +python -m pip install "modelscope" "huggingface_hub[hf_transfer]" --no-warn-script-location + +Write-Host "[INFO] Uploading to ModelScope..." +$msUser = $env:MODELSCOPE_USERNAME +$msToken = $env:MODELSCOPE_TOKEN +if (-not $msUser -or -not $msToken) { + Write-Error "Missing MODELSCOPE_USERNAME or MODELSCOPE_TOKEN" + exit 1 +} +modelscope upload "$msUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $msToken + +Write-Host "[SUCCESS] Uploaded: $7zPath to ModelScope" + +Write-Host "[INFO] Uploading to HuggingFace..." +$hfUser = $env:HUGGINGFACE_USERNAME +$hfToken = $env:HUGGINGFACE_TOKEN +if (-not $hfUser -or -not $hfToken) { + Write-Error "Missing HUGGINGFACE_USERNAME or HUGGINGFACE_TOKEN" + exit 1 +} +$env:HF_HUB_ENABLE_HF_TRANSFER = "1" +huggingface-cli upload "$hfUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $hfToken + +Write-Host "[SUCCESS] Uploaded: $7zPath to HuggingFace" diff --git a/.github/workflows/build_windows_packages.yaml b/.github/workflows/build_windows_packages.yaml new file mode 100644 index 00000000..32861463 --- /dev/null +++ b/.github/workflows/build_windows_packages.yaml @@ -0,0 +1,38 @@ +name: Build and Upload Windows Package + +on: + workflow_dispatch: + inputs: + date: + description: "Date suffix (optional)" + required: false + default: "" + suffix: + description: "Package name suffix (optional)" + required: false + default: "" + +jobs: + build: + runs-on: windows-latest + strategy: + matrix: + torch_cuda: [cu124, cu128] + env: + TORCH_CUDA: ${{ matrix.torch_cuda }} + MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }} + MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }} + HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }} + HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} + DATE_SUFFIX: ${{ github.event.inputs.date }} + PKG_SUFFIX: ${{ github.event.inputs.suffix }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Build and Upload Script + shell: pwsh + run: | + Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1 + ../build_windows_packages.ps1 \ No newline at end of file diff --git a/.github/workflows/docker-publish.yaml b/.github/workflows/docker-publish.yaml new file mode 100644 index 00000000..a00a0a77 --- /dev/null +++ b/.github/workflows/docker-publish.yaml @@ -0,0 +1,276 @@ +name: Build and Publish Docker Image + +on: + workflow_dispatch: + +jobs: + generate-meta: + runs-on: ubuntu-22.04 + outputs: + tag: ${{ steps.meta.outputs.tag }} + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Generate Tag + id: meta + run: | + DATE=$(date +'%Y%m%d') + COMMIT=$(git rev-parse --short=6 HEAD) + echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT + build-amd64: + needs: generate-meta + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - cuda_version: 12.6 + lite: true + torch_base: lite + tag_prefix: cu126-lite + - cuda_version: 12.6 + lite: false + torch_base: full + tag_prefix: cu126 + - cuda_version: 12.8 + lite: true + torch_base: lite + tag_prefix: cu128-lite + - cuda_version: 12.8 + lite: false + torch_base: full + tag_prefix: cu128 + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Free up disk space + run: | + echo "Before cleanup:" + df -h + + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /opt/hostedtoolcache/PyPy + sudo rm -rf /opt/hostedtoolcache/go + sudo rm -rf /opt/hostedtoolcache/node + sudo rm -rf /opt/hostedtoolcache/Ruby + sudo rm -rf /opt/microsoft + sudo rm -rf /opt/pipx + sudo rm -rf /opt/az + sudo rm -rf /opt/google + + + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/lib/google-cloud-sdk + sudo rm -rf /usr/lib/dotnet + + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /usr/local/julia1.11.5 + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/share/chromium + + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/share/miniconda + sudo rm -rf /usr/share/az_12.1.0 + sudo rm -rf /usr/share/dotnet + + echo "After cleanup:" + df -h + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Build and Push Docker Image (amd64) + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: true + platforms: linux/amd64 + build-args: | + LITE=${{ matrix.lite }} + TORCH_BASE=${{ matrix.torch_base }} + CUDA_VERSION=${{ matrix.cuda_version }} + WORKFLOW=true + tags: | + xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64 + xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64 + + build-arm64: + needs: generate-meta + runs-on: ubuntu-22.04-arm + strategy: + matrix: + include: + - cuda_version: 12.6 + lite: true + torch_base: lite + tag_prefix: cu126-lite + - cuda_version: 12.6 + lite: false + torch_base: full + tag_prefix: cu126 + - cuda_version: 12.8 + lite: true + torch_base: lite + tag_prefix: cu128-lite + - cuda_version: 12.8 + lite: false + torch_base: full + tag_prefix: cu128 + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Free up disk space + run: | + echo "Before cleanup:" + df -h + + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /opt/hostedtoolcache/PyPy + sudo rm -rf /opt/hostedtoolcache/go + sudo rm -rf /opt/hostedtoolcache/node + sudo rm -rf /opt/hostedtoolcache/Ruby + sudo rm -rf /opt/microsoft + sudo rm -rf /opt/pipx + sudo rm -rf /opt/az + sudo rm -rf /opt/google + + + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/lib/google-cloud-sdk + sudo rm -rf /usr/lib/dotnet + + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /usr/local/julia1.11.5 + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/share/chromium + + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/share/miniconda + sudo rm -rf /usr/share/az_12.1.0 + sudo rm -rf /usr/share/dotnet + + echo "After cleanup:" + df -h + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Build and Push Docker Image (arm64) + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: true + platforms: linux/arm64 + build-args: | + LITE=${{ matrix.lite }} + TORCH_BASE=${{ matrix.torch_base }} + CUDA_VERSION=${{ matrix.cuda_version }} + WORKFLOW=true + tags: | + xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64 + xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64 + + + merge-and-clean: + needs: + - build-amd64 + - build-arm64 + - generate-meta + runs-on: ubuntu-latest + strategy: + matrix: + include: + - tag_prefix: cu126-lite + - tag_prefix: cu126 + - tag_prefix: cu128-lite + - tag_prefix: cu128 + + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Merge amd64 and arm64 into multi-arch image + run: | + DATE_TAG=${{ needs.generate-meta.outputs.tag }} + TAG_PREFIX=${{ matrix.tag_prefix }} + + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64 + + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64 + - name: Delete old platform-specific tags via Docker Hub API + env: + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }} + TAG_PREFIX: ${{ matrix.tag_prefix }} + DATE_TAG: ${{ needs.generate-meta.outputs.tag }} + run: | + sudo apt-get update && sudo apt-get install -y jq + + TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \ + "https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \ + | jq -r .token) + + for PLATFORM in amd64 arm64; do + SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g') + TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}" + LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}" + + for DEL_TAG in "$TAG" "$LATEST_TAG"; do + echo "Deleting tag: $DEL_TAG" + curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG + done + done + create-default: + runs-on: ubuntu-latest + needs: + - merge-and-clean + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Create Default Tag + run: | + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite + \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0bb4e0bf..d280e459 100644 --- a/.gitignore +++ b/.gitignore @@ -7,13 +7,8 @@ runtime .idea output logs -reference -GPT_weights -SoVITS_weights -GPT_weights_v2 -SoVITS_weights_v2 -GPT_weights_v3 -SoVITS_weights_v3 +SoVITS_weights*/ +GPT_weights*/ TEMP weight.json ffmpeg* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..2434e74e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +ci: + autoupdate_schedule: monthly + +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.7 + hooks: + # Run the linter. + - id: ruff + types_or: [ python, pyi ] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + types_or: [ python, pyi ] + args: [ --line-length, "120", --target-version, "py310" ] diff --git a/Colab-Inference.ipynb b/Colab-Inference.ipynb new file mode 100644 index 00000000..b962c9b3 --- /dev/null +++ b/Colab-Inference.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-SoVITS Infer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Env Setup (Run Once Only)\n", + "## 环境配置, 只需运行一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e9b7iFV3dm1f" + }, + "outputs": [], + "source": [ + "%%writefile /content/setup.sh\n", + "set -e\n", + "\n", + "cd /content\n", + "\n", + "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", + "\n", + "cd GPT-SoVITS\n", + "\n", + "mkdir -p GPT_weights\n", + "\n", + "mkdir -p SoVITS_weights\n", + "\n", + "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", + " :\n", + "else\n", + " conda create -n GPTSoVITS python=3.10 -y\n", + "fi\n", + "\n", + "source activate GPTSoVITS\n", + "\n", + "pip install ipykernel\n", + "\n", + "bash install.sh --device CU126 --source HF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "0NgxXg5sjv7z" + }, + "outputs": [], + "source": [ + "%pip install -q condacolab\n", + "import condacolab\n", + "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n", + "!cd /content && bash setup.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download From HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "vbZY-LnM0tzq" + }, + "outputs": [], + "source": [ + "# Modify These\n", + "USER_ID = \"AkitoP\"\n", + "REPO_NAME = \"GPT-SoVITS-v2-aegi\"\n", + "BRANCH = \"main\"\n", + "GPT_PATH = \"new_aegigoe-e100.ckpt\"\n", + "SOVITS_PATH = \"new_aegigoe_e60_s32220.pth\"\n", + "\n", + "# Do Not Modify\n", + "HF_BASE = \"https://huggingface.co\"\n", + "REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n", + "GPT_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{GPT_PATH}\"\n", + "SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{SOVITS_PATH}\"\n", + "\n", + "!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n", + "!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download From ModelScope" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Modify These\n", + "USER_ID = \"aihobbyist\"\n", + "REPO_NAME = \"GPT-SoVits-V2-models\"\n", + "BRANCH = \"master\"\n", + "GPT_PATH = \"Genshin_Impact/EN/GPT_GenshinImpact_EN_5.1.ckpt\"\n", + "SOVITS_PATH = \"Wuthering_Waves/CN/SV_WutheringWaves_CN_1.3.pth\"\n", + "\n", + "# Do Not Modify\n", + "HF_BASE = \"https://www.modelscope.cn/models\"\n", + "REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n", + "GPT_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{GPT_PATH}\"\n", + "SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{SOVITS_PATH}\"\n", + "\n", + "!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n", + "!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Launch WebUI\n", + "# 启动 WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "4oRGUzkrk8C7" + }, + "outputs": [], + "source": [ + "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Colab-WebUI.ipynb b/Colab-WebUI.ipynb new file mode 100644 index 00000000..b1403f33 --- /dev/null +++ b/Colab-WebUI.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-SoVITS WebUI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_o6a8GS2lWQM" + }, + "source": [ + "## Env Setup (Run Once Only)\n", + "## 环境配置, 只需运行一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile /content/setup.sh\n", + "set -e\n", + "\n", + "cd /content\n", + "\n", + "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", + "\n", + "cd GPT-SoVITS\n", + "\n", + "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", + " :\n", + "else\n", + " conda create -n GPTSoVITS python=3.10 -y\n", + "fi\n", + "\n", + "source activate GPTSoVITS\n", + "\n", + "pip install ipykernel\n", + "\n", + "bash install.sh --device CU126 --source HF --download-uvr5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q condacolab\n", + "import condacolab\n", + "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n", + "!cd /content && bash setup.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch WebUI\n", + "## 启动 WebUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4oRGUzkrk8C7" + }, + "outputs": [], + "source": [ + "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Docker/damo.sha256 b/Docker/damo.sha256 deleted file mode 100644 index 6e9804da..00000000 --- a/Docker/damo.sha256 +++ /dev/null @@ -1,3 +0,0 @@ -5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb -b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb -a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb \ No newline at end of file diff --git a/Docker/download.py b/Docker/download.py deleted file mode 100644 index 234fd069..00000000 --- a/Docker/download.py +++ /dev/null @@ -1,5 +0,0 @@ -# Download moda ASR related models -from modelscope import snapshot_download -model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4") -model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4") -model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4") diff --git a/Docker/download.sh b/Docker/download.sh deleted file mode 100644 index 447e018e..00000000 --- a/Docker/download.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -set -Eeuo pipefail - -echo "Downloading models..." - -aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue - -echo "Checking SHA256..." - -parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c" diff --git a/Docker/install_wrapper.sh b/Docker/install_wrapper.sh new file mode 100644 index 00000000..6dd93e5a --- /dev/null +++ b/Docker/install_wrapper.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +cd .. || exit 1 + +set -e + +source "$HOME/miniconda3/etc/profile.d/conda.sh" + +mkdir -p GPT_SoVITS + +mkdir -p GPT_SoVITS/text + +ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + +ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + +bash install.sh --device "CU${CUDA_VERSION//./}" --source HF + +pip cache purge + +pip show torch + +rm -rf /tmp/* /var/tmp/* + +rm -rf "$HOME/miniconda3/pkgs" + +mkdir -p "$HOME/miniconda3/pkgs" + +rm -rf /root/.conda /root/.cache diff --git a/Docker/links.sha256 b/Docker/links.sha256 deleted file mode 100644 index cda6dc15..00000000 --- a/Docker/links.sha256 +++ /dev/null @@ -1,12 +0,0 @@ -b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt -fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth -020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth -24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin -e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin -39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth -45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth -5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth -8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth -56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth -233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Docker/links.txt b/Docker/links.txt deleted file mode 100644 index e6603db0..00000000 --- a/Docker/links.txt +++ /dev/null @@ -1,34 +0,0 @@ -# GPT-SoVITS models -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt - out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth - out=GPT_SoVITS/pretrained_models/s2D488k.pth -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth - out=GPT_SoVITS/pretrained_models/s2G488k.pth -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json -# UVR5 -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth - out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth - out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth - out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx - out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Docker/miniconda_install.sh b/Docker/miniconda_install.sh new file mode 100644 index 00000000..001a2a46 --- /dev/null +++ b/Docker/miniconda_install.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +cd .. || exit 1 + +if [ -d "$HOME/miniconda3" ]; then + exit 0 +fi + +WORKFLOW=${WORKFLOW:-"false"} +TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"} + +if [ "$WORKFLOW" = "true" ]; then + WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +else + WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +fi + +if [ "$TARGETPLATFORM" = "linux/amd64" ]; then + "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh +elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then + "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh +else + exit 1 +fi + +LOG_PATH="/tmp/miniconda-install.log" + +bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1 + +if [ $? -eq 0 ]; then + echo "== Miniconda Installed ==" +else + echo "Failed to Install miniconda" + tail -n 50 "$LOG_PATH" + exit 1 +fi + +rm miniconda.sh + +source "$HOME/miniconda3/etc/profile.d/conda.sh" + +"$HOME/miniconda3/bin/conda" config --add channels conda-forge + +"$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null + +"$HOME/miniconda3/bin/conda" install python=3.11 -q -y + +"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y + +if [ "$CUDA_VERSION" = "12.8" ]; then + "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128 +elif [ "$CUDA_VERSION" = "12.6" ]; then + "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126 +fi + +"$HOME/miniconda3/bin/pip" cache purge + +rm $LOG_PATH + +rm -rf "$HOME/miniconda3/pkgs" + +mkdir -p "$HOME/miniconda3/pkgs" + +rm -rf "$HOME/.conda" "$HOME/.cache" diff --git a/Dockerfile b/Dockerfile index 80cd9f3a..71bf6fa1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,42 +1,62 @@ -# Base CUDA image -FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 +ARG CUDA_VERSION=12.6 +ARG TORCH_BASE=full -LABEL maintainer="breakstring@hotmail.com" -LABEL version="dev-20240209" +FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} + +LABEL maintainer="XXXXRT" +LABEL version="V4" LABEL description="Docker image for GPT-SoVITS" +ARG CUDA_VERSION=12.6 -# Install 3rd party apps -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Etc/UTC -RUN apt-get update && \ - apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ - git lfs install && \ - rm -rf /var/lib/apt/lists/* +ENV CUDA_VERSION=${CUDA_VERSION} -# Copy only requirements.txt initially to leverage Docker cache -WORKDIR /workspace -COPY requirements.txt /workspace/ -RUN pip install --no-cache-dir -r requirements.txt +SHELL ["/bin/bash", "-c"] -# Define a build-time argument for image type -ARG IMAGE_TYPE=full +WORKDIR /workspace/GPT-SoVITS -# Conditional logic based on the IMAGE_TYPE argument -# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" -COPY ./Docker /workspace/Docker -# elite 类型的镜像里面不包含额外的模型 -RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ - chmod +x /workspace/Docker/download.sh && \ - /workspace/Docker/download.sh && \ - python /workspace/Docker/download.py && \ - python -m nltk.downloader averaged_perceptron_tagger cmudict; \ - fi +COPY Docker /workspace/GPT-SoVITS/Docker/ +ARG LITE=false +ENV LITE=${LITE} -# Copy the rest of the application -COPY . /workspace +ARG WORKFLOW=false +ENV WORKFLOW=${WORKFLOW} + +ARG TARGETPLATFORM +ENV TARGETPLATFORM=${TARGETPLATFORM} + +RUN bash Docker/miniconda_install.sh + +COPY extra-req.txt /workspace/GPT-SoVITS/ + +COPY requirements.txt /workspace/GPT-SoVITS/ + +COPY install.sh /workspace/GPT-SoVITS/ + +RUN bash Docker/install_wrapper.sh EXPOSE 9871 9872 9873 9874 9880 -CMD ["python", "webui.py"] +ENV PYTHONPATH="/workspace/GPT-SoVITS" + +RUN conda init bash && echo "conda activate base" >> ~/.bashrc + +WORKDIR /workspace + +RUN rm -rf /workspace/GPT-SoVITS + +WORKDIR /workspace/GPT-SoVITS + +COPY . /workspace/GPT-SoVITS + +CMD ["/bin/bash", "-c", "\ + rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ + rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ + rm -rf /workspace/GPT-SoVITS/tools/asr/models && \ + rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ + ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ + ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ + ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \ + ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ + exec bash"] \ No newline at end of file diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py index 45f91d8e..d8457334 100644 --- a/GPT_SoVITS/AR/data/bucket_sampler.py +++ b/GPT_SoVITS/AR/data/bucket_sampler.py @@ -4,14 +4,11 @@ import itertools import math import random from random import shuffle -from typing import Iterator -from typing import Optional -from typing import TypeVar +from typing import Iterator, Optional, TypeVar import torch import torch.distributed as dist -from torch.utils.data import Dataset -from torch.utils.data import Sampler +from torch.utils.data import Dataset, Sampler __all__ = [ "DistributedBucketSampler", @@ -50,10 +47,7 @@ class DistributedBucketSampler(Sampler[T_co]): if torch.cuda.is_available(): torch.cuda.set_device(rank) if rank >= num_replicas or rank < 0: - raise ValueError( - "Invalid rank {}, rank should be in the interval" - " [0, {}]".format(rank, num_replicas - 1) - ) + raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1)) self.dataset = dataset self.num_replicas = num_replicas self.rank = rank @@ -61,19 +55,16 @@ class DistributedBucketSampler(Sampler[T_co]): self.drop_last = drop_last # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. - if ( - self.drop_last and len(self.dataset) % self.num_replicas != 0 - ): # type: ignore[arg-type] + if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type] # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( - (len(self.dataset) - self.num_replicas) - / self.num_replicas # type: ignore[arg-type] + (len(self.dataset) - self.num_replicas) / self.num_replicas, # type: ignore[arg-type] ) else: self.num_samples = math.ceil( - len(self.dataset) / self.num_replicas + len(self.dataset) / self.num_replicas, ) # type: ignore[arg-type] self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle @@ -118,10 +109,7 @@ class DistributedBucketSampler(Sampler[T_co]): grouped_batch_size = self.batch_size * self.num_replicas shuffled_bucket = list(itertools.chain(*shuffled_bucket)) n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) - batches = [ - shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] - for b in range(n_batch) - ] + batches = [shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] for b in range(n_batch)] shuffle(batches) indices = list(itertools.chain(*batches)) else: @@ -134,9 +122,7 @@ class DistributedBucketSampler(Sampler[T_co]): if padding_size <= len(indices): indices += indices[:padding_size] else: - indices += (indices * math.ceil(padding_size / len(indices)))[ - :padding_size - ] + indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] else: # remove tail of data to make it evenly divisible. indices = indices[: self.total_size] diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py index cb947959..f360503b 100644 --- a/GPT_SoVITS/AR/data/data_module.py +++ b/GPT_SoVITS/AR/data/data_module.py @@ -1,9 +1,10 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py # reference: https://github.com/lifeiteng/vall-e from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader + from AR.data.bucket_sampler import DistributedBucketSampler from AR.data.dataset import Text2SemanticDataset -from torch.utils.data import DataLoader class Text2SemanticDataModule(LightningDataModule): @@ -42,8 +43,12 @@ class Text2SemanticDataModule(LightningDataModule): # pad_val=self.config['data']['pad_val']) def train_dataloader(self): - batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] - batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 + batch_size = ( + self.config["train"]["batch_size"] // 2 + if self.config["train"].get("if_dpo", False) is True + else self.config["train"]["batch_size"] + ) + batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存 sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) return DataLoader( self._train_dataset, diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 9d2dfe8a..402483d9 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -1,21 +1,17 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py # reference: https://github.com/lifeiteng/vall-e -import pdb -import sys # sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") -import traceback, os -from typing import Dict -from typing import List +import os +import traceback +from typing import Dict, List import numpy as np import pandas as pd -import torch, json -from torch.utils.data import DataLoader -from torch.utils.data import Dataset -from transformers import AutoTokenizer +import torch +from torch.utils.data import DataLoader, Dataset -version = os.environ.get('version',None) +version = os.environ.get("version", None) from text import cleaned_text_to_sequence @@ -34,9 +30,7 @@ def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0 padded_sequences = [] for seq, length in zip(sequences, seq_lengths): - padding = ( - [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) - ) + padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) padded_sequences.append(padded_seq) batch = np.stack(padded_sequences) @@ -61,12 +55,16 @@ class Text2SemanticDataset(Dataset): super().__init__() self.semantic_data = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" + semantic_path, + delimiter="\t", + encoding="utf-8", ) # get dict self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path self.path3 = "%s/3-bert" % ( - os.path.dirname(phoneme_path) + os.path.dirname( + phoneme_path, + ) ) # "%s/3-bert"%exp_dir#bert_dir self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path assert os.path.exists(self.path2) @@ -127,7 +125,7 @@ class Text2SemanticDataset(Dataset): for i in range(semantic_data_len): # 先依次遍历 # get str - item_name = self.semantic_data.iloc[i,0] + item_name = self.semantic_data.iloc[i, 0] # print(self.phoneme_data) try: phoneme, word2ph, text = self.phoneme_data[item_name] @@ -137,7 +135,7 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue - semantic_str = self.semantic_data.iloc[i,1] + semantic_str = self.semantic_data.iloc[i, 1] # get token list semantic_ids = [int(idx) for idx in semantic_str.split(" ")] # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len @@ -158,9 +156,7 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 - if ( - len(phoneme_ids) > self.max_sec * self.hz / 2.5 - ): ###########2:改为恒定限制为semantic/2.5就行 + if len(phoneme_ids) > self.max_sec * self.hz / 2.5: ###########2:改为恒定限制为semantic/2.5就行 num_deleted_ps += 1 continue # if len(semantic_ids) > 1000:###########3 @@ -169,9 +165,7 @@ class Text2SemanticDataset(Dataset): ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) - if ( - ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio - ): ##########4#3~25#每秒多少个phone + if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio: ##########4#3~25#每秒多少个phone num_deleted_ps += 1 # print(item_name) continue @@ -194,12 +188,12 @@ class Text2SemanticDataset(Dataset): print(f"there are {num_not_in} semantic datas not in phoneme datas") if num_deleted_bigger > 0: print( - f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds" + f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds", ) if num_deleted_ps > 0: # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值 print( - f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}" + f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}", ) """ there are 31 semantic datas not in phoneme datas @@ -306,7 +300,10 @@ if __name__ == "__main__": batch_size = 12 dataloader = DataLoader( - dataset, batch_size=batch_size, collate_fn=dataset.collate, shuffle=False + dataset, + batch_size=batch_size, + collate_fn=dataset.collate, + shuffle=False, ) for i, batch in enumerate(dataloader): if i % 1000 == 0: diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 2dd3f392..0696c35f 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -1,6 +1,7 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py # reference: https://github.com/lifeiteng/vall-e -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(now_dir) @@ -8,10 +9,12 @@ from typing import Dict import torch from pytorch_lightning import LightningModule + from AR.models.t2s_model import Text2SemanticDecoder from AR.modules.lr_schedulers import WarmupCosineLRSchedule from AR.modules.optim import ScaledAdam + class Text2SemanticLightningModule(LightningModule): def __init__(self, config, output_dir, is_train=True): super().__init__() @@ -23,7 +26,10 @@ class Text2SemanticLightningModule(LightningModule): # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) print( self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], ) ) if is_train: @@ -35,7 +41,7 @@ class Text2SemanticLightningModule(LightningModule): def training_step(self, batch: Dict, batch_idx: int): opt = self.optimizers() scheduler = self.lr_schedulers() - forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old + forward = self.model.forward if self.config["train"].get("if_dpo", False) == True else self.model.forward_old loss, acc = forward( batch["phoneme_ids"], batch["phoneme_ids_len"], @@ -113,9 +119,7 @@ class Text2SemanticLightningModule(LightningModule): def configure_optimizers(self): model_parameters = self.model.parameters() parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) lm_opt = ScaledAdam( model_parameters, lr=0.01, diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py index 487edb01..b0ab59c4 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py @@ -1,6 +1,7 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py # reference: https://github.com/lifeiteng/vall-e -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(now_dir) @@ -8,6 +9,7 @@ from typing import Dict import torch from pytorch_lightning import LightningModule + from AR.models.t2s_model_onnx import Text2SemanticDecoder from AR.modules.lr_schedulers import WarmupCosineLRSchedule from AR.modules.optim import ScaledAdam @@ -24,8 +26,11 @@ class Text2SemanticLightningModule(LightningModule): # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) print( self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] - ) + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], + ), ) if is_train: self.automatic_optimization = False @@ -79,9 +84,7 @@ class Text2SemanticLightningModule(LightningModule): def configure_optimizers(self): model_parameters = self.model.parameters() parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) lm_opt = ScaledAdam( model_parameters, lr=0.01, diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 8a32d0dd..4725b7a3 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -2,27 +2,24 @@ # reference: https://github.com/lifeiteng/vall-e import math from typing import List, Optional -import torch -from tqdm import tqdm -from AR.models.utils import make_pad_mask, make_pad_mask_left -from AR.models.utils import ( - topk_sampling, - sample, - logits_to_probs, - multinomial_sample_one_no_sync, - dpo_loss, - make_reject_y, - get_batch_logps -) -from AR.modules.embedding import SinePositionalEmbedding -from AR.modules.embedding import TokenEmbedding -from AR.modules.transformer import LayerNorm -from AR.modules.transformer import TransformerEncoder -from AR.modules.transformer import TransformerEncoderLayer +import torch from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy +from tqdm import tqdm + +from AR.models.utils import ( + dpo_loss, + get_batch_logps, + make_pad_mask, + make_pad_mask_left, + make_reject_y, + sample, + topk_sampling, +) +from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer default_config = { "embedding_dim": 512, @@ -36,10 +33,17 @@ default_config = { "EOS": 1024, } + # @torch.jit.script ## 使用的话首次推理会非常慢,而且推理速度不稳定 # Efficient implementation equivalent to the following: -def scaled_dot_product_attention(query:torch.Tensor, key:torch.Tensor, value:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, scale:Optional[torch.Tensor]=None) -> torch.Tensor: - B, H, L, S =query.size(0), query.size(1), query.size(-2), key.size(-2) +def scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + B, H, L, S = query.size(0), query.size(1), query.size(-2), key.size(-2) if scale is None: scale_factor = torch.tensor(1 / math.sqrt(query.size(-1))) else: @@ -59,12 +63,13 @@ def scaled_dot_product_attention(query:torch.Tensor, key:torch.Tensor, value:tor if attn_mask.dtype == torch.bool: attn_weight.masked_fill_(attn_mask, 0) else: - attn_mask[attn_mask!=float("-inf")] =0 - attn_mask[attn_mask==float("-inf")] =1 + attn_mask[attn_mask != float("-inf")] = 0 + attn_mask[attn_mask == float("-inf")] = 1 attn_weight.masked_fill_(attn_mask, 0) return attn_weight @ value + @torch.jit.script class T2SMLP: def __init__(self, w1, b1, w2, b2): @@ -82,20 +87,20 @@ class T2SMLP: @torch.jit.script class T2SBlock: def __init__( - self, - num_heads, - hidden_dim: int, - mlp: T2SMLP, - qkv_w, - qkv_b, - out_w, - out_b, - norm_w1, - norm_b1, - norm_eps1, - norm_w2, - norm_b2, - norm_eps2, + self, + num_heads, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1, + norm_w2, + norm_b2, + norm_eps2, ): self.num_heads = num_heads self.mlp = mlp @@ -114,24 +119,32 @@ class T2SBlock: self.false = torch.tensor(False, dtype=torch.bool) @torch.jit.ignore - def to_mask(self, x:torch.Tensor, padding_mask:Optional[torch.Tensor]): + def to_mask( + self, + x: torch.Tensor, + padding_mask: Optional[torch.Tensor], + ): if padding_mask is None: return x - + if padding_mask.dtype == torch.bool: return x.masked_fill(padding_mask, 0) else: return x * padding_mask - - def process_prompt(self, x:torch.Tensor, attn_mask : torch.Tensor, padding_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True): - + def process_prompt( + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) batch_size = q.shape[0] q_len = q.shape[1] kv_len = k.shape[1] - + q = self.to_mask(q, padding_mask) k_cache = self.to_mask(k, padding_mask) v_cache = self.to_mask(v, padding_mask) @@ -149,9 +162,7 @@ class T2SBlock: attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -161,13 +172,20 @@ class T2SBlock: self.norm_eps2, ) return x, k_cache, v_cache - - def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:torch.Tensor=None, torch_sdpa:bool=True): + + def decode_next_token( + self, + x: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, + ): q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) k_cache = torch.cat([k_cache, k], dim=1) v_cache = torch.cat([v_cache, v], dim=1) - + batch_size = q.shape[0] q_len = q.shape[1] kv_len = k_cache.shape[1] @@ -176,7 +194,6 @@ class T2SBlock: k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - if torch_sdpa: attn = F.scaled_dot_product_attention(q, k, v, (~attn_mask) if attn_mask is not None else None) else: @@ -187,7 +204,11 @@ class T2SBlock: x = x + attn x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 + x, + [self.hidden_dim], + self.norm_w1, + self.norm_b1, + self.norm_eps1, ) x = x + self.mlp.forward(x) x = F.layer_norm( @@ -202,17 +223,19 @@ class T2SBlock: @torch.jit.script class T2STransformer: - def __init__(self, num_blocks : int, blocks: List[T2SBlock]): - self.num_blocks : int = num_blocks + def __init__(self, num_blocks: int, blocks: List[T2SBlock]): + self.num_blocks: int = num_blocks self.blocks = blocks def process_prompt( - self, x:torch.Tensor, attn_mask : torch.Tensor, - padding_mask : Optional[torch.Tensor]=None, - torch_sdpa:bool=True - ): - k_cache : List[torch.Tensor] = [] - v_cache : List[torch.Tensor] = [] + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): + k_cache: List[torch.Tensor] = [] + v_cache: List[torch.Tensor] = [] for i in range(self.num_blocks): x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask, torch_sdpa) k_cache.append(k_cache_) @@ -220,14 +243,17 @@ class T2STransformer: return x, k_cache, v_cache def decode_next_token( - self, x:torch.Tensor, - k_cache: List[torch.Tensor], - v_cache: List[torch.Tensor], - attn_mask : torch.Tensor=None, - torch_sdpa:bool=True + self, + x: torch.Tensor, + k_cache: List[torch.Tensor], + v_cache: List[torch.Tensor], + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, ): for i in range(self.num_blocks): - x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i], attn_mask, torch_sdpa) + x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token( + x, k_cache[i], v_cache[i], attn_mask, torch_sdpa + ) return x, k_cache, v_cache @@ -249,16 +275,26 @@ class Text2SemanticDecoder(nn.Module): # assert self.EOS == 1024 self.bert_proj = nn.Linear(1024, self.embedding_dim) self.ar_text_embedding = TokenEmbedding( - self.embedding_dim, self.phoneme_vocab_size, self.p_dropout + self.embedding_dim, + self.phoneme_vocab_size, + self.p_dropout, ) self.ar_text_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, ) self.ar_audio_embedding = TokenEmbedding( - self.embedding_dim, self.vocab_size, self.p_dropout + self.embedding_dim, + self.vocab_size, + self.p_dropout, ) self.ar_audio_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, ) self.h = TransformerEncoder( @@ -293,7 +329,7 @@ class Text2SemanticDecoder(nn.Module): layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, - layer.linear2.bias + layer.linear2.bias, ) block = T2SBlock( @@ -309,11 +345,11 @@ class Text2SemanticDecoder(nn.Module): layer.norm1.eps, layer.norm2.weight, layer.norm2.bias, - layer.norm2.eps + layer.norm2.eps, ) blocks.append(block) - + self.t2s_transformer = T2STransformer(self.num_layers, blocks) def make_input_data(self, x, x_lens, y, y_lens, bert_feature): @@ -387,7 +423,9 @@ class Text2SemanticDecoder(nn.Module): logits = self.ar_predict_layer(xy_dec[:, x_len:]) ###### DPO ############# - reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(x, x_lens, reject_y, reject_y_lens, bert_feature) + reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data( + x, x_lens, reject_y, reject_y_lens, bert_feature + ) reject_xy_dec, _ = self.h( (reject_xy_pos, None), @@ -404,7 +442,7 @@ class Text2SemanticDecoder(nn.Module): A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets) loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True) - + loss = loss_1 + loss_2 return loss, acc @@ -473,14 +511,14 @@ class Text2SemanticDecoder(nn.Module): # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 def infer( - self, - x, - x_lens, - prompts, - bert_feature, - top_k: int = -100, - early_stop_num: int = -1, - temperature: float = 1.0, + self, + x, + x_lens, + prompts, + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, ): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -508,18 +546,14 @@ class Text2SemanticDecoder(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( - y.device - ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(y.device) xy_dec, _ = self.h( (xy_pos, None), mask=xy_attn_mask, ) logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = topk_sampling( - logits, top_k=top_k, top_p=1.0, temperature=temperature - ) + samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature) if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: print("use early stop num:", early_stop_num) @@ -542,18 +576,16 @@ class Text2SemanticDecoder(nn.Module): return y def pad_y_eos(self, y, y_mask_int, eos_id): - targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad( - y_mask_int, (0, 1), value=1 - ) + targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1) # 错位 return targets[:, :-1], targets[:, 1:] def infer_panel_batch_infer( self, - x:List[torch.LongTensor], #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:List[torch.LongTensor], + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, @@ -563,10 +595,19 @@ class Text2SemanticDecoder(nn.Module): ): if prompts is None: print("Warning: Prompt free is not supported batch_infer! switch to naive_infer") - return self.infer_panel_naive_batched(x, x_lens, prompts, bert_feature, top_k=top_k, top_p=top_p, early_stop_num=early_stop_num, temperature=temperature, **kwargs) + return self.infer_panel_naive_batched( + x, + x_lens, + prompts, + bert_feature, + top_k=top_k, + top_p=top_p, + early_stop_num=early_stop_num, + temperature=temperature, + **kwargs, + ) - - max_len = kwargs.get("max_len",x_lens.max()) + max_len = kwargs.get("max_len", x_lens.max()) x_list = [] for x_item, bert_item in zip(x, bert_feature): # max_len = max(max_len, x_item.shape[0], bert_item.shape[1]) @@ -574,14 +615,15 @@ class Text2SemanticDecoder(nn.Module): x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0)) x_item = self.ar_text_position(x_item).squeeze(0) # x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0] early_stop_num) or idx==1499: + + if (early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num) or idx == 1499: print("use early stop num:", early_stop_num) stop = True for i, batch_index in enumerate(batch_idx_map): batch_index = batch_idx_map[i] idx_list[batch_index] = idx y_list[batch_index] = y[i, :-1] - - if not (None in idx_list): + + if None not in idx_list: stop = True - + if stop: - if y.shape[1]==0: + if y.shape[1] == 0: y = torch.concat([y, torch.zeros_like(samples)], dim=1) print("bad zero prediction") print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") @@ -730,60 +764,65 @@ class Text2SemanticDecoder(nn.Module): ####################### update next step ################################### y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) - if (None in idx_list): + if None in idx_list: for i in range(x.shape[0]): if idx_list[i] is None: - idx_list[i] = 1500-1 ###如果没有生成到EOS,就用最大长度代替 - + idx_list[i] = 1500 - 1 ###如果没有生成到EOS,就用最大长度代替 + if ref_free: - return y_list, [0]*x.shape[0] + return y_list, [0] * x.shape[0] # print(idx_list) return y_list, idx_list - - def infer_panel_naive_batched(self, - x:List[torch.LongTensor], #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:List[torch.LongTensor], + + def infer_panel_naive_batched( + self, + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs - ): + **kwargs, + ): y_list = [] idx_list = [] for i in range(len(x)): - y, idx = self.infer_panel_naive(x[i].unsqueeze(0), - x_lens[i], - prompts[i].unsqueeze(0) if prompts is not None else None, - bert_feature[i].unsqueeze(0), - top_k, - top_p, - early_stop_num, - temperature, - repetition_penalty, - **kwargs) + y, idx = self.infer_panel_naive( + x[i].unsqueeze(0), + x_lens[i], + prompts[i].unsqueeze(0) if prompts is not None else None, + bert_feature[i].unsqueeze(0), + top_k, + top_p, + early_stop_num, + temperature, + repetition_penalty, + **kwargs, + ) y_list.append(y[0]) idx_list.append(idx) - + return y_list, idx_list - + def infer_panel_naive( self, - x:torch.LongTensor, #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:torch.LongTensor, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs + **kwargs, ): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -828,11 +867,13 @@ class Text2SemanticDecoder(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)\ - .unsqueeze(0)\ - .expand(bsz*self.num_head, -1, -1)\ - .view(bsz, self.num_head, src_len, src_len)\ - .to(device=x.device, dtype=torch.bool) + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) for idx in tqdm(range(1500)): if xy_attn_mask is not None: @@ -840,13 +881,11 @@ class Text2SemanticDecoder(nn.Module): else: xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) - logits = self.ar_predict_layer( - xy_dec[:, -1] - ) + logits = self.ar_predict_layer(xy_dec[:, -1]) if idx == 0: xy_attn_mask = None - if(idx<11):###至少预测出10个token不然不给停止(0.4s) + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) logits = logits[:, :-1] samples = sample( @@ -870,24 +909,27 @@ class Text2SemanticDecoder(nn.Module): ####################### update next step ################################### y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) if ref_free: return y[:, :-1], 0 return y[:, :-1], idx - - + def infer_panel( self, - x:torch.LongTensor, #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:torch.LongTensor, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs + **kwargs, ): - return self.infer_panel_naive(x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs) + return self.infer_panel_naive( + x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs + ) diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py index 7834297d..4f7b50a3 100644 --- a/GPT_SoVITS/AR/models/t2s_model_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py @@ -1,17 +1,13 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py # reference: https://github.com/lifeiteng/vall-e import torch -from tqdm import tqdm - -from AR.modules.embedding_onnx import SinePositionalEmbedding -from AR.modules.embedding_onnx import TokenEmbedding -from AR.modules.transformer_onnx import LayerNorm -from AR.modules.transformer_onnx import TransformerEncoder -from AR.modules.transformer_onnx import TransformerEncoderLayer from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy +from AR.modules.embedding_onnx import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer_onnx import LayerNorm, TransformerEncoder, TransformerEncoderLayer + default_config = { "embedding_dim": 512, "hidden_dim": 512, @@ -26,12 +22,13 @@ default_config = { inf_tensor_value = torch.FloatTensor([-float("Inf")]).float() + def logits_to_probs( logits, - previous_tokens = None, + previous_tokens=None, temperature: float = 1.0, - top_k = None, - top_p = None, + top_k=None, + top_p=None, repetition_penalty: float = 1.0, ): previous_tokens = previous_tokens.squeeze() @@ -39,19 +36,27 @@ def logits_to_probs( previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=0, index=previous_tokens) score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty + score < 0, + score * repetition_penalty, + score / repetition_penalty, ) logits.scatter_(dim=0, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 + torch.nn.functional.softmax( + sorted_logits, + dim=-1, + ), + dim=-1, ) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[0] = False # keep at least one option indices_to_remove = sorted_indices_to_remove.scatter( - dim=0, index=sorted_indices, src=sorted_indices_to_remove + dim=0, + index=sorted_indices, + src=sorted_indices_to_remove, ) logits = logits.masked_fill(indices_to_remove, -float("Inf")) @@ -67,7 +72,7 @@ def logits_to_probs( def multinomial_sample_one_no_sync( - probs_sort + probs_sort, ): # Does multinomial sampling without a cuda synchronization q = torch.randn_like(probs_sort) return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) @@ -79,7 +84,9 @@ def sample( **sampling_kwargs, ): probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, **sampling_kwargs + logits=logits, + previous_tokens=previous_tokens, + **sampling_kwargs, ) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs @@ -91,7 +98,7 @@ class OnnxEncoder(nn.Module): self.ar_text_embedding = ar_text_embedding self.bert_proj = bert_proj self.ar_text_position = ar_text_position - + def forward(self, x, bert_feature): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -99,8 +106,18 @@ class OnnxEncoder(nn.Module): class T2SFirstStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): super().__init__() self.ar_audio_embedding = ar_audio_embedding self.ar_audio_position = ar_audio_position @@ -111,11 +128,11 @@ class T2SFirstStageDecoder(nn.Module): self.top_k = top_k self.early_stop_num = early_stop_num self.num_layers = num_layers - + def forward(self, x, prompt): y = prompt - x_example = x[:,:,0] * 0.0 - #N, 1, 512 + x_example = x[:, :, 0] * 0.0 + # N, 1, 512 cache = { "all_stage": self.num_layers, "k": None, @@ -132,11 +149,15 @@ class T2SFirstStageDecoder(nn.Module): xy_pos = torch.concat([x, y_pos], dim=1) - y_example = y_pos[:,:,0] * 0.0 - x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool() + y_example = y_pos[:, :, 0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example).bool() y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64) y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum( - torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0 + torch.ones_like( + y_example.transpose(0, 1), + dtype=torch.int64, + ), + dim=0, ) y_attn_mask = y_attn_mask > 0 @@ -145,10 +166,16 @@ class T2SFirstStageDecoder(nn.Module): x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1) y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1) xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) - cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) - cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) + cache["k"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) + cache["v"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) logits = self.ar_predict_layer(xy_dec[:, -1]) @@ -160,8 +187,18 @@ class T2SFirstStageDecoder(nn.Module): class T2SStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): super().__init__() self.ar_audio_embedding = ar_audio_embedding self.ar_audio_position = ar_audio_position @@ -184,14 +221,18 @@ class T2SStageDecoder(nn.Module): } y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 + [ + cache["y_emb"], + self.ar_audio_embedding(y[:, -1:]), + ], + 1, ) cache["y_emb"] = y_emb y_pos = self.ar_audio_position(y_emb) xy_pos = y_pos[:, -1:] - - y_example = y_pos[:,:,0] * 0.0 + + y_example = y_pos[:, :, 0] * 0.0 xy_attn_mask = torch.cat([x_example, y_example], dim=1) xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool) @@ -250,12 +291,28 @@ class Text2SemanticDecoder(nn.Module): def init_onnx(self): self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position) - self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) - self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) + self.first_stage_decoder = T2SFirstStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) + self.stage_decoder = T2SStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) def forward(self, x, prompts, bert_feature): early_stop_num = self.early_stop_num @@ -286,7 +343,7 @@ class Text2SemanticDecoder(nn.Module): y = prompts prefix_len = y.shape[1] x_len = x.shape[1] - x_example = x[:,:,0] * 0.0 + x_example = x[:, :, 0] * 0.0 x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example) x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool) @@ -303,9 +360,7 @@ class Text2SemanticDecoder(nn.Module): if cache["first_infer"] == 1: y_emb = self.ar_audio_embedding(y) else: - y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 - ) + y_emb = torch.cat([cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1) cache["y_emb"] = y_emb y_pos = self.ar_audio_position(y_emb) if cache["first_infer"] == 1: @@ -317,7 +372,8 @@ class Text2SemanticDecoder(nn.Module): x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) y_attn_mask = F.pad( torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), - (x_len, 0), value=False + (x_len, 0), + value=False, ) xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) else: @@ -335,4 +391,4 @@ class Text2SemanticDecoder(nn.Module): break y = torch.concat([y, samples], dim=1) cache["first_infer"] = 0 - return y, idx \ No newline at end of file + return y, idx diff --git a/GPT_SoVITS/AR/models/utils.py b/GPT_SoVITS/AR/models/utils.py index d2ae26d9..cc4f24d8 100644 --- a/GPT_SoVITS/AR/models/utils.py +++ b/GPT_SoVITS/AR/models/utils.py @@ -1,8 +1,10 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py # reference: https://github.com/lifeiteng/vall-e +from typing import Tuple + import torch import torch.nn.functional as F -from typing import Tuple + def sequence_mask(length, max_length=None): if max_length is None: @@ -67,14 +69,18 @@ def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: n = lengths.size(0) seq_range = torch.arange(0, max_len, device=lengths.device) expaned_lengths = seq_range.unsqueeze(0).repeat(n, 1) - expaned_lengths -= (max_len-lengths).unsqueeze(-1) + expaned_lengths -= (max_len - lengths).unsqueeze(-1) - return expaned_lengths<0 + return expaned_lengths < 0 # https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py def top_k_top_p_filtering( - logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 + logits, + top_k=0, + top_p=1.0, + filter_value=-float("Inf"), + min_tokens_to_keep=1, ): """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: @@ -105,9 +111,7 @@ def top_k_top_p_filtering( sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter( - 1, sorted_indices, sorted_indices_to_remove - ) + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits @@ -130,7 +134,7 @@ def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): return token -from typing import Optional, Tuple +from typing import Optional def multinomial_sample_one_no_sync( @@ -156,19 +160,21 @@ def logits_to_probs( previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=1, index=previous_tokens) score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty + score < 0, + score * repetition_penalty, + score / repetition_penalty, ) logits.scatter_(dim=1, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 - ) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[:, 0] = False # keep at least one option indices_to_remove = sorted_indices_to_remove.scatter( - dim=1, index=sorted_indices, src=sorted_indices_to_remove + dim=1, + index=sorted_indices, + src=sorted_indices_to_remove, ) logits = logits.masked_fill(indices_to_remove, -float("Inf")) @@ -176,7 +182,7 @@ def logits_to_probs( if top_k is not None: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - pivot = v[: , -1].unsqueeze(-1) + pivot = v[:, -1].unsqueeze(-1) logits = torch.where(logits < pivot, -float("Inf"), logits) probs = torch.nn.functional.softmax(logits, dim=-1) @@ -188,18 +194,19 @@ def sample( previous_tokens: Optional[torch.Tensor] = None, **sampling_kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: - probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, **sampling_kwargs - ) + probs = logits_to_probs(logits=logits, previous_tokens=previous_tokens, **sampling_kwargs) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs -def dpo_loss(policy_chosen_logps: torch.FloatTensor, - policy_rejected_logps: torch.FloatTensor, - reference_chosen_logps: torch.FloatTensor, - reference_rejected_logps: torch.FloatTensor, - beta: float, - reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + +def dpo_loss( + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + beta: float, + reference_free: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: pi_logratios = policy_chosen_logps - policy_rejected_logps ref_logratios = reference_chosen_logps - reference_rejected_logps @@ -214,40 +221,53 @@ def dpo_loss(policy_chosen_logps: torch.FloatTensor, return losses.mean(), chosen_rewards, rejected_rewards -def get_batch_logps(logits_target: torch.FloatTensor, logits_reject: torch.FloatTensor, labels_target: torch.LongTensor, labels_reject: torch.LongTensor, average_log_prob: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor]: +def get_batch_logps( + logits_target: torch.FloatTensor, + logits_reject: torch.FloatTensor, + labels_target: torch.LongTensor, + labels_reject: torch.LongTensor, + average_log_prob: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor]: # dummy token; we'll ignore the losses on these tokens later - per_token_logps_target = torch.gather(logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)).squeeze(2) - per_token_logps_reject = torch.gather(logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)).squeeze(2) + per_token_logps_target = torch.gather( + logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2) + ).squeeze(2) + per_token_logps_reject = torch.gather( + logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2) + ).squeeze(2) return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1) + def make_reject_y(y_o, y_lens): def repeat_P(y): range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() - pre = y[:range_idx[0]] - shf = y[range_idx[1]:] - range_text = y[range_idx[0]:range_idx[1]] + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] new_y = torch.cat([pre, range_text, range_text, shf]) return new_y + def lost_P(y): range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() - pre = y[:range_idx[0]] - shf = y[range_idx[1]:] - range_text = y[range_idx[0]:range_idx[1]] + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] new_y = torch.cat([pre, shf]) return new_y + bs = len(y_lens) reject_y = [] reject_y_lens = [] for b in range(bs): - process_item_idx = torch.randint(0, 1, size=(1, ))[0] + process_item_idx = torch.randint(0, 1, size=(1,))[0] if process_item_idx == 0: new_y = repeat_P(y_o[b]) reject_y.append(new_y) reject_y_lens.append(len(new_y)) - elif process_item_idx==1: + elif process_item_idx == 1: new_y = lost_P(y_o[b]) reject_y.append(new_y) reject_y_lens.append(len(new_y)) @@ -256,7 +276,7 @@ def make_reject_y(y_o, y_lens): pad_length = max_length - reject_y_lens[b] reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0) - reject_y = torch.stack(reject_y, dim = 0) + reject_y = torch.stack(reject_y, dim=0) reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device) return reject_y, reject_y_lens diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 5ca888b5..936f9c3f 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -1,17 +1,14 @@ # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple +from typing import Optional, Tuple + import torch from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ +from torch.nn import Linear, Module +from torch.nn import functional as F +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from torch.nn import functional as F from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched F.multi_head_attention_forward = multi_head_attention_forward_patched @@ -73,6 +70,7 @@ class MultiheadAttention(Module): >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ + __constants__ = ["batch_first"] bias_k: Optional[torch.Tensor] bias_v: Optional[torch.Tensor] @@ -104,9 +102,7 @@ class MultiheadAttention(Module): self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if add_bias_kv: self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) @@ -117,31 +113,32 @@ class MultiheadAttention(Module): if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) + torch.empty((embed_dim, embed_dim), **factory_kwargs), ) self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) + torch.empty((embed_dim, self.kdim), **factory_kwargs), ) self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) + torch.empty((embed_dim, self.vdim), **factory_kwargs), ) self.register_parameter("in_proj_weight", None) else: self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs), ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) - ) + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) else: self.register_parameter("in_proj_bias", None) self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) self._reset_parameters() @@ -150,7 +147,10 @@ class MultiheadAttention(Module): raise NotImplementedError else: self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, ) self.in_proj_weight = self.in_proj_linear.weight @@ -164,7 +164,10 @@ class MultiheadAttention(Module): self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) if self.bias_k is not None: @@ -261,28 +264,26 @@ class MultiheadAttention(Module): if key_padding_mask is not None: _kpm_dtype = key_padding_mask.dtype if _kpm_dtype != torch.bool and not torch.is_floating_point( - key_padding_mask + key_padding_mask, ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) + raise AssertionError("only bool and floating types of key_padding_mask are supported") why_not_fast_path = "" if not is_batched: - why_not_fast_path = ( - f"input not batched; expected query.dim() of 3 but got {query.dim()}" - ) + why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" elif query is not key or key is not value: # When lifting this restriction, don't forget to either # enforce that the dtypes all match or test cases where # they don't! why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" - elif ( - self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype - ): + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + ) + elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype: # this case will fail anyway, but at least they'll get a useful error message. - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + ) elif self.training: why_not_fast_path = "training is enabled" elif not self.batch_first: @@ -300,9 +301,7 @@ class MultiheadAttention(Module): elif attn_mask is not None: why_not_fast_path = "attn_mask was not None" elif query.is_nested and key_padding_mask is not None: - why_not_fast_path = ( - "key_padding_mask is not supported with NestedTensor input" - ) + why_not_fast_path = "key_padding_mask is not supported with NestedTensor input" elif self.num_heads % 2 == 1: why_not_fast_path = "num_heads is odd" elif torch.is_autocast_enabled(): @@ -322,20 +321,10 @@ class MultiheadAttention(Module): # generator expressions. if torch.overrides.has_torch_function(tensor_args): why_not_fast_path = "some Tensor argument has_torch_function" - elif not all( - [ - (x is None or x.is_cuda or "cpu" in str(x.device)) - for x in tensor_args - ] - ): + elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args]): why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" - elif torch.is_grad_enabled() and any( - [x is not None and x.requires_grad for x in tensor_args] - ): - why_not_fast_path = ( - "grad is enabled and at least one of query or the " - "input/output projection weights or biases requires_grad" - ) + elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]): + why_not_fast_path = "grad is enabled and at least one of query or the input/output projection weights or biases requires_grad" if not why_not_fast_path: return torch._native_multi_head_attention( query, @@ -350,11 +339,7 @@ class MultiheadAttention(Module): key_padding_mask if key_padding_mask is not None else attn_mask, need_weights, average_attn_weights, - 1 - if key_padding_mask is not None - else 0 - if attn_mask is not None - else None, + 1 if key_padding_mask is not None else 0 if attn_mask is not None else None, ) any_nested = query.is_nested or key.is_nested or value.is_nested diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py index b54acd99..c14ce40c 100644 --- a/GPT_SoVITS/AR/modules/activation_onnx.py +++ b/GPT_SoVITS/AR/modules/activation_onnx.py @@ -1,17 +1,13 @@ # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple +from typing import Optional, Tuple + import torch from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ +from torch.nn import Linear, Module +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from torch.nn import functional as F from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched @@ -47,9 +43,7 @@ class MultiheadAttention(Module): self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if add_bias_kv: self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) @@ -60,18 +54,30 @@ class MultiheadAttention(Module): if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) + torch.empty( + (embed_dim, embed_dim), + **factory_kwargs, + ) ) self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) + torch.empty( + (embed_dim, self.kdim), + **factory_kwargs, + ) ) self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) + torch.empty( + (embed_dim, self.vdim), + **factory_kwargs, + ) ) self.register_parameter("in_proj_weight", None) else: self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + torch.empty( + (3 * embed_dim, embed_dim), + **factory_kwargs, + ) ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) @@ -79,13 +85,11 @@ class MultiheadAttention(Module): if bias: self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) + torch.empty(3 * embed_dim, **factory_kwargs), ) else: self.register_parameter("in_proj_bias", None) - self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) + self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs) self._reset_parameters() else: @@ -93,7 +97,10 @@ class MultiheadAttention(Module): raise NotImplementedError else: self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, ) self.in_proj_weight = self.in_proj_linear.weight @@ -107,7 +114,10 @@ class MultiheadAttention(Module): self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) if self.bias_k is not None: diff --git a/GPT_SoVITS/AR/modules/embedding.py b/GPT_SoVITS/AR/modules/embedding.py index 3a382f93..39da5603 100644 --- a/GPT_SoVITS/AR/modules/embedding.py +++ b/GPT_SoVITS/AR/modules/embedding.py @@ -60,14 +60,11 @@ class SinePositionalEmbedding(nn.Module): return pe = torch.zeros(x.size(1), self.embedding_dim) if self.reverse: - position = torch.arange( - x.size(1) - 1, -1, -1.0, dtype=torch.float32 - ).unsqueeze(1) + position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) else: position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( - torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) - * -(math.log(10000.0) / self.embedding_dim) + torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim) ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) diff --git a/GPT_SoVITS/AR/modules/embedding_onnx.py b/GPT_SoVITS/AR/modules/embedding_onnx.py index b93405b4..c870013f 100644 --- a/GPT_SoVITS/AR/modules/embedding_onnx.py +++ b/GPT_SoVITS/AR/modules/embedding_onnx.py @@ -50,7 +50,7 @@ class SinePositionalEmbedding(nn.Module): self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) def extend_pe(self, x): - position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) + position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1) scpe = (position * self.div_term).unsqueeze(0) pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) pe = pe.contiguous().view(1, -1, self.embedding_dim) diff --git a/GPT_SoVITS/AR/modules/lr_schedulers.py b/GPT_SoVITS/AR/modules/lr_schedulers.py index b8867467..707a911f 100644 --- a/GPT_SoVITS/AR/modules/lr_schedulers.py +++ b/GPT_SoVITS/AR/modules/lr_schedulers.py @@ -49,13 +49,9 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): lr = self.end_lr else: - decay_ratio = (self._current_step - self.warmup_steps) / ( - self.total_steps - self.warmup_steps - ) + decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps) if decay_ratio < 0.0 or decay_ratio > 1.0: - raise RuntimeError( - "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." - ) + raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.") coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) @@ -70,7 +66,13 @@ if __name__ == "__main__": m = nn.Linear(10, 10) opt = Adam(m.parameters(), lr=1e-4) s = WarmupCosineLRSchedule( - opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 + opt, + 1e-6, + 2e-4, + 1e-6, + warmup_steps=2000, + total_steps=20000, + current_step=0, ) lrs = [] for i in range(25000): diff --git a/GPT_SoVITS/AR/modules/optim.py b/GPT_SoVITS/AR/modules/optim.py index 98785f05..aeebbee3 100644 --- a/GPT_SoVITS/AR/modules/optim.py +++ b/GPT_SoVITS/AR/modules/optim.py @@ -16,8 +16,7 @@ import contextlib import logging from collections import defaultdict -from typing import List -from typing import Tuple +from typing import List, Tuple import torch from torch import Tensor @@ -71,12 +70,8 @@ class BatchedOptimizer(Optimizer): group_params_names: name for each parameter in group, which is List[str]. """ - batches = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter - batches_names = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str + batches = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter + batches_names = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of str assert len(param_group) == len(group_params_names) for p, named_p in zip(param_group, group_params_names): @@ -85,11 +80,8 @@ class BatchedOptimizer(Optimizer): batches_names[key].append(named_p) batches_names_keys = list(batches_names.keys()) - sorted_idx = sorted( - range(len(batches_names)), key=lambda i: batches_names_keys[i]) - batches_names = [ - batches_names[batches_names_keys[idx]] for idx in sorted_idx - ] + sorted_idx = sorted(range(len(batches_names)), key=lambda i: batches_names_keys[i]) + batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx] batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] stacked_params_dict = dict() @@ -106,16 +98,14 @@ class BatchedOptimizer(Optimizer): # group. class Optimizer will take care of saving/loading state. state = self.state[p] p_stacked = torch.stack(batch) - grad = torch.stack([ - torch.zeros_like(p) if p.grad is None else p.grad for p in batch - ]) + grad = torch.stack([torch.zeros_like(p) if p.grad is None else p.grad for p in batch]) p_stacked.grad = grad stacked_params_dict[key] = p_stacked tuples.append((p_stacked, state, batch_names)) yield tuples # <-- calling code will do the actual optimization here! - for ((stacked_params, _state, _names), batch) in zip(tuples, batches): + for (stacked_params, _state, _names), batch in zip(tuples, batches): for i, p in enumerate(batch): # batch is list of Parameter p.copy_(stacked_params[i]) @@ -164,25 +154,24 @@ class ScaledAdam(BatchedOptimizer): """ def __init__( - self, - params, - lr=3e-02, - clipping_scale=None, - betas=(0.9, 0.98), - scalar_lr_scale=0.1, - eps=1.0e-08, - param_min_rms=1.0e-05, - param_max_rms=3.0, - scalar_max=10.0, - size_update_period=4, - clipping_update_period=100, - parameters_names=None, - show_dominant_parameters=True, ): - + self, + params, + lr=3e-02, + clipping_scale=None, + betas=(0.9, 0.98), + scalar_lr_scale=0.1, + eps=1.0e-08, + param_min_rms=1.0e-05, + param_max_rms=3.0, + scalar_max=10.0, + size_update_period=4, + clipping_update_period=100, + parameters_names=None, + show_dominant_parameters=True, + ): assert parameters_names is not None, ( - "Please prepare parameters_names," - "which is a List[List[str]]. Each List[str] is for a group" - "and each str is for a parameter") + "Please prepare parameters_names,which is a List[List[str]]. Each List[str] is for a groupand each str is for a parameter" + ) defaults = dict( lr=lr, clipping_scale=clipping_scale, @@ -193,7 +182,8 @@ class ScaledAdam(BatchedOptimizer): param_max_rms=param_max_rms, scalar_max=scalar_max, size_update_period=size_update_period, - clipping_update_period=clipping_update_period, ) + clipping_update_period=clipping_update_period, + ) super(ScaledAdam, self).__init__(params, defaults) assert len(self.param_groups) == len(parameters_names) @@ -218,18 +208,13 @@ class ScaledAdam(BatchedOptimizer): batch = True - for group, group_params_names in zip(self.param_groups, - self.parameters_names): - - with self.batched_params(group["params"], - group_params_names) as batches: - + for group, group_params_names in zip(self.param_groups, self.parameters_names): + with self.batched_params(group["params"], group_params_names) as batches: # batches is list of pairs (stacked_param, state). stacked_param is like # a regular parameter, and will have a .grad, but the 1st dim corresponds to # a stacking dim, it is not a real dim. - if (len(batches[0][1]) == - 0): # if len(first state) == 0: not yet initialized + if len(batches[0][1]) == 0: # if len(first state) == 0: not yet initialized clipping_scale = 1 else: clipping_scale = self._get_clipping_scale(group, batches) @@ -239,9 +224,7 @@ class ScaledAdam(BatchedOptimizer): # grad is not going to be None, we handled that when creating the batches. grad = p.grad if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients" - ) + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") # State initialization if len(state) == 0: self._init_state(group, p, state) @@ -274,8 +257,7 @@ class ScaledAdam(BatchedOptimizer): # parameter-change "delta", which combines all forms of # update. this is equivalent to how it's done in Adam, # except for the first few steps. - state["delta"] = torch.zeros_like( - p, memory_format=torch.preserve_format) + state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format) batch_size = p.shape[0] numel = p.numel() // batch_size @@ -285,22 +267,16 @@ class ScaledAdam(BatchedOptimizer): # "param_rms" just periodically records the scalar root-mean-square value of # the parameter tensor. # it has a shape like (batch_size, 1, 1, 1, 1) - param_rms = ( - (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) + param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt() state["param_rms"] = param_rms state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) - state["scale_grads"] = torch.zeros(size_update_period, - *param_rms.shape, **kwargs) + state["scale_grads"] = torch.zeros(size_update_period, *param_rms.shape, **kwargs) # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. - state["exp_avg_sq"] = torch.zeros_like( - p, memory_format=torch.preserve_format) + state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format) - def _get_clipping_scale(self, - group: dict, - tuples: List[Tuple[Tensor, dict, List[str]]] - ) -> float: + def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]) -> float: """ Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients by this amount before applying the rest of the update. @@ -325,20 +301,18 @@ class ScaledAdam(BatchedOptimizer): clipping_update_period = group["clipping_update_period"] tot_sumsq = torch.tensor(0.0, device=first_p.device) - for (p, state, param_names) in tuples: + for p, state, param_names in tuples: grad = p.grad if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients") + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") if p.numel() == p.shape[0]: # a batch of scalars tot_sumsq += (grad**2).sum() # sum() to change shape [1] to [] else: - tot_sumsq += ((grad * state["param_rms"])**2).sum() + tot_sumsq += ((grad * state["param_rms"]) ** 2).sum() tot_norm = tot_sumsq.sqrt() if "model_norms" not in first_state: - first_state["model_norms"] = torch.zeros( - clipping_update_period, device=p.device) + first_state["model_norms"] = torch.zeros(clipping_update_period, device=p.device) first_state["model_norms"][step % clipping_update_period] = tot_norm if step % clipping_update_period == 0: @@ -350,20 +324,20 @@ class ScaledAdam(BatchedOptimizer): for n in range(0, 5): index = min( clipping_update_period - 1, - (clipping_update_period // 4) * n, ) + (clipping_update_period // 4) * n, + ) quartiles.append(sorted_norms[index].item()) median = quartiles[2] threshold = clipping_scale * median first_state["model_norm_threshold"] = threshold - percent_clipped = (first_state["num_clipped"] * 100.0 / - clipping_update_period - if "num_clipped" in first_state else 0.0) + percent_clipped = ( + first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0 + ) first_state["num_clipped"] = 0 quartiles = " ".join(["%.3e" % x for x in quartiles]) logging.info( - f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, " - f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" + f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" ) if step < clipping_update_period: @@ -373,25 +347,20 @@ class ScaledAdam(BatchedOptimizer): model_norm_threshold = first_state["model_norm_threshold"] except KeyError: logging.info( - "Warning: model_norm_threshold not in state: possibly " - "you changed config when restarting, adding clipping_scale option?" + "Warning: model_norm_threshold not in state: possibly you changed config when restarting, adding clipping_scale option?" ) return 1.0 ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) if ans < 1.0: first_state["num_clipped"] += 1 if ans < 0.1: - logging.warn( - f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}" - ) + logging.warn(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}") if self.show_dominant_parameters: assert p.shape[0] == len(param_names) self._show_gradient_dominating_parameter(tuples, tot_sumsq) return ans - def _show_gradient_dominating_parameter( - self, tuples: List[Tuple[Tensor, dict, List[str]]], - tot_sumsq: Tensor): + def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor): """ Show information of parameter wihch dominanting tot_sumsq. @@ -406,7 +375,7 @@ class ScaledAdam(BatchedOptimizer): from tuples, we still pass it to save some time. """ all_sumsq_orig = {} - for (p, state, batch_param_names) in tuples: + for p, state, batch_param_names in tuples: # p is a stacked batch parameters. batch_grad = p.grad if p.numel() == p.shape[0]: # a batch of scalars @@ -415,41 +384,46 @@ class ScaledAdam(BatchedOptimizer): batch_rms_orig = torch.ones(p.shape[0]) else: batch_rms_orig = state["param_rms"] - batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum( - dim=list(range(1, batch_grad.ndim))) - - for name, sumsq_orig, rms, grad in zip(batch_param_names, - batch_sumsq_orig, - batch_rms_orig, batch_grad): + batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(dim=list(range(1, batch_grad.ndim))) + for name, sumsq_orig, rms, grad in zip( + batch_param_names, + batch_sumsq_orig, + batch_rms_orig, + batch_grad, + ): proportion_orig = sumsq_orig / tot_sumsq all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) assert torch.isclose( sum([value[0] for value in all_sumsq_orig.values()]).cpu(), - torch.tensor(1.0), ) + torch.tensor(1.0), + ) sorted_by_proportion = { k: v for k, v in sorted( all_sumsq_orig.items(), key=lambda item: item[1][0], - reverse=True, ) + reverse=True, + ) } dominant_param_name = next(iter(sorted_by_proportion)) - (dominant_proportion, dominant_sumsq, dominant_rms, - dominant_grad, ) = sorted_by_proportion[dominant_param_name] - logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}" - f" with proportion {dominant_proportion:.2f}," - f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" - f"={dominant_sumsq:.3e}," - f" grad_sumsq = {(dominant_grad**2).sum():.3e}," - f" orig_rms_sq={(dominant_rms**2).item():.3e}") + ( + dominant_proportion, + dominant_sumsq, + dominant_rms, + dominant_grad, + ) = sorted_by_proportion[dominant_param_name] + logging.info( + f"Parameter Dominanting tot_sumsq {dominant_param_name}" + f" with proportion {dominant_proportion:.2f}," + f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" + f"={dominant_sumsq:.3e}," + f" grad_sumsq = {(dominant_grad**2).sum():.3e}," + f" orig_rms_sq={(dominant_rms**2).item():.3e}" + ) - def _step_one_batch(self, - group: dict, - p: Tensor, - state: dict, - clipping_scale: float): + def _step_one_batch(self, group: dict, p: Tensor, state: dict, clipping_scale: float): """ Do the step for one parameter, which is actually going to be a batch of `real` parameters, with dim 0 as the batch dim. @@ -475,13 +449,10 @@ class ScaledAdam(BatchedOptimizer): if numel > 1: # Update the size/scale of p, and set param_rms scale_grads = state["scale_grads"] - scale_grads[step % size_update_period] = (p * grad).sum( - dim=list(range(1, p.ndim)), keepdim=True) + scale_grads[step % size_update_period] = (p * grad).sum(dim=list(range(1, p.ndim)), keepdim=True) if step % size_update_period == size_update_period - 1: param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) - param_rms.copy_((p**2) - .mean(dim=list(range(1, p.ndim)), keepdim=True) - .sqrt()) + param_rms.copy_((p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) if step > 0: # self._size_update() learns the overall scale on the # parameter, by shrinking or expanding it. @@ -496,11 +467,13 @@ class ScaledAdam(BatchedOptimizer): state["step"] = step + 1 - def _size_update(self, - group: dict, - scale_grads: Tensor, - p: Tensor, - state: dict) -> None: + def _size_update( + self, + group: dict, + scale_grads: Tensor, + p: Tensor, + state: dict, + ) -> None: """ Called only where p.numel() > 1, this updates the scale of the parameter. If we imagine: p = underlying_param * scale.exp(), and we are doing @@ -529,11 +502,11 @@ class ScaledAdam(BatchedOptimizer): # faster decay at this level. beta2_corr = beta2**size_update_period - scale_exp_avg_sq = state[ - "scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) + scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) scale_exp_avg_sq.mul_(beta2_corr).add_( (scale_grads**2).mean(dim=0), # mean over dim `size_update_period` - alpha=1 - beta2_corr, ) # shape is (batch_size, 1, 1, ...) + alpha=1 - beta2_corr, + ) # shape is (batch_size, 1, 1, ...) # The 1st time we reach here is when size_step == 1. size_step = (step + 1) // size_update_period @@ -543,8 +516,7 @@ class ScaledAdam(BatchedOptimizer): denom = scale_exp_avg_sq.sqrt() + eps - scale_step = (-size_lr * (bias_correction2**0.5) * - scale_grads.sum(dim=0) / denom) + scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom is_too_small = param_rms < param_min_rms is_too_large = param_rms > param_max_rms @@ -580,9 +552,8 @@ class ScaledAdam(BatchedOptimizer): exp_avg_sq = state["exp_avg_sq"] exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) - this_step = state["step"] - (state["zero_step"] - if "zero_step" in state else 0) - bias_correction2 = 1 - beta2**(this_step + 1) + this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0) + bias_correction2 = 1 - beta2 ** (this_step + 1) if bias_correction2 < 0.99: # note: not in-place. exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) @@ -613,7 +584,7 @@ class ScaledAdam(BatchedOptimizer): # bias_correction2 is like in Adam. Don't bother with bias_correction1; # slower update at the start will help stability anyway. - bias_correction2 = 1 - beta2**(state["step"] + 1) + bias_correction2 = 1 - beta2 ** (state["step"] + 1) denom = (exp_avg_sq / bias_correction2).sqrt() + eps delta = state["delta"] diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py index cab6afe9..5bffcea6 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py @@ -5,7 +5,6 @@ from torch.nn.functional import ( _none_or_dtype, _in_projection_packed, ) -from torch.nn import functional as F import torch # Tensor = torch.Tensor # from typing import Callable, List, Optional, Tuple, Union @@ -25,18 +24,18 @@ def multi_head_attention_forward_patched( dropout_p: float, out_proj_weight, out_proj_bias, - training = True, - key_padding_mask = None, - need_weights = True, - attn_mask = None, - use_separate_proj_weight = False, - q_proj_weight = None, - k_proj_weight = None, - v_proj_weight = None, - static_k = None, - static_v = None, - average_attn_weights = True, - is_causal = False, + training=True, + key_padding_mask=None, + need_weights=True, + attn_mask=None, + use_separate_proj_weight=False, + q_proj_weight=None, + k_proj_weight=None, + v_proj_weight=None, + static_k=None, + static_v=None, + average_attn_weights=True, + is_causal=False, cache=None, ): r""" @@ -156,9 +155,7 @@ def multi_head_attention_forward_patched( cache=cache, ) - is_batched = _mha_shape_check( - query, key, value, key_padding_mask, attn_mask, num_heads - ) + is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads) # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input # is batched, run the computation and before returning squeeze the @@ -211,45 +208,33 @@ def multi_head_attention_forward_patched( # longer causal. is_causal = False - assert ( - embed_dim == embed_dim_to_check - ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + assert embed_dim == embed_dim_to_check, ( + f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + ) if isinstance(embed_dim, torch.Tensor): # embed_dim can be a tensor when JIT tracing head_dim = embed_dim.div(num_heads, rounding_mode="trunc") else: head_dim = embed_dim // num_heads - assert ( - head_dim * num_heads == embed_dim - ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used - assert ( - key.shape[:2] == value.shape[:2] - ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + assert key.shape[:2] == value.shape[:2], ( + f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + ) else: - assert ( - key.shape == value.shape - ), f"key shape {key.shape} does not match value shape {value.shape}" + assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: - assert ( - in_proj_weight is not None - ), "use_separate_proj_weight is False but in_proj_weight is None" + assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None" q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: - assert ( - q_proj_weight is not None - ), "use_separate_proj_weight is True but q_proj_weight is None" - assert ( - k_proj_weight is not None - ), "use_separate_proj_weight is True but k_proj_weight is None" - assert ( - v_proj_weight is not None - ), "use_separate_proj_weight is True but v_proj_weight is None" + assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" + assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" + assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: @@ -312,9 +297,7 @@ def multi_head_attention_forward_patched( f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." ) else: - raise RuntimeError( - f"attn_mask's dimension {attn_mask.dim()} is not supported" - ) + raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: @@ -338,34 +321,26 @@ def multi_head_attention_forward_patched( k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_k.size(0) == bsz * num_heads - ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" - assert ( - static_k.size(2) == head_dim - ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + assert static_k.size(0) == bsz * num_heads, ( + f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" + ) + assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_v.size(0) == bsz * num_heads - ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" - assert ( - static_v.size(2) == head_dim - ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + assert static_v.size(0) == bsz * num_heads, ( + f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" + ) + assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * num_heads, 1, head_dim) - k = torch.cat( - [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1 - ) - v = torch.cat( - [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1 - ) + k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) + v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: @@ -381,9 +356,7 @@ def multi_head_attention_forward_patched( src_len, ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = ( - key_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, num_heads, -1, -1) - .reshape(bsz * num_heads, 1, src_len) + key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) ) if attn_mask is None: attn_mask = key_padding_mask @@ -402,14 +375,10 @@ def multi_head_attention_forward_patched( B, Nt, E = q.shape q_scaled = q / math.sqrt(E) - assert not ( - is_causal and attn_mask is None - ), "FIXME: is_causal not implemented for need_weights" + assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" if attn_mask is not None: - attn_output_weights = torch.baddbmm( - attn_mask, q_scaled, k.transpose(-2, -1) - ) + attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1)) else: attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) attn_output_weights = softmax(attn_output_weights, dim=-1) @@ -418,9 +387,7 @@ def multi_head_attention_forward_patched( attn_output = torch.bmm(attn_output_weights, v) - attn_output = ( - attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) - ) + attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) @@ -449,13 +416,9 @@ def multi_head_attention_forward_patched( v = v.view(bsz, num_heads, src_len, head_dim) # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) - ) + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py index 14bdb550..8144c9c6 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py @@ -1,11 +1,9 @@ from torch.nn.functional import * from torch.nn.functional import ( - _mha_shape_check, _canonical_mask, - _none_or_dtype, - _in_projection_packed, ) + def multi_head_attention_forward_patched( query, key, @@ -34,7 +32,6 @@ def multi_head_attention_forward_patched( is_causal: bool = False, cache=None, ) -> Tuple[Tensor, Optional[Tensor]]: - # set up shape vars _, _, embed_dim = query.shape attn_mask = _canonical_mask( @@ -80,12 +77,8 @@ def multi_head_attention_forward_patched( q = q.view(num_heads, -1, head_dim).unsqueeze(0) k = k.view(num_heads, -1, head_dim).unsqueeze(0) v = v.view(num_heads, -1, head_dim).unsqueeze(0) - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) - ) + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(-1, 1, attn_output.size(1)) diff --git a/GPT_SoVITS/AR/modules/scaling.py b/GPT_SoVITS/AR/modules/scaling.py index 9256a8cb..aae14533 100644 --- a/GPT_SoVITS/AR/modules/scaling.py +++ b/GPT_SoVITS/AR/modules/scaling.py @@ -13,12 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging -import math import random from typing import Optional from typing import Tuple -from typing import Union import torch import torch.nn as nn @@ -61,9 +58,7 @@ class DoubleSwishFunction(torch.autograd.Function): # floors), should be expectation-preserving. floor = -0.043637 ceil = 1.2 - d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( - deriv - ) + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(deriv) if __name__ == "__main__": # for self-testing only. assert d_scaled.min() >= 0.0 @@ -153,13 +148,9 @@ def _compute_scale_factor( else: # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if # x_abs)_mean , min_abs. - below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( - min=0, max=max_factor - ) + below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(min=0, max=max_factor) - above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( - min=0, max=max_factor - ) + above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(min=0, max=max_factor) return below_threshold - above_threshold @@ -181,18 +172,16 @@ def _compute_sign_factor( else: # 0 if proportion_positive >= min_positive, else can be # as large as max_factor. - factor1 = ( - (min_positive - proportion_positive) * (gain_factor / min_positive) - ).clamp_(min=0, max=max_factor) + factor1 = ((min_positive - proportion_positive) * (gain_factor / min_positive)).clamp_(min=0, max=max_factor) if max_positive == 1.0: factor2 = 0.0 else: # 0 if self.proportion_positive <= max_positive, else can be # as large as -max_factor. - factor2 = ( - (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) - ).clamp_(min=0, max=max_factor) + factor2 = ((proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))).clamp_( + min=0, max=max_factor + ) sign_factor = factor1 - factor2 # require min_positive != 0 or max_positive != 1: assert not isinstance(sign_factor, float) @@ -320,15 +309,11 @@ class ActivationBalancer(torch.nn.Module): return _no_op(x) -def BalancedDoubleSwish( - d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 -) -> nn.Sequential: +def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25) -> nn.Sequential: """ ActivationBalancer -> DoubleSwish """ - balancer = ActivationBalancer( - d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob - ) + balancer = ActivationBalancer(d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob) return nn.Sequential( balancer, DoubleSwish(), diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 7921f48e..1bf21cdb 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -42,12 +42,8 @@ class LayerNorm(nn.Module): self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) else: self.register_parameter("weight", None) self.register_parameter("bias", None) @@ -74,15 +70,10 @@ class LayerNorm(nn.Module): ) assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) class IdentityNorm(nn.Module): @@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module): >>> src = torch.rand(10, 32, 512) >>> out = transformer_encoder(src) """ + __constants__ = ["norm"] def __init__(self, encoder_layer, num_layers, norm=None): @@ -218,13 +210,9 @@ class TransformerEncoderLayer(nn.Module): ) # Implementation of Feedforward model - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) @@ -291,12 +279,8 @@ class TransformerEncoderLayer(nn.Module): if src_key_padding_mask is not None: _skpm_dtype = src_key_padding_mask.dtype - if _skpm_dtype != torch.bool and not torch.is_floating_point( - src_key_padding_mask - ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) + if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask): + raise AssertionError("only bool and floating types of key_padding_mask are supported") if self.norm_first: x = x + self._sa_block( diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py index a3f68b43..fa170254 100644 --- a/GPT_SoVITS/AR/modules/transformer_onnx.py +++ b/GPT_SoVITS/AR/modules/transformer_onnx.py @@ -42,12 +42,8 @@ class LayerNorm(nn.Module): self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) else: self.register_parameter("weight", None) self.register_parameter("bias", None) @@ -74,15 +70,10 @@ class LayerNorm(nn.Module): ) assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) class IdentityNorm(nn.Module): @@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module): >>> src = torch.rand(10, 32, 512) >>> out = transformer_encoder(src) """ + __constants__ = ["norm"] def __init__(self, encoder_layer, num_layers, norm=None): @@ -154,6 +146,7 @@ class TransformerEncoder(nn.Module): class TransformerEncoderLayer(nn.Module): __constants__ = ["batch_first", "norm_first"] + def __init__( self, d_model: int, @@ -184,13 +177,9 @@ class TransformerEncoderLayer(nn.Module): linear2_cls=linear2_self_attention_cls, **factory_kwargs, ) - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py index 9c5f58fb..1003040e 100644 --- a/GPT_SoVITS/AR/text_processing/phonemizer.py +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -30,9 +30,7 @@ class GruutPhonemizer: "«": "«", "»": "»", } - self._punctuation_regexp: str = ( - rf"([{''.join(self._special_cases_dict.keys())}])" - ) + self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" def _normalize_punctuation(self, text: str) -> str: text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) @@ -53,13 +51,8 @@ class GruutPhonemizer: def phonemize(self, text: str, espeak: bool = False) -> str: text_to_phonemize: str = self._normalize_punctuation(text) - sents: List[Sentence] = [ - sent - for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) - ] - words: List[str] = [ - self._convert_punctuation(word) for word in itertools.chain(*sents) - ] + sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)] + words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)] return " ".join(words) def transform(self, phonemes): diff --git a/GPT_SoVITS/AR/text_processing/symbols.py b/GPT_SoVITS/AR/text_processing/symbols.py index 7d754a78..f7ef57fa 100644 --- a/GPT_SoVITS/AR/text_processing/symbols.py +++ b/GPT_SoVITS/AR/text_processing/symbols.py @@ -3,7 +3,9 @@ PAD = "_" PUNCTUATION = ';:,.!?¡¿—…"«»“” ' LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" -IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +IPA_LETTERS = ( + "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +) SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) SPACE_ID = SYMBOLS.index(" ") SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} diff --git a/GPT_SoVITS/AR/utils/__init__.py b/GPT_SoVITS/AR/utils/__init__.py index c2eaf61a..4a9cb4d5 100644 --- a/GPT_SoVITS/AR/utils/__init__.py +++ b/GPT_SoVITS/AR/utils/__init__.py @@ -2,12 +2,12 @@ import re def str2bool(str): - return True if str.lower() == 'true' else False + return True if str.lower() == "true" else False def get_newest_ckpt(string_list): # 定义一个正则表达式模式,用于匹配字符串中的数字 - pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' + pattern = r"epoch=(\d+)-step=(\d+)\.ckpt" # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 extracted_info = [] @@ -18,8 +18,7 @@ def get_newest_ckpt(string_list): step = int(match.group(2)) extracted_info.append((epoch, step, string)) # 按照 epoch 后面的数字和 step 后面的数字进行排序 - sorted_info = sorted( - extracted_info, key=lambda x: (x[0], x[1]), reverse=True) + sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True) # 获取最新的 ckpt 文件名 newest_ckpt = sorted_info[0][2] return newest_ckpt @@ -28,9 +27,9 @@ def get_newest_ckpt(string_list): # 文本存在且不为空时 return True def check_txt_file(file_path): try: - with open(file_path, 'r') as file: + with open(file_path, "r") as file: text = file.readline().strip() - assert text.strip() != '' + assert text.strip() != "" return text except Exception: return False diff --git a/GPT_SoVITS/AR/utils/initialize.py b/GPT_SoVITS/AR/utils/initialize.py index 17ff9f92..ee7c7138 100644 --- a/GPT_SoVITS/AR/utils/initialize.py +++ b/GPT_SoVITS/AR/utils/initialize.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """Initialize modules for espnet2 neural networks.""" + import torch from typeguard import check_argument_types diff --git a/GPT_SoVITS/AR/utils/io.py b/GPT_SoVITS/AR/utils/io.py index 52f1f3c9..a6475cb6 100644 --- a/GPT_SoVITS/AR/utils/io.py +++ b/GPT_SoVITS/AR/utils/io.py @@ -18,14 +18,10 @@ def save_config_to_yaml(config, path): def write_args(args, path): - args_dict = dict( - (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") - ) + args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_")) with open(path, "a") as args_file: args_file.write("==> torch version: {}\n".format(torch.__version__)) - args_file.write( - "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) - ) + args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version())) args_file.write("==> Cmd:\n") args_file.write(str(sys.argv)) args_file.write("\n==> args:\n") diff --git a/GPT_SoVITS/BigVGAN/activations.py b/GPT_SoVITS/BigVGAN/activations.py index 4f08ddab..abe3ad9e 100644 --- a/GPT_SoVITS/BigVGAN/activations.py +++ b/GPT_SoVITS/BigVGAN/activations.py @@ -23,9 +23,7 @@ class Snake(nn.Module): >>> x = a1(x) """ - def __init__( - self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False - ): + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): """ Initialization. INPUT: @@ -80,9 +78,7 @@ class SnakeBeta(nn.Module): >>> x = a1(x) """ - def __init__( - self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False - ): + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): """ Initialization. INPUT: diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py index fbc0fd8f..ea333cfa 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py @@ -20,9 +20,7 @@ class FusedAntiAliasActivation(torch.autograd.Function): @staticmethod def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta): - activation_results = anti_alias_activation_cuda.forward( - inputs, up_ftr, down_ftr, alpha, beta - ) + activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta) return activation_results @@ -61,17 +59,11 @@ class Activation1d(nn.Module): if self.act.__class__.__name__ == "Snake": beta = self.act.alpha.data # Snake uses same params for alpha and beta else: - beta = ( - self.act.beta.data - ) # Snakebeta uses different params for alpha and beta + beta = self.act.beta.data # Snakebeta uses different params for alpha and beta alpha = self.act.alpha.data - if ( - not self.act.alpha_logscale - ): # Exp baked into cuda kernel, cancel it out with a log + if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log alpha = torch.log(alpha) beta = torch.log(beta) - x = FusedAntiAliasActivation.apply( - x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta - ) + x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta) return x diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py index ca5d01de..14fbf054 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py @@ -58,17 +58,13 @@ def load(): srcpath / "anti_alias_activation.cpp", srcpath / "anti_alias_activation_cuda.cu", ] - anti_alias_activation_cuda = _cpp_extention_load_helper( - "anti_alias_activation_cuda", sources, extra_cuda_flags - ) + anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags) return anti_alias_activation_cuda def _get_cuda_bare_metal_version(cuda_dir): - raw_output = subprocess.check_output( - [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True - ) + raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) output = raw_output.split() release_idx = output.index("release") + 1 release = output[release_idx].split(".") diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py index 0fa35b0d..dc905b20 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py @@ -27,9 +27,7 @@ else: # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License # https://adefossez.github.io/julius/julius/lowpass.html # LICENSE is in incl_licenses directory. -def kaiser_sinc_filter1d( - cutoff, half_width, kernel_size -): # return filter [1,1,kernel_size] +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] even = kernel_size % 2 == 0 half_size = kernel_size // 2 diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py index a35380f5..e7928fad 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py @@ -11,18 +11,12 @@ class UpSample1d(nn.Module): def __init__(self, ratio=2, kernel_size=None): super().__init__() self.ratio = ratio - self.kernel_size = ( - int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - ) + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size self.stride = ratio self.pad = self.kernel_size // ratio - 1 self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 - self.pad_right = ( - self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 - ) - filter = kaiser_sinc_filter1d( - cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size - ) + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size) self.register_buffer("filter", filter) # x: [B, C, T] @@ -30,9 +24,7 @@ class UpSample1d(nn.Module): _, C, _ = x.shape x = F.pad(x, (self.pad, self.pad), mode="replicate") - x = self.ratio * F.conv_transpose1d( - x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C - ) + x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) x = x[..., self.pad_left : -self.pad_right] return x @@ -42,9 +34,7 @@ class DownSample1d(nn.Module): def __init__(self, ratio=2, kernel_size=None): super().__init__() self.ratio = ratio - self.kernel_size = ( - int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - ) + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size self.lowpass = LowPassFilter1d( cutoff=0.5 / ratio, half_width=0.6 / ratio, diff --git a/GPT_SoVITS/BigVGAN/bigvgan.py b/GPT_SoVITS/BigVGAN/bigvgan.py index 6c4a2235..febdf165 100644 --- a/GPT_SoVITS/BigVGAN/bigvgan.py +++ b/GPT_SoVITS/BigVGAN/bigvgan.py @@ -50,7 +50,7 @@ class AMPBlock1(torch.nn.Module): activation: str = None, ): super().__init__() - + self.h = h self.convs1 = nn.ModuleList( @@ -87,9 +87,7 @@ class AMPBlock1(torch.nn.Module): ) self.convs2.apply(init_weights) - self.num_layers = len(self.convs1) + len( - self.convs2 - ) # Total number of conv layers + self.num_layers = len(self.convs1) + len(self.convs2) # Total number of conv layers # Select which Activation1d, lazy-load cuda version to ensure backward compatibility if self.h.get("use_cuda_kernel", False): @@ -105,22 +103,14 @@ class AMPBlock1(torch.nn.Module): if activation == "snake": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.Snake( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) elif activation == "snakebeta": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.SnakeBeta( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) @@ -169,7 +159,7 @@ class AMPBlock2(torch.nn.Module): activation: str = None, ): super().__init__() - + self.h = h self.convs = nn.ModuleList( @@ -205,22 +195,14 @@ class AMPBlock2(torch.nn.Module): if activation == "snake": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.Snake( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) elif activation == "snakebeta": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.SnakeBeta( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) @@ -283,9 +265,7 @@ class BigVGAN( self.num_upsamples = len(h.upsample_rates) # Pre-conv - self.conv_pre = weight_norm( - Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) - ) + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) # Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default if h.resblock == "1": @@ -293,9 +273,7 @@ class BigVGAN( elif h.resblock == "2": resblock_class = AMPBlock2 else: - raise ValueError( - f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}" - ) + raise ValueError(f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}") # Transposed conv-based upsamplers. does not apply anti-aliasing self.ups = nn.ModuleList() @@ -320,22 +298,14 @@ class BigVGAN( self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = h.upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) - ): - self.resblocks.append( - resblock_class(h, ch, k, d, activation=h.activation) - ) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock_class(h, ch, k, d, activation=h.activation)) # Post-conv activation_post = ( activations.Snake(ch, alpha_logscale=h.snake_logscale) if h.activation == "snake" - else ( - activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) - if h.activation == "snakebeta" - else None - ) + else (activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) if h.activation == "snakebeta" else None) ) if activation_post is None: raise NotImplementedError( @@ -346,9 +316,7 @@ class BigVGAN( # Whether to use bias for the final conv_post. Default to True for backward compatibility self.use_bias_at_final = h.get("use_bias_at_final", True) - self.conv_post = weight_norm( - Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final) - ) + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)) # Weight initialization for i in range(len(self.ups)): @@ -451,13 +419,13 @@ class BigVGAN( # instantiate BigVGAN using h if use_cuda_kernel: print( - f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!" + "[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!" ) print( - f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!" + "[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!" ) print( - f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis" + "[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis" ) model = cls(h, use_cuda_kernel=use_cuda_kernel) @@ -485,7 +453,7 @@ class BigVGAN( model.load_state_dict(checkpoint_dict["generator"]) except RuntimeError: print( - f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!" + "[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!" ) model.remove_weight_norm() model.load_state_dict(checkpoint_dict["generator"]) diff --git a/GPT_SoVITS/BigVGAN/discriminators.py b/GPT_SoVITS/BigVGAN/discriminators.py index ffdf327b..2d44c798 100644 --- a/GPT_SoVITS/BigVGAN/discriminators.py +++ b/GPT_SoVITS/BigVGAN/discriminators.py @@ -15,7 +15,7 @@ from torchaudio.transforms import Spectrogram, Resample from env import AttrDict from utils import get_padding import typing -from typing import Optional, List, Union, Dict, Tuple +from typing import List, Tuple class DiscriminatorP(torch.nn.Module): @@ -81,9 +81,7 @@ class DiscriminatorP(torch.nn.Module): ), ] ) - self.conv_post = norm_f( - Conv2d(int(1024 * self.d_mult), 1, (3, 1), 1, padding=(1, 0)) - ) + self.conv_post = norm_f(Conv2d(int(1024 * self.d_mult), 1, (3, 1), 1, padding=(1, 0))) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] @@ -113,13 +111,12 @@ class MultiPeriodDiscriminator(torch.nn.Module): self.mpd_reshapes = h.mpd_reshapes print(f"mpd_reshapes: {self.mpd_reshapes}") self.discriminators = nn.ModuleList( - [ - DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) - for rs in self.mpd_reshapes - ] + [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes] ) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], @@ -145,19 +142,13 @@ class DiscriminatorR(nn.Module): super().__init__() self.resolution = resolution - assert ( - len(self.resolution) == 3 - ), f"MRD layer requires list with len=3, got {self.resolution}" + assert len(self.resolution) == 3, f"MRD layer requires list with len=3, got {self.resolution}" self.lrelu_slope = 0.1 norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm if hasattr(cfg, "mrd_use_spectral_norm"): - print( - f"[INFO] overriding MRD use_spectral_norm as {cfg.mrd_use_spectral_norm}" - ) - norm_f = ( - weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm - ) + print(f"[INFO] overriding MRD use_spectral_norm as {cfg.mrd_use_spectral_norm}") + norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm self.d_mult = cfg.discriminator_channel_mult if hasattr(cfg, "mrd_channel_mult"): print(f"[INFO] overriding mrd channel multiplier as {cfg.mrd_channel_mult}") @@ -203,9 +194,7 @@ class DiscriminatorR(nn.Module): ), ] ) - self.conv_post = norm_f( - nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)) - ) + self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] @@ -248,14 +237,14 @@ class MultiResolutionDiscriminator(nn.Module): def __init__(self, cfg, debug=False): super().__init__() self.resolutions = cfg.resolutions - assert ( - len(self.resolutions) == 3 - ), f"MRD requires list of list with len=3, each element having a list with len=3. Got {self.resolutions}" - self.discriminators = nn.ModuleList( - [DiscriminatorR(cfg, resolution) for resolution in self.resolutions] + assert len(self.resolutions) == 3, ( + f"MRD requires list of list with len=3, each element having a list with len=3. Got {self.resolutions}" ) + self.discriminators = nn.ModuleList([DiscriminatorR(cfg, resolution) for resolution in self.resolutions]) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], @@ -309,25 +298,15 @@ class DiscriminatorB(nn.Module): convs = lambda: nn.ModuleList( [ weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1)) - ), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))), ] ) self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))]) - self.conv_post = weight_norm( - nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)) - ) + self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))) def spectrogram(self, x: torch.Tensor) -> List[torch.Tensor]: # Remove DC offset @@ -376,17 +355,16 @@ class MultiBandDiscriminator(nn.Module): super().__init__() # fft_sizes (list[int]): Tuple of window lengths for FFT. Defaults to [2048, 1024, 512] if not set in h. self.fft_sizes = h.get("mbd_fft_sizes", [2048, 1024, 512]) - self.discriminators = nn.ModuleList( - [DiscriminatorB(window_length=w) for w in self.fft_sizes] - ) + self.discriminators = nn.ModuleList([DiscriminatorB(window_length=w) for w in self.fft_sizes]) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] @@ -406,7 +384,7 @@ class MultiBandDiscriminator(nn.Module): # Adapted from https://github.com/open-mmlab/Amphion/blob/main/models/vocoders/gan/discriminator/mssbcqtd.py under the MIT license. # LICENSE is in incl_licenses directory. class DiscriminatorCQT(nn.Module): - def __init__(self, cfg: AttrDict, hop_length: int, n_octaves:int, bins_per_octave: int): + def __init__(self, cfg: AttrDict, hop_length: int, n_octaves: int, bins_per_octave: int): super().__init__() self.cfg = cfg @@ -460,9 +438,7 @@ class DiscriminatorCQT(nn.Module): in_chs = min(self.filters_scale * self.filters, self.max_filters) for i, dilation in enumerate(self.dilations): - out_chs = min( - (self.filters_scale ** (i + 1)) * self.filters, self.max_filters - ) + out_chs = min((self.filters_scale ** (i + 1)) * self.filters, self.max_filters) self.convs.append( weight_norm( nn.Conv2d( @@ -486,9 +462,7 @@ class DiscriminatorCQT(nn.Module): in_chs, out_chs, kernel_size=(self.kernel_size[0], self.kernel_size[0]), - padding=self.get_2d_padding( - (self.kernel_size[0], self.kernel_size[0]) - ), + padding=self.get_2d_padding((self.kernel_size[0], self.kernel_size[0])), ) ) ) @@ -508,7 +482,7 @@ class DiscriminatorCQT(nn.Module): self.cqtd_normalize_volume = self.cfg.get("cqtd_normalize_volume", False) if self.cqtd_normalize_volume: print( - f"[INFO] cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!" + "[INFO] cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!" ) def get_2d_padding( @@ -580,9 +554,7 @@ class MultiScaleSubbandCQTDiscriminator(nn.Module): # Multi-scale params to loop over self.cfg["cqtd_hop_lengths"] = self.cfg.get("cqtd_hop_lengths", [512, 256, 256]) self.cfg["cqtd_n_octaves"] = self.cfg.get("cqtd_n_octaves", [9, 9, 9]) - self.cfg["cqtd_bins_per_octaves"] = self.cfg.get( - "cqtd_bins_per_octaves", [24, 36, 48] - ) + self.cfg["cqtd_bins_per_octaves"] = self.cfg.get("cqtd_bins_per_octaves", [24, 36, 48]) self.discriminators = nn.ModuleList( [ @@ -596,13 +568,14 @@ class MultiScaleSubbandCQTDiscriminator(nn.Module): ] ) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] @@ -629,13 +602,14 @@ class CombinedDiscriminator(nn.Module): super().__init__() self.discrimiantor = nn.ModuleList(list_discriminator) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] diff --git a/GPT_SoVITS/BigVGAN/inference.py b/GPT_SoVITS/BigVGAN/inference.py index a213f318..5f892a3c 100644 --- a/GPT_SoVITS/BigVGAN/inference.py +++ b/GPT_SoVITS/BigVGAN/inference.py @@ -35,9 +35,7 @@ def inference(a, h): with torch.no_grad(): for i, filname in enumerate(filelist): # Load the ground truth audio and resample if necessary - wav, sr = librosa.load( - os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True - ) + wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True) wav = torch.FloatTensor(wav).to(device) # Compute mel spectrogram from the ground truth audio x = get_mel_spectrogram(wav.unsqueeze(0), generator.h) @@ -48,9 +46,7 @@ def inference(a, h): audio = audio * MAX_WAV_VALUE audio = audio.cpu().numpy().astype("int16") - output_file = os.path.join( - a.output_dir, os.path.splitext(filname)[0] + "_generated.wav" - ) + output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav") write(output_file, h.sampling_rate, audio) print(output_file) diff --git a/GPT_SoVITS/BigVGAN/inference_e2e.py b/GPT_SoVITS/BigVGAN/inference_e2e.py index a39dc67a..9c0df774 100644 --- a/GPT_SoVITS/BigVGAN/inference_e2e.py +++ b/GPT_SoVITS/BigVGAN/inference_e2e.py @@ -61,9 +61,7 @@ def inference(a, h): audio = audio * MAX_WAV_VALUE audio = audio.cpu().numpy().astype("int16") - output_file = os.path.join( - a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav" - ) + output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav") write(output_file, h.sampling_rate, audio) print(output_file) diff --git a/GPT_SoVITS/BigVGAN/loss.py b/GPT_SoVITS/BigVGAN/loss.py index d5811512..c295a144 100644 --- a/GPT_SoVITS/BigVGAN/loss.py +++ b/GPT_SoVITS/BigVGAN/loss.py @@ -6,13 +6,12 @@ import torch -import torch.nn.functional as F import torch.nn as nn from librosa.filters import mel as librosa_mel_fn from scipy import signal import typing -from typing import Optional, List, Union, Dict, Tuple +from typing import List, Tuple from collections import namedtuple import math import functools @@ -117,15 +116,13 @@ class MultiScaleMelSpectrogramLoss(nn.Module): window_type, ): """ - Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from: + Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from: https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py """ B, C, T = wav.shape if match_stride: - assert ( - hop_length == window_length // 4 - ), "For match_stride, hop must equal n_fft // 4" + assert hop_length == window_length // 4, "For match_stride, hop must equal n_fft // 4" right_pad = math.ceil(T / hop_length) * hop_length - T pad = (window_length - hop_length) // 2 else: @@ -155,9 +152,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): magnitude = torch.abs(stft) nf = magnitude.shape[2] - mel_basis = self.get_mel_filters( - self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax - ) + mel_basis = self.get_mel_filters(self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax) mel_basis = torch.from_numpy(mel_basis).to(wav.device) mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T mel_spectrogram = mel_spectrogram.transpose(-1, 2) @@ -182,9 +177,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): """ loss = 0.0 - for n_mels, fmin, fmax, s in zip( - self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params - ): + for n_mels, fmin, fmax, s in zip(self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params): kwargs = { "n_mels": n_mels, "fmin": fmin, @@ -197,12 +190,8 @@ class MultiScaleMelSpectrogramLoss(nn.Module): x_mels = self.mel_spectrogram(x, **kwargs) y_mels = self.mel_spectrogram(y, **kwargs) - x_logmels = torch.log( - x_mels.clamp(min=self.clamp_eps).pow(self.pow) - ) / torch.log(torch.tensor(10.0)) - y_logmels = torch.log( - y_mels.clamp(min=self.clamp_eps).pow(self.pow) - ) / torch.log(torch.tensor(10.0)) + x_logmels = torch.log(x_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0)) + y_logmels = torch.log(y_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0)) loss += self.log_weight * self.loss_fn(x_logmels, y_logmels) loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels) @@ -211,10 +200,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): # Loss functions -def feature_loss( - fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]] -) -> torch.Tensor: - +def feature_loss(fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]) -> torch.Tensor: loss = 0 for dr, dg in zip(fmap_r, fmap_g): for rl, gl in zip(dr, dg): @@ -226,7 +212,6 @@ def feature_loss( def discriminator_loss( disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor] ) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]: - loss = 0 r_losses = [] g_losses = [] @@ -243,7 +228,6 @@ def discriminator_loss( def generator_loss( disc_outputs: List[torch.Tensor], ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - loss = 0 gen_losses = [] for dg in disc_outputs: diff --git a/GPT_SoVITS/BigVGAN/meldataset.py b/GPT_SoVITS/BigVGAN/meldataset.py index a5859b9e..dc12c987 100644 --- a/GPT_SoVITS/BigVGAN/meldataset.py +++ b/GPT_SoVITS/BigVGAN/meldataset.py @@ -86,9 +86,7 @@ def mel_spectrogram( key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}" if key not in mel_basis_cache: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) hann_window_cache[key] = torch.hann_window(win_size).to(device) @@ -96,9 +94,7 @@ def mel_spectrogram( hann_window = hann_window_cache[key] padding = (n_fft - hop_size) // 2 - y = torch.nn.functional.pad( - y.unsqueeze(1), (padding, padding), mode="reflect" - ).squeeze(1) + y = torch.nn.functional.pad(y.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1) spec = torch.stft( y, @@ -150,17 +146,13 @@ def get_dataset_filelist(a): with open(a.input_training_file, "r", encoding="utf-8") as fi: training_files = [ - os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") - for x in fi.read().split("\n") - if len(x) > 0 + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 ] print(f"first training file: {training_files[0]}") with open(a.input_validation_file, "r", encoding="utf-8") as fi: validation_files = [ - os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") - for x in fi.read().split("\n") - if len(x) > 0 + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 ] print(f"first validation file: {validation_files[0]}") @@ -171,9 +163,7 @@ def get_dataset_filelist(a): for x in fi.read().split("\n") if len(x) > 0 ] - print( - f"first unseen {i}th validation fileset: {unseen_validation_files[0]}" - ) + print(f"first unseen {i}th validation fileset: {unseen_validation_files[0]}") list_unseen_validation_files.append(unseen_validation_files) return training_files, validation_files, list_unseen_validation_files @@ -227,13 +217,9 @@ class MelDataset(torch.utils.data.Dataset): print("[INFO] checking dataset integrity...") for i in tqdm(range(len(self.audio_files))): - assert os.path.exists( - self.audio_files[i] - ), f"{self.audio_files[i]} not found" + assert os.path.exists(self.audio_files[i]), f"{self.audio_files[i]} not found" - def __getitem__( - self, index: int - ) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]: + def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]: try: filename = self.audio_files[index] @@ -248,17 +234,12 @@ class MelDataset(torch.utils.data.Dataset): # Obtain randomized audio chunk if source_sampling_rate != self.sampling_rate: # Adjust segment size to crop if the source sr is different - target_segment_size = math.ceil( - self.segment_size - * (source_sampling_rate / self.sampling_rate) - ) + target_segment_size = math.ceil(self.segment_size * (source_sampling_rate / self.sampling_rate)) else: target_segment_size = self.segment_size # Compute upper bound index for the random chunk - random_chunk_upper_bound = max( - 0, audio.shape[0] - target_segment_size - ) + random_chunk_upper_bound = max(0, audio.shape[0] - target_segment_size) # Crop or pad audio to obtain random chunk with target_segment_size if audio.shape[0] >= target_segment_size: @@ -318,9 +299,9 @@ class MelDataset(torch.utils.data.Dataset): else: # For fine-tuning, assert that the waveform is in the defined sampling_rate # Fine-tuning won't support on-the-fly resampling to be fool-proof (the dataset should have been prepared properly) - assert ( - source_sampling_rate == self.sampling_rate - ), f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}" + assert source_sampling_rate == self.sampling_rate, ( + f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}" + ) # Cast ndarray to torch tensor audio = torch.FloatTensor(audio) @@ -346,20 +327,14 @@ class MelDataset(torch.utils.data.Dataset): mel = mel[:, :, mel_start : mel_start + frames_per_seg] audio = audio[ :, - mel_start - * self.hop_size : (mel_start + frames_per_seg) - * self.hop_size, + mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size, ] # Pad pre-computed mel and audio to match length to ensuring fine-tuning without error. # NOTE: this may introduce a single-frame misalignment of the # To remove possible misalignment, it is recommended to prepare the pair where the audio length is the integer multiple of self.hop_size - mel = torch.nn.functional.pad( - mel, (0, frames_per_seg - mel.size(2)), "constant" - ) - audio = torch.nn.functional.pad( - audio, (0, self.segment_size - audio.size(1)), "constant" - ) + mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant") + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") # Compute mel_loss used by spectral regression objective. Uses self.fmax_loss instead (usually None) mel_loss = mel_spectrogram( @@ -376,9 +351,10 @@ class MelDataset(torch.utils.data.Dataset): # Shape sanity checks assert ( - audio.shape[1] == mel.shape[2] * self.hop_size - and audio.shape[1] == mel_loss.shape[2] * self.hop_size - ), f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}" + audio.shape[1] == mel.shape[2] * self.hop_size and audio.shape[1] == mel_loss.shape[2] * self.hop_size + ), ( + f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}" + ) return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) @@ -387,9 +363,7 @@ class MelDataset(torch.utils.data.Dataset): if self.fine_tuning: raise e # Terminate training if it is fine-tuning. The dataset should have been prepared properly. else: - print( - f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}" - ) + print(f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}") return self[random.randrange(len(self))] def __len__(self): diff --git a/GPT_SoVITS/BigVGAN/tests/test_activation.py b/GPT_SoVITS/BigVGAN/tests/test_activation.py index 146600e5..41348835 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_activation.py +++ b/GPT_SoVITS/BigVGAN/tests/test_activation.py @@ -3,6 +3,7 @@ import os import sys + # to import modules from parent_dir parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(parent_dir) @@ -24,14 +25,10 @@ def test_anti_alias_activation(): data = torch.rand((10, 10, 200), device="cuda") # Check activations.Snake cuda vs. torch - fused_anti_alias_activation = activation1d.Activation1d( - activation=Snake(10), fused=True - ).cuda() + fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda() fused_activation_output = fused_anti_alias_activation(data) - torch_anti_alias_activation = activation1d.Activation1d( - activation=Snake(10), fused=False - ).cuda() + torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda() torch_activation_output = torch_anti_alias_activation(data) test_result = (fused_activation_output - torch_activation_output).abs() diff --git a/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py b/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py index 3e653859..4cc46b98 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py +++ b/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py @@ -3,6 +3,7 @@ import os import sys + # to import modules from parent_dir parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(parent_dir) @@ -24,14 +25,10 @@ def test_anti_alias_activation(): data = torch.rand((10, 10, 200), device="cuda") # Check activations, Snake CUDA vs. Torch - fused_anti_alias_activation = activation1d.Activation1d( - activation=SnakeBeta(10), fused=True - ).cuda() + fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda() fused_activation_output = fused_anti_alias_activation(data) - torch_anti_alias_activation = activation1d.Activation1d( - activation=SnakeBeta(10), fused=False - ).cuda() + torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda() torch_activation_output = torch_anti_alias_activation(data) test_result = (fused_activation_output - torch_activation_output).abs() @@ -57,7 +54,6 @@ def test_anti_alias_activation(): ) - if __name__ == "__main__": from alias_free_activation.cuda import load diff --git a/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py b/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py index 86ad0518..8ddb29e5 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py +++ b/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py @@ -42,9 +42,7 @@ def generate_soundwave(duration=5.0, sr=24000): def get_mel(x, h): - return mel_spectrogram( - x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax - ) + return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax) def load_checkpoint(filepath, device): @@ -56,9 +54,7 @@ def load_checkpoint(filepath, device): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Test script to check CUDA kernel correctness." - ) + parser = argparse.ArgumentParser(description="Test script to check CUDA kernel correctness.") parser.add_argument( "--checkpoint_file", type=str, @@ -91,27 +87,25 @@ if __name__ == "__main__": # define number of samples and length of mel frame to benchmark num_sample = 10 num_mel_frame = 16384 - + # CUDA kernel correctness check diff = 0.0 for i in tqdm(range(num_sample)): # Random mel data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda") - + with torch.inference_mode(): audio_original = generator_original(data) - + with torch.inference_mode(): audio_cuda_kernel = generator_cuda_kernel(data) # Both outputs should be (almost) the same test_result = (audio_original - audio_cuda_kernel).abs() diff += test_result.mean(dim=-1).item() - + diff /= num_sample - if ( - diff <= 2e-3 - ): # We can expect a small difference (~1e-3) which does not affect perceptual quality + if diff <= 2e-3: # We can expect a small difference (~1e-3) which does not affect perceptual quality print( f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference" f"\n > mean_difference={diff}" @@ -125,9 +119,9 @@ if __name__ == "__main__": f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, " f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}" ) - + del data, audio_original, audio_cuda_kernel - + # Variables for tracking total time and VRAM usage toc_total_original = 0 toc_total_cuda_kernel = 0 @@ -145,10 +139,10 @@ if __name__ == "__main__": audio_original = generator_original(data) torch.cuda.synchronize() toc = time() - tic - toc_total_original += toc + toc_total_original += toc vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda") - + del data, audio_original torch.cuda.empty_cache() @@ -163,11 +157,11 @@ if __name__ == "__main__": torch.cuda.synchronize() toc = time() - tic toc_total_cuda_kernel += toc - + audio_length_total += audio_cuda_kernel.shape[-1] - + vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda") - + del data, audio_cuda_kernel torch.cuda.empty_cache() @@ -175,8 +169,8 @@ if __name__ == "__main__": audio_second = audio_length_total / h.sampling_rate khz_original = audio_length_total / toc_total_original / 1000 khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000 - vram_used_original_gb = vram_used_original_total / num_sample / (1024 ** 3) - vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024 ** 3) + vram_used_original_gb = vram_used_original_total / num_sample / (1024**3) + vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024**3) # Print results print( diff --git a/GPT_SoVITS/BigVGAN/train.py b/GPT_SoVITS/BigVGAN/train.py index 01eeb095..39718cdb 100644 --- a/GPT_SoVITS/BigVGAN/train.py +++ b/GPT_SoVITS/BigVGAN/train.py @@ -77,24 +77,18 @@ def train(rank, a, h): # Define additional discriminators. BigVGAN-v1 uses UnivNet's MRD as default # New in BigVGAN-v2: option to switch to new discriminators: MultiBandDiscriminator / MultiScaleSubbandCQTDiscriminator if h.get("use_mbd_instead_of_mrd", False): # Switch to MBD - print( - "[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator" - ) + print("[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator") # Variable name is kept as "mrd" for backward compatibility & minimal code change mrd = MultiBandDiscriminator(h).to(device) elif h.get("use_cqtd_instead_of_mrd", False): # Switch to CQTD - print( - "[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator" - ) + print("[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator") mrd = MultiScaleSubbandCQTDiscriminator(h).to(device) else: # Fallback to original MRD in BigVGAN-v1 mrd = MultiResolutionDiscriminator(h).to(device) # New in BigVGAN-v2: option to switch to multi-scale L1 mel loss if h.get("use_multiscale_melloss", False): - print( - "[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss" - ) + print("[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss") fn_mel_loss_multiscale = MultiScaleMelSpectrogramLoss( sampling_rate=h.sampling_rate ) # NOTE: accepts waveform as input @@ -114,9 +108,7 @@ def train(rank, a, h): if os.path.isdir(a.checkpoint_path): # New in v2.1: If the step prefix pattern-based checkpoints are not found, also check for renamed files in Hugging Face Hub to resume training - cp_g = scan_checkpoint( - a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt" - ) + cp_g = scan_checkpoint(a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt") cp_do = scan_checkpoint( a.checkpoint_path, prefix="do_", @@ -143,9 +135,7 @@ def train(rank, a, h): mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) mrd = DistributedDataParallel(mrd, device_ids=[rank]).to(device) - optim_g = torch.optim.AdamW( - generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2] - ) + optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) optim_d = torch.optim.AdamW( itertools.chain(mrd.parameters(), mpd.parameters()), h.learning_rate, @@ -156,12 +146,8 @@ def train(rank, a, h): optim_g.load_state_dict(state_dict_do["optim_g"]) optim_d.load_state_dict(state_dict_do["optim_d"]) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=h.lr_decay, last_epoch=last_epoch - ) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR( - optim_d, gamma=h.lr_decay, last_epoch=last_epoch - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch) # Define training and validation datasets @@ -169,9 +155,7 @@ def train(rank, a, h): unseen_validation_filelist will contain sample filepaths outside the seen training & validation dataset Example: trained on LibriTTS, validate on VCTK """ - training_filelist, validation_filelist, list_unseen_validation_filelist = ( - get_dataset_filelist(a) - ) + training_filelist, validation_filelist, list_unseen_validation_filelist = get_dataset_filelist(a) trainset = MelDataset( training_filelist, @@ -324,33 +308,26 @@ def train(rank, a, h): h.fmax_for_loss, ) min_t = min(y_mel.size(-1), y_g_hat_mel.size(-1)) - val_err_tot += F.l1_loss(y_mel[...,:min_t], y_g_hat_mel[...,:min_t]).item() + val_err_tot += F.l1_loss(y_mel[..., :min_t], y_g_hat_mel[..., :min_t]).item() # PESQ calculation. only evaluate PESQ if it's speech signal (nonspeech PESQ will error out) - if ( - not "nonspeech" in mode - ): # Skips if the name of dataset (in mode string) contains "nonspeech" - + if "nonspeech" not in mode: # Skips if the name of dataset (in mode string) contains "nonspeech" # Resample to 16000 for pesq y_16k = pesq_resampler(y) y_g_hat_16k = pesq_resampler(y_g_hat.squeeze(1)) y_int_16k = (y_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() - y_g_hat_int_16k = ( - (y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() - ) + y_g_hat_int_16k = (y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() val_pesq_tot += pesq(16000, y_int_16k, y_g_hat_int_16k, "wb") # MRSTFT calculation min_t = min(y.size(-1), y_g_hat.size(-1)) - val_mrstft_tot += loss_mrstft(y_g_hat[...,:min_t], y[...,:min_t]).item() + val_mrstft_tot += loss_mrstft(y_g_hat[..., :min_t], y[..., :min_t]).item() # Log audio and figures to Tensorboard if j % a.eval_subsample == 0: # Subsample every nth from validation set if steps >= 0: sw.add_audio(f"gt_{mode}/y_{j}", y[0], steps, h.sampling_rate) - if ( - a.save_audio - ): # Also save audio to disk if --save_audio is set to True + if a.save_audio: # Also save audio to disk if --save_audio is set to True save_audio( y[0], os.path.join( @@ -373,9 +350,7 @@ def train(rank, a, h): steps, h.sampling_rate, ) - if ( - a.save_audio - ): # Also save audio to disk if --save_audio is set to True + if a.save_audio: # Also save audio to disk if --save_audio is set to True save_audio( y_g_hat[0, 0], os.path.join( @@ -487,15 +462,11 @@ def train(rank, a, h): # MPD y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) - loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss( - y_df_hat_r, y_df_hat_g - ) + loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g) # MRD y_ds_hat_r, y_ds_hat_g, _, _ = mrd(y, y_g_hat.detach()) - loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss( - y_ds_hat_r, y_ds_hat_g - ) + loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g) loss_disc_all = loss_disc_s + loss_disc_f @@ -505,17 +476,11 @@ def train(rank, a, h): # Whether to freeze D for initial training steps if steps >= a.freeze_step: loss_disc_all.backward() - grad_norm_mpd = torch.nn.utils.clip_grad_norm_( - mpd.parameters(), clip_grad_norm - ) - grad_norm_mrd = torch.nn.utils.clip_grad_norm_( - mrd.parameters(), clip_grad_norm - ) + grad_norm_mpd = torch.nn.utils.clip_grad_norm_(mpd.parameters(), clip_grad_norm) + grad_norm_mrd = torch.nn.utils.clip_grad_norm_(mrd.parameters(), clip_grad_norm) optim_d.step() else: - print( - f"[WARNING] skipping D training for the first {a.freeze_step} steps" - ) + print(f"[WARNING] skipping D training for the first {a.freeze_step} steps") grad_norm_mpd = 0.0 grad_norm_mrd = 0.0 @@ -523,9 +488,7 @@ def train(rank, a, h): optim_g.zero_grad() # L1 Mel-Spectrogram Loss - lambda_melloss = h.get( - "lambda_melloss", 45.0 - ) # Defaults to 45 in BigVGAN-v1 if not set + lambda_melloss = h.get("lambda_melloss", 45.0) # Defaults to 45 in BigVGAN-v1 if not set if h.get("use_multiscale_melloss", False): # uses wav for loss loss_mel = fn_mel_loss_multiscale(y, y_g_hat) * lambda_melloss else: # Uses mel for loss @@ -542,27 +505,19 @@ def train(rank, a, h): loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) if steps >= a.freeze_step: - loss_gen_all = ( - loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel - ) + loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel else: - print( - f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps" - ) + print(f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps") loss_gen_all = loss_mel loss_gen_all.backward() - grad_norm_g = torch.nn.utils.clip_grad_norm_( - generator.parameters(), clip_grad_norm - ) + grad_norm_g = torch.nn.utils.clip_grad_norm_(generator.parameters(), clip_grad_norm) optim_g.step() if rank == 0: # STDOUT logging if steps % a.stdout_interval == 0: - mel_error = ( - loss_mel.item() / lambda_melloss - ) # Log training mel regression loss to stdout + mel_error = loss_mel.item() / lambda_melloss # Log training mel regression loss to stdout print( f"Steps: {steps:d}, " f"Gen Loss Total: {loss_gen_all:4.3f}, " @@ -577,11 +532,7 @@ def train(rank, a, h): checkpoint_path = f"{a.checkpoint_path}/g_{steps:08d}" save_checkpoint( checkpoint_path, - { - "generator": ( - generator.module if h.num_gpus > 1 else generator - ).state_dict() - }, + {"generator": (generator.module if h.num_gpus > 1 else generator).state_dict()}, ) checkpoint_path = f"{a.checkpoint_path}/do_{steps:08d}" save_checkpoint( @@ -598,9 +549,7 @@ def train(rank, a, h): # Tensorboard summary logging if steps % a.summary_interval == 0: - mel_error = ( - loss_mel.item() / lambda_melloss - ) # Log training mel regression loss to tensorboard + mel_error = loss_mel.item() / lambda_melloss # Log training mel regression loss to tensorboard sw.add_scalar("training/gen_loss_total", loss_gen_all.item(), steps) sw.add_scalar("training/mel_spec_error", mel_error, steps) sw.add_scalar("training/fm_loss_mpd", loss_fm_f.item(), steps) @@ -612,12 +561,8 @@ def train(rank, a, h): sw.add_scalar("training/disc_loss_mrd", loss_disc_s.item(), steps) sw.add_scalar("training/grad_norm_mrd", grad_norm_mrd, steps) sw.add_scalar("training/grad_norm_g", grad_norm_g, steps) - sw.add_scalar( - "training/learning_rate_d", scheduler_d.get_last_lr()[0], steps - ) - sw.add_scalar( - "training/learning_rate_g", scheduler_g.get_last_lr()[0], steps - ) + sw.add_scalar("training/learning_rate_d", scheduler_d.get_last_lr()[0], steps) + sw.add_scalar("training/learning_rate_g", scheduler_g.get_last_lr()[0], steps) sw.add_scalar("training/epoch", epoch + 1, steps) # Validation @@ -660,9 +605,7 @@ def train(rank, a, h): scheduler_d.step() if rank == 0: - print( - f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n" - ) + print(f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n") def main(): @@ -674,12 +617,8 @@ def main(): parser.add_argument("--input_wavs_dir", default="LibriTTS") parser.add_argument("--input_mels_dir", default="ft_dataset") - parser.add_argument( - "--input_training_file", default="tests/LibriTTS/train-full.txt" - ) - parser.add_argument( - "--input_validation_file", default="tests/LibriTTS/val-full.txt" - ) + parser.add_argument("--input_training_file", default="tests/LibriTTS/train-full.txt") + parser.add_argument("--input_validation_file", default="tests/LibriTTS/val-full.txt") parser.add_argument( "--list_input_unseen_wavs_dir", diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 5cd618e6..6ef46eb8 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1,74 +1,100 @@ -from copy import deepcopy +import gc import math -import os, sys, gc +import os import random -import traceback +import sys import time +import traceback +from copy import deepcopy + import torchaudio from tqdm import tqdm + now_dir = os.getcwd() sys.path.append(now_dir) -import ffmpeg import os -from typing import Generator, List, Tuple, Union +from typing import List, Tuple, Union + +import ffmpeg +import librosa import numpy as np import torch import torch.nn.functional as F import yaml -from transformers import AutoModelForMaskedLM, AutoTokenizer -from tools.audio_sr import AP_BWE from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from BigVGAN.bigvgan import BigVGAN from feature_extractor.cnhubert import CNHubert -from module.models import SynthesizerTrn, SynthesizerTrnV3 +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch +from module.models import SynthesizerTrn, SynthesizerTrnV3, Generator from peft import LoraConfig, get_peft_model -import librosa -from time import time as ttime +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new +from transformers import AutoModelForMaskedLM, AutoTokenizer + +from tools.audio_sr import AP_BWE from tools.i18n.i18n import I18nAuto, scan_language_list from tools.my_utils import load_audio -from module.mel_processing import spectrogram_torch from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor -from BigVGAN.bigvgan import BigVGAN -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch -from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language -i18n = I18nAuto(language=language) +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +i18n = I18nAuto(language=language) spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) -def speed_change(input_audio:np.ndarray, speed:float, sr:int): +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + +mel_fn_v4 = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1280, + "win_size": 1280, + "hop_size": 320, + "num_mels": 100, + "sampling_rate": 32000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +def speed_change(input_audio: np.ndarray, speed: float, sr: int): # 将 NumPy 数组转换为原始 PCM 流 raw_audio = input_audio.astype(np.int16).tobytes() # 设置 ffmpeg 输入流 - input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1) + input_stream = ffmpeg.input("pipe:", format="s16le", acodec="pcm_s16le", ar=str(sr), ac=1) # 变速处理 - output_stream = input_stream.filter('atempo', speed) + output_stream = input_stream.filter("atempo", speed) # 输出流到管道 - out, _ = ( - output_stream.output('pipe:', format='s16le', acodec='pcm_s16le') - .run(input=raw_audio, capture_stdout=True, capture_stderr=True) + out, _ = output_stream.output("pipe:", format="s16le", acodec="pcm_s16le").run( + input=raw_audio, capture_stdout=True, capture_stderr=True ) # 将管道输出解码为 NumPy 数组 @@ -77,15 +103,15 @@ def speed_change(input_audio:np.ndarray, speed:float, sr:int): return processed_audio +resample_transform_dict = {} -resample_transform_dict={} -def resample(audio_tensor, sr0, device): + +def resample(audio_tensor, sr0, sr1, device): global resample_transform_dict - if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) - return resample_transform_dict[sr0](audio_tensor) + key = "%s-%s" % (sr0, sr1) + if key not in resample_transform_dict: + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) + return resample_transform_dict[key](audio_tensor) class DictToAttrRecursive(dict): @@ -130,7 +156,7 @@ custom: t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 -default: +v1: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu @@ -138,7 +164,7 @@ default: t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth version: v1 -default_v2: +v2: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu @@ -146,7 +172,7 @@ default_v2: t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth version: v2 -default_v3: +v3: bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base device: cpu @@ -154,13 +180,22 @@ default_v3: t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth version: v3 +v4: + bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + device: cpu + is_half: false + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt + version: v4 + vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth """ -def set_seed(seed:int): + +def set_seed(seed: int): seed = int(seed) seed = seed if seed != -1 else random.randint(0, 2**32 - 1) print(f"Set seed to {seed}") - os.environ['PYTHONHASHSEED'] = str(seed) + os.environ["PYTHONHASHSEED"] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -178,40 +213,50 @@ def set_seed(seed:int): pass return seed + class TTS_Config: - default_configs={ - "v1":{ - "device": "cpu", - "is_half": False, - "version": "v1", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, - "v2":{ - "device": "cpu", - "is_half": False, - "version": "v2", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, - "v3":{ - "device": "cpu", - "is_half": False, - "version": "v3", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, + default_configs = { + "v1": { + "device": "cpu", + "is_half": False, + "version": "v1", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, + "v2": { + "device": "cpu", + "is_half": False, + "version": "v2", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, + "v3": { + "device": "cpu", + "is_half": False, + "version": "v3", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, + "v4": { + "device": "cpu", + "is_half": False, + "version": "v4", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, } - configs:dict = None - v1_languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] - v2_languages:list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"] - languages:list = v2_languages + configs: dict = None + v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] + v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"] + languages: list = v2_languages # "all_zh",#全部按中文识别 # "en",#全部按英文识别#######不变 # "all_ja",#全部按日文识别 @@ -224,33 +269,31 @@ class TTS_Config: # "auto",#多语种启动切分识别语种 # "auto_yue",#多语种启动切分识别语种 - def __init__(self, configs: Union[dict, str]=None): - + def __init__(self, configs: Union[dict, str] = None): # 设置默认配置文件路径 - configs_base_path:str = "GPT_SoVITS/configs/" + configs_base_path: str = "GPT_SoVITS/configs/" os.makedirs(configs_base_path, exist_ok=True) - self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml") + self.configs_path: str = os.path.join(configs_base_path, "tts_infer.yaml") if configs in ["", None]: if not os.path.exists(self.configs_path): self.save_configs() print(f"Create default config file at {self.configs_path}") - configs:dict = deepcopy(self.default_configs) + configs: dict = deepcopy(self.default_configs) if isinstance(configs, str): self.configs_path = configs - configs:dict = self._load_configs(self.configs_path) + configs: dict = self._load_configs(self.configs_path) assert isinstance(configs, dict) version = configs.get("version", "v2").lower() - assert version in ["v1", "v2", "v3"] + assert version in ["v1", "v2", "v3", "v4"] self.default_configs[version] = configs.get(version, self.default_configs[version]) - self.configs:dict = configs.get("custom", deepcopy(self.default_configs[version])) - + self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version])) self.device = self.configs.get("device", torch.device("cpu")) if "cuda" in str(self.device) and not torch.cuda.is_available(): - print(f"Warning: CUDA is not available, set device to CPU.") + print("Warning: CUDA is not available, set device to CPU.") self.device = torch.device("cpu") self.is_half = self.configs.get("is_half", False) @@ -263,81 +306,77 @@ class TTS_Config: self.vits_weights_path = self.configs.get("vits_weights_path", None) self.bert_base_path = self.configs.get("bert_base_path", None) self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None) - self.languages = self.v1_languages if self.version=="v1" else self.v2_languages - - self.is_v3_synthesizer:bool = False + self.languages = self.v1_languages if self.version == "v1" else self.v2_languages + self.use_vocoder: bool = False if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)): - self.t2s_weights_path = self.default_configs[version]['t2s_weights_path'] + self.t2s_weights_path = self.default_configs[version]["t2s_weights_path"] print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}") if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)): - self.vits_weights_path = self.default_configs[version]['vits_weights_path'] + self.vits_weights_path = self.default_configs[version]["vits_weights_path"] print(f"fall back to default vits_weights_path: {self.vits_weights_path}") if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)): - self.bert_base_path = self.default_configs[version]['bert_base_path'] + self.bert_base_path = self.default_configs[version]["bert_base_path"] print(f"fall back to default bert_base_path: {self.bert_base_path}") if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): - self.cnhuhbert_base_path = self.default_configs[version]['cnhuhbert_base_path'] + self.cnhuhbert_base_path = self.default_configs[version]["cnhuhbert_base_path"] print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") self.update_configs() - self.max_sec = None - self.hz:int = 50 - self.semantic_frame_rate:str = "25hz" - self.segment_size:int = 20480 - self.filter_length:int = 2048 - self.sampling_rate:int = 32000 - self.hop_length:int = 640 - self.win_length:int = 2048 - self.n_speakers:int = 300 + self.hz: int = 50 + self.semantic_frame_rate: str = "25hz" + self.segment_size: int = 20480 + self.filter_length: int = 2048 + self.sampling_rate: int = 32000 + self.hop_length: int = 640 + self.win_length: int = 2048 + self.n_speakers: int = 300 - - - def _load_configs(self, configs_path: str)->dict: + def _load_configs(self, configs_path: str) -> dict: if os.path.exists(configs_path): ... else: print(i18n("路径不存在,使用默认配置")) self.save_configs(configs_path) - with open(configs_path, 'r', encoding='utf-8') as f: + with open(configs_path, "r", encoding="utf-8") as f: configs = yaml.load(f, Loader=yaml.FullLoader) return configs - def save_configs(self, configs_path:str=None)->None: - configs=deepcopy(self.default_configs) + def save_configs(self, configs_path: str = None) -> None: + configs = deepcopy(self.default_configs) if self.configs is not None: configs["custom"] = self.update_configs() if configs_path is None: configs_path = self.configs_path - with open(configs_path, 'w') as f: + with open(configs_path, "w") as f: yaml.dump(configs, f) def update_configs(self): self.config = { - "device" : str(self.device), - "is_half" : self.is_half, - "version" : self.version, - "t2s_weights_path" : self.t2s_weights_path, - "vits_weights_path" : self.vits_weights_path, - "bert_base_path" : self.bert_base_path, + "device": str(self.device), + "is_half": self.is_half, + "version": self.version, + "t2s_weights_path": self.t2s_weights_path, + "vits_weights_path": self.vits_weights_path, + "bert_base_path": self.bert_base_path, "cnhuhbert_base_path": self.cnhuhbert_base_path, } return self.config - def update_version(self, version:str)->None: + def update_version(self, version: str) -> None: self.version = version - self.languages = self.v1_languages if self.version=="v1" else self.v2_languages + self.languages = self.v1_languages if self.version == "v1" else self.v2_languages def __str__(self): self.configs = self.update_configs() - string = "TTS Config".center(100, '-') + '\n' + string = "TTS Config".center(100, "-") + "\n" for k, v in self.configs.items(): string += f"{str(k).ljust(20)}: {str(v)}\n" - string += "-" * 100 + '\n' + string += "-" * 100 + "\n" return string def __repr__(self): @@ -355,77 +394,79 @@ class TTS: if isinstance(configs, TTS_Config): self.configs = configs else: - self.configs:TTS_Config = TTS_Config(configs) + self.configs: TTS_Config = TTS_Config(configs) - self.t2s_model:Text2SemanticLightningModule = None - self.vits_model:Union[SynthesizerTrn, SynthesizerTrnV3] = None - self.bert_tokenizer:AutoTokenizer = None - self.bert_model:AutoModelForMaskedLM = None - self.cnhuhbert_model:CNHubert = None - self.bigvgan_model:BigVGAN = None - self.sr_model:AP_BWE = None - self.sr_model_not_exist:bool = False + self.t2s_model: Text2SemanticLightningModule = None + self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None + self.bert_tokenizer: AutoTokenizer = None + self.bert_model: AutoModelForMaskedLM = None + self.cnhuhbert_model: CNHubert = None + self.vocoder = None + self.sr_model: AP_BWE = None + self.sr_model_not_exist: bool = False + + self.vocoder_configs: dict = { + "sr": None, + "T_ref": None, + "T_chunk": None, + "upsample_rate": None, + "overlapped_len": None, + } self._init_models() - self.text_preprocessor:TextPreprocessor = \ - TextPreprocessor(self.bert_model, - self.bert_tokenizer, - self.configs.device) + self.text_preprocessor: TextPreprocessor = TextPreprocessor( + self.bert_model, self.bert_tokenizer, self.configs.device + ) - - self.prompt_cache:dict = { - "ref_audio_path" : None, + self.prompt_cache: dict = { + "ref_audio_path": None, "prompt_semantic": None, - "refer_spec" : [], - "prompt_text" : None, - "prompt_lang" : None, - "phones" : None, - "bert_features" : None, - "norm_text" : None, + "refer_spec": [], + "prompt_text": None, + "prompt_lang": None, + "phones": None, + "bert_features": None, + "norm_text": None, "aux_ref_audio_paths": [], } + self.stop_flag: bool = False + self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32 - self.stop_flag:bool = False - self.precision:torch.dtype = torch.float16 if self.configs.is_half else torch.float32 - - def _init_models(self,): + def _init_models( + self, + ): self.init_t2s_weights(self.configs.t2s_weights_path) self.init_vits_weights(self.configs.vits_weights_path) self.init_bert_weights(self.configs.bert_base_path) self.init_cnhuhbert_weights(self.configs.cnhuhbert_base_path) # self.enable_half_precision(self.configs.is_half) - - def init_cnhuhbert_weights(self, base_path: str): print(f"Loading CNHuBERT weights from {base_path}") self.cnhuhbert_model = CNHubert(base_path) - self.cnhuhbert_model=self.cnhuhbert_model.eval() + self.cnhuhbert_model = self.cnhuhbert_model.eval() self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device) - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.cnhuhbert_model = self.cnhuhbert_model.half() - - def init_bert_weights(self, base_path: str): print(f"Loading BERT weights from {base_path}") self.bert_tokenizer = AutoTokenizer.from_pretrained(base_path) self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path) - self.bert_model=self.bert_model.eval() + self.bert_model = self.bert_model.eval() self.bert_model = self.bert_model.to(self.configs.device) - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.bert_model = self.bert_model.half() def init_vits_weights(self, weights_path: str): - self.configs.vits_weights_path = weights_path - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(weights_path) - path_sovits_v3=self.configs.default_configs["v3"]["vits_weights_path"] + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path) + path_sovits = self.configs.default_configs[model_version]["vits_weights_path"] - if if_lora_v3==True and os.path.exists(path_sovits_v3)==False: - info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + if if_lora_v3 == True and os.path.exists(path_sovits) == False: + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) raise FileExistsError(info) # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) @@ -433,9 +474,9 @@ class TTS: hps = dict_s2["config"] hps["model"]["semantic_frame_rate"] = "25hz" - if 'enc_p.text_embedding.weight'not in dict_s2['weight']: - hps["model"]["version"] = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps["model"]["version"] = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps["model"]["version"] = "v1" else: hps["model"]["version"] = "v2" @@ -455,32 +496,36 @@ class TTS: # print(f"model_version:{model_version}") # print(f'hps["model"]["version"]:{hps["model"]["version"]}') - if model_version!="v3": + if model_version not in {"v3", "v4"}: vits_model = SynthesizerTrn( self.configs.filter_length // 2 + 1, self.configs.segment_size // self.configs.hop_length, n_speakers=self.configs.n_speakers, - **kwargs + **kwargs, ) - if hasattr(vits_model, "enc_q"): - del vits_model.enc_q - self.configs.is_v3_synthesizer = False + self.configs.use_vocoder = False else: + kwargs["version"] = model_version vits_model = SynthesizerTrnV3( self.configs.filter_length // 2 + 1, self.configs.segment_size // self.configs.hop_length, n_speakers=self.configs.n_speakers, - **kwargs + **kwargs, ) - self.configs.is_v3_synthesizer = True - self.init_bigvgan() - + self.configs.use_vocoder = True + self.init_vocoder(model_version) + if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"): + del vits_model.enc_q - if if_lora_v3==False: - print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") + if if_lora_v3 == False: + print( + f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}" + ) else: - print(f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}") - lora_rank=dict_s2["lora_rank"] + print( + f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits)['weight'], strict=False)}" + ) + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -488,19 +533,19 @@ class TTS: init_lora_weights=True, ) vits_model.cfm = get_peft_model(vits_model.cfm, lora_config) - print(f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") - - vits_model.cfm = vits_model.cfm.merge_and_unload() + print( + f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}" + ) + vits_model.cfm = vits_model.cfm.merge_and_unload() vits_model = vits_model.to(self.configs.device) vits_model = vits_model.eval() self.vits_model = vits_model - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.vits_model = self.vits_model.half() - def init_t2s_weights(self, weights_path: str): print(f"Loading Text2Semantic weights from {weights_path}") self.configs.t2s_weights_path = weights_path @@ -514,40 +559,85 @@ class TTS: t2s_model = t2s_model.to(self.configs.device) t2s_model = t2s_model.eval() self.t2s_model = t2s_model - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.t2s_model = self.t2s_model.half() + def init_vocoder(self, version: str): + if version == "v3": + if self.vocoder is not None and self.vocoder.__class__.__name__ == "BigVGAN": + return + if self.vocoder is not None: + self.vocoder.cpu() + del self.vocoder + self.empty_cache() - def init_bigvgan(self): - if self.bigvgan_model is not None: - return - self.bigvgan_model = BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions - # remove weight norm in the model and set to eval mode - self.bigvgan_model.remove_weight_norm() - self.bigvgan_model = self.bigvgan_model.eval() + self.vocoder = BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions + # remove weight norm in the model and set to eval mode + self.vocoder.remove_weight_norm() + + self.vocoder_configs["sr"] = 24000 + self.vocoder_configs["T_ref"] = 468 + self.vocoder_configs["T_chunk"] = 934 + self.vocoder_configs["upsample_rate"] = 256 + self.vocoder_configs["overlapped_len"] = 12 + + elif version == "v4": + if self.vocoder is not None and self.vocoder.__class__.__name__ == "Generator": + return + if self.vocoder is not None: + self.vocoder.cpu() + del self.vocoder + self.empty_cache() + + self.vocoder = Generator( + initial_channel=100, + resblock="1", + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + upsample_rates=[10, 6, 2, 2, 2], + upsample_initial_channel=512, + upsample_kernel_sizes=[20, 12, 4, 4, 4], + gin_channels=0, + is_bias=True, + ) + self.vocoder.remove_weight_norm() + state_dict_g = torch.load( + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu" + ) + print("loading vocoder", self.vocoder.load_state_dict(state_dict_g)) + + self.vocoder_configs["sr"] = 48000 + self.vocoder_configs["T_ref"] = 500 + self.vocoder_configs["T_chunk"] = 1000 + self.vocoder_configs["upsample_rate"] = 480 + self.vocoder_configs["overlapped_len"] = 12 + + self.vocoder = self.vocoder.eval() if self.configs.is_half == True: - self.bigvgan_model = self.bigvgan_model.half().to(self.configs.device) + self.vocoder = self.vocoder.half().to(self.configs.device) else: - self.bigvgan_model = self.bigvgan_model.to(self.configs.device) + self.vocoder = self.vocoder.to(self.configs.device) def init_sr_model(self): if self.sr_model is not None: return try: - self.sr_model:AP_BWE=AP_BWE(self.configs.device,DictToAttrRecursive) + self.sr_model: AP_BWE = AP_BWE(self.configs.device, DictToAttrRecursive) self.sr_model_not_exist = False except FileNotFoundError: print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) self.sr_model_not_exist = True - def enable_half_precision(self, enable: bool = True, save: bool = True): - ''' - To enable half precision for the TTS model. - Args: - enable: bool, whether to enable half precision. + """ + To enable half precision for the TTS model. + Args: + enable: bool, whether to enable half precision. - ''' + """ if str(self.configs.device) == "cpu" and enable: print("Half precision is not supported on CPU.") return @@ -558,15 +648,15 @@ class TTS: self.configs.save_configs() if enable: if self.t2s_model is not None: - self.t2s_model =self.t2s_model.half() + self.t2s_model = self.t2s_model.half() if self.vits_model is not None: self.vits_model = self.vits_model.half() if self.bert_model is not None: - self.bert_model =self.bert_model.half() + self.bert_model = self.bert_model.half() if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.half() - if self.bigvgan_model is not None: - self.bigvgan_model = self.bigvgan_model.half() + if self.vocoder is not None: + self.vocoder = self.vocoder.half() else: if self.t2s_model is not None: self.t2s_model = self.t2s_model.float() @@ -576,15 +666,15 @@ class TTS: self.bert_model = self.bert_model.float() if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.float() - if self.bigvgan_model is not None: - self.bigvgan_model = self.bigvgan_model.float() + if self.vocoder is not None: + self.vocoder = self.vocoder.float() def set_device(self, device: torch.device, save: bool = True): - ''' - To set the device for all models. - Args: - device: torch.device, the device to use for all models. - ''' + """ + To set the device for all models. + Args: + device: torch.device, the device to use for all models. + """ self.configs.device = device if save: self.configs.save_configs() @@ -596,19 +686,18 @@ class TTS: self.bert_model = self.bert_model.to(device) if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.to(device) - if self.bigvgan_model is not None: - self.bigvgan_model = self.bigvgan_model.to(device) + if self.vocoder is not None: + self.vocoder = self.vocoder.to(device) if self.sr_model is not None: self.sr_model = self.sr_model.to(device) - - def set_ref_audio(self, ref_audio_path:str): - ''' - To set the reference audio for the TTS model, - including the prompt_semantic and refer_spepc. - Args: - ref_audio_path: str, the path of the reference audio. - ''' + def set_ref_audio(self, ref_audio_path: str): + """ + To set the reference audio for the TTS model, + including the prompt_semantic and refer_spepc. + Args: + ref_audio_path: str, the path of the reference audio. + """ self._set_prompt_semantic(ref_audio_path) self._set_ref_spec(ref_audio_path) self._set_ref_audio_path(ref_audio_path) @@ -618,21 +707,22 @@ class TTS: def _set_ref_spec(self, ref_audio_path): spec = self._get_ref_spec(ref_audio_path) - if self.prompt_cache["refer_spec"] in [[],None]: - self.prompt_cache["refer_spec"]=[spec] + if self.prompt_cache["refer_spec"] in [[], None]: + self.prompt_cache["refer_spec"] = [spec] else: self.prompt_cache["refer_spec"][0] = spec def _get_ref_spec(self, ref_audio_path): raw_audio, raw_sr = torchaudio.load(ref_audio_path) - raw_audio=raw_audio.to(self.configs.device).float() + raw_audio = raw_audio.to(self.configs.device).float() self.prompt_cache["raw_audio"] = raw_audio self.prompt_cache["raw_sr"] = raw_sr audio = load_audio(ref_audio_path, int(self.configs.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1):audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( @@ -648,14 +738,14 @@ class TTS: spec = spec.half() return spec - def _set_prompt_semantic(self, ref_wav_path:str): + def _set_prompt_semantic(self, ref_wav_path: str): zero_wav = np.zeros( int(self.configs.sampling_rate * 0.3), dtype=np.float16 if self.configs.is_half else np.float32, ) with torch.no_grad(): wav16k, sr = librosa.load(ref_wav_path, sr=16000) - if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000: raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) wav16k = torch.from_numpy(wav16k) zero_wav_torch = torch.from_numpy(zero_wav) @@ -666,9 +756,7 @@ class TTS: zero_wav_torch = zero_wav_torch.half() wav16k = torch.cat([wav16k, zero_wav_torch]) - hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( + hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose( 1, 2 ) # .float() codes = self.vits_model.extract_latent(hubert_feature) @@ -676,12 +764,12 @@ class TTS: prompt_semantic = codes[0, 0].to(self.configs.device) self.prompt_cache["prompt_semantic"] = prompt_semantic - def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length:int=None): + def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None): seq = sequences[0] ndim = seq.dim() if axis < 0: axis += ndim - dtype:torch.dtype = seq.dtype + dtype: torch.dtype = seq.dtype pad_value = torch.tensor(pad_value, dtype=dtype) seq_lengths = [seq.shape[axis] for seq in sequences] if max_length is None: @@ -697,15 +785,17 @@ class TTS: batch = torch.stack(padded_sequences) return batch - def to_batch(self, data:list, - prompt_data:dict=None, - batch_size:int=5, - threshold:float=0.75, - split_bucket:bool=True, - device:torch.device=torch.device("cpu"), - precision:torch.dtype=torch.float32, - ): - _data:list = [] + def to_batch( + self, + data: list, + prompt_data: dict = None, + batch_size: int = 5, + threshold: float = 0.75, + split_bucket: bool = True, + device: torch.device = torch.device("cpu"), + precision: torch.dtype = torch.float32, + ): + _data: list = [] index_and_len_list = [] for idx, item in enumerate(data): norm_text_len = len(item["norm_text"]) @@ -718,29 +808,28 @@ class TTS: batch_index_list_len = 0 pos = 0 - while pos =threshold) or (pos_end-pos==1): - batch_index=index_and_len_list[pos:pos_end, 0].tolist() + batch = index_and_len_list[pos:pos_end, 1].astype(np.float32) + score = batch[(pos_end - pos) // 2] / (batch.mean() + 1e-8) + if (score >= threshold) or (pos_end - pos == 1): + batch_index = index_and_len_list[pos:pos_end, 0].tolist() batch_index_list_len += len(batch_index) batch_index_list.append(batch_index) pos = pos_end break - pos_end=pos_end-1 + pos_end = pos_end - 1 assert batch_index_list_len == len(data) else: for i in range(len(data)): - if i%batch_size == 0: + if i % batch_size == 0: batch_index_list.append([]) batch_index_list[-1].append(i) - for batch_idx, index_list in enumerate(batch_index_list): item_list = [data[idx] for idx in index_list] phones_list = [] @@ -754,14 +843,14 @@ class TTS: all_phones_max_len = 0 for item in item_list: if prompt_data is not None: - all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\ - .to(dtype=precision, device=device) - all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"]).to(device) + all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1).to( + dtype=precision, device=device + ) + all_phones = torch.LongTensor(prompt_data["phones"] + item["phones"]).to(device) phones = torch.LongTensor(item["phones"]).to(device) # norm_text = prompt_data["norm_text"]+item["norm_text"] else: - all_bert_features = item["bert_features"]\ - .to(dtype=precision, device=device) + all_bert_features = item["bert_features"].to(dtype=precision, device=device) phones = torch.LongTensor(item["phones"]).to(device) all_phones = phones # norm_text = item["norm_text"] @@ -780,7 +869,6 @@ class TTS: all_phones_batch = all_phones_list all_bert_features_batch = all_bert_features_list - max_len = max(all_bert_max_len, all_phones_max_len) # phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len) #### 直接对phones和bert_features进行pad。(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略) @@ -812,8 +900,8 @@ class TTS: return _data, batch_index_list - def recovery_order(self, data:list, batch_index_list:list)->list: - ''' + def recovery_order(self, data: list, batch_index_list: list) -> list: + """ Recovery the order of the audio according to the batch_index_list. Args: @@ -822,22 +910,24 @@ class TTS: Returns: list (List[torch.Tensor]): the data in the original order. - ''' + """ length = len(sum(batch_index_list, [])) - _data = [None]*length + _data = [None] * length for i, index_list in enumerate(batch_index_list): for j, index in enumerate(index_list): _data[index] = data[i][j] return _data - def stop(self,): - ''' + def stop( + self, + ): + """ Stop the inference process. - ''' + """ self.stop_flag = True @torch.no_grad() - def run(self, inputs:dict): + def run(self, inputs: dict): """ Text to speech inference. @@ -870,17 +960,17 @@ class TTS: Tuple[int, np.ndarray]: sampling rate and audio data. """ ########## variables initialization ########### - self.stop_flag:bool = False - text:str = inputs.get("text", "") - text_lang:str = inputs.get("text_lang", "") - ref_audio_path:str = inputs.get("ref_audio_path", "") - aux_ref_audio_paths:list = inputs.get("aux_ref_audio_paths", []) - prompt_text:str = inputs.get("prompt_text", "") - prompt_lang:str = inputs.get("prompt_lang", "") - top_k:int = inputs.get("top_k", 5) - top_p:float = inputs.get("top_p", 1) - temperature:float = inputs.get("temperature", 1) - text_split_method:str = inputs.get("text_split_method", "cut0") + self.stop_flag: bool = False + text: str = inputs.get("text", "") + text_lang: str = inputs.get("text_lang", "") + ref_audio_path: str = inputs.get("ref_audio_path", "") + aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", []) + prompt_text: str = inputs.get("prompt_text", "") + prompt_lang: str = inputs.get("prompt_lang", "") + top_k: int = inputs.get("top_k", 5) + top_p: float = inputs.get("top_p", 1) + temperature: float = inputs.get("temperature", 1) + text_split_method: str = inputs.get("text_split_method", "cut0") batch_size = inputs.get("batch_size", 1) batch_threshold = inputs.get("batch_threshold", 0.75) speed_factor = inputs.get("speed_factor", 1.0) @@ -908,18 +998,18 @@ class TTS: split_bucket = False print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理")) - if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer): + if split_bucket and speed_factor == 1.0 and not (self.configs.use_vocoder and parallel_infer): print(i18n("分桶处理模式已开启")) - elif speed_factor!=1.0: + elif speed_factor != 1.0: print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理")) split_bucket = False - elif self.configs.is_v3_synthesizer and parallel_infer: - print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理")) + elif self.configs.use_vocoder and parallel_infer: + print(i18n("当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理")) split_bucket = False else: print(i18n("分桶处理模式已关闭")) - if fragment_interval<0.01: + if fragment_interval < 0.01: fragment_interval = 0.01 print(i18n("分段间隔过小,已自动设置为0.01")) @@ -931,12 +1021,15 @@ class TTS: if not no_prompt_text: assert prompt_lang in self.configs.languages - if no_prompt_text and self.configs.is_v3_synthesizer: + if no_prompt_text and self.configs.use_vocoder: raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3") - if ref_audio_path in [None, ""] and \ - ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])): - raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") + if ref_audio_path in [None, ""] and ( + (self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []]) + ): + raise ValueError( + "ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()" + ) ###### setting reference audio and prompt text preprocessing ######## t0 = time.perf_counter() @@ -946,7 +1039,7 @@ class TTS: self.set_ref_audio(ref_audio_path) aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else [] - paths = set(aux_ref_audio_paths)&set(self.prompt_cache["aux_ref_audio_paths"]) + paths = set(aux_ref_audio_paths) & set(self.prompt_cache["aux_ref_audio_paths"]) if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])): self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]] @@ -960,58 +1053,57 @@ class TTS: if not no_prompt_text: prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_lang != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_lang != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) if self.prompt_cache["prompt_text"] != prompt_text: - phones, bert_features, norm_text = \ - self.text_preprocessor.segment_and_extract_feature_for_text( - prompt_text, - prompt_lang, - self.configs.version) + phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text( + prompt_text, prompt_lang, self.configs.version + ) self.prompt_cache["prompt_text"] = prompt_text self.prompt_cache["prompt_lang"] = prompt_lang self.prompt_cache["phones"] = phones self.prompt_cache["bert_features"] = bert_features self.prompt_cache["norm_text"] = norm_text - - - ###### text preprocessing ######## t1 = time.perf_counter() - data:list = None + data: list = None if not return_fragment: data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) if len(data) == 0: yield 16000, np.zeros(int(16000), dtype=np.int16) return - batch_index_list:list = None - data, batch_index_list = self.to_batch(data, - prompt_data=self.prompt_cache if not no_prompt_text else None, - batch_size=batch_size, - threshold=batch_threshold, - split_bucket=split_bucket, - device=self.configs.device, - precision=self.precision - ) + batch_index_list: list = None + data, batch_index_list = self.to_batch( + data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=split_bucket, + device=self.configs.device, + precision=self.precision, + ) else: - print(f'############ {i18n("切分文本")} ############') + print(f"############ {i18n('切分文本')} ############") texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method) data = [] for i in range(len(texts)): - if i%batch_size == 0: + if i % batch_size == 0: data.append([]) data[-1].append(texts[i]) def make_batch(batch_texts): batch_data = [] - print(f'############ {i18n("提取文本Bert特征")} ############') + print(f"############ {i18n('提取文本Bert特征')} ############") for text in tqdm(batch_texts): - phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang, self.configs.version) + phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text( + text, text_lang, self.configs.version + ) if phones is None: continue - res={ + res = { "phones": phones, "bert_features": bert_features, "norm_text": norm_text, @@ -1019,17 +1111,17 @@ class TTS: batch_data.append(res) if len(batch_data) == 0: return None - batch, _ = self.to_batch(batch_data, - prompt_data=self.prompt_cache if not no_prompt_text else None, - batch_size=batch_size, - threshold=batch_threshold, - split_bucket=False, - device=self.configs.device, - precision=self.precision - ) + batch, _ = self.to_batch( + batch_data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=False, + device=self.configs.device, + precision=self.precision, + ) return batch[0] - t2 = time.perf_counter() try: print("############ 推理 ############") @@ -1037,7 +1129,7 @@ class TTS: t_34 = 0.0 t_45 = 0.0 audio = [] - output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000 + output_sr = self.configs.sampling_rate if not self.configs.use_vocoder else self.vocoder_configs["sr"] for item in data: t3 = time.perf_counter() if return_fragment: @@ -1045,20 +1137,22 @@ class TTS: if item is None: continue - batch_phones:List[torch.LongTensor] = item["phones"] + batch_phones: List[torch.LongTensor] = item["phones"] # batch_phones:torch.LongTensor = item["phones"] - batch_phones_len:torch.LongTensor = item["phones_len"] - all_phoneme_ids:torch.LongTensor = item["all_phones"] - all_phoneme_lens:torch.LongTensor = item["all_phones_len"] - all_bert_features:torch.LongTensor = item["all_bert_features"] - norm_text:str = item["norm_text"] + batch_phones_len: torch.LongTensor = item["phones_len"] + all_phoneme_ids: torch.LongTensor = item["all_phones"] + all_phoneme_lens: torch.LongTensor = item["all_phones_len"] + all_bert_features: torch.LongTensor = item["all_bert_features"] + norm_text: str = item["norm_text"] max_len = item["max_len"] print(i18n("前端处理后的文本(每句):"), norm_text) - if no_prompt_text : + if no_prompt_text: prompt = None else: - prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) + prompt = ( + self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) + ) print(f"############ {i18n('预测语义Token')} ############") pred_semantic_list, idx_list = self.t2s_model.model.infer_panel( @@ -1077,8 +1171,10 @@ class TTS: t4 = time.perf_counter() t_34 += t4 - t3 - refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]] - + refer_audio_spec: torch.Tensor = [ + item.to(dtype=self.precision, device=self.configs.device) + for item in self.prompt_cache["refer_spec"] + ] batch_audio_fragment = [] @@ -1095,66 +1191,71 @@ class TTS: # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec # )) print(f"############ {i18n('合成音频')} ############") - if not self.configs.is_v3_synthesizer: + if not self.configs.use_vocoder: if speed_factor == 1.0: print(f"{i18n('并行合成中')}...") # ## vits并行推理 method 2 pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] upsample_rate = math.prod(self.vits_model.upsample_rates) - audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))] - audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))] - all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) + audio_frag_idx = [ + pred_semantic_list[i].shape[0] * 2 * upsample_rate + for i in range(0, len(pred_semantic_list)) + ] + audio_frag_end_idx = [sum(audio_frag_idx[: i + 1]) for i in range(0, len(audio_frag_idx))] + all_pred_semantic = ( + torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) + ) _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) - _batch_audio_fragment = (self.vits_model.decode( - all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) + _batch_audio_fragment = self.vits_model.decode( + all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] audio_frag_end_idx.insert(0, 0) - batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))] + batch_audio_fragment = [ + _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]] + for i in range(1, len(audio_frag_end_idx)) + ] else: - # ## vits串行推理 + # ## vits串行推理 for i, idx in enumerate(tqdm(idx_list)): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 - audio_fragment =(self.vits_model.decode( - _pred_semantic, phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) - batch_audio_fragment.append( - audio_fragment - ) ###试试重建不带上prompt部分 + _pred_semantic = ( + pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 + audio_fragment = self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] + batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分 else: if parallel_infer: print(f"{i18n('并行合成中')}...") - audio_fragments = self.v3_synthesis_batched_infer( - idx_list, - pred_semantic_list, - batch_phones, - speed=speed_factor, - sample_steps=sample_steps - ) + audio_fragments = self.using_vocoder_synthesis_batched_infer( + idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps + ) batch_audio_fragment.extend(audio_fragments) else: for i, idx in enumerate(tqdm(idx_list)): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 - audio_fragment = self.v3_synthesis( - _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps - ) - batch_audio_fragment.append( - audio_fragment - ) + _pred_semantic = ( + pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 + audio_fragment = self.using_vocoder_synthesis( + _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps + ) + batch_audio_fragment.append(audio_fragment) t5 = time.perf_counter() t_45 += t5 - t4 if return_fragment: print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4)) - yield self.audio_postprocess([batch_audio_fragment], - output_sr, - None, - speed_factor, - False, - fragment_interval, - super_sampling if self.configs.is_v3_synthesizer else False - ) + yield self.audio_postprocess( + [batch_audio_fragment], + output_sr, + None, + speed_factor, + False, + fragment_interval, + super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False, + ) else: audio.append(batch_audio_fragment) @@ -1167,14 +1268,15 @@ class TTS: if len(audio) == 0: yield 16000, np.zeros(int(16000), dtype=np.int16) return - yield self.audio_postprocess(audio, - output_sr, - batch_index_list, - speed_factor, - split_bucket, - fragment_interval, - super_sampling if self.configs.is_v3_synthesizer else False - ) + yield self.audio_postprocess( + audio, + output_sr, + batch_index_list, + speed_factor, + split_bucket, + fragment_interval, + super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False, + ) except Exception as e: traceback.print_exc() @@ -1193,7 +1295,7 @@ class TTS: def empty_cache(self): try: - gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 + gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 if "cuda" in str(self.configs.device): torch.cuda.empty_cache() elif str(self.configs.device) == "mps": @@ -1201,29 +1303,28 @@ class TTS: except: pass - def audio_postprocess(self, - audio:List[torch.Tensor], - sr:int, - batch_index_list:list=None, - speed_factor:float=1.0, - split_bucket:bool=True, - fragment_interval:float=0.3, - super_sampling:bool=False, - )->Tuple[int, np.ndarray]: + def audio_postprocess( + self, + audio: List[torch.Tensor], + sr: int, + batch_index_list: list = None, + speed_factor: float = 1.0, + split_bucket: bool = True, + fragment_interval: float = 0.3, + super_sampling: bool = False, + ) -> Tuple[int, np.ndarray]: zero_wav = torch.zeros( - int(self.configs.sampling_rate * fragment_interval), - dtype=self.precision, - device=self.configs.device - ) + int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device + ) for i, batch in enumerate(audio): for j, audio_fragment in enumerate(batch): - max_audio=torch.abs(audio_fragment).max()#简单防止16bit爆音 - if max_audio>1: audio_fragment/=max_audio - audio_fragment:torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) + max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音 + if max_audio > 1: + audio_fragment /= max_audio + audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) audio[i][j] = audio_fragment - if split_bucket: audio = self.recovery_order(audio, batch_index_list) else: @@ -1237,11 +1338,12 @@ class TTS: t1 = time.perf_counter() self.init_sr_model() if not self.sr_model_not_exist: - audio,sr=self.sr_model(audio.unsqueeze(0),sr) - max_audio=np.abs(audio).max() - if max_audio > 1: audio /= max_audio + audio, sr = self.sr_model(audio.unsqueeze(0), sr) + max_audio = np.abs(audio).max() + if max_audio > 1: + audio /= max_audio t2 = time.perf_counter() - print(f"超采样用时:{t2-t1:.3f}s") + print(f"超采样用时:{t2 - t1:.3f}s") else: audio = audio.cpu().numpy() @@ -1255,51 +1357,54 @@ class TTS: return sr, audio - - def v3_synthesis(self, - semantic_tokens:torch.Tensor, - phones:torch.Tensor, - speed:float=1.0, - sample_steps:int=32 - ): - + def using_vocoder_synthesis( + self, semantic_tokens: torch.Tensor, phones: torch.Tensor, speed: float = 1.0, sample_steps: int = 32 + ): prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device) - fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) - ref_audio:torch.Tensor = self.prompt_cache["raw_audio"] + fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) + ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_sr = self.prompt_cache["raw_sr"] - ref_audio=ref_audio.to(self.configs.device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(self.configs.device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if ref_sr!=24000: - ref_audio=resample(ref_audio, ref_sr, self.configs.device) - mel2 = mel_fn(ref_audio) + # tgt_sr = self.vocoder_configs["sr"] + tgt_sr = 24000 if self.configs.version == "v3" else 32000 + if ref_sr != tgt_sr: + ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device) + + mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): - mel2 = mel2[:, :, -468:] - fea_ref = fea_ref[:, :, -468:] - T_min = 468 - chunk_len = 934 - T_min + T_ref = self.vocoder_configs["T_ref"] + T_chunk = self.vocoder_configs["T_chunk"] + if T_min > T_ref: + mel2 = mel2[:, :, -T_ref:] + fea_ref = fea_ref[:, :, -T_ref:] + T_min = T_ref + chunk_len = T_chunk - T_min - mel2=mel2.to(self.precision) + mel2 = mel2.to(self.precision) fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = self.vits_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:] @@ -1308,148 +1413,146 @@ class TTS: cfm_res = torch.cat(cfm_resss, 2) cfm_res = denorm_spec(cfm_res) - with torch.inference_mode(): - wav_gen = self.bigvgan_model(cfm_res) - audio=wav_gen[0][0]#.cpu().detach().numpy() - + wav_gen = self.vocoder(cfm_res) + audio = wav_gen[0][0] # .cpu().detach().numpy() + return audio - - - def v3_synthesis_batched_infer(self, - idx_list:List[int], - semantic_tokens_list:List[torch.Tensor], - batch_phones:List[torch.Tensor], - speed:float=1.0, - sample_steps:int=32 - )->List[torch.Tensor]: - + def using_vocoder_synthesis_batched_infer( + self, + idx_list: List[int], + semantic_tokens_list: List[torch.Tensor], + batch_phones: List[torch.Tensor], + speed: float = 1.0, + sample_steps: int = 32, + ) -> List[torch.Tensor]: prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device) - fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) - ref_audio:torch.Tensor = self.prompt_cache["raw_audio"] + fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) + ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_sr = self.prompt_cache["raw_sr"] - ref_audio=ref_audio.to(self.configs.device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(self.configs.device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if ref_sr!=24000: - ref_audio=resample(ref_audio, ref_sr, self.configs.device) - mel2 = mel_fn(ref_audio) + # tgt_sr = self.vocoder_configs["sr"] + tgt_sr = 24000 if self.configs.version == "v3" else 32000 + if ref_sr != tgt_sr: + ref_audio = resample(ref_audio, ref_sr, tgt_sr, self.configs.device) + + mel2 = mel_fn(ref_audio) if self.configs.version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): - mel2 = mel2[:, :, -468:] - fea_ref = fea_ref[:, :, -468:] - T_min = 468 - chunk_len = 934 - T_min - - mel2=mel2.to(self.precision) + T_ref = self.vocoder_configs["T_ref"] + T_chunk = self.vocoder_configs["T_chunk"] + if T_min > T_ref: + mel2 = mel2[:, :, -T_ref:] + fea_ref = fea_ref[:, :, -T_ref:] + T_min = T_ref + chunk_len = T_chunk - T_min + mel2 = mel2.to(self.precision) # #### batched inference - overlapped_len = 12 + overlapped_len = self.vocoder_configs["overlapped_len"] feat_chunks = [] feat_lens = [] feat_list = [] for i, idx in enumerate(idx_list): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次 + semantic_tokens = ( + semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed) feat_list.append(feat) feat_lens.append(feat.shape[2]) feats = torch.cat(feat_list, 2) - feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0) + feats_padded = F.pad(feats, (overlapped_len, 0), "constant", 0) pos = 0 padding_len = 0 while True: - if pos ==0: - chunk = feats_padded[:, :, pos:pos + chunk_len] + if pos == 0: + chunk = feats_padded[:, :, pos : pos + chunk_len] else: pos = pos - overlapped_len - chunk = feats_padded[:, :, pos:pos + chunk_len] + chunk = feats_padded[:, :, pos : pos + chunk_len] pos += chunk_len - if (chunk.shape[-1] == 0): break + if chunk.shape[-1] == 0: + break # padding for the last chunk padding_len = chunk_len - chunk.shape[2] if padding_len != 0: - chunk = F.pad(chunk, (0,padding_len), "constant", 0) + chunk = F.pad(chunk, (0, padding_len), "constant", 0) feat_chunks.append(chunk) - - feat_chunks = torch.cat(feat_chunks, 0) bs = feat_chunks.shape[0] - fea_ref = fea_ref.repeat(bs,1,1) + fea_ref = fea_ref.repeat(bs, 1, 1) fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1) - pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) + pred_spec = self.vits_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) pred_spec = pred_spec[:, :, -chunk_len:] dd = pred_spec.shape[1] pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0) # pred_spec = pred_spec[..., :-padding_len] - pred_spec = denorm_spec(pred_spec) - - with torch.no_grad(): - wav_gen = self.bigvgan_model(pred_spec) - audio = wav_gen[0][0]#.cpu().detach().numpy() + with torch.no_grad(): + wav_gen = self.vocoder(pred_spec) + audio = wav_gen[0][0] # .cpu().detach().numpy() audio_fragments = [] - upsample_rate = 256 + upsample_rate = self.vocoder_configs["upsample_rate"] pos = 0 while pos < audio.shape[-1]: - audio_fragment = audio[pos:pos+chunk_len*upsample_rate] + audio_fragment = audio[pos : pos + chunk_len * upsample_rate] audio_fragments.append(audio_fragment) - pos += chunk_len*upsample_rate + pos += chunk_len * upsample_rate - audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate) - audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate] + audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate) + audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate] audio_fragments = [] for feat_len in feat_lens: - audio_fragment = audio[:feat_len*upsample_rate] + audio_fragment = audio[: feat_len * upsample_rate] audio_fragments.append(audio_fragment) - audio = audio[feat_len*upsample_rate:] - + audio = audio[feat_len * upsample_rate :] return audio_fragments - - - def sola_algorithm(self, - audio_fragments:List[torch.Tensor], - overlap_len:int, - ): - - for i in range(len(audio_fragments)-1): + def sola_algorithm( + self, + audio_fragments: List[torch.Tensor], + overlap_len: int, + ): + for i in range(len(audio_fragments) - 1): f1 = audio_fragments[i] - f2 = audio_fragments[i+1] + f2 = audio_fragments[i + 1] w1 = f1[-overlap_len:] w2 = f2[:overlap_len] assert w1.shape == w2.shape - corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1] + corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1), padding=w2.shape[-1] // 2).view(-1)[:-1] idx = corr.argmax() - f1_ = f1[:-(overlap_len-idx)] + f1_ = f1[: -(overlap_len - idx)] audio_fragments[i] = f1_ f2_ = f2[idx:] - window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype) - f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):] - audio_fragments[i+1] = f2_ - + window = torch.hann_window((overlap_len - idx) * 2, device=f1.device, dtype=f1.dtype) + f2_[: (overlap_len - idx)] = ( + window[: (overlap_len - idx)] * f2_[: (overlap_len - idx)] + + window[(overlap_len - idx) :] * f1[-(overlap_len - idx) :] + ) + audio_fragments[i + 1] = f2_ return torch.cat(audio_fragments, 0) - - - diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 0ebe553d..426929f8 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -1,8 +1,9 @@ - -import os, sys +import os +import sys import threading from tqdm import tqdm + now_dir = os.getcwd() sys.path.append(now_dir) @@ -18,17 +19,19 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_ from tools.i18n.i18n import I18nAuto, scan_language_list -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) -punctuation = set(['!', '?', '…', ',', '.', '-']) +punctuation = set(["!", "?", "…", ",", ".", "-"]) -def get_first(text:str) -> str: + +def get_first(text: str) -> str: pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" text = re.split(pattern, text)[0].strip() return text -def merge_short_text_in_array(texts:str, threshold:int) -> list: + +def merge_short_text_in_array(texts: str, threshold: int) -> list: if (len(texts)) < 2: return texts result = [] @@ -38,7 +41,7 @@ def merge_short_text_in_array(texts:str, threshold:int) -> list: if len(text) >= threshold: result.append(text) text = "" - if (len(text) > 0): + if len(text) > 0: if len(result) == 0: result.append(text) else: @@ -46,28 +49,24 @@ def merge_short_text_in_array(texts:str, threshold:int) -> list: return result - - - class TextPreprocessor: - def __init__(self, bert_model:AutoModelForMaskedLM, - tokenizer:AutoTokenizer, device:torch.device): + def __init__(self, bert_model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, device: torch.device): self.bert_model = bert_model self.tokenizer = tokenizer self.device = device self.bert_lock = threading.RLock() - def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]: - print(f'############ {i18n("切分文本")} ############') + def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]: + print(f"############ {i18n('切分文本')} ############") text = self.replace_consecutive_punctuation(text) texts = self.pre_seg_text(text, lang, text_split_method) result = [] - print(f'############ {i18n("提取文本Bert特征")} ############') + print(f"############ {i18n('提取文本Bert特征')} ############") for text in tqdm(texts): phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version) - if phones is None or norm_text=="": + if phones is None or norm_text == "": continue - res={ + res = { "phones": phones, "bert_features": bert_features, "norm_text": norm_text, @@ -75,11 +74,11 @@ class TextPreprocessor: result.append(res) return result - def pre_seg_text(self, text:str, lang:str, text_split_method:str): + def pre_seg_text(self, text: str, lang: str, text_split_method: str): text = text.strip("\n") if len(text) == 0: return [] - if (text[0] not in splits and len(get_first(text)) < 4): + if text[0] not in splits and len(get_first(text)) < 4: text = "。" + text if lang != "en" else "." + text print(i18n("实际输入的目标文本:")) print(text) @@ -95,18 +94,18 @@ class TextPreprocessor: _texts = merge_short_text_in_array(_texts, 5) texts = [] - for text in _texts: # 解决输入目标文本的空行导致报错的问题 - if (len(text.strip()) == 0): - continue + if len(text.strip()) == 0: + continue if not re.sub("\W+", "", text): # 检测一下,如果是纯符号,就跳过。 continue - if (text[-1] not in splits): text += "。" if lang != "en" else "." + if text[-1] not in splits: + text += "。" if lang != "en" else "." # 解决句子过长导致Bert报错的问题 - if (len(text) > 510): + if len(text) > 510: texts.extend(split_big_text(text)) else: texts.append(text) @@ -115,78 +114,79 @@ class TextPreprocessor: print(texts) return texts - def segment_and_extract_feature_for_text(self, text:str, language:str, version:str="v1")->Tuple[list, torch.Tensor, str]: + def segment_and_extract_feature_for_text( + self, text: str, language: str, version: str = "v1" + ) -> Tuple[list, torch.Tensor, str]: return self.get_phones_and_bert(text, language, version) - def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): + def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False): with self.bert_lock: - if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: - # language = language.replace("all_","") - formattext = text - while " " in formattext: - formattext = formattext.replace(" ", " ") - if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"zh",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = self.get_bert_feature(norm_text, word2ph).to(self.device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"yue",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = torch.zeros( - (1024, len(phones)), - dtype=torch.float32, - ).to(self.device) - elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] - if language == "auto": - for tmp in LangSegmenter.getTexts(text): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "auto_yue": - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "zh": - tmp["lang"] = "yue" - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - else: - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "en": - langlist.append(tmp["lang"]) - else: - # 因无法区别中日韩文汉字,以用户输入为准 - langlist.append(language) - textlist.append(tmp["text"]) - # print(textlist) - # print(langlist) - phones_list = [] - bert_list = [] - norm_text_list = [] - for i in range(len(textlist)): - lang = langlist[i] - phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) - bert = self.get_bert_inf(phones, word2ph, norm_text, lang) - phones_list.append(phones) - norm_text_list.append(norm_text) - bert_list.append(bert) - bert = torch.cat(bert_list, dim=1) - phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + # language = language.replace("all_","") + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext, "zh", version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = self.get_bert_feature(norm_text, word2ph).to(self.device) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext, "yue", version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float32, + ).to(self.device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist = [] + langlist = [] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + # print(textlist) + # print(langlist) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) + bert = self.get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = "".join(norm_text_list) - if not final and len(phones) < 6: - return self.get_phones_and_bert("." + text,language,version,final=True) + if not final and len(phones) < 6: + return self.get_phones_and_bert("." + text, language, version, final=True) - return phones, bert, norm_text + return phones, bert, norm_text - - def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor: + def get_bert_feature(self, text: str, word2ph: list) -> torch.Tensor: with torch.no_grad(): inputs = self.tokenizer(text, return_tensors="pt") for i in inputs: @@ -201,14 +201,14 @@ class TextPreprocessor: phone_level_feature = torch.cat(phone_level_feature, dim=0) return phone_level_feature.T - def clean_text_inf(self, text:str, language:str, version:str="v2"): - language = language.replace("all_","") + def clean_text_inf(self, text: str, language: str, version: str = "v2"): + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text - def get_bert_inf(self, phones:list, word2ph:list, norm_text:str, language:str): - language=language.replace("all_","") + def get_bert_inf(self, phones: list, word2ph: list, norm_text: str, language: str): + language = language.replace("all_", "") if language == "zh": feature = self.get_bert_feature(norm_text, word2ph).to(self.device) else: @@ -219,21 +219,19 @@ class TextPreprocessor: return feature - - def filter_text(self,texts): - _text=[] - if all(text in [None, " ", "\n",""] for text in texts): + def filter_text(self, texts): + _text = [] + if all(text in [None, " ", "\n", ""] for text in texts): raise ValueError(i18n("请输入有效文本")) for text in texts: - if text in [None, " ", ""]: + if text in [None, " ", ""]: pass else: _text.append(text) return _text - - def replace_consecutive_punctuation(self,text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + def replace_consecutive_punctuation(self, text): + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result diff --git a/GPT_SoVITS/TTS_infer_pack/__init__.py b/GPT_SoVITS/TTS_infer_pack/__init__.py index 74381982..8579a632 100644 --- a/GPT_SoVITS/TTS_infer_pack/__init__.py +++ b/GPT_SoVITS/TTS_infer_pack/__init__.py @@ -1 +1 @@ -from . import TTS, text_segmentation_method \ No newline at end of file +from . import TTS, text_segmentation_method diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index 4ee0cfb4..fda70a49 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -1,41 +1,57 @@ - - - - import re from typing import Callable -punctuation = set(['!', '?', '…', ',', '.', '-'," "]) +punctuation = set(["!", "?", "…", ",", ".", "-", " "]) METHODS = dict() -def get_method(name:str)->Callable: + +def get_method(name: str) -> Callable: method = METHODS.get(name, None) if method is None: raise ValueError(f"Method {name} not found") return method -def get_method_names()->list: + +def get_method_names() -> list: return list(METHODS.keys()) + def register_method(name): def decorator(func): METHODS[name] = func return func + return decorator -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } + +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + def split_big_text(text, max_len=510): # 定义全角和半角标点符号 punctuation = "".join(splits) # 切割文本 - segments = re.split('([' + punctuation + '])', text) - + segments = re.split("([" + punctuation + "])", text) + # 初始化结果列表和当前片段 result = [] - current_segment = '' - + current_segment = "" + for segment in segments: # 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段 if len(current_segment + segment) > max_len: @@ -43,13 +59,12 @@ def split_big_text(text, max_len=510): current_segment = segment else: current_segment += segment - + # 将最后一个片段加入结果列表 if current_segment: result.append(current_segment) - - return result + return result def split(todo_text): @@ -90,7 +105,7 @@ def cut1(inp): if len(split_idx) > 1: opts = [] for idx in range(len(split_idx) - 1): - opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) else: opts = [inp] opts = [item for item in opts if not set(item).issubset(punctuation)] @@ -123,6 +138,7 @@ def cut2(inp): opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) + # 按中文句号。切 @register_method("cut3") def cut3(inp): @@ -131,26 +147,28 @@ def cut3(inp): opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) -#按英文句号.切 + +# 按英文句号.切 @register_method("cut4") def cut4(inp): inp = inp.strip("\n") - opts = re.split(r'(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): + if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): items.append(char) else: items.append(char) @@ -166,8 +184,6 @@ def cut5(inp): return "\n".join(opt) - -if __name__ == '__main__': +if __name__ == "__main__": method = get_method("cut5") print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。")) - diff --git a/GPT_SoVITS/configs/tts_infer.yaml b/GPT_SoVITS/configs/tts_infer.yaml index 344aae4b..20c41a20 100644 --- a/GPT_SoVITS/configs/tts_infer.yaml +++ b/GPT_SoVITS/configs/tts_infer.yaml @@ -30,3 +30,11 @@ v3: t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt version: v3 vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth +v4: + bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large + cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base + device: cpu + is_half: false + t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt + version: v4 + vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth diff --git a/GPT_SoVITS/download.py b/GPT_SoVITS/download.py index e7c8c97b..fc4ead63 100644 --- a/GPT_SoVITS/download.py +++ b/GPT_SoVITS/download.py @@ -1,5 +1,13 @@ -import os, sys +import os +import sys + now_dir = os.getcwd() sys.path.insert(0, now_dir) from text.g2pw import G2PWPinyin -g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True) \ No newline at end of file + +g2pw = G2PWPinyin( + model_dir="GPT_SoVITS/text/G2PWModel", + model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + v_to_u=False, + neutral_tone_with_five=True, +) diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index 3f2c2968..69817a37 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -3,7 +3,6 @@ import argparse from typing import Optional from my_utils import load_audio -from text import cleaned_text_to_sequence import torch import torchaudio @@ -33,7 +32,8 @@ default_config = { "EOS": 1024, } -def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: + +def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: config = dict_s1["config"] config["model"]["dropout"] = float(config["model"]["dropout"]) t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) @@ -41,6 +41,7 @@ def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: t2s_model = t2s_model.eval() return t2s_model + @torch.jit.script def logits_to_probs( logits, @@ -57,39 +58,35 @@ def logits_to_probs( if previous_tokens is not None and repetition_penalty != 1.0: previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=1, index=previous_tokens) - score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty - ) + score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) logits.scatter_(dim=1, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 - ) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[:, 0] = False # keep at least one option - indices_to_remove = sorted_indices_to_remove.scatter( - dim=1, index=sorted_indices, src=sorted_indices_to_remove - ) + indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) logits = logits.masked_fill(indices_to_remove, -float("Inf")) logits = logits / max(temperature, 1e-5) if top_k is not None: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - pivot = v[: , -1].unsqueeze(-1) + pivot = v[:, -1].unsqueeze(-1) logits = torch.where(logits < pivot, -float("Inf"), logits) probs = torch.nn.functional.softmax(logits, dim=-1) return probs + @torch.jit.script -def multinomial_sample_one_no_sync(probs_sort): +def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization q = torch.randn_like(probs_sort) return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + @torch.jit.script def sample( logits, @@ -100,15 +97,20 @@ def sample( repetition_penalty: float = 1.0, ): probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty + logits=logits, + previous_tokens=previous_tokens, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, ) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs @torch.jit.script -def spectrogram_torch(y:Tensor, n_fft:int, sampling_rate:int, hop_size:int, win_size:int, center:bool=False): - hann_window = torch.hann_window(win_size,device=y.device,dtype=y.dtype) +def spectrogram_torch(y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False): + hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), @@ -158,6 +160,7 @@ class DictToAttrRecursive(dict): except KeyError: raise AttributeError(f"Attribute {item} not found") + @torch.jit.script class T2SMLP: def __init__(self, w1, b1, w2, b2): @@ -171,23 +174,24 @@ class T2SMLP: x = F.linear(x, self.w2, self.b2) return x + @torch.jit.script class T2SBlock: def __init__( - self, - num_heads: int, - hidden_dim: int, - mlp: T2SMLP, - qkv_w, - qkv_b, - out_w, - out_b, - norm_w1, - norm_b1, - norm_eps1: float, - norm_w2, - norm_b2, - norm_eps2: float, + self, + num_heads: int, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1: float, + norm_w2, + norm_b2, + norm_eps2: float, ): self.num_heads = num_heads self.mlp = mlp @@ -206,22 +210,22 @@ class T2SBlock: self.false = torch.tensor(False, dtype=torch.bool) @torch.jit.ignore - def to_mask(self, x:torch.Tensor, padding_mask:Optional[torch.Tensor]): + def to_mask(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor]): if padding_mask is None: return x - + if padding_mask.dtype == torch.bool: return x.masked_fill(padding_mask, 0) else: return x * padding_mask - - def process_prompt(self, x:torch.Tensor, attn_mask : torch.Tensor, padding_mask:Optional[torch.Tensor]=None): + + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) batch_size = q.shape[0] q_len = q.shape[1] kv_len = k.shape[1] - + q = self.to_mask(q, padding_mask) k_cache = self.to_mask(k, padding_mask) v_cache = self.to_mask(v, padding_mask) @@ -232,22 +236,20 @@ class T2SBlock: attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) - attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim) + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) if padding_mask is not None: for i in range(batch_size): # mask = padding_mask[i,:,0] - if self.false.device!= padding_mask.device: + if self.false.device != padding_mask.device: self.false = self.false.to(padding_mask.device) - idx = torch.where(padding_mask[i,:,0]==self.false)[0] - x_item = x[i,idx,:].unsqueeze(0) - attn_item = attn[i,idx,:].unsqueeze(0) + idx = torch.where(padding_mask[i, :, 0] == self.false)[0] + x_item = x[i, idx, :].unsqueeze(0) + attn_item = attn[i, idx, :].unsqueeze(0) x_item = x_item + attn_item - x_item = F.layer_norm( - x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x_item = F.layer_norm(x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x_item = x_item + self.mlp.forward(x_item) x_item = F.layer_norm( x_item, @@ -256,13 +258,11 @@ class T2SBlock: self.norm_b2, self.norm_eps2, ) - x[i,idx,:] = x_item.squeeze(0) + x[i, idx, :] = x_item.squeeze(0) x = self.to_mask(x, padding_mask) else: x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -272,13 +272,13 @@ class T2SBlock: self.norm_eps2, ) return x, k_cache, v_cache - - def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor): + + def decode_next_token(self, x: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor): q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) k_cache = torch.cat([k_cache, k], dim=1) v_cache = torch.cat([v_cache, v], dim=1) - + batch_size = q.shape[0] q_len = q.shape[1] kv_len = k_cache.shape[1] @@ -289,14 +289,12 @@ class T2SBlock: attn = F.scaled_dot_product_attention(q, k, v) - attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim) + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) attn = F.linear(attn, self.out_w, self.out_b) x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -307,48 +305,46 @@ class T2SBlock: ) return x, k_cache, v_cache + @torch.jit.script class T2STransformer: - def __init__(self, num_blocks : int, blocks: list[T2SBlock]): - self.num_blocks : int = num_blocks + def __init__(self, num_blocks: int, blocks: list[T2SBlock]): + self.num_blocks: int = num_blocks self.blocks = blocks - def process_prompt( - self, x:torch.Tensor, attn_mask : torch.Tensor,padding_mask : Optional[torch.Tensor]=None): - k_cache : list[torch.Tensor] = [] - v_cache : list[torch.Tensor] = [] + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + k_cache: list[torch.Tensor] = [] + v_cache: list[torch.Tensor] = [] for i in range(self.num_blocks): x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask) k_cache.append(k_cache_) v_cache.append(v_cache_) return x, k_cache, v_cache - def decode_next_token( - self, x:torch.Tensor, - k_cache: list[torch.Tensor], - v_cache: list[torch.Tensor]): + def decode_next_token(self, x: torch.Tensor, k_cache: list[torch.Tensor], v_cache: list[torch.Tensor]): for i in range(self.num_blocks): x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i]) return x, k_cache, v_cache + class VitsModel(nn.Module): def __init__(self, vits_path): super().__init__() # dict_s2 = torch.load(vits_path,map_location="cpu") dict_s2 = torch.load(vits_path) self.hps = dict_s2["config"] - if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: self.hps["model"]["version"] = "v1" else: self.hps["model"]["version"] = "v2" - + self.hps = DictToAttrRecursive(self.hps) self.hps.model.semantic_frame_rate = "25hz" self.vq_model = SynthesizerTrn( self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, - **self.hps.model + **self.hps.model, ) self.vq_model.eval() self.vq_model.load_state_dict(dict_s2["weight"], strict=False) @@ -360,12 +356,13 @@ class VitsModel(nn.Module): self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, - center=False + center=False, ) return self.vq_model(pred_semantic, text_seq, refer, speed)[0, 0] + class T2SModel(nn.Module): - def __init__(self,raw_t2s:Text2SemanticLightningModule): + def __init__(self, raw_t2s: Text2SemanticLightningModule): super(T2SModel, self).__init__() self.model_dim = raw_t2s.model.model_dim self.embedding_dim = raw_t2s.model.embedding_dim @@ -374,7 +371,7 @@ class T2SModel(nn.Module): self.vocab_size = raw_t2s.model.vocab_size self.phoneme_vocab_size = raw_t2s.model.phoneme_vocab_size # self.p_dropout = float(raw_t2s.model.p_dropout) - self.EOS:int = int(raw_t2s.model.EOS) + self.EOS: int = int(raw_t2s.model.EOS) self.norm_first = raw_t2s.model.norm_first assert self.EOS == self.vocab_size - 1 self.hz = 50 @@ -384,7 +381,7 @@ class T2SModel(nn.Module): self.ar_text_position = raw_t2s.model.ar_text_position self.ar_audio_embedding = raw_t2s.model.ar_audio_embedding self.ar_audio_position = raw_t2s.model.ar_audio_position - + # self.t2s_transformer = T2STransformer(self.num_layers, blocks) # self.t2s_transformer = raw_t2s.model.t2s_transformer @@ -393,12 +390,7 @@ class T2SModel(nn.Module): for i in range(self.num_layers): layer = h.layers[i] - t2smlp = T2SMLP( - layer.linear1.weight, - layer.linear1.bias, - layer.linear2.weight, - layer.linear2.bias - ) + t2smlp = T2SMLP(layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, layer.linear2.bias) block = T2SBlock( self.num_head, @@ -413,11 +405,11 @@ class T2SModel(nn.Module): layer.norm1.eps, layer.norm2.weight, layer.norm2.bias, - layer.norm2.eps + layer.norm2.eps, ) blocks.append(block) - + self.t2s_transformer = T2STransformer(self.num_layers, blocks) # self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) @@ -426,20 +418,27 @@ class T2SModel(nn.Module): self.max_sec = raw_t2s.config["data"]["max_sec"] self.top_k = int(raw_t2s.config["inference"]["top_k"]) self.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) - - def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor,top_k:LongTensor): + + def forward( + self, + prompts: LongTensor, + ref_seq: LongTensor, + text_seq: LongTensor, + ref_bert: torch.Tensor, + text_bert: torch.Tensor, + top_k: LongTensor, + ): bert = torch.cat([ref_bert.T, text_bert.T], 1) all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) bert = bert.unsqueeze(0) x = self.ar_text_embedding(all_phoneme_ids) x = x + self.bert_proj(bert.transpose(1, 2)) - x:torch.Tensor = self.ar_text_position(x) + x: torch.Tensor = self.ar_text_position(x) early_stop_num = self.early_stop_num - - #[1,N,512] [1,N] + # [1,N,512] [1,N] # y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) y = prompts # x_example = x[:,:,0] * 0.0 @@ -465,15 +464,17 @@ class T2SModel(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)\ - .unsqueeze(0)\ - .expand(bsz*self.num_head, -1, -1)\ - .view(bsz, self.num_head, src_len, src_len)\ - .to(device=x.device, dtype=torch.bool) - + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) + idx = 0 top_k = int(top_k) - + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) logits = self.ar_predict_layer(xy_dec[:, -1]) @@ -481,23 +482,25 @@ class T2SModel(nn.Module): samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) stop = False # for idx in range(1, 50): for idx in range(1, 1500): - #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] # y, k, v, y_emb, logits, samples = self.stage_decoder(y, k, v, y_emb, x_example) xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) logits = self.ar_predict_layer(xy_dec[:, -1]) - if(idx<11):###至少预测出10个token不然不给停止(0.4s) + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) logits = logits[:, :-1] - + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) - + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: stop = True if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: @@ -508,20 +511,22 @@ class T2SModel(nn.Module): break y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) - - y[0,-1] = 0 - + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + y[0, -1] = 0 + return y[:, -idx:].unsqueeze(0) -bert_path = os.environ.get( - "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" -) + +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path + @torch.jit.script -def build_phone_level_feature(res:Tensor, word2ph:IntTensor): +def build_phone_level_feature(res: Tensor, word2ph: IntTensor): phone_level_feature = [] for i in range(word2ph.shape[0]): repeat_feature = res[i].repeat(word2ph[i].item(), 1) @@ -530,103 +535,111 @@ def build_phone_level_feature(res:Tensor, word2ph:IntTensor): # [sum(word2ph), 1024] return phone_level_feature + class MyBertModel(torch.nn.Module): def __init__(self, bert_model): super(MyBertModel, self).__init__() self.bert = bert_model - def forward(self, input_ids:torch.Tensor, attention_mask:torch.Tensor, token_type_ids:torch.Tensor, word2ph:IntTensor): + def forward( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, word2ph: IntTensor + ): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # res = torch.cat(outputs["hidden_states"][-3:-2], -1)[0][1:-1] res = torch.cat(outputs[1][-3:-2], -1)[0][1:-1] return build_phone_level_feature(res, word2ph) + class SSLModel(torch.nn.Module): def __init__(self): super().__init__() self.ssl = cnhubert.get_model().model - def forward(self, ref_audio_16k)-> torch.Tensor: + def forward(self, ref_audio_16k) -> torch.Tensor: ssl_content = self.ssl(ref_audio_16k)["last_hidden_state"].transpose(1, 2) return ssl_content + class ExportSSLModel(torch.nn.Module): - def __init__(self,ssl:SSLModel): + def __init__(self, ssl: SSLModel): super().__init__() self.ssl = ssl - def forward(self, ref_audio:torch.Tensor): + def forward(self, ref_audio: torch.Tensor): return self.ssl(ref_audio) - + @torch.jit.export - def resample(self,ref_audio:torch.Tensor,src_sr:int,dst_sr:int)->torch.Tensor: - audio = resamplex(ref_audio,src_sr,dst_sr).float() + def resample(self, ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + audio = resamplex(ref_audio, src_sr, dst_sr).float() return audio + def export_bert(output_path): tokenizer = AutoTokenizer.from_pretrained(bert_path) - + text = "叹息声一声接着一声传出,木兰对着房门织布.听不见织布机织布的声音,只听见木兰在叹息.问木兰在想什么?问木兰在惦记什么?木兰答道,我也没有在想什么,也没有在惦记什么." ref_bert_inputs = tokenizer(text, return_tensors="pt") word2ph = [] for c in text: - if c in [',','。',':','?',",",".","?"]: + if c in [",", "。", ":", "?", ",", ".", "?"]: word2ph.append(1) else: word2ph.append(2) - ref_bert_inputs['word2ph'] = torch.Tensor(word2ph).int() + ref_bert_inputs["word2ph"] = torch.Tensor(word2ph).int() - bert_model = AutoModelForMaskedLM.from_pretrained(bert_path,output_hidden_states=True,torchscript=True) + bert_model = AutoModelForMaskedLM.from_pretrained(bert_path, output_hidden_states=True, torchscript=True) my_bert_model = MyBertModel(bert_model) ref_bert_inputs = { - 'input_ids': ref_bert_inputs['input_ids'], - 'attention_mask': ref_bert_inputs['attention_mask'], - 'token_type_ids': ref_bert_inputs['token_type_ids'], - 'word2ph': ref_bert_inputs['word2ph'] + "input_ids": ref_bert_inputs["input_ids"], + "attention_mask": ref_bert_inputs["attention_mask"], + "token_type_ids": ref_bert_inputs["token_type_ids"], + "word2ph": ref_bert_inputs["word2ph"], } - torch._dynamo.mark_dynamic(ref_bert_inputs['input_ids'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['attention_mask'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['token_type_ids'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['word2ph'], 0) + torch._dynamo.mark_dynamic(ref_bert_inputs["input_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["attention_mask"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["token_type_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["word2ph"], 0) - my_bert_model = torch.jit.trace(my_bert_model,example_kwarg_inputs=ref_bert_inputs) + my_bert_model = torch.jit.trace(my_bert_model, example_kwarg_inputs=ref_bert_inputs) output_path = os.path.join(output_path, "bert_model.pt") my_bert_model.save(output_path) - print('#### exported bert ####') + print("#### exported bert ####") -def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_bert_and_ssl=False, device='cpu'): + +def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_bert_and_ssl=False, device="cpu"): if not os.path.exists(output_path): os.makedirs(output_path) print(f"目录已创建: {output_path}") else: print(f"目录已存在: {output_path}") - + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float() ssl = SSLModel() if export_bert_and_ssl: - s = ExportSSLModel(torch.jit.trace(ssl,example_inputs=(ref_audio))) + s = ExportSSLModel(torch.jit.trace(ssl, example_inputs=(ref_audio))) ssl_path = os.path.join(output_path, "ssl_model.pt") torch.jit.script(s).save(ssl_path) - print('#### exported ssl ####') + print("#### exported ssl ####") export_bert(output_path) else: s = ExportSSLModel(ssl) print(f"device: {device}") - - ref_seq_id,ref_bert_T,ref_norm_text = get_phones_and_bert(ref_text,"all_zh",'v2') + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") ref_seq = torch.LongTensor([ref_seq_id]).to(device) ref_bert = ref_bert_T.T.to(ref_seq.device) - text_seq_id,text_bert_T,norm_text = get_phones_and_bert("这是一条测试语音,说什么无所谓,只是给它一个例子","all_zh",'v2') + text_seq_id, text_bert_T, norm_text = get_phones_and_bert( + "这是一条测试语音,说什么无所谓,只是给它一个例子", "all_zh", "v2" + ) text_seq = torch.LongTensor([text_seq_id]).to(device) text_bert = text_bert_T.T.to(text_seq.device) ssl_content = ssl(ref_audio).to(device) - # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" + # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" vits = VitsModel(vits_path).to(device) vits.eval() @@ -634,18 +647,18 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be # dict_s1 = torch.load(gpt_path, map_location=device) dict_s1 = torch.load(gpt_path) raw_t2s = get_raw_t2s_model(dict_s1).to(device) - print('#### get_raw_t2s_model ####') + print("#### get_raw_t2s_model ####") print(raw_t2s.config) t2s_m = T2SModel(raw_t2s) t2s_m.eval() t2s = torch.jit.script(t2s_m).to(device) - print('#### script t2s_m ####') - - print("vits.hps.data.sampling_rate:",vits.hps.data.sampling_rate) - gpt_sovits = GPT_SoVITS(t2s,vits).to(device) + print("#### script t2s_m ####") + + print("vits.hps.data.sampling_rate:", vits.hps.data.sampling_rate) + gpt_sovits = GPT_SoVITS(t2s, vits).to(device) gpt_sovits.eval() - - ref_audio_sr = s.resample(ref_audio,16000,32000).to(device) + + ref_audio_sr = s.resample(ref_audio, 16000, 32000).to(device) torch._dynamo.mark_dynamic(ssl_content, 2) torch._dynamo.mark_dynamic(ref_audio_sr, 1) @@ -658,32 +671,28 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be with torch.no_grad(): gpt_sovits_export = torch.jit.trace( - gpt_sovits, - example_inputs=( - ssl_content, - ref_audio_sr, - ref_seq, - text_seq, - ref_bert, - text_bert, - top_k)) - + gpt_sovits, example_inputs=(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, text_bert, top_k) + ) + gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt") gpt_sovits_export.save(gpt_sovits_path) - print('#### exported gpt_sovits ####') + print("#### exported gpt_sovits ####") + @torch.jit.script def parse_audio(ref_audio): - ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float()#.to(ref_audio.device) - ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,32000).float()#.to(ref_audio.device) - return ref_audio_16k,ref_audio_sr + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() # .to(ref_audio.device) + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, 32000).float() # .to(ref_audio.device) + return ref_audio_16k, ref_audio_sr + @torch.jit.script -def resamplex(ref_audio:torch.Tensor,src_sr:int,dst_sr:int)->torch.Tensor: - return torchaudio.functional.resample(ref_audio,src_sr,dst_sr).float() +def resamplex(ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + return torchaudio.functional.resample(ref_audio, src_sr, dst_sr).float() + class GPT_SoVITS(nn.Module): - def __init__(self, t2s:T2SModel,vits:VitsModel): + def __init__(self, t2s: T2SModel, vits: VitsModel): super().__init__() self.t2s = t2s self.vits = vits @@ -710,12 +719,11 @@ class GPT_SoVITS(nn.Module): def test(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--output_path', required=True, help="Path to the output directory") - + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") args = parser.parse_args() gpt_path = args.gpt_model @@ -726,7 +734,7 @@ def test(): tokenizer = AutoTokenizer.from_pretrained(bert_path) # bert_model = AutoModelForMaskedLM.from_pretrained(bert_path,output_hidden_states=True,torchscript=True) # bert = MyBertModel(bert_model) - my_bert = torch.jit.load("onnx/bert_model.pt",map_location='cuda') + my_bert = torch.jit.load("onnx/bert_model.pt", map_location="cuda") # dict_s1 = torch.load(gpt_path, map_location="cuda") # raw_t2s = get_raw_t2s_model(dict_s1) @@ -740,95 +748,97 @@ def test(): # ssl = ExportSSLModel(SSLModel()).to('cuda') # ssl.eval() - ssl = torch.jit.load("onnx/by/ssl_model.pt",map_location='cuda') + ssl = torch.jit.load("onnx/by/ssl_model.pt", map_location="cuda") # gpt_sovits = GPT_SoVITS(t2s,vits) - gpt_sovits = torch.jit.load("onnx/by/gpt_sovits_model.pt",map_location='cuda') + gpt_sovits = torch.jit.load("onnx/by/gpt_sovits_model.pt", map_location="cuda") - ref_seq_id,ref_bert_T,ref_norm_text = get_phones_and_bert(ref_text,"all_zh",'v2') + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") ref_seq = torch.LongTensor([ref_seq_id]) ref_bert = ref_bert_T.T.to(ref_seq.device) # text_seq_id,text_bert_T,norm_text = get_phones_and_bert("昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字.","all_zh",'v2') text = "昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字." - text_seq_id,text_bert_T,norm_text = get_phones_and_bert(text,"all_zh",'v2') - + text_seq_id, text_bert_T, norm_text = get_phones_and_bert(text, "all_zh", "v2") + test_bert = tokenizer(text, return_tensors="pt") word2ph = [] for c in text: - if c in [',','。',':','?',"?",",","."]: + if c in [",", "。", ":", "?", "?", ",", "."]: word2ph.append(1) else: word2ph.append(2) - test_bert['word2ph'] = torch.Tensor(word2ph).int() + test_bert["word2ph"] = torch.Tensor(word2ph).int() test_bert = my_bert( - test_bert['input_ids'].to('cuda'), - test_bert['attention_mask'].to('cuda'), - test_bert['token_type_ids'].to('cuda'), - test_bert['word2ph'].to('cuda') + test_bert["input_ids"].to("cuda"), + test_bert["attention_mask"].to("cuda"), + test_bert["token_type_ids"].to("cuda"), + test_bert["word2ph"].to("cuda"), ) - + text_seq = torch.LongTensor([text_seq_id]) text_bert = text_bert_T.T.to(text_seq.device) - print('text_bert:',text_bert.shape,text_bert) - print('test_bert:',test_bert.shape,test_bert) - print(torch.allclose(text_bert.to('cuda'),test_bert)) + print("text_bert:", text_bert.shape, text_bert) + print("test_bert:", test_bert.shape, test_bert) + print(torch.allclose(text_bert.to("cuda"), test_bert)) - print('text_seq:',text_seq.shape) - print('text_bert:',text_bert.shape,text_bert.type()) + print("text_seq:", text_seq.shape) + print("text_bert:", text_bert.shape, text_bert.type()) - #[1,N] - ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float().to('cuda') - print('ref_audio:',ref_audio.shape) - - ref_audio_sr = ssl.resample(ref_audio,16000,32000) - print('start ssl') + # [1,N] + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float().to("cuda") + print("ref_audio:", ref_audio.shape) + + ref_audio_sr = ssl.resample(ref_audio, 16000, 32000) + print("start ssl") ssl_content = ssl(ref_audio) - print('start gpt_sovits:') - print('ssl_content:',ssl_content.shape) - print('ref_audio_sr:',ref_audio_sr.shape) - print('ref_seq:',ref_seq.shape) - ref_seq=ref_seq.to('cuda') - print('text_seq:',text_seq.shape) - text_seq=text_seq.to('cuda') - print('ref_bert:',ref_bert.shape) - ref_bert=ref_bert.to('cuda') - print('text_bert:',text_bert.shape) - text_bert=text_bert.to('cuda') + print("start gpt_sovits:") + print("ssl_content:", ssl_content.shape) + print("ref_audio_sr:", ref_audio_sr.shape) + print("ref_seq:", ref_seq.shape) + ref_seq = ref_seq.to("cuda") + print("text_seq:", text_seq.shape) + text_seq = text_seq.to("cuda") + print("ref_bert:", ref_bert.shape) + ref_bert = ref_bert.to("cuda") + print("text_bert:", text_bert.shape) + text_bert = text_bert.to("cuda") - top_k = torch.LongTensor([5]).to('cuda') + top_k = torch.LongTensor([5]).to("cuda") with torch.no_grad(): audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert, top_k) - print('start write wav') + print("start write wav") soundfile.write("out.wav", audio.detach().cpu().numpy(), 32000) import text import json -def export_symbel(version='v2'): - if version=='v1': + +def export_symbel(version="v2"): + if version == "v1": symbols = text._symbol_to_id_v1 - with open(f"onnx/symbols_v1.json", "w") as file: + with open("onnx/symbols_v1.json", "w") as file: json.dump(symbols, file, indent=4) else: symbols = text._symbol_to_id_v2 - with open(f"onnx/symbols_v2.json", "w") as file: + with open("onnx/symbols_v2.json", "w") as file: json.dump(symbols, file, indent=4) + def main(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--output_path', required=True, help="Path to the output directory") - parser.add_argument('--export_common_model', action='store_true', help="Export Bert and SSL model") - parser.add_argument('--device', help="Device to use") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") + parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model") + parser.add_argument("--device", help="Device to use") args = parser.parse_args() export( @@ -841,9 +851,11 @@ def main(): export_bert_and_ssl=args.export_common_model, ) + import inference_webui + if __name__ == "__main__": - inference_webui.is_half=False - inference_webui.dtype=torch.float32 + inference_webui.is_half = False + inference_webui.dtype = torch.float32 main() # test() diff --git a/GPT_SoVITS/export_torch_script_v3.py b/GPT_SoVITS/export_torch_script_v3.py index 8b73d30e..b34495a7 100644 --- a/GPT_SoVITS/export_torch_script_v3.py +++ b/GPT_SoVITS/export_torch_script_v3.py @@ -6,16 +6,16 @@ from export_torch_script import ( spectrogram_torch, ) from f5_tts.model.backbones.dit import DiT -from feature_extractor import cnhubert from inference_webui import get_phones_and_bert import librosa from module import commons -from module.mel_processing import mel_spectrogram_torch, spectral_normalize_torch +from module.mel_processing import mel_spectrogram_torch from module.models_onnx import CFM, SynthesizerTrnV3 import numpy as np import torch._dynamo.config import torchaudio -import logging, uvicorn +import logging +import uvicorn import torch import soundfile from librosa.filters import mel as librosa_mel_fn @@ -32,7 +32,6 @@ now_dir = os.getcwd() class MelSpectrgram(torch.nn.Module): - def __init__( self, dtype, @@ -48,14 +47,12 @@ class MelSpectrgram(torch.nn.Module): ): super().__init__() self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype) - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device) - self.n_fft:int = n_fft - self.hop_size:int = hop_size - self.win_size:int = win_size - self.center:bool = center + self.n_fft: int = n_fft + self.hop_size: int = hop_size + self.win_size: int = win_size + self.center: bool = center def forward(self, y): y = torch.nn.functional.pad( @@ -172,9 +169,7 @@ class ExportCFM(torch.nn.Module): ): T_min = fea_ref.size(2) fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - cfm_res = self.cfm( - fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps - ) + cfm_res = self.cfm(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps) cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:] @@ -198,6 +193,7 @@ mel_fn = lambda x: mel_spectrogram_torch( spec_min = -12 spec_max = 2 + @torch.jit.script def norm_spec(x): spec_min = -12 @@ -212,7 +208,6 @@ def denorm_spec(x): class ExportGPTSovitsHalf(torch.nn.Module): - def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3): super().__init__() self.hps = hps @@ -231,15 +226,15 @@ class ExportGPTSovitsHalf(torch.nn.Module): center=False, ) # self.dtype = dtype - self.filter_length:int = hps.data.filter_length - self.sampling_rate:int = hps.data.sampling_rate - self.hop_length:int = hps.data.hop_length - self.win_length:int = hps.data.win_length + self.filter_length: int = hps.data.filter_length + self.sampling_rate: int = hps.data.sampling_rate + self.hop_length: int = hps.data.hop_length + self.win_length: int = hps.data.win_length def forward( self, ssl_content, - ref_audio_32k:torch.FloatTensor, + ref_audio_32k: torch.FloatTensor, phoneme_ids0, phoneme_ids1, bert1, @@ -255,21 +250,17 @@ class ExportGPTSovitsHalf(torch.nn.Module): center=False, ).to(ssl_content.dtype) - codes = self.vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0) # print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - pred_semantic = self.t2s_m( - prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k - ) + pred_semantic = self.t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) # print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - ge = self.vq_model.create_ge(refer) # print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - + prompt_ = prompt.unsqueeze(0) fea_ref = self.vq_model(prompt_, phoneme_ids0, ge) # print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) @@ -293,6 +284,7 @@ class ExportGPTSovitsHalf(torch.nn.Module): return fea_ref, fea_todo, mel2 + class GPTSoVITSV3(torch.nn.Module): def __init__(self, gpt_sovits_half, cfm, bigvgan): super().__init__() @@ -303,9 +295,9 @@ class GPTSoVITSV3(torch.nn.Module): def forward( self, ssl_content, - ref_audio_32k:torch.FloatTensor, - phoneme_ids0:torch.LongTensor, - phoneme_ids1:torch.LongTensor, + ref_audio_32k: torch.FloatTensor, + phoneme_ids0: torch.LongTensor, + phoneme_ids1: torch.LongTensor, bert1, bert2, top_k: torch.LongTensor, @@ -313,7 +305,9 @@ class GPTSoVITSV3(torch.nn.Module): ): # current_time = datetime.now() # print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S")) - fea_ref, fea_todo, mel2 = self.gpt_sovits_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + fea_ref, fea_todo, mel2 = self.gpt_sovits_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) chunk_len = 934 - fea_ref.shape[2] wav_gen_list = [] idx = 0 @@ -331,7 +325,13 @@ class GPTSoVITSV3(torch.nn.Module): # 经过 bigvgan 之后音频长度就是 fea_todo.shape[2] * 256 complete_len = chunk_len - fea_todo_chunk.shape[-1] if complete_len != 0: - fea_todo_chunk = torch.cat([fea_todo_chunk, torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype)], 2) + fea_todo_chunk = torch.cat( + [ + fea_todo_chunk, + torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype), + ], + 2, + ) cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) idx += chunk_len @@ -339,17 +339,17 @@ class GPTSoVITSV3(torch.nn.Module): cfm_res = denorm_spec(cfm_res) bigvgan_res = self.bigvgan(cfm_res) wav_gen_list.append(bigvgan_res) - + wav_gen = torch.cat(wav_gen_list, 2) return wav_gen[0][0][:wav_gen_length] + def init_bigvgan(): global bigvgan_model from BigVGAN import bigvgan bigvgan_model = bigvgan.BigVGAN.from_pretrained( - "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" - % (now_dir,), + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False, ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode @@ -467,10 +467,7 @@ def export_cfm( cfm = e_cfm.cfm B, T = mu.size(0), mu.size(1) - x = ( - torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) - * temperature - ) + x = torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature print("x:", x.shape, x.dtype) prompt_len = prompt.size(-1) prompt_x = torch.zeros_like(x, dtype=mu.dtype) @@ -565,11 +562,7 @@ def export(): wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() codes = sovits.vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) @@ -626,10 +619,7 @@ def export(): "create_ge": refer, } - - trace_vq_model = torch.jit.trace_module( - sovits.vq_model, inputs, optimize=True - ) + trace_vq_model = torch.jit.trace_module(sovits.vq_model, inputs, optimize=True) trace_vq_model.save("onnx/ad/vq_model.pt") print(fea_ref.shape, fea_ref.dtype, ge.shape) @@ -714,9 +704,7 @@ def export(): idx += chunk_len - cfm_res, fea_ref, mel2 = export_cfm_( - fea_ref, fea_todo_chunk, mel2, sample_steps - ) + cfm_res, fea_ref, mel2 = export_cfm_(fea_ref, fea_todo_chunk, mel2, sample_steps) cfm_resss.append(cfm_res) continue @@ -726,9 +714,7 @@ def export(): with torch.inference_mode(): cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype) torch._dynamo.mark_dynamic(cmf_res_rand, 2) - bigvgan_model_ = torch.jit.trace( - bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,) - ) + bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,)) bigvgan_model_.save("onnx/ad/bigvgan_model.pt") wav_gen = bigvgan_model(cmf_res) print("wav_gen:", wav_gen.shape, wav_gen.dtype) @@ -748,7 +734,6 @@ def test_export( bigvgan, output, ): - # hps = sovits.hps ref_wav_path = "onnx/ad/ref.wav" speed = 1.0 @@ -773,13 +758,9 @@ def test_export( wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() - ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() phones1, bert1, norm_text1 = get_phones_and_bert( @@ -799,8 +780,18 @@ def test_export( current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logger.info("start inference %s", current_time) - print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) - fea_ref, fea_todo, mel2 = gpt_sovits_v3_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + fea_ref, fea_todo, mel2 = gpt_sovits_v3_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) chunk_len = 934 - fea_ref.shape[2] print(fea_ref.shape, fea_todo.shape, mel2.shape) @@ -812,7 +803,6 @@ def test_export( wav_gen_length = fea_todo.shape[2] * 256 while 1: - current_time = datetime.now() print("idx:", idx, current_time.strftime("%Y-%m-%d %H:%M:%S")) fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] @@ -861,7 +851,6 @@ def test_export1( gpt_sovits_v3, output, ): - # hps = sovits.hps ref_wav_path = "onnx/ad/ref.wav" speed = 1.0 @@ -886,14 +875,10 @@ def test_export1( wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() print("ssl_content:", ssl_content.shape, ssl_content.dtype) - ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() phones1, bert1, norm_text1 = get_phones_and_bert( @@ -913,11 +898,19 @@ def test_export1( current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logger.info("start inference %s", current_time) - print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps) print("wav_gen:", wav_gen.shape, wav_gen.dtype) - wav_gen = torch.cat([wav_gen,zero_wav_torch],0) + wav_gen = torch.cat([wav_gen, zero_wav_torch], 0) audio = wav_gen.cpu().detach().numpy() logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) @@ -929,20 +922,19 @@ import time def test_(): - sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") # cfm = ExportCFM(sovits.cfm) # cfm.cfm.estimator = dit sovits.cfm = None - + cfm = torch.jit.load("onnx/ad/cfm.pt", map_location=device) # cfm = torch.jit.optimize_for_inference(cfm) cfm = cfm.half().to(device) - + cfm.eval() - logger.info(f"cfm ok") + logger.info("cfm ok") dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") # v2 的 gpt 也可以用 @@ -957,17 +949,14 @@ def test_(): t2s_m = torch.jit.script(t2s_m) t2s_m.eval() # t2s_m.top_k = 15 - logger.info(f"t2s_m ok") + logger.info("t2s_m ok") - - vq_model: torch.jit.ScriptModule = torch.jit.load( - "onnx/ad/vq_model.pt", map_location=device - ) + vq_model: torch.jit.ScriptModule = torch.jit.load("onnx/ad/vq_model.pt", map_location=device) # vq_model = torch.jit.optimize_for_inference(vq_model) # vq_model = vq_model.half().to(device) vq_model.eval() # vq_model = sovits.vq_model - logger.info(f"vq_model ok") + logger.info("vq_model ok") # gpt_sovits_v3_half = torch.jit.load("onnx/ad/gpt_sovits_v3_half.pt") # gpt_sovits_v3_half = torch.jit.optimize_for_inference(gpt_sovits_v3_half) @@ -975,7 +964,7 @@ def test_(): # gpt_sovits_v3_half = gpt_sovits_v3_half.cuda() # gpt_sovits_v3_half.eval() gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model) - logger.info(f"gpt_sovits_v3_half ok") + logger.info("gpt_sovits_v3_half ok") # init_bigvgan() # global bigvgan_model @@ -985,7 +974,7 @@ def test_(): bigvgan_model = bigvgan_model.cuda() bigvgan_model.eval() - logger.info(f"bigvgan ok") + logger.info("bigvgan ok") gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model) gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3) @@ -1020,8 +1009,9 @@ def test_(): # "out2.wav", # ) + def test_export_gpt_sovits_v3(): - gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt",map_location=device) + gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt", map_location=device) # test_export1( # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", # gpt_sovits_v3, diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py index ac32fa50..7d98a858 100644 --- a/GPT_SoVITS/f5_tts/model/backbones/dit.py +++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py @@ -11,7 +11,6 @@ from __future__ import annotations import torch from torch import nn -import torch.nn.functional as F from torch.utils.checkpoint import checkpoint from x_transformers.x_transformers import RotaryEmbedding @@ -28,6 +27,7 @@ from GPT_SoVITS.f5_tts.model.modules import ( from module.commons import sequence_mask + class TextEmbedding(nn.Module): def __init__(self, text_dim, conv_layers=0, conv_mult=2): super().__init__() @@ -130,26 +130,24 @@ class DiT(nn.Module): return ckpt_forward - def forward(#x, prompt_x, x_lens, t, style,cond - self,#d is channel,n is T + def forward( # x, prompt_x, x_lens, t, style,cond + self, # d is channel,n is T x0: float["b n d"], # nosied input audio # noqa: F722 cond0: float["b n d"], # masked cond audio # noqa: F722 x_lens, time: float["b"] | float[""], # time step # noqa: F821 F722 - dt_base_bootstrap, + dt_base_bootstrap, text0, # : int["b nt"] # noqa: F722#####condition feature use_grad_ckpt=False, # bool ###no-use drop_audio_cond=False, # cfg for cond audio drop_text=False, # cfg for text # mask: bool["b n"] | None = None, # noqa: F722 - ): - - x=x0.transpose(2,1) - cond=cond0.transpose(2,1) - text=text0.transpose(2,1) - mask = sequence_mask(x_lens,max_length=x.size(1)).to(x.device) + x = x0.transpose(2, 1) + cond = cond0.transpose(2, 1) + text = text0.transpose(2, 1) + mask = sequence_mask(x_lens, max_length=x.size(1)).to(x.device) batch, seq_len = x.shape[0], x.shape[1] if time.ndim == 0: @@ -158,8 +156,8 @@ class DiT(nn.Module): # t: conditioning time, c: context (text + masked cond audio), x: noised input audio t = self.time_embed(time) dt = self.d_embed(dt_base_bootstrap) - t+=dt - text_embed = self.text_embed(text, seq_len, drop_text=drop_text)###need to change + t += dt + text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond) rope = self.rotary_embed.forward_from_seq_len(seq_len) @@ -179,4 +177,4 @@ class DiT(nn.Module): x = self.norm_out(x, t) output = self.proj_out(x) - return output \ No newline at end of file + return output diff --git a/GPT_SoVITS/f5_tts/model/modules.py b/GPT_SoVITS/f5_tts/model/modules.py index 5f6f5cf4..9f030d96 100644 --- a/GPT_SoVITS/f5_tts/model/modules.py +++ b/GPT_SoVITS/f5_tts/model/modules.py @@ -391,6 +391,7 @@ class Attention(nn.Module): # Attention processor + # from torch.nn.attention import SDPBackend # torch.backends.cuda.enable_flash_sdp(True) class AttnProcessor: @@ -545,6 +546,7 @@ class JointAttnProcessor: # DiT Block + class DiTBlock(nn.Module): def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1): super().__init__() diff --git a/GPT_SoVITS/feature_extractor/__init__.py b/GPT_SoVITS/feature_extractor/__init__.py index 79aa9294..01ef5ddf 100644 --- a/GPT_SoVITS/feature_extractor/__init__.py +++ b/GPT_SoVITS/feature_extractor/__init__.py @@ -1,6 +1,3 @@ from . import cnhubert, whisper_enc -content_module_map = { - 'cnhubert': cnhubert, - 'whisper': whisper_enc -} \ No newline at end of file +content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc} diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index 013e462b..f22b8d09 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -1,14 +1,11 @@ -import time - -import librosa import torch -import torch.nn.functional as F -import soundfile as sf import os from transformers import logging as tf_logging + tf_logging.set_verbosity_error() import logging + logging.getLogger("numba").setLevel(logging.WARNING) from transformers import ( @@ -23,21 +20,19 @@ cnhubert_base_path = None class CNHubert(nn.Module): - def __init__(self, base_path:str=None): + def __init__(self, base_path: str = None): super().__init__() if base_path is None: base_path = cnhubert_base_path - if os.path.exists(base_path):... - else:raise FileNotFoundError(base_path) + if os.path.exists(base_path): + ... + else: + raise FileNotFoundError(base_path) self.model = HubertModel.from_pretrained(base_path, local_files_only=True) - self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - base_path, local_files_only=True - ) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True) def forward(self, x): - input_values = self.feature_extractor( - x, return_tensors="pt", sampling_rate=16000 - ).input_values.to(x.device) + input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) feats = self.model(input_values)["last_hidden_state"] return feats diff --git a/GPT_SoVITS/feature_extractor/whisper_enc.py b/GPT_SoVITS/feature_extractor/whisper_enc.py index 983c3e4d..260539bc 100644 --- a/GPT_SoVITS/feature_extractor/whisper_enc.py +++ b/GPT_SoVITS/feature_extractor/whisper_enc.py @@ -19,7 +19,5 @@ def get_content(model=None, wav_16k_tensor=None): feature_len = mel.shape[-1] // 2 assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" with torch.no_grad(): - feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ - :1, :feature_len, : - ].transpose(1, 2) + feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2) return feature diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index bd987aaf..459a3d36 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -7,13 +7,23 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights i18n = I18nAuto() -def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): + +def synthesize( + GPT_model_path, + SoVITS_model_path, + ref_audio_path, + ref_text_path, + ref_language, + target_text_path, + target_language, + output_path, +): # Read reference text - with open(ref_text_path, 'r', encoding='utf-8') as file: + with open(ref_text_path, "r", encoding="utf-8") as file: ref_text = file.read() # Read target text - with open(target_text_path, 'r', encoding='utf-8') as file: + with open(target_text_path, "r", encoding="utf-8") as file: target_text = file.read() # Change model weights @@ -21,12 +31,16 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, change_sovits_weights(sovits_path=SoVITS_model_path) # Synthesize audio - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=i18n(ref_language), - text=target_text, - text_language=i18n(target_language), top_p=1, temperature=1) - + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=i18n(ref_language), + text=target_text, + text_language=i18n(target_language), + top_p=1, + temperature=1, + ) + result_list = list(synthesis_result) if result_list: @@ -35,21 +49,38 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, sf.write(output_wav_path, last_audio_data, last_sampling_rate) print(f"Audio saved to {output_wav_path}") + def main(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio") - parser.add_argument('--target_text', required=True, help="Path to the target text file") - parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text") - parser.add_argument('--output_path', required=True, help="Path to the output directory") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument( + "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio" + ) + parser.add_argument("--target_text", required=True, help="Path to the target text file") + parser.add_argument( + "--target_language", + required=True, + choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], + help="Language of the target text", + ) + parser.add_argument("--output_path", required=True, help="Path to the output directory") args = parser.parse_args() - synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path) + synthesize( + args.gpt_model, + args.sovits_model, + args.ref_audio, + args.ref_text, + args.ref_language, + args.target_text, + args.target_language, + args.output_path, + ) -if __name__ == '__main__': + +if __name__ == "__main__": main() - diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index 2059155d..379f7fa8 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -6,6 +6,7 @@ from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QSta import soundfile as sf from tools.i18n.i18n import I18nAuto + i18n = I18nAuto() from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav @@ -18,7 +19,7 @@ class GPTSoVITSGUI(QMainWindow): def __init__(self): super().__init__() - self.setWindowTitle('GPT-SoVITS GUI') + self.setWindowTitle("GPT-SoVITS GUI") self.setGeometry(800, 450, 950, 850) self.setStyleSheet(""" @@ -61,11 +62,12 @@ class GPTSoVITSGUI(QMainWindow): border: 1px solid #45a049; box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); } - """) + """) license_text = ( - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " - "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " + "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + ) license_label = QLabel(license_text) license_label.setWordWrap(True) @@ -124,14 +126,16 @@ class GPTSoVITSGUI(QMainWindow): self.output_text = QTextEdit() self.output_text.setReadOnly(True) - self.add_drag_drop_events([ - self.GPT_model_input, - self.SoVITS_model_input, - self.ref_audio_input, - self.ref_text_input, - self.target_text_input, - self.output_input, - ]) + self.add_drag_drop_events( + [ + self.GPT_model_input, + self.SoVITS_model_input, + self.ref_audio_input, + self.ref_text_input, + self.target_text_input, + self.output_input, + ] + ) self.synthesize_button = QPushButton("合成") self.synthesize_button.clicked.connect(self.synthesize) @@ -235,14 +239,14 @@ class GPTSoVITSGUI(QMainWindow): def upload_ref_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") if file_path: - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, "r", encoding="utf-8") as file: content = file.read() self.ref_text_input.setText(content) def upload_target_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") if file_path: - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, "r", encoding="utf-8") as file: content = file.read() self.target_text_input.setText(content) @@ -284,17 +288,19 @@ class GPTSoVITSGUI(QMainWindow): change_sovits_weights(sovits_path=SoVITS_model_path) self.SoVITS_Path = SoVITS_model_path - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=language_combobox, - text=target_text, - text_language=target_language_combobox) + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=language_combobox, + text=target_text, + text_language=target_language_combobox, + ) result_list = list(synthesis_result) if result_list: last_sampling_rate, last_audio_data = result_list[-1] - output_wav_path = os.path.join(output_path, "output.wav") + output_wav_path = os.path.join(output_path, "output.wav") sf.write(output_wav_path, last_audio_data, last_sampling_rate) result = "Audio saved to " + output_wav_path @@ -303,8 +309,8 @@ class GPTSoVITSGUI(QMainWindow): self.output_text.append("处理结果:\n" + result) -if __name__ == '__main__': +if __name__ == "__main__": app = QApplication(sys.argv) mainWin = GPTSoVITSGUI() mainWin.show() - sys.exit(app.exec_()) \ No newline at end of file + sys.exit(app.exec_()) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index afae2cf5..46820145 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1,13 +1,24 @@ -''' +""" 按中英混合识别 按日英混合识别 多语种启动切分识别语种 全部按中文识别 全部按英文识别 全部按日文识别 -''' +""" + +import json import logging -import traceback,torchaudio,warnings +import os +import re +import sys +import traceback +import warnings + +import torch +import torchaudio +from text.LangSegmenter import LangSegmenter + logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) @@ -16,59 +27,58 @@ logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR) -warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action="ignore", category=FutureWarning) -import os, re, sys, json -import pdb -import torch -from text.LangSegmenter import LangSegmenter - -try: - import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... -version=model_version=os.environ.get("version","v2") -path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" -is_exist_s2gv3=os.path.exists(path_sovits_v3) -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] +version = model_version = os.environ.get("version", "v2") +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +is_exist_s2gv4 = os.path.exists(path_sovits_v4) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] - -_ =[[],[]] -for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) -pretrained_gpt_name,pretrained_sovits_name = _ +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ -if os.path.exists(f"./weight.json"): +if os.path.exists("./weight.json"): pass else: - with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file) + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) -with open(f"./weight.json", 'r', encoding="utf-8") as file: +with open("./weight.json", "r", encoding="utf-8") as file: weight_data = file.read() - weight_data=json.loads(weight_data) - gpt_path = os.environ.get( - "gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name)) - sovits_path = os.environ.get( - "sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name)) - if isinstance(gpt_path,list): + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): gpt_path = gpt_path[0] - if isinstance(sovits_path,list): + if isinstance(sovits_path, list): sovits_path = sovits_path[0] # gpt_path = os.environ.get( # "gpt_path", pretrained_gpt_name # ) # sovits_path = os.environ.get("sovits_path", pretrained_sovits_name) -cnhubert_base_path = os.environ.get( - "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base" -) -bert_path = os.environ.get( - "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" -) +cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base") +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = int(infer_ttswebui) is_share = os.environ.get("is_share", "False") @@ -77,18 +87,20 @@ if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() # is_half=False -punctuation = set(['!', '?', '…', ',', '.', '-'," "]) +punctuation = set(["!", "?", "…", ",", ".", "-", " "]) import gradio as gr -from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np import librosa +import numpy as np from feature_extractor import cnhubert +from transformers import AutoModelForMaskedLM, AutoTokenizer cnhubert.cnhubert_base_path = cnhubert_base_path -from GPT_SoVITS.module.models import SynthesizerTrn,SynthesizerTrnV3 -import numpy as np import random + +from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3 + + def set_seed(seed): if seed == -1: seed = random.randint(0, 1000000) @@ -98,18 +110,21 @@ def set_seed(seed): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) + + # set_seed(42) +from time import time as ttime + from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from peft import LoraConfig, get_peft_model from text import cleaned_text_to_sequence from text.cleaner import clean_text -from time import time as ttime -from tools.my_utils import load_audio -from tools.i18n.i18n import I18nAuto, scan_language_list -from peft import LoraConfig, PeftModel, get_peft_model -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 @@ -120,27 +135,27 @@ else: device = "cpu" dict_language_v1 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 } dict_language_v2 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("粤语"): "all_yue",#全部按中文识别 - i18n("韩文"): "all_ko",#全部按韩文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("粤英混合"): "yue",#按粤英混合识别####不变 - i18n("韩英混合"): "ko",#按韩英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 - i18n("多语种混合(粤语)"): "auto_yue",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 } -dict_language = dict_language_v1 if version =='v1' else dict_language_v2 +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 tokenizer = AutoTokenizer.from_pretrained(bert_path) bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) @@ -200,87 +215,122 @@ if is_half == True: else: ssl_model = ssl_model.to(device) -resample_transform_dict={} -def resample(audio_tensor, sr0): +resample_transform_dict = {} + + +def resample(audio_tensor, sr0, sr1): global resample_transform_dict - if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) - return resample_transform_dict[sr0](audio_tensor) + key = "%s-%s" % (sr0, sr1) + if key not in resample_transform_dict: + resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) + return resample_transform_dict[key](audio_tensor) + ###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt -#symbol_version-model_version-if_lora_v3 -from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new -def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): - global vq_model, hps, version, model_version, dict_language,if_lora_v3 - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) - # print(sovits_path,version, model_version, if_lora_v3) - if if_lora_v3==True and is_exist_s2gv3==False: - info= "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") +# symbol_version-model_version-if_lora_v3 +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + +v3v4set = {"v3", "v4"} + + +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global vq_model, hps, version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + print(sovits_path, version, model_version, if_lora_v3) + is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 + if if_lora_v3 == True and is_exist == False: + info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n( + "SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version + ) gr.Warning(info) raise FileExistsError(info) - dict_language = dict_language_v1 if version =='v1' else dict_language_v2 + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): - prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language} + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) else: - prompt_text_update = {'__type__':'update', 'value':''} - prompt_language_update = {'__type__':'update', 'value':i18n("中文")} + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} if text_language in list(dict_language.keys()): - text_update, text_language_update = {'__type__':'update'}, {'__type__':'update', 'value':text_language} + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} else: - text_update = {'__type__':'update', 'value':''} - text_language_update = {'__type__':'update', 'value':i18n("中文")} - if model_version=="v3": - visible_sample_steps=True - visible_inp_refs=False + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version in v3v4set: + visible_sample_steps = True + visible_inp_refs = False else: - visible_sample_steps=False - visible_inp_refs=True - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False} + visible_sample_steps = False + visible_inp_refs = True + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + { + "__type__": "update", + "visible": visible_sample_steps, + "value": 32 if model_version == "v3" else 8, + "choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + }, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) dict_s2 = load_sovits_new(sovits_path) hps = dict_s2["config"] hps = DictToAttrRecursive(hps) hps.model.semantic_frame_rate = "25hz" - if 'enc_p.text_embedding.weight'not in dict_s2['weight']: - hps.model.version = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps.model.version = "v1" else: hps.model.version = "v2" - version=hps.model.version + version = hps.model.version # print("sovits版本:",hps.model.version) - if model_version!="v3": + if model_version not in v3v4set: vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model + **hps.model, ) - model_version=version + model_version = version else: + hps.model.version = model_version vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model + **hps.model, ) - if ("pretrained" not in sovits_path): + if "pretrained" not in sovits_path: try: del vq_model.enc_q - except:pass + except: + pass if is_half == True: vq_model = vq_model.half().to(device) else: vq_model = vq_model.to(device) vq_model.eval() - if if_lora_v3==False: - print("loading sovits_%s"%model_version,vq_model.load_state_dict(dict_s2["weight"], strict=False)) + if if_lora_v3 == False: + print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False)) else: - print("loading sovits_v3pretrained_G", vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False)) - lora_rank=dict_s2["lora_rank"] + path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 + print( + "loading sovits_%spretrained_G" % model_version, + vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False), + ) + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -288,21 +338,43 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): init_lora_weights=True, ) vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) - print("loading sovits_v3_lora%s"%(lora_rank)) + print("loading sovits_%s_lora%s" % (model_version, lora_rank)) vq_model.load_state_dict(dict_s2["weight"], strict=False) vq_model.cfm = vq_model.cfm.merge_and_unload() # torch.save(vq_model.state_dict(),"merge_win.pth") vq_model.eval() - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["SoVITS"][version]=sovits_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + { + "__type__": "update", + "visible": visible_sample_steps, + "value": 32 if model_version == "v3" else 8, + "choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + }, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) -try:next(change_sovits_weights(sovits_path)) -except:pass +try: + next(change_sovits_weights(sovits_path)) +except: + pass + def change_gpt_weights(gpt_path): global hz, max_sec, t2s_model, config @@ -318,41 +390,91 @@ def change_gpt_weights(gpt_path): t2s_model.eval() # total = sum([param.nelement() for param in t2s_model.parameters()]) # print("Number of parameter: %.2fM" % (total / 1e6)) - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["GPT"][version]=gpt_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["GPT"][version] = gpt_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) change_gpt_weights(gpt_path) -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" -import torch,soundfile +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +import torch + now_dir = os.getcwd() -import soundfile + def init_bigvgan(): - global bigvgan_model + global bigvgan_model, hifigan_model from BigVGAN import bigvgan - bigvgan_model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() + if hifigan_model: + hifigan_model = hifigan_model.cpu() + hifigan_model = None + try: + torch.cuda.empty_cache() + except: + pass if is_half == True: bigvgan_model = bigvgan_model.half().to(device) else: bigvgan_model = bigvgan_model.to(device) -if model_version!="v3":bigvgan_model=None -else:init_bigvgan() + +def init_hifigan(): + global hifigan_model, bigvgan_model + hifigan_model = Generator( + initial_channel=100, + resblock="1", + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + upsample_rates=[10, 6, 2, 2, 2], + upsample_initial_channel=512, + upsample_kernel_sizes=[20, 12, 4, 4, 4], + gin_channels=0, + is_bias=True, + ) + hifigan_model.eval() + hifigan_model.remove_weight_norm() + state_dict_g = torch.load( + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu" + ) + print("loading vocoder", hifigan_model.load_state_dict(state_dict_g)) + if bigvgan_model: + bigvgan_model = bigvgan_model.cpu() + bigvgan_model = None + try: + torch.cuda.empty_cache() + except: + pass + if is_half == True: + hifigan_model = hifigan_model.half().to(device) + else: + hifigan_model = hifigan_model.to(device) + + +bigvgan_model = hifigan_model = None +if model_version == "v3": + init_bigvgan() +if model_version == "v4": + init_hifigan() def get_spepc(hps, filename): # audio = load_audio(filename, int(hps.data.sampling_rate)) audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1):audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( @@ -365,17 +487,21 @@ def get_spepc(hps, filename): ) return spec + def clean_text_inf(text, language, version): - language = language.replace("all_","") + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text -dtype=torch.float16 if is_half == True else torch.float32 + +dtype = torch.float16 if is_half == True else torch.float32 + + def get_bert_inf(phones, word2ph, norm_text, language): - language=language.replace("all_","") + language = language.replace("all_", "") if language == "zh": - bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) else: bert = torch.zeros( (1024, len(phones)), @@ -385,7 +511,21 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} def get_first(text): @@ -393,24 +533,27 @@ def get_first(text): text = re.split(pattern, text)[0].strip() return text + from text import chinese -def get_phones_and_bert(text,language,version,final=False): + + +def get_phones_and_bert(text, language, version, final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"zh",version) + return get_phones_and_bert(formattext, "zh", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = get_bert_feature(norm_text, word2ph).to(device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"yue",version) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = torch.zeros( @@ -418,8 +561,8 @@ def get_phones_and_bert(text,language,version,final=False): dtype=torch.float16 if is_half == True else torch.float32, ).to(device) elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] + textlist = [] + langlist = [] if language == "auto": for tmp in LangSegmenter.getTexts(text): langlist.append(tmp["lang"]) @@ -452,30 +595,55 @@ def get_phones_and_bert(text,language,version,final=False): bert_list.append(bert) bert = torch.cat(bert_list, dim=1) phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + norm_text = "".join(norm_text_list) if not final and len(phones) < 6: - return get_phones_and_bert("." + text,language,version,final=True) + return get_phones_and_bert("." + text, language, version, final=True) - return phones,bert.to(dtype),norm_text + return phones, bert.to(dtype), norm_text + + +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) +mel_fn_v4 = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1280, + "win_size": 1280, + "hop_size": 320, + "num_mels": 100, + "sampling_rate": 32000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + def merge_short_text_in_array(texts, threshold): if (len(texts)) < 2: @@ -487,50 +655,77 @@ def merge_short_text_in_array(texts, threshold): if len(text) >= threshold: result.append(text) text = "" - if (len(text) > 0): + if len(text) > 0: if len(result) == 0: result.append(text) else: result[len(result) - 1] += text return result -sr_model=None -def audio_sr(audio,sr): + +sr_model = None + + +def audio_sr(audio, sr): global sr_model - if sr_model==None: + if sr_model == None: from tools.audio_sr import AP_BWE + try: - sr_model=AP_BWE(device,DictToAttrRecursive) + sr_model = AP_BWE(device, DictToAttrRecursive) except FileNotFoundError: gr.Warning(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) - return audio.cpu().detach().numpy(),sr - return sr_model(audio,sr) + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) ##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature # cache_tokens={}#暂未实现清理机制 -cache= {} -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False,inp_refs=None,sample_steps=8,if_sr=False,pause_second=0.3): +cache = {} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut=i18n("不切"), + top_k=20, + top_p=0.6, + temperature=0.6, + ref_free=False, + speed=1, + if_freeze=False, + inp_refs=None, + sample_steps=8, + if_sr=False, + pause_second=0.3, +): global cache - if ref_wav_path:pass - else:gr.Warning(i18n('请上传参考音频')) - if text:pass - else:gr.Warning(i18n('请填入推理文本')) + if ref_wav_path: + pass + else: + gr.Warning(i18n("请上传参考音频")) + if text: + pass + else: + gr.Warning(i18n("请填入推理文本")) t = [] if prompt_text is None or len(prompt_text) == 0: ref_free = True - if model_version=="v3": - ref_free=False#s2v3暂不支持ref_free + if model_version in v3v4set: + ref_free = False # s2v3暂不支持ref_free else: - if_sr=False + if_sr = False t0 = ttime() prompt_language = dict_language[prompt_language] text_language = dict_language[text_language] - if not ref_free: prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) text = text.strip("\n") # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text @@ -548,7 +743,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if not ref_free: with torch.no_grad(): wav16k, sr = librosa.load(ref_wav_path, sr=16000) - if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000: gr.Warning(i18n("参考音频在3~10秒范围外,请更换!")) raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) wav16k = torch.from_numpy(wav16k) @@ -557,27 +752,23 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, else: wav16k = wav16k.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() codes = vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) t1 = ttime() - t.append(t1-t0) + t.append(t1 - t0) - if (how_to_cut == i18n("凑四句一切")): + if how_to_cut == i18n("凑四句一切"): text = cut1(text) - elif (how_to_cut == i18n("凑50字一切")): + elif how_to_cut == i18n("凑50字一切"): text = cut2(text) - elif (how_to_cut == i18n("按中文句号。切")): + elif how_to_cut == i18n("按中文句号。切"): text = cut3(text) - elif (how_to_cut == i18n("按英文句号.切")): + elif how_to_cut == i18n("按英文句号.切"): text = cut4(text) - elif (how_to_cut == i18n("按标点符号切")): + elif how_to_cut == i18n("按标点符号切"): text = cut5(text) while "\n\n" in text: text = text.replace("\n\n", "\n") @@ -588,19 +779,20 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, audio_opt = [] ###s2v3暂不支持ref_free if not ref_free: - phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language, version) + phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version) - for i_text,text in enumerate(texts): + for i_text, text in enumerate(texts): # 解决输入目标文本的空行导致报错的问题 - if (len(text.strip()) == 0): + if len(text.strip()) == 0: continue - if (text[-1] not in splits): text += "。" if text_language != "en" else "." + if text[-1] not in splits: + text += "。" if text_language != "en" else "." print(i18n("实际输入的目标文本(每句):"), text) - phones2,bert2,norm_text2=get_phones_and_bert(text, text_language, version) + phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) print(i18n("前端处理后的文本(每句):"), norm_text2) if not ref_free: bert = torch.cat([bert1, bert2], 1) - all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) else: bert = bert2 all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) @@ -611,7 +803,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, t2 = ttime() # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature) # print(cache.keys(),if_freeze) - if(i_text in cache and if_freeze==True):pred_semantic=cache[i_text] + if i_text in cache and if_freeze == True: + pred_semantic = cache[i_text] else: with torch.no_grad(): pred_semantic, idx = t2s_model.model.infer_panel( @@ -626,88 +819,103 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, early_stop_num=hz * max_sec, ) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) - cache[i_text]=pred_semantic + cache[i_text] = pred_semantic t3 = ttime() ###v3不存在以下逻辑和inp_refs - if model_version!="v3": - refers=[] - if(inp_refs): + if model_version not in v3v4set: + refers = [] + if inp_refs: for path in inp_refs: try: refer = get_spepc(hps, path.name).to(dtype).to(device) refers.append(refer) except: traceback.print_exc() - if(len(refers)==0):refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] - audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers,speed=speed)[0][0]#.cpu().detach().numpy() + if len(refers) == 0: + refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] + audio = vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed + )[0][0] # .cpu().detach().numpy() else: refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) - phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0) - phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) # print(11111111, phoneme_ids0, phoneme_ids1) - fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) ref_audio, sr = torchaudio.load(ref_wav_path) - ref_audio=ref_audio.to(device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if sr!=24000: - ref_audio=resample(ref_audio,sr) + tgt_sr = 24000 if model_version == "v3" else 32000 + if sr != tgt_sr: + ref_audio = resample(ref_audio, sr, tgt_sr) # print("ref_audio",ref_audio.abs().mean()) - mel2 = mel_fn(ref_audio) + mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): - mel2 = mel2[:, :, -468:] - fea_ref = fea_ref[:, :, -468:] - T_min = 468 - chunk_len = 934 - T_min - # print("fea_ref",fea_ref,fea_ref.shape) - # print("mel2",mel2) - mel2=mel2.to(dtype) - fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge,speed) - # print("fea_todo",fea_todo) - # print("ge",ge.abs().mean()) + Tref = 468 if model_version == "v3" else 500 + Tchunk = 934 if model_version == "v3" else 1000 + if T_min > Tref: + mel2 = mel2[:, :, -Tref:] + fea_ref = fea_ref[:, :, -Tref:] + T_min = Tref + chunk_len = Tchunk - T_min + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - # set_seed(123) - cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] - # print("fea", fea) - # print("mel2in", mel2) fea_ref = fea_todo_chunk[:, :, -T_min:] cfm_resss.append(cfm_res) - cmf_res = torch.cat(cfm_resss, 2) - cmf_res = denorm_spec(cmf_res) - if bigvgan_model==None:init_bigvgan() + cfm_res = torch.cat(cfm_resss, 2) + cfm_res = denorm_spec(cfm_res) + if model_version == "v3": + if bigvgan_model == None: + init_bigvgan() + else: # v4 + if hifigan_model == None: + init_hifigan() + vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model with torch.inference_mode(): - wav_gen = bigvgan_model(cmf_res) - audio=wav_gen[0][0]#.cpu().detach().numpy() - max_audio=torch.abs(audio).max()#简单防止16bit爆音 - if max_audio>1:audio=audio/max_audio + wav_gen = vocoder_model(cfm_res) + audio = wav_gen[0][0] # .cpu().detach().numpy() + max_audio = torch.abs(audio).max() # 简单防止16bit爆音 + if max_audio > 1: + audio = audio / max_audio audio_opt.append(audio) - audio_opt.append(zero_wav_torch)#zero_wav + audio_opt.append(zero_wav_torch) # zero_wav t4 = ttime() - t.extend([t2 - t1,t3 - t2, t4 - t3]) + t.extend([t2 - t1, t3 - t2, t4 - t3]) t1 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) - audio_opt=torch.cat(audio_opt, 0)#np.concatenate - sr=hps.data.sampling_rate if model_version!="v3"else 24000 - if if_sr==True and sr==24000: - print(i18n("音频超分中")) - audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr) - max_audio=np.abs(audio_opt).max() - if max_audio > 1: audio_opt /= max_audio + audio_opt = torch.cat(audio_opt, 0) # np.concatenate + if model_version in {"v1", "v2"}: + opt_sr = 32000 + elif model_version == "v3": + opt_sr = 24000 else: - audio_opt=audio_opt.cpu().detach().numpy() - yield sr, (audio_opt * 32767).astype(np.int16) + opt_sr = 48000 # v4 + if if_sr == True and opt_sr == 24000: + print(i18n("音频超分中")) + audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio + else: + audio_opt = audio_opt.cpu().detach().numpy() + yield opt_sr, (audio_opt * 32767).astype(np.int16) def split(todo_text): @@ -737,7 +945,7 @@ def cut1(inp): if len(split_idx) > 1: opts = [] for idx in range(len(split_idx) - 1): - opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) else: opts = [inp] opts = [item for item in opts if not set(item).issubset(punctuation)] @@ -773,11 +981,12 @@ def cut3(inp): inp = inp.strip("\n") opts = ["%s" % item for item in inp.strip("。").split("。")] opts = [item for item in opts if not set(item).issubset(punctuation)] - return "\n".join(opts) + return "\n".join(opts) + def cut4(inp): inp = inp.strip("\n") - opts = re.split(r'(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): + if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): items.append(char) else: items.append(char) @@ -809,17 +1018,18 @@ def cut5(inp): def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts + def process_text(texts): - _text=[] - if all(text in [None, " ", "\n",""] for text in texts): + _text = [] + if all(text in [None, " ", "\n", ""] for text in texts): raise ValueError(i18n("请输入有效文本")) for text in texts: - if text in [None, " ", ""]: + if text in [None, " ", ""]: pass else: _text.append(text) @@ -828,102 +1038,239 @@ def process_text(texts): def change_choices(): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) - return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names = [i for i in pretrained_sovits_name] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) GPT_names = [i for i in pretrained_gpt_name] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) -def html_center(text, label='p'): + +def html_center(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" -def html_left(text, label='p'): + +def html_left(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" -with gr.Blocks(title="GPT-SoVITS WebUI") as app: +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Group(): - gr.Markdown(html_center(i18n("模型切换"),'h3')) + gr.Markdown(html_center(i18n("模型切换"), "h3")) with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True, scale=14) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True, scale=14) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + scale=14, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + scale=14, + ) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14) refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - gr.Markdown(html_center(i18n("*请上传并填写参考信息"),'h3')) + gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3")) with gr.Row(): inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13) with gr.Column(scale=13): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True, show_label=True,scale=1) - gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"
"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))) - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1) + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。") + + i18n("v3暂不支持该模式,使用了会报错。"), + value=False, + interactive=True if model_version not in v3v4set else False, + show_label=True, + scale=1, + ) + gr.Markdown( + html_left( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) + ) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1) with gr.Column(scale=14): prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"), + label=i18n("参考音频的语种"), + choices=list(dict_language.keys()), + value=i18n("中文"), ) - inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple")if model_version!="v3"else gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple",visible=False) - sample_steps = gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=32,choices=[4,8,16,32],visible=True)if model_version=="v3"else gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),choices=[4,8,16,32],visible=False,value=32) - if_sr_Checkbox=gr.Checkbox(label=i18n("v3输出如果觉得闷可以试试开超分"), value=False, interactive=True, show_label=True,visible=False if model_version!="v3"else True) - gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3')) + inp_refs = ( + gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + ) + if model_version not in v3v4set + else gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + visible=False, + ) + ) + sample_steps = ( + gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + value=32 if model_version == "v3" else 8, + choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + visible=True, + ) + if model_version in v3v4set + else gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + visible=False, + value=32 if model_version == "v3" else 8, + ) + ) + if_sr_Checkbox = gr.Checkbox( + label=i18n("v3输出如果觉得闷可以试试开超分"), + value=False, + interactive=True, + show_label=True, + visible=False if model_version != "v3" else True, + ) + gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) with gr.Row(): with gr.Column(scale=13): text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26) with gr.Column(scale=7): text_language = gr.Dropdown( - label=i18n("需要合成的语种")+i18n(".限制范围越小判别效果越好。"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1 - ) + label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"), + choices=list(dict_language.keys()), + value=i18n("中文"), + scale=1, + ) how_to_cut = gr.Dropdown( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, scale=1 - ) + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) gr.Markdown(value=html_center(i18n("语速调整,高为更快"))) - if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), value=False, interactive=True,show_label=True, scale=1) + if_freeze = gr.Checkbox( + label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), + value=False, + interactive=True, + show_label=True, + scale=1, + ) with gr.Row(): - speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1) - pause_second_slider = gr.Slider(minimum=0.1,maximum=0.5,step=0.01,label=i18n("句间停顿秒数"),value=0.3,interactive=True, scale=1) + speed = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1 + ) + pause_second_slider = gr.Slider( + minimum=0.1, + maximum=0.5, + step=0.01, + label=i18n("句间停顿秒数"), + value=0.3, + interactive=True, + scale=1, + ) gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):"))) - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=15,interactive=True, scale=1) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1) - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True, scale=1) + top_k = gr.Slider( + minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1 + ) + top_p = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1 + ) + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1 + ) # with gr.Column(): # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。")) # phoneme=gr.Textbox(label=i18n("音素框"), value="") # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") with gr.Row(): - inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25) + inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) output = gr.Audio(label=i18n("输出的语音"), scale=14) inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider], + [ + inp_ref, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_text_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr_Checkbox, + pause_second_slider, + ], [output], ) - SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox]) + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + if_sr_Checkbox, + inference_button, + ], + ) GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) @@ -942,11 +1289,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: # button5.click(cut5, [text_inp], [text_opt]) # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))) -if __name__ == '__main__': - app.queue().launch(#concurrency_count=511, max_size=1022 +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, server_port=infer_ttswebui, - quiet=True, + # quiet=True, ) diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 9017aa49..0b9525e8 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -1,14 +1,21 @@ -''' +""" 按中英混合识别 按日英混合识别 多语种启动切分识别语种 全部按中文识别 全部按英文识别 全部按日文识别 -''' +""" + +import json +import logging +import os import random -import os, re, logging, json +import re import sys + +import torch + now_dir = os.getcwd() sys.path.append(now_dir) sys.path.append("%s/GPT_SoVITS" % (now_dir)) @@ -20,13 +27,6 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) -import pdb -import torch - -try: - import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... infer_ttswebui = os.environ.get("infer_ttswebui", 9872) @@ -41,15 +41,16 @@ gpt_path = os.environ.get("gpt_path", None) sovits_path = os.environ.get("sovits_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None) bert_path = os.environ.get("bert_path", None) -version=os.environ.get("version","v2") +version = model_version = os.environ.get("version", "v2") import gradio as gr -from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR from TTS_infer_pack.text_segmentation_method import get_method +from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config + from tools.i18n.i18n import I18nAuto, scan_language_list -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) @@ -66,30 +67,30 @@ else: # device = "cpu" dict_language_v1 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 } dict_language_v2 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("粤语"): "all_yue",#全部按中文识别 - i18n("韩文"): "all_ko",#全部按韩文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("粤英混合"): "yue",#按粤英混合识别####不变 - i18n("韩英混合"): "ko",#按韩英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 - i18n("多语种混合(粤语)"): "auto_yue",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 } -dict_language = dict_language_v1 if version =='v1' else dict_language_v2 +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 cut_method = { - i18n("不切"):"cut0", + i18n("不切"): "cut0", i18n("凑四句一切"): "cut1", i18n("凑50字一切"): "cut2", i18n("按中文句号。切"): "cut3", @@ -116,22 +117,33 @@ gpt_path = tts_config.t2s_weights_path sovits_path = tts_config.vits_weights_path version = tts_config.version -def inference(text, text_lang, - ref_audio_path, - aux_ref_audio_paths, - prompt_text, - prompt_lang, top_k, - top_p, temperature, - text_split_method, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed, keep_random, parallel_infer, - repetition_penalty, sample_steps, super_sampling, - ): +def inference( + text, + text_lang, + ref_audio_path, + aux_ref_audio_paths, + prompt_text, + prompt_lang, + top_k, + top_p, + temperature, + text_split_method, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, +): seed = -1 if keep_random else seed actual_seed = seed if seed not in [-1, "", None] else random.randint(0, 2**32 - 1) - inputs={ + inputs = { "text": text, "text_lang": dict_language[text_lang], "ref_audio_path": ref_audio_path, @@ -142,12 +154,12 @@ def inference(text, text_lang, "top_p": top_p, "temperature": temperature, "text_split_method": cut_method[text_split_method], - "batch_size":int(batch_size), - "speed_factor":float(speed_factor), - "split_bucket":split_bucket, - "return_fragment":False, - "fragment_interval":fragment_interval, - "seed":actual_seed, + "batch_size": int(batch_size), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "return_fragment": False, + "fragment_interval": fragment_interval, + "seed": actual_seed, "parallel_infer": parallel_infer, "repetition_penalty": repetition_penalty, "sample_steps": int(sample_steps), @@ -157,11 +169,12 @@ def inference(text, text_lang, for item in tts_pipeline.run(inputs): yield item, actual_seed except NO_PROMPT_ERROR: - gr.Warning(i18n('V3不支持无参考文本模式,请填写参考文本!')) + gr.Warning(i18n("V3不支持无参考文本模式,请填写参考文本!")) + def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts @@ -169,52 +182,72 @@ def custom_sort_key(s): def change_choices(): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) - return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} - -path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] - -_ =[[],[]] -for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) -pretrained_gpt_name,pretrained_sovits_name = _ + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } -if os.path.exists(f"./weight.json"): +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +is_exist_s2gv4 = os.path.exists(path_sovits_v4) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ + +if os.path.exists("./weight.json"): pass else: - with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file) + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) -with open(f"./weight.json", 'r', encoding="utf-8") as file: +with open("./weight.json", "r", encoding="utf-8") as file: weight_data = file.read() - weight_data=json.loads(weight_data) - gpt_path = os.environ.get( - "gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name)) - sovits_path = os.environ.get( - "sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name)) - if isinstance(gpt_path,list): + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): gpt_path = gpt_path[0] - if isinstance(sovits_path,list): + if isinstance(sovits_path, list): sovits_path = sovits_path[0] +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names = [i for i in pretrained_sovits_name] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) GPT_names = [i for i in pretrained_gpt_name] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names @@ -222,72 +255,128 @@ SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) from process_ckpt import get_sovits_version_from_path_fast -def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): - global version, dict_language - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) - if if_lora_v3 and not os.path.exists(path_sovits_v3): - info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") +v3v4set = {"v3", "v4"} + + +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + # print(sovits_path,version, model_version, if_lora_v3) + is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 + path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 + if if_lora_v3 == True and is_exist == False: + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) gr.Warning(info) raise FileExistsError(info) - - tts_pipeline.init_vits_weights(sovits_path) - - dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2 + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): - prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language} + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) else: - prompt_text_update = {'__type__':'update', 'value':''} - prompt_language_update = {'__type__':'update', 'value':i18n("中文")} + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} if text_language in list(dict_language.keys()): - text_update, text_language_update = {'__type__':'update'}, {'__type__':'update', 'value':text_language} + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} else: - text_update = {'__type__':'update', 'value':''} - text_language_update = {'__type__':'update', 'value':i18n("中文")} - if model_version=="v3": - visible_sample_steps=True - visible_inp_refs=False + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version in v3v4set: + visible_sample_steps = True + visible_inp_refs = False else: - visible_sample_steps=False - visible_inp_refs=True - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False} + visible_sample_steps = False + visible_inp_refs = True + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["SoVITS"][version]=sovits_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + tts_pipeline.init_vits_weights(sovits_path) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version not in v3v4set else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) -with gr.Blocks(title="GPT-SoVITS WebUI") as app: + +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Column(): # with gr.Group(): gr.Markdown(value=i18n("模型切换")) with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + ) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - with gr.Row(): with gr.Column(): gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath") - inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple") + inp_refs = gr.File( + label=i18n("辅参考音频(可选多个,或不选)"), + file_count="multiple", + visible=True if model_version != "v3" else False, + ) prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) with gr.Row(): prompt_language = gr.Dropdown( label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文") ) with gr.Column(): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) - gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"
"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")) + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), + value=False, + interactive=True if model_version != "v3" else False, + show_label=True, + ) + gr.Markdown( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) with gr.Column(): gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) @@ -296,42 +385,66 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label=i18n("需要合成的文本的语种"), choices=list(dict_language.keys()), value=i18n("中文") ) - with gr.Group(): gr.Markdown(value=i18n("推理设置")) with gr.Row(): - with gr.Column(): with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True) - sample_steps = gr.Radio(label=i18n("采样步数(仅对V3生效)"),value=32,choices=[4,8,16,32],visible=True) + batch_size = gr.Slider( + minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True + ) + sample_steps = gr.Radio( + label=i18n("采样步数(仅对V3/4生效)"), value=32, choices=[4, 8, 16, 32, 64, 128], visible=True + ) with gr.Row(): - fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True) - speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="语速",value=1.0,interactive=True) + fragment_interval = gr.Slider( + minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True + ) + speed_factor = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True + ) with gr.Row(): - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True) + top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True) with gr.Row(): - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True) - + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True + ) + repetition_penalty = gr.Slider( + minimum=0, maximum=2, step=0.05, label=i18n("重复惩罚"), value=1.35, interactive=True + ) + with gr.Column(): with gr.Row(): how_to_cut = gr.Dropdown( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, scale=1 - ) - super_sampling = gr.Checkbox(label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True) + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) + super_sampling = gr.Checkbox( + label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True + ) with gr.Row(): parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True) - split_bucket = gr.Checkbox(label=i18n("数据分桶(并行推理时会降低一点计算量)"), value=True, interactive=True, show_label=True) + split_bucket = gr.Checkbox( + label=i18n("数据分桶(并行推理时会降低一点计算量)"), + value=True, + interactive=True, + show_label=True, + ) with gr.Row(): - - seed = gr.Number(label=i18n("随机种子"),value=-1) + seed = gr.Number(label=i18n("随机种子"), value=-1) keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True) output = gr.Audio(label=i18n("输出的语音")) @@ -339,40 +452,78 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inference_button = gr.Button(i18n("合成语音"), variant="primary") stop_infer = gr.Button(i18n("终止合成"), variant="primary") - inference_button.click( inference, [ - text,text_language, inp_ref, inp_refs, - prompt_text, prompt_language, - top_k, top_p, temperature, - how_to_cut, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed, keep_random, parallel_infer, - repetition_penalty, sample_steps, super_sampling, - ], + text, + text_language, + inp_ref, + inp_refs, + prompt_text, + prompt_language, + top_k, + top_p, + temperature, + how_to_cut, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, + ], [output, seed], ) stop_infer.click(tts_pipeline.stop, [], []) - SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language]) + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + inference_button, + ], + ) # GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) with gr.Group(): - gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + gr.Markdown( + value=i18n( + "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。" + ) + ) with gr.Row(): text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4) with gr.Column(): _how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, - ) - cut_text= gr.Button(i18n("切分"), variant="primary") + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + ) + cut_text = gr.Button(i18n("切分"), variant="primary") def to_cut(text_inp, how_to_cut): - if len(text_inp.strip()) == 0 or text_inp==[]: + if len(text_inp.strip()) == 0 or text_inp == []: return "" method = get_method(cut_method[how_to_cut]) return method(text_inp) @@ -381,11 +532,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt]) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -if __name__ == '__main__': - app.queue().launch(#concurrency_count=511, max_size=1022 +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, server_port=infer_ttswebui, - quiet=True, + # quiet=True, ) diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index a2e9e515..341de4ac 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -18,7 +18,7 @@ class Encoder(nn.Module): p_dropout=0.0, window_size=4, isflow=False, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -56,9 +56,7 @@ class Encoder(nn.Module): ) self.norm_layers_2.append(LayerNorm(hidden_channels)) if isflow: - cond_layer = torch.nn.Conv1d( - kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1) self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) self.cond_layer = weight_norm_modules(cond_layer, name="weight") self.gin_channels = kwargs["gin_channels"] @@ -74,9 +72,7 @@ class Encoder(nn.Module): x = self.cond_pre(x) cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - x = commons.fused_add_tanh_sigmoid_multiply( - x, g_l, torch.IntTensor([self.hidden_channels]) - ) + x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels])) y = self.attn_layers[i](x, x, attn_mask) y = self.drop(y) x = self.norm_layers_1[i](x + y) @@ -99,7 +95,7 @@ class Decoder(nn.Module): p_dropout=0.0, proximal_bias=False, proximal_init=True, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -131,9 +127,7 @@ class Decoder(nn.Module): ) self.norm_layers_0.append(LayerNorm(hidden_channels)) self.encdec_attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout - ) + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout) ) self.norm_layers_1.append(LayerNorm(hidden_channels)) self.ffn_layers.append( @@ -153,9 +147,7 @@ class Decoder(nn.Module): x: decoder input h: encoder output """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask for i in range(self.n_layers): @@ -211,14 +203,8 @@ class MultiHeadAttention(nn.Module): if window_size is not None: n_heads_rel = 1 if heads_share else n_heads rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) nn.init.xavier_uniform_(self.conv_q.weight) nn.init.xavier_uniform_(self.conv_k.weight) @@ -247,46 +233,28 @@ class MultiHeadAttention(nn.Module): scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." + assert t_s == t_t, "Relative attention is only available for self-attention." key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) + rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) scores_local = self._relative_position_to_absolute_position(rel_logits) scores = scores + scores_local if self.proximal_bias: assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) if mask is not None: scores = scores.masked_fill(mask == 0, -1e4) if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) + assert t_s == t_t, "Local attention is only available for self-attention." + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) scores = scores.masked_fill(block_mask == 0, -1e4) p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] p_attn = self.drop(p_attn) output = torch.matmul(p_attn, value) if self.window_size is not None: relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = ( - output.transpose(2, 3).contiguous().view(b, d, t_t) - ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] return output, p_attn def _matmul_with_relative_values(self, x, y): @@ -320,9 +288,7 @@ class MultiHeadAttention(nn.Module): ) else: padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] return used_relative_embeddings def _relative_position_to_absolute_position(self, x): @@ -336,14 +302,10 @@ class MultiHeadAttention(nn.Module): # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :] return x_final def _absolute_position_to_relative_position(self, x): @@ -353,9 +315,7 @@ class MultiHeadAttention(nn.Module): """ batch, heads, length, _ = x.size() # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) @@ -537,9 +497,7 @@ class Depthwise_Separable_TransposeConv1D(nn.Module): def weight_norm_modules(module, name="weight", dim=0): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( - module, Depthwise_Separable_TransposeConv1D - ): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): module.weight_norm() return module else: @@ -547,9 +505,7 @@ def weight_norm_modules(module, name="weight", dim=0): def remove_weight_norm_modules(module, name="weight"): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( - module, Depthwise_Separable_TransposeConv1D - ): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): module.remove_weight_norm() else: remove_weight_norm(module, name) @@ -567,7 +523,7 @@ class FFT(nn.Module): proximal_bias=False, proximal_init=True, isflow=False, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -579,9 +535,7 @@ class FFT(nn.Module): self.proximal_bias = proximal_bias self.proximal_init = proximal_init if isflow: - cond_layer = torch.nn.Conv1d( - kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1) self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) self.cond_layer = weight_norm_modules(cond_layer, name="weight") self.gin_channels = kwargs["gin_channels"] @@ -622,18 +576,14 @@ class FFT(nn.Module): if g is not None: g = self.cond_layer(g) - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) x = x * x_mask for i in range(self.n_layers): if g is not None: x = self.cond_pre(x) cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - x = commons.fused_add_tanh_sigmoid_multiply( - x, g_l, torch.IntTensor([self.hidden_channels]) - ) + x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels])) y = self.self_attn_layers[i](x, x, self_attn_mask) y = self.drop(y) x = self.norm_layers_0[i](x + y) diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py index 097b1b9c..9961f987 100644 --- a/GPT_SoVITS/module/attentions_onnx.py +++ b/GPT_SoVITS/module/attentions_onnx.py @@ -7,6 +7,7 @@ from module import commons from typing import Optional + class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): super().__init__() @@ -43,7 +44,7 @@ class Encoder(nn.Module): p_dropout=0.0, window_size=4, isflow=True, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -65,13 +66,9 @@ class Encoder(nn.Module): if self.gin_channels != 0: self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) # vits2 says 3rd block, so idx is 2 by default - self.cond_layer_idx = ( - kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 - ) + self.cond_layer_idx = kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 logging.debug(self.gin_channels, self.cond_layer_idx) - assert ( - self.cond_layer_idx < self.n_layers - ), "cond_layer_idx should be less than n_layers" + assert self.cond_layer_idx < self.n_layers, "cond_layer_idx should be less than n_layers" self.drop = nn.Dropout(p_dropout) self.attn_layers = nn.ModuleList() self.norm_layers_1 = nn.ModuleList() @@ -117,11 +114,13 @@ class Encoder(nn.Module): # x = self.norm_layers_2[i](x + y) # x = x * x_mask # return x - + def forward(self, x, x_mask): attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask - for attn_layers,norm_layers_1,ffn_layers,norm_layers_2 in zip(self.attn_layers,self.norm_layers_1,self.ffn_layers,self.norm_layers_2): + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ): y = attn_layers(x, x, attn_mask) y = self.drop(y) x = norm_layers_1(x + y) @@ -170,14 +169,8 @@ class MultiHeadAttention(nn.Module): if window_size is not None: n_heads_rel = 1 if heads_share else n_heads rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) nn.init.xavier_uniform_(self.conv_q.weight) nn.init.xavier_uniform_(self.conv_k.weight) @@ -187,7 +180,7 @@ class MultiHeadAttention(nn.Module): self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.bias.copy_(self.conv_q.bias) - def forward(self, x, c, attn_mask:Optional[torch.Tensor]=None): + def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None): q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) @@ -198,7 +191,7 @@ class MultiHeadAttention(nn.Module): x = self.conv_o(x) return x - def attention(self, query, key, value, mask:Optional[torch.Tensor]=None): + def attention(self, query, key, value, mask: Optional[torch.Tensor] = None): # reshape [b, d, t] -> [b, n_h, t, d_k] b, d, t_s, _ = (*key.size(), query.size(2)) query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3) @@ -223,8 +216,8 @@ class MultiHeadAttention(nn.Module): relative_weights = self._absolute_position_to_relative_position(p_attn) value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - - output = (output.transpose(2, 3).contiguous().view(b, d, -1)) + + output = output.transpose(2, 3).contiguous().view(b, d, -1) return output, p_attn def _matmul_with_relative_values(self, x, y): @@ -248,19 +241,17 @@ class MultiHeadAttention(nn.Module): def _get_relative_embeddings(self, relative_embeddings, length): max_relative_position = 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. - pad_l = torch.zeros((1), dtype = torch.int64) + length - (self.window_size + 1) - pad_s = torch.zeros((1), dtype = torch.int64) + (self.window_size + 1) - length - pad_length = torch.max(pad_l, other=torch.zeros((1), dtype = torch.int64)) - slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype = torch.int64)) + pad_l = torch.zeros((1), dtype=torch.int64) + length - (self.window_size + 1) + pad_s = torch.zeros((1), dtype=torch.int64) + (self.window_size + 1) - length + pad_length = torch.max(pad_l, other=torch.zeros((1), dtype=torch.int64)) + slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype=torch.int64)) slice_end_position = slice_start_position + 2 * length - 1 padded_relative_embeddings = F.pad( relative_embeddings, commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), ) - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] return used_relative_embeddings def _relative_position_to_absolute_position(self, x): @@ -274,14 +265,10 @@ class MultiHeadAttention(nn.Module): # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :] return x_final def _absolute_position_to_relative_position(self, x): @@ -291,9 +278,7 @@ class MultiHeadAttention(nn.Module): """ batch, heads, length, _ = x.size() # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) @@ -351,7 +336,7 @@ class FFN(nn.Module): x = self.drop(x) x = self.conv_2(self.padding(x * x_mask)) return x * x_mask - + def padding(self, x): return self._same_padding(x) @@ -395,12 +380,6 @@ class MRTE(nn.Module): ssl_enc = self.c_pre(ssl_enc * ssl_mask) text_enc = self.text_pre(text * text_mask) - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge x = self.c_post(x * ssl_mask) return x diff --git a/GPT_SoVITS/module/commons.py b/GPT_SoVITS/module/commons.py index 6083535f..20392f91 100644 --- a/GPT_SoVITS/module/commons.py +++ b/GPT_SoVITS/module/commons.py @@ -28,9 +28,7 @@ def intersperse(lst, item): def kl_divergence(m_p, logs_p, m_q, logs_q): """KL(P||Q)""" kl = (logs_q - logs_p) - 0.5 - kl += ( - 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) - ) + kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) return kl @@ -67,9 +65,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): position = torch.arange(length, dtype=torch.float) num_timescales = channels // 2 - log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( - num_timescales - 1 - ) + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1) inv_timescales = min_timescale * torch.exp( torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment ) diff --git a/GPT_SoVITS/module/core_vq.py b/GPT_SoVITS/module/core_vq.py index a5e22d66..b7dab317 100644 --- a/GPT_SoVITS/module/core_vq.py +++ b/GPT_SoVITS/module/core_vq.py @@ -30,6 +30,7 @@ # SOFTWARE. """Core vector quantization implementation.""" + import typing as tp from einops import rearrange, repeat @@ -121,9 +122,7 @@ class EuclideanCodebook(nn.Module): ): super().__init__() self.decay = decay - init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = ( - uniform_init if not kmeans_init else torch.zeros - ) + init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros embed = init_fn(codebook_size, dim) self.codebook_size = codebook_size @@ -151,9 +150,7 @@ class EuclideanCodebook(nn.Module): # broadcast_tensors(self.buffers()) def replace_(self, samples, mask): - modified_codebook = torch.where( - mask[..., None], sample_vectors(samples, self.codebook_size), self.embed - ) + modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed) self.embed.data.copy_(modified_codebook) def expire_codes_(self, batch_samples): @@ -174,11 +171,7 @@ class EuclideanCodebook(nn.Module): def quantize(self, x): embed = self.embed.t() - dist = -( - x.pow(2).sum(1, keepdim=True) - - 2 * x @ embed - + embed.pow(2).sum(0, keepdim=True) - ) + dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True)) embed_ind = dist.max(dim=-1).indices return embed_ind @@ -222,8 +215,7 @@ class EuclideanCodebook(nn.Module): embed_sum = x.t() @ embed_onehot ema_inplace(self.embed_avg, embed_sum.t(), self.decay) cluster_size = ( - laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) - * self.cluster_size.sum() + laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum() ) embed_normalized = self.embed_avg / cluster_size.unsqueeze(1) self.embed.data.copy_(embed_normalized) @@ -264,12 +256,8 @@ class VectorQuantization(nn.Module): _codebook_dim: int = default(codebook_dim, dim) requires_projection = _codebook_dim != dim - self.project_in = ( - nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() - ) - self.project_out = ( - nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() - ) + self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() + self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() self.epsilon = epsilon self.commitment_weight = commitment_weight @@ -330,13 +318,9 @@ class ResidualVectorQuantization(nn.Module): def __init__(self, *, num_quantizers, **kwargs): super().__init__() - self.layers = nn.ModuleList( - [VectorQuantization(**kwargs) for _ in range(num_quantizers)] - ) + self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)]) - def forward( - self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None - ): + def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None): quantized_out = 0.0 residual = x @@ -359,9 +343,7 @@ class ResidualVectorQuantization(nn.Module): out_losses, out_indices = map(torch.stack, (all_losses, all_indices)) return quantized_out, out_indices, out_losses, out_quantized - def encode( - self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None - ) -> torch.Tensor: + def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor: residual = x all_indices = [] n_q = n_q or len(self.layers) diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 6ceca203..11f6b092 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,24 +1,18 @@ -import time -import logging import os import random import traceback -import numpy as np import torch import torch.utils.data from tqdm import tqdm -from module import commons -from module.mel_processing import spectrogram_torch,spec_to_mel_torch +from module.mel_processing import spectrogram_torch, spec_to_mel_torch from text import cleaned_text_to_sequence -from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F -from functools import lru_cache -import requests -from scipy.io import wavfile -from io import BytesIO from tools.my_utils import load_audio -version = os.environ.get('version',None) + +version = os.environ.get("version", None) + + # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ @@ -43,7 +37,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -51,7 +45,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -76,7 +70,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -111,7 +105,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -129,8 +123,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) return spec, audio_norm @@ -146,12 +141,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): return len(self.audiopaths_sid_text) def random_slice(self, ssl, wav, mel): - assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( - "first", ssl.shape, wav.shape) + assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape) len_mel = mel.shape[1] if self.val: - reference_mel = mel[:, :len_mel // 3] + reference_mel = mel[:, : len_mel // 3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) @@ -159,20 +153,29 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] - wav2 = wav[:, sep_point * self.hop_length:] + wav2 = wav[:, sep_point * self.hop_length :] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] - wav2 = wav[:, :sep_point * self.hop_length] + wav2 = wav[:, : sep_point * self.hop_length] mel = mel[:, :sep_point] assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( - ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir) + ssl.shape, + wav.shape, + wav2.shape, + mel.shape, + sep_point, + self.hop_length, + sep_point * self.hop_length, + dir, + ) return reference_mel, ssl, wav2, mel -class TextAudioSpeakerCollate(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollate: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -184,9 +187,7 @@ class TextAudioSpeakerCollate(): batch: [text_normalized, spec_normalized, wav_normalized, sid] """ # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) @@ -214,22 +215,24 @@ class TextAudioSpeakerCollate(): row = batch[ids_sorted_decreasing[i]] ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav + wav_padded[i, :, : wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths + + class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -253,7 +256,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -261,7 +264,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -286,7 +289,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -313,15 +316,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths - self.spec_min=-12 - self.spec_max=2 + self.spec_min = -12 + self.spec_max = 2 + + self.filter_length_mel = self.win_length_mel = 1024 + self.hop_length_mel = 256 + self.n_mel_channels = 100 + self.sampling_rate_mel = 24000 + self.mel_fmin = 0 + self.mel_fmax = None - self.filter_length_mel=self.win_length_mel=1024 - self.hop_length_mel=256 - self.n_mel_channels=100 - self.sampling_rate_mel=24000 - self.mel_fmin=0 - self.mel_fmax=None def norm_spec(self, x): return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 @@ -332,7 +336,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -347,25 +351,35 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): return (ssl, spec, mel, text) def get_audio(self, filename): - audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - audio=torch.FloatTensor(audio_array)#/32768 + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 - audio24=torch.FloatTensor(audio_array24)#/32768 + audio_array24 = load_audio( + filename, 24000 + ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 + audio24 = torch.FloatTensor(audio_array24) # /32768 audio_norm24 = audio24 audio_norm24 = audio_norm24.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) - - spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False) - mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax) + spec1 = spectrogram_torch( + audio_norm24, + self.filter_length_mel, + self.sampling_rate_mel, + self.hop_length_mel, + self.win_length_mel, + center=False, + ) + mel = spec_to_mel_torch( + spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax + ) mel = torch.squeeze(mel, 0) - mel=self.norm_spec(mel) + mel = self.norm_spec(mel) # print(1111111,spec.shape,mel.shape) return spec, mel @@ -379,9 +393,10 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_sid_text) -class TextAudioSpeakerCollateV3(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollateV3: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -392,12 +407,10 @@ class TextAudioSpeakerCollateV3(): ------ batch: [text_normalized, spec_normalized, wav_normalized, sid] """ - #ssl, spec, wav,mel, text + # ssl, spec, wav,mel, text # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) -#(ssl, spec,mel, text) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) + # (ssl, spec,mel, text) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1)) @@ -411,7 +424,7 @@ class TextAudioSpeakerCollateV3(): # max_wav_len = max([x[2].size(1) for x in batch]) max_text_len = max([x[3].size(0) for x in batch]) - max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320 + max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320 ssl_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) @@ -422,7 +435,7 @@ class TextAudioSpeakerCollateV3(): spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) - text_padded = torch.LongTensor(len(batch), max_text_len) + text_padded = torch.LongTensor(len(batch), max_text_len) # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) spec_padded.zero_() @@ -435,11 +448,11 @@ class TextAudioSpeakerCollateV3(): row = batch[ids_sorted_decreasing[i]] # ssl, spec, wav,mel, text ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) # wav = row[2] @@ -447,15 +460,228 @@ class TextAudioSpeakerCollateV3(): # wav_lengths[i] = wav.size(1) mel = row[2] - mel_padded[i, :, :mel.size(1)] = mel + mel_padded[i, :, : mel.size(1)] = mel mel_lengths[i] = mel.size(1) text = row[3] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths - return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths + return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths + + +class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, hparams, val=False): + exp_dir = hparams.exp_dir + self.path2 = "%s/2-name2text.txt" % exp_dir + self.path4 = "%s/4-cnhubert" % exp_dir + self.path5 = "%s/5-wav32k" % exp_dir + assert os.path.exists(self.path2) + assert os.path.exists(self.path4) + assert os.path.exists(self.path5) + names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀 + names5 = set(os.listdir(self.path5)) + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + for line in lines: + tmp = line.split("\t") + if len(tmp) != 4: + continue + self.phoneme_data[tmp[0]] = [tmp[1]] + + self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5) + tmp = self.audiopaths_sid_text + leng = len(tmp) + min_num = 100 + if leng < min_num: + self.audiopaths_sid_text = [] + for _ in range(max(2, int(min_num / leng))): + self.audiopaths_sid_text += tmp + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.val = val + + random.seed(1234) + random.shuffle(self.audiopaths_sid_text) + + print("phoneme_data_len:", len(self.phoneme_data.keys())) + print("wav_data_len:", len(self.audiopaths_sid_text)) + + audiopaths_sid_text_new = [] + lengths = [] + skipped_phone = 0 + skipped_dur = 0 + for audiopath in tqdm(self.audiopaths_sid_text): + try: + phoneme = self.phoneme_data[audiopath][0] + phoneme = phoneme.split(" ") + phoneme_ids = cleaned_text_to_sequence(phoneme, version) + except Exception: + print(f"{audiopath} not in self.phoneme_data !") + skipped_phone += 1 + continue + + size = os.path.getsize("%s/%s" % (self.path5, audiopath)) + duration = size / self.sampling_rate / 2 + + if duration == 0: + print(f"Zero duration for {audiopath}, skipping...") + skipped_dur += 1 + continue + + if 54 > duration > 0.6 or self.val: + audiopaths_sid_text_new.append([audiopath, phoneme_ids]) + lengths.append(size // (2 * self.hop_length)) + else: + skipped_dur += 1 + continue + + print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) + print("total left: ", len(audiopaths_sid_text_new)) + assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo + self.audiopaths_sid_text = audiopaths_sid_text_new + self.lengths = lengths + self.spec_min = -12 + self.spec_max = 2 + + self.filter_length_mel = self.win_length_mel = 1280 + self.hop_length_mel = 320 + self.n_mel_channels = 100 + self.sampling_rate_mel = 32000 + self.mel_fmin = 0 + self.mel_fmax = None + + def norm_spec(self, x): + return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 + + def get_audio_text_speaker_pair(self, audiopath_sid_text): + audiopath, phoneme_ids = audiopath_sid_text + text = torch.FloatTensor(phoneme_ids) + try: + spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath)) + with torch.no_grad(): + ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") + if ssl.shape[-1] != spec.shape[-1]: + typee = ssl.dtype + ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) + ssl.requires_grad = False + except: + traceback.print_exc() + mel = torch.zeros(100, 192) + # wav = torch.zeros(1, 96 * self.hop_length) + spec = torch.zeros(1025, 96) + ssl = torch.zeros(1, 768, 96) + text = text[-1:] + print("load audio or ssl error!!!!!!", audiopath) + return (ssl, spec, mel, text) + + def get_audio(self, filename): + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) + spec = torch.squeeze(spec, 0) + spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False) + mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None) + mel = self.norm_spec(torch.squeeze(mel, 0)) + return spec, mel + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def __getitem__(self, index): + # with torch.no_grad(): + return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) + + def __len__(self): + return len(self.audiopaths_sid_text) + + +class TextAudioSpeakerCollateV4: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text, audio and speaker identities + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized, sid] + """ + # ssl, spec, wav,mel, text + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) + # (ssl, spec,mel, text) + max_ssl_len = max([x[0].size(2) for x in batch]) + max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) + max_spec_len = max([x[1].size(1) for x in batch]) + max_spec_len = int(2 * ((max_spec_len // 2) + 1)) + # max_wav_len = max([x[2].size(1) for x in batch]) + max_text_len = max([x[3].size(0) for x in batch]) + + ssl_lengths = torch.LongTensor(len(batch)) + spec_lengths = torch.LongTensor(len(batch)) + text_lengths = torch.LongTensor(len(batch)) + # wav_lengths = torch.LongTensor(len(batch)) + mel_lengths = torch.LongTensor(len(batch)) + + spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) + mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2) + ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) + text_padded = torch.LongTensor(len(batch), max_text_len) + # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) + + spec_padded.zero_() + mel_padded.zero_() + ssl_padded.zero_() + text_padded.zero_() + # wav_padded.zero_() + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + # ssl, spec, wav,mel, text + ssl = row[0] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] + ssl_lengths[i] = ssl.size(2) + + spec = row[1] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + # wav = row[2] + # wav_padded[i, :, :wav.size(1)] = wav + # wav_lengths[i] = wav.size(1) + + mel = row[2] + mel_padded[i, :, : mel.size(1)] = mel + mel_lengths[i] = mel.size(1) + + text = row[3] + text_padded[i, : text.size(0)] = text + text_lengths[i] = text.size(0) + + # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths + return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths + + class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -479,7 +705,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -487,7 +713,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -512,7 +738,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -539,15 +765,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths - self.spec_min=-12 - self.spec_max=2 + self.spec_min = -12 + self.spec_max = 2 + + self.filter_length_mel = self.win_length_mel = 1024 + self.hop_length_mel = 256 + self.n_mel_channels = 100 + self.sampling_rate_mel = 24000 + self.mel_fmin = 0 + self.mel_fmax = None - self.filter_length_mel=self.win_length_mel=1024 - self.hop_length_mel=256 - self.n_mel_channels=100 - self.sampling_rate_mel=24000 - self.mel_fmin=0 - self.mel_fmax=None def norm_spec(self, x): return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 @@ -555,10 +782,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): audiopath, phoneme_ids = audiopath_sid_text text = torch.FloatTensor(phoneme_ids) try: - spec, mel,wav = self.get_audio("%s/%s" % (self.path5, audiopath)) + spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -573,27 +800,37 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): return (ssl, spec, wav, mel, text) def get_audio(self, filename): - audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - audio=torch.FloatTensor(audio_array)#/32768 + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 - audio24=torch.FloatTensor(audio_array24)#/32768 + audio_array24 = load_audio( + filename, 24000 + ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 + audio24 = torch.FloatTensor(audio_array24) # /32768 audio_norm24 = audio24 audio_norm24 = audio_norm24.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) - - spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False) - mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax) + spec1 = spectrogram_torch( + audio_norm24, + self.filter_length_mel, + self.sampling_rate_mel, + self.hop_length_mel, + self.win_length_mel, + center=False, + ) + mel = spec_to_mel_torch( + spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax + ) mel = torch.squeeze(mel, 0) - mel=self.norm_spec(mel) + mel = self.norm_spec(mel) # print(1111111,spec.shape,mel.shape) - return spec, mel,audio_norm + return spec, mel, audio_norm def get_sid(self, sid): sid = torch.LongTensor([int(sid)]) @@ -605,9 +842,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_sid_text) -class TextAudioSpeakerCollateV3b(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollateV3b: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -618,12 +856,10 @@ class TextAudioSpeakerCollateV3b(): ------ batch: [text_normalized, spec_normalized, wav_normalized, sid] """ - #ssl, spec, wav,mel, text + # ssl, spec, wav,mel, text # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) -#(ssl, spec,mel, text) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) + # (ssl, spec,mel, text) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1)) @@ -636,7 +872,7 @@ class TextAudioSpeakerCollateV3b(): max_spec_len = int(2 * ((max_spec_len // 2) + 1)) max_wav_len = max([x[2].size(1) for x in batch]) max_text_len = max([x[4].size(0) for x in batch]) - max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320 + max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320 ssl_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) @@ -647,7 +883,7 @@ class TextAudioSpeakerCollateV3b(): spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) - text_padded = torch.LongTensor(len(batch), max_text_len) + text_padded = torch.LongTensor(len(batch), max_text_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) spec_padded.zero_() @@ -660,28 +896,40 @@ class TextAudioSpeakerCollateV3b(): row = batch[ids_sorted_decreasing[i]] # ssl, spec, wav,mel, text ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav + wav_padded[i, :, : wav.size(1)] = wav wav_lengths[i] = wav.size(1) mel = row[3] - mel_padded[i, :, :mel.size(1)] = mel + mel_padded[i, :, : mel.size(1)] = mel mel_lengths[i] = mel.size(1) text = row[4] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) - return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths + return ( + ssl_padded, + spec_padded, + mel_padded, + ssl_lengths, + spec_lengths, + text_padded, + text_lengths, + wav_padded, + wav_lengths, + mel_lengths, + ) # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths + class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): """ Maintain similar input lengths in a batch. @@ -745,12 +993,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): num_samples_bucket = self.num_samples_per_bucket[i] rem = num_samples_bucket - len_bucket - ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] + ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)] - ids_bucket = ids_bucket[self.rank::self.num_replicas] + ids_bucket = ids_bucket[self.rank :: self.num_replicas] for j in range(len(ids_bucket) // self.batch_size): - batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] + batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]] batches.append(batch) if self.shuffle: @@ -777,4 +1025,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): return -1 def __len__(self): - return self.num_samples // self.batch_size \ No newline at end of file + return self.num_samples // self.batch_size diff --git a/GPT_SoVITS/module/losses.py b/GPT_SoVITS/module/losses.py index b23fc8c1..2b642db1 100644 --- a/GPT_SoVITS/module/losses.py +++ b/GPT_SoVITS/module/losses.py @@ -1,7 +1,6 @@ import math import torch -from torch.nn import functional as F def feature_loss(fmap_r, fmap_g): @@ -66,8 +65,6 @@ def mle_loss(z, m, logs, logdet, mask): torch.exp(-2 * logs) * ((z - m) ** 2) ) # neg normal likelihood w/o the constant term l = l - torch.sum(logdet) # log jacobian determinant - l = l / torch.sum( - torch.ones_like(z) * mask - ) # averaging across batch, channel and time axes + l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term return l diff --git a/GPT_SoVITS/module/mel_processing.py b/GPT_SoVITS/module/mel_processing.py index d94b0456..62c7b40e 100644 --- a/GPT_SoVITS/module/mel_processing.py +++ b/GPT_SoVITS/module/mel_processing.py @@ -1,16 +1,5 @@ -import math -import os -import random import torch -from torch import nn -import torch.nn.functional as F import torch.utils.data -import numpy as np -import librosa -import librosa.util as librosa_util -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window -from scipy.io.wavfile import read from librosa.filters import mel as librosa_mel_fn MAX_WAV_VALUE = 32768.0 @@ -49,31 +38,31 @@ hann_window = {} def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - if torch.min(y) < -1.0: + if torch.min(y) < -1.2: print("min value is ", torch.min(y)) - if torch.max(y) > 1.0: + if torch.max(y) > 1.2: print("max value is ", torch.max(y)) global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + # wnsize_dtype_device = str(win_size) + '_' + dtype_device + key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size) + # if wnsize_dtype_device not in hann_window: + if key not in hann_window: + # hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" ) y = y.squeeze(1) + # spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], spec = torch.stft( y, n_fft, hop_length=hop_size, win_length=win_size, - window=hann_window[wnsize_dtype_device], + window=hann_window[key], center=center, pad_mode="reflect", normalized=False, @@ -81,54 +70,55 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) return_complex=False, ) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) return spec def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): global mel_basis dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=spec.dtype, device=spec.device - ) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + # fmax_dtype_device = str(fmax) + '_' + dtype_device + key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax) + # if fmax_dtype_device not in mel_basis: + if key not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + # mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + mel_basis[key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + # spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = torch.matmul(mel_basis[key], spec) spec = spectral_normalize_torch(spec) return spec -def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False -): - if torch.min(y) < -1.0: +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.2: print("min value is ", torch.min(y)) - if torch.max(y) > 1.0: + if torch.max(y) > 1.2: print("max value is ", torch.max(y)) global mel_basis, hann_window dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_size) + "_" + dtype_device + # fmax_dtype_device = str(fmax) + '_' + dtype_device + fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % ( + dtype_device, + n_fft, + num_mels, + sampling_rate, + hop_size, + win_size, + fmin, + fmax, + ) + # wnsize_dtype_device = str(win_size) + '_' + dtype_device + wnsize_dtype_device = fmax_dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=y.dtype, device=y.device - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" ) y = y.squeeze(1) @@ -145,7 +135,7 @@ def mel_spectrogram_torch( return_complex=False, ) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-9) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 33bd6077..3e37f0ff 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -1,9 +1,7 @@ import warnings + warnings.filterwarnings("ignore") -import copy import math -import os -import pdb import torch from torch import nn @@ -13,16 +11,18 @@ from module import commons from module import modules from module import attentions from f5_tts.model import DiT -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding from module.mrte_model import MRTE from module.quantize import ResidualVectorQuantizer + # from text import symbols from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 from torch.cuda.amp import autocast -import contextlib,random +import contextlib +import random class StochasticDurationPredictor(nn.Module): @@ -48,29 +48,21 @@ class StochasticDurationPredictor(nn.Module): self.flows = nn.ModuleList() self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -91,10 +83,7 @@ class StochasticDurationPredictor(nn.Module): h_w = self.post_pre(w) h_w = self.post_convs(h_w, x_mask) h_w = self.post_proj(h_w) * x_mask - e_q = ( - torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) - * x_mask - ) + e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask z_q = e_q for flow in self.post_flows: z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) @@ -102,13 +91,8 @@ class StochasticDurationPredictor(nn.Module): z_u, z1 = torch.split(z_q, [1, 1], 1) u = torch.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += torch.sum( - (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] - ) - logq = ( - torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - - logdet_tot_q - ) + logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) + logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) @@ -117,18 +101,12 @@ class StochasticDurationPredictor(nn.Module): for flow in flows: z, logdet = flow(z, x_mask, g=x, reverse=reverse) logdet_tot = logdet_tot + logdet - nll = ( - torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - - logdet_tot - ) + nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot return nll + logq # [b] else: flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = ( - torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) - * noise_scale - ) + z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale for flow in flows: z = flow(z, x_mask, g=x, reverse=reverse) z0, z1 = torch.split(z, [1, 1], 1) @@ -137,9 +115,7 @@ class StochasticDurationPredictor(nn.Module): class DurationPredictor(nn.Module): - def __init__( - self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 - ): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): super().__init__() self.in_channels = in_channels @@ -149,13 +125,9 @@ class DurationPredictor(nn.Module): self.gin_channels = gin_channels self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d( - in_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) @@ -190,7 +162,7 @@ class TextEncoder(nn.Module): kernel_size, p_dropout, latent_channels=192, - version = "v2", + version="v2", ): super().__init__() self.out_channels = out_channels @@ -237,26 +209,22 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, y, y_lengths, text, text_lengths, ge, speed=1,test=None): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) + def forward(self, y, y_lengths, text, text_lengths, ge, speed=1, test=None): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) y = self.ssl_proj(y * y_mask) * y_mask y = self.encoder_ssl(y * y_mask, y_mask) - text_mask = torch.unsqueeze( - commons.sequence_mask(text_lengths, text.size(1)), 1 - ).to(y.dtype) + text_mask = torch.unsqueeze(commons.sequence_mask(text_lengths, text.size(1)), 1).to(y.dtype) if test == 1: text[:, :] = 0 text = self.text_embedding(text).transpose(1, 2) text = self.encoder_text(text * text_mask, text_mask) y = self.mrte(y, y_mask, text, text_mask, ge) y = self.encoder2(y * y_mask, y_mask) - if(speed!=1): - y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear") + if speed != 1: + y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear") y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest") stats = self.proj(y) * y_mask m, logs = torch.split(stats, self.out_channels, dim=1) @@ -360,9 +328,7 @@ class PosteriorEncoder(nn.Module): def forward(self, x, x_lengths, g=None): if g != None: g = g.detach() - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -372,14 +338,9 @@ class PosteriorEncoder(nn.Module): class Encoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0 + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -394,7 +355,7 @@ class Encoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, x, x_lengths, g=None): - if(g!=None): + if g != None: g = g.detach() x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask @@ -402,6 +363,7 @@ class Encoder(nn.Module): stats = self.proj(x) * x_mask return stats, x_mask + class WNEncoder(nn.Module): def __init__( self, @@ -434,9 +396,7 @@ class WNEncoder(nn.Module): self.norm = modules.LayerNorm(out_channels) def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) out = self.proj(x) * x_mask @@ -455,13 +415,12 @@ class Generator(torch.nn.Module): upsample_initial_channel, upsample_kernel_sizes, gin_channels=0, + is_bias=False, ): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() @@ -481,12 +440,10 @@ class Generator(torch.nn.Module): self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=is_bias) self.ups.apply(init_weights) if gin_channels != 0: @@ -636,9 +593,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): @@ -738,10 +693,7 @@ class Quantizer(torch.nn.Module): super(Quantizer, self).__init__() assert embed_dim % n_code_groups == 0 self.quantizer_modules = nn.ModuleList( - [ - Quantizer_module(n_codes, embed_dim // n_code_groups) - for _ in range(n_code_groups) - ] + [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)] ) self.n_code_groups = n_code_groups self.embed_dim = embed_dim @@ -759,9 +711,7 @@ class Quantizer(torch.nn.Module): z_q.append(_z_q) min_indicies.append(_min_indicies) # B * T, z_q = torch.cat(z_q, -1).reshape(xin.shape) - loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( - (z_q - xin.detach()) ** 2 - ) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2) z_q = xin + (z_q - xin).detach() z_q = z_q.transpose(1, 2) codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) @@ -801,13 +751,9 @@ class CodePredictor(nn.Module): self.p_dropout = p_dropout self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) - self.ref_enc = modules.MelStyleEncoder( - ssl_dim, style_vector_dim=hidden_channels - ) + self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) + self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) self.n_q = n_q @@ -820,9 +766,7 @@ class CodePredictor(nn.Module): x = x + g x = self.encoder(x * x_mask, x_mask) x = self.out_proj(x * x_mask) * x_mask - logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( - 2, 3 - ) + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3) target = codes[1:].transpose(0, 1) if not infer: logits = logits.reshape(-1, self.dims) @@ -870,8 +814,8 @@ class SynthesizerTrn(nn.Module): use_sdp=True, semantic_frame_rate=None, freeze_quantizer=None, - version = "v2", - **kwargs + version="v2", + **kwargs, ): super().__init__() self.spec_channels = spec_channels @@ -902,7 +846,7 @@ class SynthesizerTrn(nn.Module): n_layers, kernel_size, p_dropout, - version = version, + version=version, ) self.dec = Generator( inter_channels, @@ -923,12 +867,10 @@ class SynthesizerTrn(nn.Module): 16, gin_channels=gin_channels, ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels - ) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) # self.version=os.environ.get("version","v1") - if(self.version=="v1"): + if self.version == "v1": self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels) else: self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) @@ -945,13 +887,11 @@ class SynthesizerTrn(nn.Module): self.freeze_quantizer = freeze_quantizer def forward(self, ssl, y, y_lengths, text, text_lengths): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) - if(self.version=="v1"): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + if self.version == "v1": ge = self.ref_enc(y * y_mask, y_mask) else: - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) with autocast(enabled=False): maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() with maybe_no_grad: @@ -959,24 +899,16 @@ class SynthesizerTrn(nn.Module): self.ssl_proj.eval() self.quantizer.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge - ) + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z_p = self.flow(z, y_mask, g=ge) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) + z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=ge) return ( o, @@ -989,24 +921,18 @@ class SynthesizerTrn(nn.Module): ) def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) - if(self.version=="v1"): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + if self.version == "v1": ge = self.ref_enc(y * y_mask, y_mask) else: - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) ssl = self.ssl_proj(ssl) quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0]) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge, test=test - ) + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, test=test) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -1015,39 +941,34 @@ class SynthesizerTrn(nn.Module): return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def decode(self, codes, text, refer, noise_scale=0.5,speed=1): + def decode(self, codes, text, refer, noise_scale=0.5, speed=1): def get_ge(refer): ge = None if refer is not None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) - refer_mask = torch.unsqueeze( - commons.sequence_mask(refer_lengths, refer.size(2)), 1 - ).to(refer.dtype) - if (self.version == "v1"): + refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) + if self.version == "v1": ge = self.ref_enc(refer * refer_mask, refer_mask) else: ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) return ge - if(type(refer)==list): - ges=[] + + if type(refer) == list: + ges = [] for _refer in refer: - ge=get_ge(_refer) + ge = get_ge(_refer) ges.append(ge) - ge=torch.stack(ges,0).mean(0) + ge = torch.stack(ges, 0).mean(0) else: - ge=get_ge(refer) + ge = get_ge(refer) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge,speed - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -1059,11 +980,10 @@ class SynthesizerTrn(nn.Module): ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) return codes.transpose(0, 1) + + class CFM(torch.nn.Module): - def __init__( - self, - in_channels,dit - ): + def __init__(self, in_channels, dit): super().__init__() self.sigma_min = 1e-6 @@ -1077,41 +997,54 @@ class CFM(torch.nn.Module): def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0): """Forward diffusion""" B, T = mu.size(0), mu.size(1) - x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) * temperature + x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature prompt_len = prompt.size(-1) - prompt_x = torch.zeros_like(x,dtype=mu.dtype) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) prompt_x[..., :prompt_len] = prompt[..., :prompt_len] x[..., :prompt_len] = 0 - mu=mu.transpose(2,1) + mu = mu.transpose(2, 1) t = 0 d = 1 / n_timesteps for j in range(n_timesteps): - t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t - d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d # v_pred = model(x, t_tensor, d_tensor, **extra_args) - v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu, use_grad_ckpt=False,drop_audio_cond=False,drop_text=False).transpose(2, 1) - if inference_cfg_rate>1e-5: - neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1) - v_pred=v_pred+(v_pred-neg)*inference_cfg_rate + v_pred = self.estimator( + x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False + ).transpose(2, 1) + if inference_cfg_rate > 1e-5: + neg = self.estimator( + x, + prompt_x, + x_lens, + t_tensor, + d_tensor, + mu, + use_grad_ckpt=False, + drop_audio_cond=True, + drop_text=True, + ).transpose(2, 1) + v_pred = v_pred + (v_pred - neg) * inference_cfg_rate x = x + d * v_pred t = t + d x[:, :, :prompt_len] = 0 return x + def forward(self, x1, x_lens, prompt_lens, mu, use_grad_ckpt): b, _, t = x1.shape t = torch.rand([b], device=mu.device, dtype=x1.dtype) - x0 = torch.randn_like(x1,device=mu.device) + x0 = torch.randn_like(x1, device=mu.device) vt = x1 - x0 xt = x0 + t[:, None, None] * vt - dt = torch.zeros_like(t,device=mu.device) + dt = torch.zeros_like(t, device=mu.device) prompt = torch.zeros_like(x1) for i in range(b): - prompt[i, :, :prompt_lens[i]] = x1[i, :, :prompt_lens[i]] - xt[i, :, :prompt_lens[i]] = 0 - gailv=0.3# if ttime()>1736250488 else 0.1 + prompt[i, :, : prompt_lens[i]] = x1[i, :, : prompt_lens[i]] + xt[i, :, : prompt_lens[i]] = 0 + gailv = 0.3 # if ttime()>1736250488 else 0.1 if random.random() < gailv: base = torch.randint(2, 8, (t.shape[0],), device=mu.device) - d = 1/torch.pow(2, base) + d = 1 / torch.pow(2, base) d_input = d.clone() d_input[d_input < 1e-2] = 0 # with torch.no_grad(): @@ -1119,52 +1052,55 @@ class CFM(torch.nn.Module): # v_pred_1 = self.diffusion(xt, t, d_input, cond=conditioning).detach() x_mid = xt + d[:, None, None] * v_pred_1 # v_pred_2 = self.diffusion(x_mid, t+d, d_input, cond=conditioning).detach() - v_pred_2 = self.estimator(x_mid, prompt, x_lens, t+d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach() + v_pred_2 = self.estimator(x_mid, prompt, x_lens, t + d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach() vt = (v_pred_1 + v_pred_2) / 2 vt = vt.detach() - dt = 2*d + dt = 2 * d - vt_pred = self.estimator(xt, prompt, x_lens, t,dt, mu, use_grad_ckpt).transpose(2,1) + vt_pred = self.estimator(xt, prompt, x_lens, t, dt, mu, use_grad_ckpt).transpose(2, 1) loss = 0 for i in range(b): - loss += self.criterion(vt_pred[i, :, prompt_lens[i]:x_lens[i]], vt[i, :, prompt_lens[i]:x_lens[i]]) + loss += self.criterion(vt_pred[i, :, prompt_lens[i] : x_lens[i]], vt[i, :, prompt_lens[i] : x_lens[i]]) loss /= b return loss + def set_no_grad(net_g): for name, param in net_g.named_parameters(): - param.requires_grad=False + param.requires_grad = False + class SynthesizerTrnV3(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - version="v3", - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + version="v3", + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1185,132 +1121,133 @@ class SynthesizerTrnV3(nn.Module): self.gin_channels = gin_channels self.version = version - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, # gin_channels=gin_channels) # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) - self.freeze_quantizer=freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim - if self.freeze_quantizer==True: + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + self.freeze_quantizer = freeze_quantizer + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim + if self.freeze_quantizer == True: set_no_grad(self.ssl_proj) set_no_grad(self.quantizer) set_no_grad(self.enc_p) - def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths, use_grad_ckpt):#ssl_lengths no need now + def forward( + self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths, use_grad_ckpt + ): # ssl_lengths no need now with autocast(enabled=False): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() with maybe_no_grad: if self.freeze_quantizer: - self.ssl_proj.eval()# + self.ssl_proj.eval() # self.quantizer.eval() self.enc_p.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT - fea, y_mask_ = self.wns1(fea, mel_lengths, ge)##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. - B=ssl.shape[0] - prompt_len_max = mel_lengths*2/3 + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT + fea, y_mask_ = self.wns1( + fea, mel_lengths, ge + ) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. + B = ssl.shape[0] + prompt_len_max = mel_lengths * 2 / 3 prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long) - minn=min(mel.shape[-1],fea.shape[-1]) - mel=mel[:,:,:minn] - fea=fea[:,:,:minn] - cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt) + minn = min(mel.shape[-1], fea.shape[-1]) + mel = mel[:, :, :minn] + fea = fea[:, :, :minn] + cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt) return cfm_loss @torch.no_grad() - def decode_encp(self, codes,text, refer,ge=None,speed=1): + def decode_encp(self, codes, text, refer, ge=None, speed=1): # print(2333333,refer.shape) # ge=None - if(ge==None): + if ge == None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) - y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device) - if speed==1: - sizee=int(codes.size(2)*2.5*1.5) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) + y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) + if speed == 1: + sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4)) else: - sizee=int(codes.size(2)*2.5*1.5/speed)+1 + sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1 y_lengths1 = torch.LongTensor([sizee]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge,speed) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) - return fea,ge + return fea, ge def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) + return codes.transpose(0, 1) + class SynthesizerTrnV3b(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1330,47 +1267,52 @@ class SynthesizerTrnV3b(nn.Module): self.n_speakers = n_speakers self.gin_channels = gin_channels - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, - gin_channels=gin_channels) + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels + ) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) - self.freeze_quantizer=freeze_quantizer + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + self.freeze_quantizer = freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim - - def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths):#ssl_lengths no need now + def forward(self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths): # ssl_lengths no need now with autocast(enabled=False): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) # ge = self.ref_enc(y * y_mask, y_mask)#change back, new spec setting is whole 24k # ge=None maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() @@ -1379,51 +1321,59 @@ class SynthesizerTrnV3b(nn.Module): self.ssl_proj.eval() self.quantizer.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z_p = self.flow(z, y_mask, g=ge) z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT fea, y_mask_ = self.wns1(fea, mel_lengths, ge) learned_mel = self.linear_mel(fea) - B=ssl.shape[0] - prompt_len_max = mel_lengths*2/3 - prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)# - minn=min(mel.shape[-1],fea.shape[-1]) - mel=mel[:,:,:minn] - fea=fea[:,:,:minn] - cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea)#fea==cond,y_lengths==target_mel_lengths#ge not need - return commit_loss,cfm_loss,F.mse_loss(learned_mel, mel),o, ids_slice, y_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), quantized + B = ssl.shape[0] + prompt_len_max = mel_lengths * 2 / 3 + prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long) # + minn = min(mel.shape[-1], fea.shape[-1]) + mel = mel[:, :, :minn] + fea = fea[:, :, :minn] + cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea) # fea==cond,y_lengths==target_mel_lengths#ge not need + return ( + commit_loss, + cfm_loss, + F.mse_loss(learned_mel, mel), + o, + ids_slice, + y_mask, + y_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + quantized, + ) @torch.no_grad() - def decode_encp(self, codes,text, refer,ge=None): + def decode_encp(self, codes, text, refer, ge=None): # print(2333333,refer.shape) # ge=None - if(ge==None): + if ge == None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) - y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device) - y_lengths1 = torch.LongTensor([int(codes.size(2)*2.5*1.5)]).to(codes.device) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) + y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) + y_lengths1 = torch.LongTensor([int(codes.size(2) * 2.5 * 1.5)]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) - return fea,ge + return fea, ge def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) + return codes.transpose(0, 1) diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py index 1c240564..8a3ad13f 100644 --- a/GPT_SoVITS/module/models_onnx.py +++ b/GPT_SoVITS/module/models_onnx.py @@ -1,4 +1,3 @@ -import copy import math from typing import Optional import torch @@ -11,14 +10,14 @@ from module import attentions_onnx as attentions from f5_tts.model import DiT -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding from module.quantize import ResidualVectorQuantizer + # from text import symbols from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 -from torch.cuda.amp import autocast class StochasticDurationPredictor(nn.Module): @@ -44,29 +43,21 @@ class StochasticDurationPredictor(nn.Module): self.flows = nn.ModuleList() self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -87,10 +78,7 @@ class StochasticDurationPredictor(nn.Module): h_w = self.post_pre(w) h_w = self.post_convs(h_w, x_mask) h_w = self.post_proj(h_w) * x_mask - e_q = ( - torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) - * x_mask - ) + e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask z_q = e_q for flow in self.post_flows: z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) @@ -98,13 +86,8 @@ class StochasticDurationPredictor(nn.Module): z_u, z1 = torch.split(z_q, [1, 1], 1) u = torch.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += torch.sum( - (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] - ) - logq = ( - torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - - logdet_tot_q - ) + logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) + logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) @@ -113,18 +96,12 @@ class StochasticDurationPredictor(nn.Module): for flow in flows: z, logdet = flow(z, x_mask, g=x, reverse=reverse) logdet_tot = logdet_tot + logdet - nll = ( - torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - - logdet_tot - ) + nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot return nll + logq # [b] else: flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = ( - torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) - * noise_scale - ) + z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale for flow in flows: z = flow(z, x_mask, g=x, reverse=reverse) z0, z1 = torch.split(z, [1, 1], 1) @@ -133,9 +110,7 @@ class StochasticDurationPredictor(nn.Module): class DurationPredictor(nn.Module): - def __init__( - self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 - ): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): super().__init__() self.in_channels = in_channels @@ -145,13 +120,9 @@ class DurationPredictor(nn.Module): self.gin_channels = gin_channels self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d( - in_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) @@ -234,7 +205,7 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, y, text, ge, speed=1): - y_mask = torch.ones_like(y[:1,:1,:]) + y_mask = torch.ones_like(y[:1, :1, :]) y = self.ssl_proj(y * y_mask) * y_mask y = self.encoder_ssl(y * y_mask, y_mask) @@ -246,8 +217,8 @@ class TextEncoder(nn.Module): y = self.mrte(y, y_mask, text, text_mask, ge) y = self.encoder2(y * y_mask, y_mask) - if(speed!=1): - y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear") + if speed != 1: + y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear") y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest") stats = self.proj(y) * y_mask @@ -333,9 +304,7 @@ class PosteriorEncoder(nn.Module): def forward(self, x, x_lengths, g=None): if g != None: g = g.detach() - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -345,14 +314,9 @@ class PosteriorEncoder(nn.Module): class Encoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0 + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -367,7 +331,7 @@ class Encoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, x, x_lengths, g=None): - if(g!=None): + if g != None: g = g.detach() x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask @@ -375,6 +339,7 @@ class Encoder(nn.Module): stats = self.proj(x) * x_mask return stats, x_mask + class WNEncoder(nn.Module): def __init__( self, @@ -407,9 +372,7 @@ class WNEncoder(nn.Module): self.norm = modules.LayerNorm(out_channels) def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) out = self.proj(x) * x_mask @@ -432,9 +395,7 @@ class Generator(torch.nn.Module): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() @@ -454,9 +415,7 @@ class Generator(torch.nn.Module): self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) @@ -465,7 +424,7 @@ class Generator(torch.nn.Module): if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward(self, x, g:Optional[torch.Tensor]=None): + def forward(self, x, g: Optional[torch.Tensor] = None): x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -609,9 +568,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): @@ -711,10 +668,7 @@ class Quantizer(torch.nn.Module): super(Quantizer, self).__init__() assert embed_dim % n_code_groups == 0 self.quantizer_modules = nn.ModuleList( - [ - Quantizer_module(n_codes, embed_dim // n_code_groups) - for _ in range(n_code_groups) - ] + [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)] ) self.n_code_groups = n_code_groups self.embed_dim = embed_dim @@ -732,9 +686,7 @@ class Quantizer(torch.nn.Module): z_q.append(_z_q) min_indicies.append(_min_indicies) # B * T, z_q = torch.cat(z_q, -1).reshape(xin.shape) - loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( - (z_q - xin.detach()) ** 2 - ) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2) z_q = xin + (z_q - xin).detach() z_q = z_q.transpose(1, 2) codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) @@ -774,13 +726,9 @@ class CodePredictor(nn.Module): self.p_dropout = p_dropout self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) - self.ref_enc = modules.MelStyleEncoder( - ssl_dim, style_vector_dim=hidden_channels - ) + self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) + self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) self.n_q = n_q @@ -793,9 +741,7 @@ class CodePredictor(nn.Module): x = x + g x = self.encoder(x * x_mask, x_mask) x = self.out_proj(x * x_mask) * x_mask - logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( - 2, 3 - ) + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3) target = codes[1:].transpose(0, 1) if not infer: logits = logits.reshape(-1, self.dims) @@ -844,7 +790,7 @@ class SynthesizerTrn(nn.Module): semantic_frame_rate=None, freeze_quantizer=None, version="v2", - **kwargs + **kwargs, ): super().__init__() self.spec_channels = spec_channels @@ -896,9 +842,7 @@ class SynthesizerTrn(nn.Module): # 16, # gin_channels=gin_channels, # ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels - ) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) # self.version=os.environ.get("version","v1") if self.version == "v1": @@ -923,9 +867,9 @@ class SynthesizerTrn(nn.Module): # self.enc_p.encoder_text.requires_grad_(False) # self.enc_p.mrte.requires_grad_(False) - def forward(self, codes, text, refer,noise_scale=0.5, speed=1): - refer_mask = torch.ones_like(refer[:1,:1,:]) - if (self.version == "v1"): + def forward(self, codes, text, refer, noise_scale=0.5, speed=1): + refer_mask = torch.ones_like(refer[:1, :1, :]) + if self.version == "v1": ge = self.ref_enc(refer * refer_mask, refer_mask) else: ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) @@ -935,10 +879,8 @@ class SynthesizerTrn(nn.Module): dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0) quantized = dquantized.contiguous().view(1, self.ssl_dim, -1) - x, m_p, logs_p, y_mask = self.enc_p( - quantized, text, ge, speed - ) - + x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed) + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -951,11 +893,9 @@ class SynthesizerTrn(nn.Module): quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) return codes.transpose(0, 1) + class CFM(torch.nn.Module): - def __init__( - self, - in_channels,dit - ): + def __init__(self, in_channels, dit): super().__init__() # self.sigma_min = 1e-6 @@ -965,27 +905,34 @@ class CFM(torch.nn.Module): # self.criterion = torch.nn.MSELoss() - def forward(self, mu:torch.Tensor, x_lens:torch.LongTensor, prompt:torch.Tensor, n_timesteps:torch.LongTensor, temperature:float=1.0): + def forward( + self, + mu: torch.Tensor, + x_lens: torch.LongTensor, + prompt: torch.Tensor, + n_timesteps: torch.LongTensor, + temperature: float = 1.0, + ): """Forward diffusion""" B, T = mu.size(0), mu.size(1) - x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) + x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) ntimesteps = int(n_timesteps) prompt_len = prompt.size(-1) - prompt_x = torch.zeros_like(x,dtype=mu.dtype) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) prompt_x[..., :prompt_len] = prompt[..., :prompt_len] x[..., :prompt_len] = 0.0 - mu=mu.transpose(2,1) - t = torch.tensor(0.0,dtype=x.dtype,device=x.device) - d = torch.tensor(1.0/ntimesteps,dtype=x.dtype,device=x.device) - d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + mu = mu.transpose(2, 1) + t = torch.tensor(0.0, dtype=x.dtype, device=x.device) + d = torch.tensor(1.0 / ntimesteps, dtype=x.dtype, device=x.device) + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d for j in range(ntimesteps): - t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t # d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d # v_pred = model(x, t_tensor, d_tensor, **extra_args) - v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu).transpose(2, 1) + v_pred = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu).transpose(2, 1) # if inference_cfg_rate>1e-5: # neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1) # v_pred=v_pred+(v_pred-neg)*inference_cfg_rate @@ -997,47 +944,51 @@ class CFM(torch.nn.Module): def set_no_grad(net_g): for name, param in net_g.named_parameters(): - param.requires_grad=False + param.requires_grad = False + @torch.jit.script_if_tracing def compile_codes_length(codes): y_lengths1 = torch.LongTensor([codes.size(2)]).to(codes.device) return y_lengths1 * 2.5 * 1.5 + @torch.jit.script_if_tracing def compile_ref_length(refer): refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) return refer_lengths + class SynthesizerTrnV3(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - version="v3", - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + version="v3", + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1058,41 +1009,38 @@ class SynthesizerTrnV3(nn.Module): self.gin_channels = gin_channels self.version = version - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, # gin_channels=gin_channels) # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim - if freeze_quantizer==True: + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim + if freeze_quantizer == True: set_no_grad(self.ssl_proj) set_no_grad(self.quantizer) set_no_grad(self.enc_p) @@ -1100,24 +1048,23 @@ class SynthesizerTrnV3(nn.Module): def create_ge(self, refer): refer_lengths = compile_ref_length(refer) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) return ge - def forward(self, codes, text,ge,speed=1): + def forward(self, codes, text, ge, speed=1): + y_lengths1 = compile_codes_length(codes) - y_lengths1=compile_codes_length(codes) - quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT - x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge,speed) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT + x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed) + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) return fea def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) \ No newline at end of file + return codes.transpose(0, 1) diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index f4447455..7493f0b4 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -52,11 +52,7 @@ class ConvReluNorm(nn.Module): self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) self.norm_layers.append(LayerNorm(hidden_channels)) self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) for _ in range(n_layers - 1): @@ -156,9 +152,7 @@ class WN(torch.nn.Module): self.drop = nn.Dropout(p_dropout) if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") for i in range(n_layers): @@ -479,9 +473,7 @@ class ConvFlow(nn.Module): self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) + self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) self.proj.weight.data.zero_() self.proj.bias.data.zero_() @@ -495,9 +487,7 @@ class ConvFlow(nn.Module): h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels) unnormalized_derivatives = h[..., 2 * self.num_bins :] x1, logabsdet = piecewise_rational_quadratic_transform( @@ -616,9 +606,7 @@ class MultiHeadAttention(nn.Module): self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) - self.attention = ScaledDotProductAttention( - temperature=np.power(d_model, 0.5), dropout=dropout - ) + self.attention = ScaledDotProductAttention(temperature=np.power(d_model, 0.5), dropout=dropout) self.fc = nn.Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) @@ -649,9 +637,7 @@ class MultiHeadAttention(nn.Module): output, attn = self.attention(q, k, v, mask=slf_mask) output = output.view(n_head, sz_b, len_x, d_v) - output = ( - output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) - ) # b x lq x (n*dv) + output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) # b x lq x (n*dv) output = self.fc(output) @@ -741,9 +727,7 @@ class MelStyleEncoder(nn.Module): if mask is not None: mask = (mask.int() == 0).squeeze(1) max_len = x.shape[1] - slf_attn_mask = ( - mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None - ) + slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None # spectral x = self.spectral(x) @@ -785,9 +769,7 @@ class MelStyleEncoderVAE(nn.Module): mu = self.fc1(enc_out) logvar = self.fc2(enc_out) posterior = D.Normal(mu, torch.exp(logvar)) - kl_divergence = D.kl_divergence( - posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar)) - ) + kl_divergence = D.kl_divergence(posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar))) loss_kl = kl_divergence.mean() z = posterior.rsample() @@ -825,9 +807,7 @@ class ActNorm(nn.Module): def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): if x_mask is None: - x_mask = torch.ones(x.size(0), 1, x.size(2)).to( - device=x.device, dtype=x.dtype - ) + x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype) x_len = torch.sum(x_mask, [1, 2]) if not self.initialized: self.initialize(x, x_mask) @@ -856,9 +836,7 @@ class ActNorm(nn.Module): v = m_sq - (m**2) logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) - bias_init = ( - (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) - ) + bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) self.bias.data.copy_(bias_init) @@ -873,9 +851,7 @@ class InvConvNear(nn.Module): self.n_split = n_split self.no_jacobian = no_jacobian - w_init = torch.linalg.qr( - torch.FloatTensor(self.n_split, self.n_split).normal_() - )[0] + w_init = torch.linalg.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) @@ -890,11 +866,7 @@ class InvConvNear(nn.Module): x_len = torch.sum(x_mask, [1, 2]) x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) - x = ( - x.permute(0, 1, 3, 2, 4) - .contiguous() - .view(b, self.n_split, c // self.n_split, t) - ) + x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t) if reverse: if hasattr(self, "weight_inv"): diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index b0cd242c..e889b7e9 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -31,32 +31,15 @@ class MRTE(nn.Module): text_enc = self.text_pre(text * text_mask) if test != None: if test == 0: - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge elif test == 1: x = ssl_enc + ge elif test == 2: - x = ( - self.cross_attention( - ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask - ) - + ge - ) + x = self.cross_attention(ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask) + ge else: raise ValueError("test should be 0,1,2") else: - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge x = self.c_post(x * ssl_mask) return x @@ -70,9 +53,7 @@ class SpeakerEncoder(torch.nn.Module): model_embedding_size=256, ): super(SpeakerEncoder, self).__init__() - self.lstm = nn.LSTM( - mel_n_channels, model_hidden_size, model_num_layers, batch_first=True - ) + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index f9a5c632..0afed835 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -7,7 +7,6 @@ """Residual vector quantizer implementation.""" from dataclasses import dataclass, field -import math import typing as tp import torch @@ -88,14 +87,10 @@ class ResidualVectorQuantizer(nn.Module): raise ValueError( f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." ) - quantized, codes, commit_loss, quantized_list = self.vq( - x, n_q=n_q, layers=layers - ) + quantized, codes, commit_loss, quantized_list = self.vq(x, n_q=n_q, layers=layers) return quantized, codes, torch.mean(commit_loss), quantized_list - def encode( - self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None - ) -> torch.Tensor: + def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor: """Encode a given input tensor with the specified sample rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizer to use and returns indices for each quantizer. diff --git a/GPT_SoVITS/module/transforms.py b/GPT_SoVITS/module/transforms.py index a11f799e..16b54985 100644 --- a/GPT_SoVITS/module/transforms.py +++ b/GPT_SoVITS/module/transforms.py @@ -37,7 +37,7 @@ def piecewise_rational_quadratic_transform( min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, - **spline_kwargs + **spline_kwargs, ) return outputs, logabsdet @@ -175,8 +175,7 @@ def rational_quadratic_spline( theta_one_minus_theta = root * (1 - root) denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta ) derivative_numerator = input_delta.pow(2) * ( input_derivatives_plus_one * root.pow(2) @@ -190,12 +189,9 @@ def rational_quadratic_spline( theta = (inputs - input_cumwidths) / input_bin_widths theta_one_minus_theta = theta * (1 - theta) - numerator = input_heights * ( - input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta - ) + numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta) denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta ) outputs = input_cumheights + numerator / denominator diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 43aac19a..fd680135 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -1,23 +1,22 @@ -from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 -from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule import torch import torchaudio -from torch import nn +from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule from feature_extractor import cnhubert +from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 +from torch import nn cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() -from text import cleaned_text_to_sequence -import soundfile -from tools.my_utils import load_audio -import os import json +import os + +import soundfile +from text import cleaned_text_to_sequence + def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - hann_window = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + hann_window = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), @@ -73,7 +72,7 @@ class T2SEncoder(nn.Module): super().__init__() self.encoder = t2s.onnx_encoder self.vits = vits - + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): codes = self.vits.extract_latent(ssl_content) prompt_semantic = codes[0, 0] @@ -102,22 +101,22 @@ class T2SModel(nn.Module): self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model) self.first_stage_decoder = self.t2s_model.first_stage_decoder self.stage_decoder = self.t2s_model.stage_decoder - #self.t2s_model = torch.jit.script(self.t2s_model) + # self.t2s_model = torch.jit.script(self.t2s_model) def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): early_stop_num = self.t2s_model.early_stop_num - #[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] + # [1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) prefix_len = prompts.shape[1] - #[1,N,512] [1,N] + # [1,N,512] [1,N] y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) stop = False for idx in range(1, 1500): - #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] enco = self.stage_decoder(y, k, v, y_emb, x_example) y, k, v, y_emb, logits, samples = enco if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: @@ -131,13 +130,11 @@ class T2SModel(nn.Module): return y[:, -idx:].unsqueeze(0) def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False): - #self.onnx_encoder = torch.jit.script(self.onnx_encoder) + # self.onnx_encoder = torch.jit.script(self.onnx_encoder) if dynamo: export_options = torch.onnx.ExportOptions(dynamic_shapes=True) onnx_encoder_export_output = torch.onnx.dynamo_export( - self.onnx_encoder, - (ref_seq, text_seq, ref_bert, text_bert, ssl_content), - export_options=export_options + self.onnx_encoder, (ref_seq, text_seq, ref_bert, text_bert, ssl_content), export_options=export_options ) onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx") return @@ -149,13 +146,13 @@ class T2SModel(nn.Module): input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], output_names=["x", "prompts"], dynamic_axes={ - "ref_seq": {1 : "ref_length"}, - "text_seq": {1 : "text_length"}, - "ref_bert": {0 : "ref_length"}, - "text_bert": {0 : "text_length"}, - "ssl_content": {2 : "ssl_length"}, + "ref_seq": {1: "ref_length"}, + "text_seq": {1: "text_length"}, + "ref_bert": {0: "ref_length"}, + "text_bert": {0: "text_length"}, + "ssl_content": {2: "ssl_length"}, }, - opset_version=16 + opset_version=16, ) x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) @@ -166,11 +163,11 @@ class T2SModel(nn.Module): input_names=["x", "prompts"], output_names=["y", "k", "v", "y_emb", "x_example"], dynamic_axes={ - "x": {1 : "x_length"}, - "prompts": {1 : "prompts_length"}, + "x": {1: "x_length"}, + "prompts": {1: "prompts_length"}, }, verbose=False, - opset_version=16 + opset_version=16, ) y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) @@ -181,38 +178,38 @@ class T2SModel(nn.Module): input_names=["iy", "ik", "iv", "iy_emb", "ix_example"], output_names=["y", "k", "v", "y_emb", "logits", "samples"], dynamic_axes={ - "iy": {1 : "iy_length"}, - "ik": {1 : "ik_length"}, - "iv": {1 : "iv_length"}, - "iy_emb": {1 : "iy_emb_length"}, - "ix_example": {1 : "ix_example_length"}, + "iy": {1: "iy_length"}, + "ik": {1: "ik_length"}, + "iv": {1: "iv_length"}, + "iy_emb": {1: "iy_emb_length"}, + "ix_example": {1: "ix_example_length"}, }, verbose=False, - opset_version=16 + opset_version=16, ) class VitsModel(nn.Module): def __init__(self, vits_path): super().__init__() - dict_s2 = torch.load(vits_path,map_location="cpu") + dict_s2 = torch.load(vits_path, map_location="cpu") self.hps = dict_s2["config"] - if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: self.hps["model"]["version"] = "v1" else: self.hps["model"]["version"] = "v2" - + self.hps = DictToAttrRecursive(self.hps) self.hps.model.semantic_frame_rate = "25hz" self.vq_model = SynthesizerTrn( self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, - **self.hps.model + **self.hps.model, ) self.vq_model.eval() self.vq_model.load_state_dict(dict_s2["weight"], strict=False) - + def forward(self, text_seq, pred_semantic, ref_audio): refer = spectrogram_torch( ref_audio, @@ -220,7 +217,7 @@ class VitsModel(nn.Module): self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, - center=False + center=False, ) return self.vq_model(pred_semantic, text_seq, refer)[0, 0] @@ -230,18 +227,22 @@ class GptSoVits(nn.Module): super().__init__() self.vits = vits self.t2s = t2s - + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, debug=False): pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) audio = self.vits(text_seq, pred_semantic, ref_audio) if debug: import onnxruntime + sess = onnxruntime.InferenceSession("onnx/koharu/koharu_vits.onnx", providers=["CPU"]) - audio1 = sess.run(None, { - "text_seq" : text_seq.detach().cpu().numpy(), - "pred_semantic" : pred_semantic.detach().cpu().numpy(), - "ref_audio" : ref_audio.detach().cpu().numpy() - }) + audio1 = sess.run( + None, + { + "text_seq": text_seq.detach().cpu().numpy(), + "pred_semantic": pred_semantic.detach().cpu().numpy(), + "ref_audio": ref_audio.detach().cpu().numpy(), + }, + ) return audio, audio1 return audio @@ -255,12 +256,12 @@ class GptSoVits(nn.Module): input_names=["text_seq", "pred_semantic", "ref_audio"], output_names=["audio"], dynamic_axes={ - "text_seq": {1 : "text_length"}, - "pred_semantic": {2 : "pred_length"}, - "ref_audio": {1 : "audio_length"}, + "text_seq": {1: "text_length"}, + "pred_semantic": {2: "pred_length"}, + "ref_audio": {1: "audio_length"}, }, opset_version=17, - verbose=False + verbose=False, ) @@ -278,14 +279,67 @@ def export(vits_path, gpt_path, project_name, vits_model="v2"): gpt = T2SModel(gpt_path, vits) gpt_sovits = GptSoVits(vits, gpt) ssl = SSLModel() - ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"],version=vits_model)]) - text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"],version=vits_model)]) + ref_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "n", + "i2", + "h", + "ao3", + ",", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) + text_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) ref_bert = torch.randn((ref_seq.shape[1], 1024)).float() text_bert = torch.randn((text_seq.shape[1], 1024)).float() ref_audio = torch.randn((1, 48000 * 5)).float() # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float() - ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float() - ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float() + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, vits.hps.data.sampling_rate).float() try: os.mkdir(f"onnx/{project_name}") @@ -326,8 +380,8 @@ def export(vits_path, gpt_path, project_name, vits_model="v2"): } MoeVSConfJson = json.dumps(MoeVSConf) - with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile: - json.dump(MoeVSConf, MoeVsConfFile, indent = 4) + with open(f"onnx/{project_name}.json", "w") as MoeVsConfFile: + json.dump(MoeVSConf, MoeVsConfFile, indent=4) if __name__ == "__main__": @@ -341,4 +395,4 @@ if __name__ == "__main__": exp_path = "nahida" export(vits_path, gpt_path, exp_path) - # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) \ No newline at end of file + # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index bdeacc7b..8d83e79a 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -8,19 +8,17 @@ exp_name = os.environ.get("exp_name") i_part = os.environ.get("i_part") all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] opt_dir = os.environ.get("opt_dir") bert_pretrained_dir = os.environ.get("bert_pretrained_dir") import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -version = os.environ.get('version', None) -import sys, numpy as np, traceback, pdb +version = os.environ.get("version", None) +import traceback import os.path -from glob import glob -from tqdm import tqdm from text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np from tools.my_utils import clean_path # inp_text=sys.argv[1] @@ -36,13 +34,13 @@ from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) @@ -56,8 +54,10 @@ if os.path.exists(txt_path) == False: # device = "mps" else: device = "cpu" - if os.path.exists(bert_pretrained_dir):... - else:raise FileNotFoundError(bert_pretrained_dir) + if os.path.exists(bert_pretrained_dir): + ... + else: + raise FileNotFoundError(bert_pretrained_dir) tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) if is_half == True: @@ -86,12 +86,10 @@ if os.path.exists(txt_path) == False: def process(data, res): for name, text, lan in data: try: - name=clean_path(name) + name = clean_path(name) name = os.path.basename(name) print(name) - phones, word2ph, norm_text = clean_text( - text.replace("%", "-").replace("¥", ","), lan, version - ) + phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("¥", ","), lan, version) path_bert = "%s/%s.pt" % (bert_dir, name) if os.path.exists(path_bert) == False and lan == "zh": bert_feature = get_bert_feature(norm_text, word2ph) @@ -131,9 +129,7 @@ if os.path.exists(txt_path) == False: wav_name, spk_name, language, text = line.split("|") # todo.append([name,text,"zh"]) if language in language_v1_to_language_v2.keys(): - todo.append( - [wav_name, text, language_v1_to_language_v2.get(language, language)] - ) + todo.append([wav_name, text, language_v1_to_language_v2.get(language, language)]) else: print(f"\033[33m[Waring] The {language = } of {wav_name} is not supported for training.\033[0m") except: diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 27b61f27..3a84c014 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -1,25 +1,31 @@ # -*- coding: utf-8 -*- -import sys,os -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") +import sys +import os + +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] from feature_extractor import cnhubert -opt_dir= os.environ.get("opt_dir") -cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") + +opt_dir = os.environ.get("opt_dir") +cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir") import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -import pdb,traceback,numpy as np,logging +import traceback +import numpy as np from scipy.io import wavfile import librosa + now_dir = os.getcwd() sys.path.append(now_dir) -from tools.my_utils import load_audio,clean_path +from tools.my_utils import load_audio, clean_path # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path @@ -34,90 +40,95 @@ from tools.my_utils import load_audio,clean_path from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) -hubert_dir="%s/4-cnhubert"%(opt_dir) -wav32dir="%s/5-wav32k"%(opt_dir) -os.makedirs(opt_dir,exist_ok=True) -os.makedirs(hubert_dir,exist_ok=True) -os.makedirs(wav32dir,exist_ok=True) -maxx=0.95 -alpha=0.5 +hubert_dir = "%s/4-cnhubert" % (opt_dir) +wav32dir = "%s/5-wav32k" % (opt_dir) +os.makedirs(opt_dir, exist_ok=True) +os.makedirs(hubert_dir, exist_ok=True) +os.makedirs(wav32dir, exist_ok=True) + +maxx = 0.95 +alpha = 0.5 if torch.cuda.is_available(): device = "cuda:0" # elif torch.backends.mps.is_available(): # device = "mps" else: device = "cpu" -model=cnhubert.get_model() +model = cnhubert.get_model() # is_half=False -if(is_half==True): - model=model.half().to(device) +if is_half == True: + model = model.half().to(device) else: model = model.to(device) -nan_fails=[] -def name2go(wav_name,wav_path): - hubert_path="%s/%s.pt"%(hubert_dir,wav_name) - if(os.path.exists(hubert_path)):return +nan_fails = [] + + +def name2go(wav_name, wav_path): + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path): + return tmp_audio = load_audio(wav_path, 32000) tmp_max = np.abs(tmp_audio).max() if tmp_max > 2.2: print("%s-filtered,%s" % (wav_name, tmp_max)) return - tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio - tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio - tmp_audio = librosa.resample( - tmp_audio32b, orig_sr=32000, target_sr=16000 - )#不是重采样问题 + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio + tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio + tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题 tensor_wav16 = torch.from_numpy(tmp_audio) - if (is_half == True): - tensor_wav16=tensor_wav16.half().to(device) + if is_half == True: + tensor_wav16 = tensor_wav16.half().to(device) else: tensor_wav16 = tensor_wav16.to(device) - ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) - if np.isnan(ssl.detach().numpy()).sum()!= 0: - nan_fails.append((wav_name,wav_path)) - print("nan filtered:%s"%wav_name) + ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum() != 0: + nan_fails.append((wav_name, wav_path)) + print("nan filtered:%s" % wav_name) return wavfile.write( - "%s/%s"%(wav32dir,wav_name), + "%s/%s" % (wav32dir, wav_name), 32000, tmp_audio32.astype("int16"), ) - my_save(ssl,hubert_path) + my_save(ssl, hubert_path) -with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") -for line in lines[int(i_part)::int(all_parts)]: +with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + +for line in lines[int(i_part) :: int(all_parts)]: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) - if (inp_wav_dir != "" and inp_wav_dir != None): + wav_name = clean_path(wav_name) + if inp_wav_dir != "" and inp_wav_dir != None: wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(inp_wav_dir, wav_name) + wav_path = "%s/%s" % (inp_wav_dir, wav_name) else: - wav_path=wav_name + wav_path = wav_name wav_name = os.path.basename(wav_name) - name2go(wav_name,wav_path) + name2go(wav_name, wav_path) except: - print(line,traceback.format_exc()) + print(line, traceback.format_exc()) -if(len(nan_fails)>0 and is_half==True): - is_half=False - model=model.float() +if len(nan_fails) > 0 and is_half == True: + is_half = False + model = model.float() for wav in nan_fails: try: - name2go(wav[0],wav[1]) + name2go(wav[0], wav[1]) except: - print(wav_name,traceback.format_exc()) + print(wav_name, traceback.format_exc()) diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index b213a8a6..ddb0607c 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -5,13 +5,15 @@ exp_name = os.environ.get("exp_name") i_part = os.environ.get("i_part") all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] opt_dir = os.environ.get("opt_dir") pretrained_s2G = os.environ.get("pretrained_s2G") s2config_path = os.environ.get("s2config_path") -if os.path.exists(pretrained_s2G):... -else:raise FileNotFoundError(pretrained_s2G) +if os.path.exists(pretrained_s2G): + ... +else: + raise FileNotFoundError(pretrained_s2G) # version=os.environ.get("version","v2") size = os.path.getsize(pretrained_s2G) if size < 82978 * 1024: @@ -25,23 +27,22 @@ elif size < 700 * 1024 * 1024: else: version = "v3" import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -import math, traceback -import multiprocessing -import sys, pdb +import traceback +import sys now_dir = os.getcwd() sys.path.append(now_dir) -from random import shuffle -import torch.multiprocessing as mp -from glob import glob -from tqdm import tqdm -import logging, librosa, utils -if version!="v3": +import logging +import utils + +if version != "v3": from module.models import SynthesizerTrn else: from module.models import SynthesizerTrnV3 as SynthesizerTrn from tools.my_utils import clean_path + logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G @@ -70,7 +71,7 @@ if os.path.exists(semantic_path) == False: hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, version=version, - **hps.model + **hps.model, ) if is_half == True: vq_model = vq_model.half().to(device) @@ -107,7 +108,7 @@ if os.path.exists(semantic_path) == False: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) + wav_name = clean_path(wav_name) wav_name = os.path.basename(wav_name) # name2go(name,lines1) name2go(wav_name, lines1) diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 36ef4347..1c458a4d 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -1,37 +1,45 @@ import traceback from collections import OrderedDict from time import time as ttime -import shutil,os +import shutil +import os import torch from tools.i18n.i18n import I18nAuto i18n = I18nAuto() -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s.pth"%(ttime()) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) -''' +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +""" 00:v1 01:v2 02:v3 03:v3lora +04:v4lora - -''' +""" from io import BytesIO -def my_save2(fea,path): + + +def my_save2(fea, path, cfm_version): bio = BytesIO() torch.save(fea, bio) bio.seek(0) data = bio.getvalue() - data = b'03' + data[2:]###temp for v3lora only, todo - with open(path, "wb") as f: f.write(data) + byte = b"03" if cfm_version == "v3" else b"04" + data = byte + data[2:] + with open(path, "wb") as f: + f.write(data) -def savee(ckpt, name, epoch, steps, hps,lora_rank=None): + +def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None): try: opt = OrderedDict() opt["weight"] = {} @@ -42,49 +50,58 @@ def savee(ckpt, name, epoch, steps, hps,lora_rank=None): opt["config"] = hps opt["info"] = "%sepoch_%siteration" % (epoch, steps) if lora_rank: - opt["lora_rank"]=lora_rank - my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) + opt["lora_rank"] = lora_rank + my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version) else: my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) return "Success." except: return traceback.format_exc() -head2version={ - b'00':["v1","v1",False], - b'01':["v2","v2",False], - b'02':["v2","v3",False], - b'03':["v2","v3",True], + +head2version = { + b"00": ["v1", "v1", False], + b"01": ["v2", "v2", False], + b"02": ["v2", "v3", False], + b"03": ["v2", "v3", True], + b"04": ["v2", "v4", True], } -hash_pretrained_dict={ - "dc3c97e17592963677a4a1681f30c653":["v2","v2",False],#s2G488k.pth#sovits_v1_pretrained - "43797be674a37c1c83ee81081941ed0f":["v2","v3",False],#s2Gv3.pth#sovits_v3_pretrained - "6642b37f3dbb1f76882b69937c95a5f3":["v2","v2",False],#s2G2333K.pth#sovits_v2_pretrained +hash_pretrained_dict = { + "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained + "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained + "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained + "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained } import hashlib + + def get_hash_from_file(sovits_path): - with open(sovits_path,"rb")as f:data=f.read(8192) + with open(sovits_path, "rb") as f: + data = f.read(8192) hash_md5 = hashlib.md5() hash_md5.update(data) return hash_md5.hexdigest() + + def get_sovits_version_from_path_fast(sovits_path): ###1-if it is pretrained sovits models, by hash - hash=get_hash_from_file(sovits_path) + hash = get_hash_from_file(sovits_path) if hash in hash_pretrained_dict: return hash_pretrained_dict[hash] - ###2-new weights or old weights, by head - with open(sovits_path,"rb")as f:version=f.read(2) - if version!=b"PK": + ###2-new weights, by head + with open(sovits_path, "rb") as f: + version = f.read(2) + if version != b"PK": return head2version[version] ###3-old weights, by file size - if_lora_v3=False - size=os.path.getsize(sovits_path) - ''' + if_lora_v3 = False + size = os.path.getsize(sovits_path) + """ v1weights:about 82942KB half thr:82978KB v2weights:about 83014KB v3weights:about 750MB - ''' + """ if size < 82978 * 1024: model_version = version = "v1" elif size < 700 * 1024 * 1024: @@ -92,15 +109,16 @@ def get_sovits_version_from_path_fast(sovits_path): else: version = "v2" model_version = "v3" - return version,model_version,if_lora_v3 + return version, model_version, if_lora_v3 + def load_sovits_new(sovits_path): - f=open(sovits_path,"rb") - meta=f.read(2) - if meta!="PK": - data = b'PK' + f.read() + f = open(sovits_path, "rb") + meta = f.read(2) + if meta != "PK": + data = b"PK" + f.read() bio = BytesIO() bio.write(data) bio.seek(0) return torch.load(bio, map_location="cpu", weights_only=False) - return torch.load(sovits_path,map_location="cpu", weights_only=False) \ No newline at end of file + return torch.load(sovits_path, map_location="cpu", weights_only=False) diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 4311db98..1176f0bc 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -1,31 +1,28 @@ # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py import os -import pdb if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] import argparse import logging +import platform from pathlib import Path -import torch, platform -from pytorch_lightning import seed_everything -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger -from pytorch_lightning.strategies import DDPStrategy +import torch from AR.data.data_module import Text2SemanticDataModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.utils.io import load_yaml_config +from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger +from pytorch_lightning.strategies import DDPStrategy logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) torch.set_float32_matmul_precision("high") -from AR.utils import get_newest_ckpt - from collections import OrderedDict -from time import time as ttime -import shutil + +from AR.utils import get_newest_ckpt from process_ckpt import my_save @@ -37,7 +34,7 @@ class my_model_ckpt(ModelCheckpoint): if_save_every_weights, half_weights_save_dir, exp_name, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.if_save_latest = if_save_latest @@ -50,10 +47,7 @@ class my_model_ckpt(ModelCheckpoint): # if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): if self._should_save_on_train_epoch_end(trainer): monitor_candidates = self._monitor_candidates(trainer) - if ( - self._every_n_epochs >= 1 - and (trainer.current_epoch + 1) % self._every_n_epochs == 0 - ): + if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0: if ( self.if_save_latest == True ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt @@ -75,7 +69,7 @@ class my_model_ckpt(ModelCheckpoint): to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) # torch.save( # print(os.environ) - if(os.environ.get("LOCAL_RANK","0")=="0"): + if os.environ.get("LOCAL_RANK", "0") == "0": my_save( to_save_od, "%s/%s-e%s.ckpt" @@ -112,7 +106,7 @@ def main(args): dirpath=ckpt_dir, ) logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) - os.environ["MASTER_ADDR"]="localhost" + os.environ["MASTER_ADDR"] = "localhost" os.environ["USE_LIBUV"] = "0" trainer: Trainer = Trainer( max_epochs=config["train"]["epochs"], @@ -123,9 +117,9 @@ def main(args): devices=-1 if torch.cuda.is_available() else 1, benchmark=False, fast_dev_run=False, - strategy = DDPStrategy( - process_group_backend="nccl" if platform.system() != "Windows" else "gloo" - ) if torch.cuda.is_available() else "auto", + strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo") + if torch.cuda.is_available() + else "auto", precision=config["train"]["precision"], logger=logger, num_sanity_val_steps=0, @@ -133,9 +127,7 @@ def main(args): use_distributed_sampler=False, # 非常简单的修改,但解决了采用自定义的 bucket_sampler 下训练步数不一致的问题! ) - model: Text2SemanticLightningModule = Text2SemanticLightningModule( - config, output_dir - ) + model: Text2SemanticLightningModule = Text2SemanticLightningModule(config, output_dir) data_module: Text2SemanticDataModule = Text2SemanticDataModule( config, diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index ea863b0e..ab461186 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -1,36 +1,41 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons +from module import commons from module.data_utils import ( - TextAudioSpeakerLoader, - TextAudioSpeakerCollate, DistributedBucketSampler, + TextAudioSpeakerCollate, + TextAudioSpeakerLoader, ) -from module.models import ( - SynthesizerTrn, - MultiPeriodDiscriminator, -) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss +from module.losses import discriminator_loss, feature_loss, generator_loss, kl_loss from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from module.models import ( + MultiPeriodDiscriminator, + SynthesizerTrn, +) from process_ckpt import savee torch.backends.cudnn.benchmark = False @@ -46,7 +51,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -74,7 +78,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -128,19 +132,27 @@ def run(rank, n_gpus, hps): # batch_size=1, pin_memory=True, # drop_last=False, collate_fn=collate_fn) - net_g = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).to(device) + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + net_d = ( + MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + if torch.cuda.is_available() + else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + ) for name, param in net_g.named_parameters(): if not param.requires_grad: print(name, "not requires_grad") @@ -193,7 +205,7 @@ def run(rank, n_gpus, hps): try: # 如果能加载自动resume _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "D_*.pth"), net_d, optim_d, ) # D多半加载没事 @@ -201,11 +213,11 @@ def run(rank, n_gpus, hps): logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 # global_step = 0 @@ -213,37 +225,55 @@ def run(rank, n_gpus, hps): # traceback.print_exc() epoch_str = 1 global_step = 0 - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.module.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) if torch.cuda.is_available() else net_g.load_state_dict( + ) + if torch.cuda.is_available() + else net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) ##测试不加载优化器 - if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): + if ( + hps.train.pretrained_s2D != "" + and hps.train.pretrained_s2D != None + and os.path.exists(hps.train.pretrained_s2D) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) - print("loaded pretrained %s" % hps.train.pretrained_s2D, + print( + "loaded pretrained %s" % hps.train.pretrained_s2D, net_d.module.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] - ) if torch.cuda.is_available() else net_d.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], ) + if torch.cuda.is_available() + else net_d.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], + ), ) # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 + optim_g, + gamma=hps.train.lr_decay, + last_epoch=-1, ) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( - optim_d, gamma=hps.train.lr_decay, last_epoch=-1 + optim_d, + gamma=hps.train.lr_decay, + last_epoch=-1, ) for _ in range(epoch_str): scheduler_g.step() @@ -285,9 +315,7 @@ def run(rank, n_gpus, hps): print("training done") -def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers -): +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims # scheduler_g, scheduler_d = schedulers @@ -311,17 +339,38 @@ def train_and_evaluate( text_lengths, ) in enumerate(tqdm(train_loader)): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( - rank, non_blocking=True + y, y_lengths = ( + y.cuda( + rank, + non_blocking=True, + ), + y_lengths.cuda( + rank, + non_blocking=True, + ), ) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -350,9 +399,7 @@ def train_and_evaluate( hps.data.mel_fmin, hps.data.mel_fmax, ) - y_mel = commons.slice_segments( - mel, ids_slice, hps.train.segment_size // hps.data.hop_length - ) + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), hps.data.filter_length, @@ -364,15 +411,14 @@ def train_and_evaluate( hps.data.mel_fmax, ) - y = commons.slice_segments( - y, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice + y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g + y_d_hat_r, + y_d_hat_g, ) loss_disc_all = loss_disc optim_d.zero_grad() @@ -405,7 +451,8 @@ def train_and_evaluate( losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] logger.info( "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) + epoch, + 100.0 * batch_idx / len(train_loader), ) ) logger.info([x.item() for x in losses] + [global_step, lr]) @@ -429,25 +476,37 @@ def train_and_evaluate( # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict=None - try:###Some people installed the wrong version of matplotlib. + image_dict = None + try: ###Some people installed the wrong version of matplotlib. image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() + y_mel[0].data.cpu().numpy(), ), "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() + y_hat_mel[0].data.cpu().numpy(), ), "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() + mel[0].data.cpu().numpy(), ), "all/stats_ssl": utils.plot_spectrogram_to_numpy( - stats_ssl[0].data.cpu().numpy() + stats_ssl[0].data.cpu().numpy(), ), } - except:pass - if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,) - else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,) + except: + pass + if image_dict: + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + else: + utils.summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: @@ -457,7 +516,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), ), ) utils.save_checkpoint( @@ -466,7 +526,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(global_step), ), ) else: @@ -476,7 +537,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), ), ) utils.save_checkpoint( @@ -485,7 +547,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(233333333333), ), ) if rank == 0 and hps.train.if_save_every_weights == True: @@ -540,10 +603,24 @@ def evaluate(hps, generator, eval_loader, writer_eval): ssl = ssl.to(device) text, text_lengths = text.to(device), text_lengths.to(device) for test in [0, 1]: - y_hat, mask, *_ = generator.module.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test - ) if torch.cuda.is_available() else generator.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test + y_hat, mask, *_ = ( + generator.module.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) + if torch.cuda.is_available() + else generator.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) ) y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length @@ -568,19 +645,19 @@ def evaluate(hps, generator, eval_loader, writer_eval): image_dict.update( { f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].cpu().numpy() - ) + y_hat_mel[0].cpu().numpy(), + ), } ) audio_dict.update( - {f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]]} + { + f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]], + }, ) image_dict.update( { - f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( - mel[0].cpu().numpy() - ) - } + f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()), + }, ) audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) diff --git a/GPT_SoVITS/s2_train_v3.py b/GPT_SoVITS/s2_train_v3.py index 9933deef..71d21967 100644 --- a/GPT_SoVITS/s2_train_v3.py +++ b/GPT_SoVITS/s2_train_v3.py @@ -1,36 +1,41 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch -from torch.nn import functional as F +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons +from module import commons +from module.data_utils import ( + DistributedBucketSampler, +) +from module.data_utils import ( + TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, +) from module.data_utils import ( TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader, - TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, - DistributedBucketSampler, ) from module.models import ( SynthesizerTrnV3 as SynthesizerTrn, - MultiPeriodDiscriminator, ) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch from process_ckpt import savee torch.backends.cudnn.benchmark = False @@ -46,7 +51,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -74,7 +78,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -128,17 +132,21 @@ def run(rank, n_gpus, hps): # batch_size=1, pin_memory=True, # drop_last=False, collate_fn=collate_fn) - net_g = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).to(device) + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) # net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) # for name, param in net_g.named_parameters(): @@ -146,7 +154,7 @@ def run(rank, n_gpus, hps): # print(name, "not requires_grad") optim_g = torch.optim.AdamW( - filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 + filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps, @@ -174,11 +182,11 @@ def run(rank, n_gpus, hps): # logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 # global_step = 0 @@ -186,17 +194,24 @@ def run(rank, n_gpus, hps): # traceback.print_exc() epoch_str = 1 global_step = 0 - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.module.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) if torch.cuda.is_available() else net_g.load_state_dict( + ) + if torch.cuda.is_available() + else net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) ##测试不加载优化器 # if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): # if rank == 0: @@ -212,9 +227,7 @@ def run(rank, n_gpus, hps): # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR( # optim_d, gamma=hps.train.lr_decay, last_epoch=-1 # ) @@ -224,7 +237,7 @@ def run(rank, n_gpus, hps): scaler = GradScaler(enabled=hps.train.fp16_run) - net_d=optim_d=scheduler_d=None + net_d = optim_d = scheduler_d = None print("start training from epoch %s" % epoch_str) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: @@ -260,7 +273,16 @@ def run(rank, n_gpus, hps): def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers + rank, + epoch, + hps, + nets, + optims, + schedulers, + scaler, + loaders, + logger, + writers, ): net_g, net_d = nets optim_g, optim_d = optims @@ -284,19 +306,33 @@ def train_and_evaluate( # text, # text_lengths, # ) in enumerate(tqdm(train_loader)): - for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate(tqdm(train_loader)): + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True - ) - mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -307,8 +343,18 @@ def train_and_evaluate( text, text_lengths = text.to(device), text_lengths.to(device) with autocast(enabled=hps.train.fp16_run): - cfm_loss = net_g(ssl, spec, mel,ssl_lengths,spec_lengths, text, text_lengths,mel_lengths, use_grad_ckpt=hps.train.grad_ckpt) - loss_gen_all=cfm_loss + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) @@ -318,12 +364,15 @@ def train_and_evaluate( if rank == 0: if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] + lr = optim_g.param_groups[0]["lr"] # losses = [commit_loss,cfm_loss,mel_loss,loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] losses = [cfm_loss] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, + 100.0 * batch_idx / len(train_loader), + ) + ) logger.info([x.item() for x in losses] + [global_step, lr]) scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} @@ -337,7 +386,8 @@ def train_and_evaluate( writer=writer, global_step=global_step, # images=image_dict, - scalars=scalar_dict) + scalars=scalar_dict, + ) # if global_step % hps.train.eval_interval == 0: # # evaluate(hps, net_g, eval_loader, writer_eval) @@ -347,7 +397,6 @@ def train_and_evaluate( # # if keep_ckpts > 0: # # utils.clean_checkpoints(path_to_models=hps.s2_ckpt_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True) - global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: @@ -357,7 +406,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), ), ) # utils.save_checkpoint( @@ -376,7 +426,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), ), ) # utils.save_checkpoint( diff --git a/GPT_SoVITS/s2_train_v3_lora.py b/GPT_SoVITS/s2_train_v3_lora.py index 75b3415b..4d8d23db 100644 --- a/GPT_SoVITS/s2_train_v3_lora.py +++ b/GPT_SoVITS/s2_train_v3_lora.py @@ -1,38 +1,43 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch -from torch.nn import functional as F +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) +from collections import OrderedDict as od from random import randint + from module import commons -from peft import LoraConfig, PeftModel, get_peft_model from module.data_utils import ( - TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader, - TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, DistributedBucketSampler, + TextAudioSpeakerCollateV3, + TextAudioSpeakerLoaderV3, + TextAudioSpeakerCollateV4, + TextAudioSpeakerLoaderV4, ) from module.models import ( SynthesizerTrnV3 as SynthesizerTrn, - MultiPeriodDiscriminator, ) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from peft import LoraConfig, get_peft_model from process_ckpt import savee -from collections import OrderedDict as od + torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = False ###反正A100fp32更快,那试试tf32吧 @@ -46,7 +51,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -65,7 +69,7 @@ def main(): def run(rank, n_gpus, hps): - global global_step,no_grad_names,save_root,lora_rank + global global_step, no_grad_names, save_root, lora_rank if rank == 0: logger = utils.get_logger(hps.data.exp_dir) logger.info(hps) @@ -74,7 +78,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -83,6 +87,8 @@ def run(rank, n_gpus, hps): if torch.cuda.is_available(): torch.cuda.set_device(rank) + TextAudioSpeakerLoader = TextAudioSpeakerLoaderV3 if hps.model.version == "v3" else TextAudioSpeakerLoaderV4 + TextAudioSpeakerCollate = TextAudioSpeakerCollateV3 if hps.model.version == "v3" else TextAudioSpeakerCollateV4 train_dataset = TextAudioSpeakerLoader(hps.data) ######## train_sampler = DistributedBucketSampler( train_dataset, @@ -122,21 +128,24 @@ def run(rank, n_gpus, hps): persistent_workers=True, prefetch_factor=4, ) - save_root="%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir,hps.model.version,hps.train.lora_rank) - os.makedirs(save_root,exist_ok=True) - lora_rank=int(hps.train.lora_rank) + save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank) + os.makedirs(save_root, exist_ok=True) + lora_rank = int(hps.train.lora_rank) lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, lora_alpha=lora_rank, init_lora_weights=True, ) - def get_model(hps):return SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ) + + def get_model(hps): + return SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + def get_optim(net_g): return torch.optim.AdamW( filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 @@ -144,61 +153,66 @@ def run(rank, n_gpus, hps): betas=hps.train.betas, eps=hps.train.eps, ) - def model2cuda(net_g,rank): + + def model2cuda(net_g, rank): if torch.cuda.is_available(): net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True) else: net_g = net_g.to(device) return net_g - try:# 如果能加载自动resume + + try: # 如果能加载自动resume net_g = get_model(hps) net_g.cfm = get_peft_model(net_g.cfm, lora_config) - net_g=model2cuda(net_g,rank) - optim_g=get_optim(net_g) + net_g = model2cuda(net_g, rank) + optim_g = get_optim(net_g) # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(save_root, "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) except: # 如果首次不能加载,加载pretrain # traceback.print_exc() epoch_str = 1 global_step = 0 net_g = get_model(hps) - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) net_g.cfm = get_peft_model(net_g.cfm, lora_config) - net_g=model2cuda(net_g,rank) + net_g = model2cuda(net_g, rank) optim_g = get_optim(net_g) - no_grad_names=set() + no_grad_names = set() for name, param in net_g.named_parameters(): if not param.requires_grad: - no_grad_names.add(name.replace("module.","")) + no_grad_names.add(name.replace("module.", "")) # print(name, "not requires_grad") # print(no_grad_names) # os._exit(233333) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) for _ in range(epoch_str): scheduler_g.step() scaler = GradScaler(enabled=hps.train.fp16_run) - net_d=optim_d=scheduler_d=None - print("start training from epoch %s"%epoch_str) + net_d = optim_d = scheduler_d = None + print("start training from epoch %s" % epoch_str) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate( @@ -230,9 +244,8 @@ def run(rank, n_gpus, hps): scheduler_g.step() print("training done") -def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers -): + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims # scheduler_g, scheduler_d = schedulers @@ -244,18 +257,32 @@ def train_and_evaluate( global global_step net_g.train() - for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate(tqdm(train_loader)): + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True - ) - mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -265,8 +292,18 @@ def train_and_evaluate( text, text_lengths = text.to(device), text_lengths.to(device) with autocast(enabled=hps.train.fp16_run): - cfm_loss = net_g(ssl, spec, mel,ssl_lengths,spec_lengths, text, text_lengths,mel_lengths, use_grad_ckpt=hps.train.grad_ckpt) - loss_gen_all=cfm_loss + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) @@ -276,18 +313,17 @@ def train_and_evaluate( if rank == 0: if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] + lr = optim_g.param_groups[0]["lr"] losses = [cfm_loss] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) + logger.info("Train Epoch: {} [{:.0f}%]".format(epoch, 100.0 * batch_idx / len(train_loader))) logger.info([x.item() for x in losses] + [global_step, lr]) scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} utils.summarize( writer=writer, global_step=global_step, - scalars=scalar_dict) + scalars=scalar_dict, + ) global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: @@ -297,9 +333,7 @@ def train_and_evaluate( optim_g, hps.train.learning_rate, epoch, - os.path.join( - save_root, "G_{}.pth".format(global_step) - ), + os.path.join(save_root, "G_{}.pth".format(global_step)), ) else: utils.save_checkpoint( @@ -307,21 +341,19 @@ def train_and_evaluate( optim_g, hps.train.learning_rate, epoch, - os.path.join( - save_root, "G_{}.pth".format(233333333333) - ), + os.path.join(save_root, "G_{}.pth".format(233333333333)), ) if rank == 0 and hps.train.if_save_every_weights == True: if hasattr(net_g, "module"): ckpt = net_g.module.state_dict() else: ckpt = net_g.state_dict() - sim_ckpt=od() + sim_ckpt = od() for key in ckpt: # if "cfm"not in key: # print(key) if key not in no_grad_names: - sim_ckpt[key]=ckpt[key].half().cpu() + sim_ckpt[key] = ckpt[key].half().cpu() logger.info( "saving ckpt %s_e%s:%s" % ( @@ -329,10 +361,12 @@ def train_and_evaluate( epoch, savee( sim_ckpt, - hps.name + "_e%s_s%s_l%s" % (epoch, global_step,lora_rank), + hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank), epoch, global_step, - hps,lora_rank=lora_rank + hps, + cfm_version=hps.model.version, + lora_rank=lora_rank, ), ) ) diff --git a/GPT_SoVITS/text/LangSegmenter/__init__.py b/GPT_SoVITS/text/LangSegmenter/__init__.py index 6fe60957..0a764905 100644 --- a/GPT_SoVITS/text/LangSegmenter/__init__.py +++ b/GPT_SoVITS/text/LangSegmenter/__init__.py @@ -1 +1 @@ -from .langsegmenter import LangSegmenter \ No newline at end of file +from .langsegmenter import LangSegmenter diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py index 0971bea7..88b93794 100644 --- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py +++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py @@ -172,5 +172,4 @@ if __name__ == "__main__": print(LangSegmenter.getTexts(text)) text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。" - print(LangSegmenter.getTexts(text)) - + print(LangSegmenter.getTexts(text)) \ No newline at end of file diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index 2791d7ab..82df1fbb 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -10,18 +10,19 @@ from text import symbols2 as symbols_v2 _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} + def cleaned_text_to_sequence(cleaned_text, version=None): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. Args: text: string to convert to a sequence Returns: List of integers corresponding to the symbols in the text - ''' - if version is None:version=os.environ.get('version', 'v2') - if version == "v1": - phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] - else: - phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] - - return phones + """ + if version is None: + version = os.environ.get("version", "v2") + if version == "v1": + phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] + else: + phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] + return phones diff --git a/GPT_SoVITS/text/cantonese.py b/GPT_SoVITS/text/cantonese.py index 970be20f..1f07c414 100644 --- a/GPT_SoVITS/text/cantonese.py +++ b/GPT_SoVITS/text/cantonese.py @@ -1,6 +1,5 @@ # reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py -import sys import re import cn2an import ToJyutping @@ -99,9 +98,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -115,7 +112,9 @@ def text_normalize(text): return dest_text -punctuation_set=set(punctuation) +punctuation_set = set(punctuation) + + def jyuping_to_initials_finals_tones(jyuping_syllables): initials_finals = [] tones = [] @@ -160,12 +159,14 @@ def jyuping_to_initials_finals_tones(jyuping_syllables): assert len(initials_finals) == len(tones) ###魔改为辅音+带音调的元音 - phones=[] - for a,b in zip(initials_finals,tones): - if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。 - todo="%s%s"%(a,b) - else:todo=a - if(todo not in punctuation_set):todo="Y%s"%todo + phones = [] + for a, b in zip(initials_finals, tones): + if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。 + todo = "%s%s" % (a, b) + else: + todo = a + if todo not in punctuation_set: + todo = "Y%s" % todo phones.append(todo) # return initials_finals, tones, word2ph @@ -218,4 +219,4 @@ if __name__ == "__main__": # phones, tones, word2ph = g2p(text) phones, word2ph = g2p(text) # print(phones, tones, word2ph) - print(phones, word2ph) \ No newline at end of file + print(phones, word2ph) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index 55dc9974..ce44215f 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -1,5 +1,4 @@ import os -import pdb import re import cn2an @@ -17,7 +16,9 @@ pinyin_to_symbol_map = { for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() } -import jieba_fast, logging +import jieba_fast +import logging + jieba_fast.setLogLevel(logging.CRITICAL) import jieba_fast.posseg as psg @@ -37,7 +38,7 @@ rep_map = { "/": ",", "—": "-", "~": "…", - "~":"…", + "~": "…", } tone_modifier = ToneSandhi() @@ -49,9 +50,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -62,17 +61,15 @@ def replace_punctuation_with_en(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result @@ -87,9 +84,7 @@ def _get_initials_finals(word): initials = [] finals = [] orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) - orig_finals = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 - ) + orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for c, v in zip(orig_initials, orig_finals): initials.append(c) finals.append(v) diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py index 2b4599da..612aa3a5 100644 --- a/GPT_SoVITS/text/chinese2.py +++ b/GPT_SoVITS/text/chinese2.py @@ -1,10 +1,9 @@ import os -import pdb import re import cn2an from pypinyin import lazy_pinyin, Style -from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals +from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials from text.symbols import punctuation from text.tone_sandhi import ToneSandhi @@ -18,18 +17,26 @@ pinyin_to_symbol_map = { for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() } -import jieba_fast, logging +import jieba_fast +import logging + jieba_fast.setLogLevel(logging.CRITICAL) import jieba_fast.posseg as psg # is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启 # is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False -is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False +is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False if is_g2pw: # print("当前使用g2pw进行拼音推理") from text.g2pw import G2PWPinyin, correct_pronunciation + parent_directory = os.path.dirname(current_file_path) - g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True) + g2pw = G2PWPinyin( + model_dir="GPT_SoVITS/text/G2PWModel", + model_source=os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"), + v_to_u=False, + neutral_tone_with_five=True, + ) rep_map = { ":": ",", @@ -46,7 +53,7 @@ rep_map = { "/": ",", "—": "-", "~": "…", - "~":"…", + "~": "…", } tone_modifier = ToneSandhi() @@ -58,9 +65,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -77,9 +82,7 @@ def _get_initials_finals(word): finals = [] orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) - orig_finals = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 - ) + orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for c, v in zip(orig_initials, orig_finals): initials.append(c) @@ -87,31 +90,66 @@ def _get_initials_finals(word): return initials, finals -must_erhua = { - "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" -} +must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"} not_erhua = { - "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", - "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", - "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", - "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿", "少儿" + "虐儿", + "为儿", + "护儿", + "瞒儿", + "救儿", + "替儿", + "有儿", + "一儿", + "我儿", + "俺儿", + "妻儿", + "拐儿", + "聋儿", + "乞儿", + "患儿", + "幼儿", + "孤儿", + "婴儿", + "婴幼儿", + "连体儿", + "脑瘫儿", + "流浪儿", + "体弱儿", + "混血儿", + "蜜雪儿", + "舫儿", + "祖儿", + "美儿", + "应采儿", + "可儿", + "侄儿", + "孙儿", + "侄孙儿", + "女儿", + "男儿", + "红孩儿", + "花儿", + "虫儿", + "马儿", + "鸟儿", + "猪儿", + "猫儿", + "狗儿", + "少儿", } -def _merge_erhua(initials: list[str], - finals: list[str], - word: str, - pos: str) -> list[list[str]]: + + +def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]: """ Do erhub. """ # fix er1 for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': - finals[i] = 'er2' + if i == len(finals) - 1 and word[i] == "儿" and phn == "er1": + finals[i] = "er2" # 发音 - if word not in must_erhua and (word in not_erhua or - pos in {"a", "j", "nr"}): + if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}): return initials, finals # "……" 等情况直接返回 @@ -124,9 +162,13 @@ def _merge_erhua(initials: list[str], new_initials = [] new_finals = [] for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn in { - "er2", "er5" - } and word[-2:] not in not_erhua and new_finals: + if ( + i == len(finals) - 1 + and word[i] == "儿" + and phn in {"er2", "er5"} + and word[-2:] not in not_erhua + and new_finals + ): phn = "er" + new_finals[-1][-1] new_initials.append(initials[i]) @@ -160,7 +202,7 @@ def _g2p(segments): # assert len(sub_initials) == len(sub_finals) == len(word) initials = sum(initials, []) finals = sum(finals, []) - print("pypinyin结果",initials,finals) + print("pypinyin结果", initials, finals) else: # g2pw采用整句推理 pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3) @@ -171,19 +213,19 @@ def _g2p(segments): sub_finals = [] now_word_length = pre_word_length + len(word) - if pos == 'eng': + if pos == "eng": pre_word_length = now_word_length continue word_pinyins = pinyins[pre_word_length:now_word_length] # 多音字消歧 - word_pinyins = correct_pronunciation(word,word_pinyins) + word_pinyins = correct_pronunciation(word, word_pinyins) for pinyin in word_pinyins: if pinyin[0].isalpha(): sub_initials.append(to_initials(pinyin)) - sub_finals.append(to_finals_tone3(pinyin,neutral_tone_with_five=True)) + sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True)) else: sub_initials.append(pinyin) sub_finals.append(pinyin) @@ -259,18 +301,18 @@ def replace_punctuation_with_en(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text + def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result + def text_normalize(text): # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization tx = TextNormalizer() @@ -283,6 +325,7 @@ def text_normalize(text): dest_text = replace_consecutive_punctuation(dest_text) return dest_text + # 不排除英文的文本格式化 def mix_text_normalize(text): # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 98535f27..7ba8f376 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -19,55 +19,57 @@ special = [ def clean_text(text, language, version=None): - if version is None:version=os.environ.get('version', 'v2') + if version is None: + version = os.environ.get("version", "v2") if version == "v1": symbols = symbols_v1.symbols language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} else: symbols = symbols_v2.symbols - language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} + language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} - if(language not in language_module_map): - language="en" - text=" " + if language not in language_module_map: + language = "en" + text = " " for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) - if hasattr(language_module,"text_normalize"): + language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) + if hasattr(language_module, "text_normalize"): norm_text = language_module.text_normalize(text) else: - norm_text=text - if language == "zh" or language=="yue":########## + norm_text = text + if language == "zh" or language == "yue": ########## phones, word2ph = language_module.g2p(norm_text) assert len(phones) == sum(word2ph) assert len(norm_text) == len(word2ph) elif language == "en": phones = language_module.g2p(norm_text) if len(phones) < 4: - phones = [','] + phones + phones = [","] + phones word2ph = None else: phones = language_module.g2p(norm_text) word2ph = None - phones = ['UNK' if ph not in symbols else ph for ph in phones] + phones = ["UNK" if ph not in symbols else ph for ph in phones] return phones, word2ph, norm_text def clean_special(text, language, special_s, target_symbol, version=None): - if version is None:version=os.environ.get('version', 'v2') + if version is None: + version = os.environ.get("version", "v2") if version == "v1": symbols = symbols_v1.symbols language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} else: symbols = symbols_v2.symbols - language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} + language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} """ 特殊静音段sp符号处理 """ text = text.replace(special_s, ",") - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) + language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) norm_text = language_module.text_normalize(text) phones = language_module.g2p(norm_text) new_ph = [] @@ -81,8 +83,9 @@ def clean_special(text, language, special_s, target_symbol, version=None): def text_to_sequence(text, language, version=None): - version = os.environ.get('version',version) - if version is None:version='v2' + version = os.environ.get("version", version) + if version is None: + version = "v2" phones = clean_text(text) return cleaned_text_to_sequence(phones, version) diff --git a/GPT_SoVITS/text/en_normalization/expend.py b/GPT_SoVITS/text/en_normalization/expend.py index c0bad755..bbd607cd 100644 --- a/GPT_SoVITS/text/en_normalization/expend.py +++ b/GPT_SoVITS/text/en_normalization/expend.py @@ -9,17 +9,17 @@ import unicodedata # 后缀计量单位替换表 measurement_map = { "m": ["meter", "meters"], - 'km': ["kilometer", "kilometers"], + "km": ["kilometer", "kilometers"], "km/h": ["kilometer per hour", "kilometers per hour"], "ft": ["feet", "feet"], "L": ["liter", "liters"], "tbsp": ["tablespoon", "tablespoons"], - 'tsp': ["teaspoon", "teaspoons"], + "tsp": ["teaspoon", "teaspoons"], "h": ["hour", "hours"], "min": ["minute", "minutes"], "s": ["second", "seconds"], "°C": ["degree celsius", "degrees celsius"], - "°F": ["degree fahrenheit", "degrees fahrenheit"] + "°F": ["degree fahrenheit", "degrees fahrenheit"], } @@ -27,41 +27,42 @@ measurement_map = { _inflect = inflect.engine() # 转化数字序数词 -_ordinal_number_re = re.compile(r'\b([0-9]+)\. ') +_ordinal_number_re = re.compile(r"\b([0-9]+)\. ") # 我听说好像对于数字正则识别其实用 \d 会好一点 -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") # 时间识别 -_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b') +_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b") # 后缀计量单位识别 -_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b') +_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b") # 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ ) -_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)') -_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£') +_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)") +_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£") # 前后 $ 识别 -_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$') +_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)") +_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$") # 小数的识别 -_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)') +_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)") # 分数识别 (形式 "3/4" ) -_fraction_re = re.compile(r'([0-9]+/[0-9]+)') +_fraction_re = re.compile(r"([0-9]+/[0-9]+)") # 序数词识别 -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") # 数字处理 -_number_re = re.compile(r'[0-9]+') +_number_re = re.compile(r"[0-9]+") + def _convert_ordinal(m): """ - 标准化序数词, 例如: 1. 2. 3. 4. 5. 6. + 标准化序数词, 例如: 1. 2. 3. 4. 5. 6. Examples: input: "1. " output: "1st" @@ -70,24 +71,26 @@ def _convert_ordinal(m): ordinal = _inflect.ordinal(m.group(1)) return ordinal + ", " + def _remove_commas(m): - return m.group(1).replace(',', '') + return m.group(1).replace(",", "") + def _expand_time(m): """ 将 24 小时制的时间转换为 12 小时制的时间表示方式。 - + Examples: input: "13:00 / 4:00 / 13:30" output: "one o'clock p.m. / four o'clock am. / one thirty p.m." """ hours, minutes = map(int, m.group(1, 2)) - period = 'a.m.' if hours < 12 else 'p.m.' + period = "a.m." if hours < 12 else "p.m." if hours > 12: hours -= 12 hour_word = _inflect.number_to_words(hours) - minute_word = _inflect.number_to_words(minutes) if minutes != 0 else '' + minute_word = _inflect.number_to_words(minutes) if minutes != 0 else "" if minutes == 0: return f"{hour_word} o'clock {period}" @@ -103,7 +106,7 @@ def _expand_measurement(m): sign = m.group(3) ptr = 1 # 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "." - num = int(m.group(1).replace(sign, '').replace(".",'')) + num = int(m.group(1).replace(sign, "").replace(".", "")) decimal_part = m.group(2) # 上面判断的漏洞,比如 0.1 的情况,在这里排除了 if decimal_part == None and num == 1: @@ -116,23 +119,24 @@ def _expand_pounds(m): 没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起 """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") if len(parts) > 2: - return match + ' pounds' # Unexpected format + return match + " pounds" # Unexpected format pounds = int(parts[0]) if parts[0] else 0 - pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if pounds and pence: - pound_unit = 'pound' if pounds == 1 else 'pounds' - penny_unit = 'penny' if pence == 1 else 'pence' - return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit) + pound_unit = "pound" if pounds == 1 else "pounds" + penny_unit = "penny" if pence == 1 else "pence" + return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit) elif pounds: - pound_unit = 'pound' if pounds == 1 else 'pounds' - return '%s %s' % (pounds, pound_unit) + pound_unit = "pound" if pounds == 1 else "pounds" + return "%s %s" % (pounds, pound_unit) elif pence: - penny_unit = 'penny' if pence == 1 else 'pence' - return '%s %s' % (pence, penny_unit) + penny_unit = "penny" if pence == 1 else "pence" + return "%s %s" % (pence, penny_unit) else: - return 'zero pounds' + return "zero pounds" + def _expand_dollars(m): """ @@ -142,23 +146,24 @@ def _expand_dollars(m): output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents" """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") if len(parts) > 2: - return match + ' dollars' # Unexpected format + return match + " dollars" # Unexpected format dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit) + dollar_unit = "dollar" if dollars == 1 else "dollars" + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit) elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) + dollar_unit = "dollar" if dollars == 1 else "dollars" + return "%s %s" % (dollars, dollar_unit) elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s" % (cents, cent_unit) else: - return 'zero dollars' + return "zero dollars" + # 小数的处理 def _expand_decimal_number(m): @@ -168,11 +173,11 @@ def _expand_decimal_number(m): output: "thirteen point two three four" """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") words = [] # 遍历字符串中的每个字符 for char in parts[1]: - if char == '.': + if char == ".": words.append("point") else: words.append(char) @@ -186,7 +191,7 @@ def _expend_fraction(m): 规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法. 规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves. Examples: - + | Written | Said | |:---:|:---:| | 1/3 | one third | @@ -196,39 +201,41 @@ def _expend_fraction(m): | 3/2 | three halves | """ match = m.group(0) - numerator, denominator = map(int, match.split('/')) + numerator, denominator = map(int, match.split("/")) numerator_part = _inflect.number_to_words(numerator) if denominator == 2: if numerator == 1: - denominator_part = 'half' + denominator_part = "half" else: - denominator_part = 'halves' + denominator_part = "halves" elif denominator == 1: - return f'{numerator_part}' + return f"{numerator_part}" else: denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator)) if numerator > 1: - denominator_part += 's' + denominator_part += "s" + + return f"{numerator_part} {denominator_part}" - return f'{numerator_part} {denominator_part}' def _expand_ordinal(m): return _inflect.number_to_words(m.group(0)) + def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: - return 'two thousand' + return "two thousand" elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) + return "two thousand " + _inflect.number_to_words(num % 100) elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' + return _inflect.number_to_words(num // 100) + " hundred" else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") else: - return _inflect.number_to_words(num, andword='') + return _inflect.number_to_words(num, andword="") def normalize(text): @@ -238,7 +245,7 @@ def normalize(text): """ text = re.sub(_ordinal_number_re, _convert_ordinal, text) - text = re.sub(r'(? Dict[str, np.array]: +def prepare_onnx_input( + tokenizer, + labels: List[str], + char2phonemes: Dict[str, List[int]], + chars: List[str], + texts: List[str], + query_ids: List[int], + use_mask: bool = False, + window_size: int = None, + max_len: int = 512, +) -> Dict[str, np.array]: if window_size is not None: truncated_texts, truncated_query_ids = _truncate_texts( - window_size=window_size, texts=texts, query_ids=query_ids) + window_size=window_size, texts=texts, query_ids=query_ids + ) input_ids = [] token_type_ids = [] attention_masks = [] @@ -50,33 +54,27 @@ def prepare_onnx_input(tokenizer, query_id = (truncated_query_ids if window_size else query_ids)[idx] try: - tokens, text2token, token2text = tokenize_and_map( - tokenizer=tokenizer, text=text) + tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text) except Exception: print(f'warning: text "{text}" is invalid') return {} text, query_id, tokens, text2token, token2text = _truncate( - max_len=max_len, - text=text, - query_id=query_id, - tokens=tokens, - text2token=text2token, - token2text=token2text) + max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text + ) - processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] + processed_tokens = ["[CLS]"] + tokens + ["[SEP]"] - input_id = list( - np.array(tokenizer.convert_tokens_to_ids(processed_tokens))) - token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int)) - attention_mask = list(np.ones((len(processed_tokens), ), dtype=int)) + input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens))) + token_type_id = list(np.zeros((len(processed_tokens),), dtype=int)) + attention_mask = list(np.ones((len(processed_tokens),), dtype=int)) query_char = text[query_id] - phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \ - if use_mask else [1] * len(labels) + phoneme_mask = ( + [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels) + ) char_id = chars.index(query_char) - position_id = text2token[ - query_id] + 1 # [CLS] token locate at first place + position_id = text2token[query_id] + 1 # [CLS] token locate at first place input_ids.append(input_id) token_type_ids.append(token_type_id) @@ -86,18 +84,17 @@ def prepare_onnx_input(tokenizer, position_ids.append(position_id) outputs = { - 'input_ids': np.array(input_ids).astype(np.int64), - 'token_type_ids': np.array(token_type_ids).astype(np.int64), - 'attention_masks': np.array(attention_masks).astype(np.int64), - 'phoneme_masks': np.array(phoneme_masks).astype(np.float32), - 'char_ids': np.array(char_ids).astype(np.int64), - 'position_ids': np.array(position_ids).astype(np.int64), + "input_ids": np.array(input_ids).astype(np.int64), + "token_type_ids": np.array(token_type_ids).astype(np.int64), + "attention_masks": np.array(attention_masks).astype(np.int64), + "phoneme_masks": np.array(phoneme_masks).astype(np.float32), + "char_ids": np.array(char_ids).astype(np.int64), + "position_ids": np.array(position_ids).astype(np.int64), } return outputs -def _truncate_texts(window_size: int, texts: List[str], - query_ids: List[int]) -> Tuple[List[str], List[int]]: +def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]: truncated_texts = [] truncated_query_ids = [] for text, query_id in zip(texts, query_ids): @@ -111,12 +108,9 @@ def _truncate_texts(window_size: int, texts: List[str], return truncated_texts, truncated_query_ids -def _truncate(max_len: int, - text: str, - query_id: int, - tokens: List[str], - text2token: List[int], - token2text: List[Tuple[int]]): +def _truncate( + max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]] +): truncate_len = max_len - 2 if len(tokens) <= truncate_len: return (text, query_id, tokens, text2token, token2text) @@ -137,14 +131,16 @@ def _truncate(max_len: int, start = token2text[token_start][0] end = token2text[token_end - 1][1] - return (text[start:end], query_id - start, tokens[token_start:token_end], [ - i - token_start if i is not None else None - for i in text2token[start:end] - ], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) + return ( + text[start:end], + query_id - start, + tokens[token_start:token_end], + [i - token_start if i is not None else None for i in text2token[start:end]], + [(s - start, e - start) for s, e in token2text[token_start:token_end]], + ) -def get_phoneme_labels(polyphonic_chars: List[List[str]] - ) -> Tuple[List[str], Dict[str, List[int]]]: +def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: @@ -154,13 +150,11 @@ def get_phoneme_labels(polyphonic_chars: List[List[str]] return labels, char2phonemes -def get_char_phoneme_labels(polyphonic_chars: List[List[str]] - ) -> Tuple[List[str], Dict[str, List[int]]]: - labels = sorted( - list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) +def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]: + labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: if char not in char2phonemes: char2phonemes[char] = [] - char2phonemes[char].append(labels.index(f'{char} {phoneme}')) + char2phonemes[char].append(labels.index(f"{char} {phoneme}")) return labels, char2phonemes diff --git a/GPT_SoVITS/text/g2pw/g2pw.py b/GPT_SoVITS/text/g2pw/g2pw.py index e81b24de..08525e91 100644 --- a/GPT_SoVITS/text/g2pw/g2pw.py +++ b/GPT_SoVITS/text/g2pw/g2pw.py @@ -17,17 +17,25 @@ PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep") class G2PWPinyin(Pinyin): - def __init__(self, model_dir='G2PWModel/', model_source=None, - enable_non_tradional_chinese=True, - v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs): + def __init__( + self, + model_dir="G2PWModel/", + model_source=None, + enable_non_tradional_chinese=True, + v_to_u=False, + neutral_tone_with_five=False, + tone_sandhi=False, + **kwargs, + ): self._g2pw = G2PWOnnxConverter( model_dir=model_dir, - style='pinyin', + style="pinyin", model_source=model_source, enable_non_tradional_chinese=enable_non_tradional_chinese, ) self._converter = Converter( - self._g2pw, v_to_u=v_to_u, + self._g2pw, + v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, ) @@ -37,31 +45,25 @@ class G2PWPinyin(Pinyin): class Converter(UltimateConverter): - def __init__(self, g2pw_instance, v_to_u=False, - neutral_tone_with_five=False, - tone_sandhi=False, **kwargs): + def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs): super(Converter, self).__init__( - v_to_u=v_to_u, - neutral_tone_with_five=neutral_tone_with_five, - tone_sandhi=tone_sandhi, **kwargs) + v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs + ) self._g2pw = g2pw_instance def convert(self, words, style, heteronym, errors, strict, **kwargs): pys = [] if RE_HANS.match(words): - pys = self._to_pinyin(words, style=style, heteronym=heteronym, - errors=errors, strict=strict) + pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict) post_data = self.post_pinyin(words, heteronym, pys) if post_data is not None: pys = post_data - pys = self.convert_styles( - pys, words, style, heteronym, errors, strict) + pys = self.convert_styles(pys, words, style, heteronym, errors, strict) else: - py = self.handle_nopinyin(words, style=style, errors=errors, - heteronym=heteronym, strict=strict) + py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict) if py: pys.extend(py) @@ -73,13 +75,11 @@ class Converter(UltimateConverter): g2pw_pinyin = self._g2pw(han) if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 - return super(Converter, self).convert( - han, Style.TONE, heteronym, errors, strict, **kwargs) + return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs) for i, item in enumerate(g2pw_pinyin[0]): if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 - py = super(Converter, self).convert( - han[i], Style.TONE, heteronym, errors, strict, **kwargs) + py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs) pinyins.extend(py) else: pinyins.append([to_tone(item)]) @@ -104,7 +104,7 @@ def _remove_dup_and_empty(lst_list): if lst: new_lst_list.append(lst) else: - new_lst_list.append(['']) + new_lst_list.append([""]) return new_lst_list @@ -127,17 +127,17 @@ def get_dict(): def read_dict(): polyphonic_dict = {} - with open(PP_DICT_PATH,encoding="utf-8") as f: + with open(PP_DICT_PATH, encoding="utf-8") as f: line = f.readline() while line: - key, value_str = line.split(':') + key, value_str = line.split(":") value = eval(value_str.strip()) polyphonic_dict[key.strip()] = value line = f.readline() - with open(PP_FIX_DICT_PATH,encoding="utf-8") as f: + with open(PP_FIX_DICT_PATH, encoding="utf-8") as f: line = f.readline() while line: - key, value_str = line.split(':') + key, value_str = line.split(":") value = eval(value_str.strip()) polyphonic_dict[key.strip()] = value line = f.readline() diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index dcb4604d..92827395 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -1,45 +1,45 @@ # This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw # This code is modified from https://github.com/GitYCC/g2pW -import warnings -warnings.filterwarnings("ignore") import json import os -import zipfile,requests -from typing import Any -from typing import Dict -from typing import List -from typing import Tuple +import warnings +import zipfile +from typing import Any, Dict, List, Tuple import numpy as np import onnxruntime -onnxruntime.set_default_logger_severity(3) +import requests +import torch from opencc import OpenCC -from transformers import AutoTokenizer -from pypinyin import pinyin -from pypinyin import Style +from pypinyin import Style, pinyin +from transformers.models.auto.tokenization_auto import AutoTokenizer -from .dataset import get_char_phoneme_labels -from .dataset import get_phoneme_labels -from .dataset import prepare_onnx_input -from .utils import load_config from ..zh_normalization.char_convert import tranditional_to_simplified +from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input +from .utils import load_config -model_version = '1.1' +onnxruntime.set_default_logger_severity(3) +onnxruntime.preload_dlls() +warnings.filterwarnings("ignore") + +model_version = "1.1" -def predict(session, onnx_input: Dict[str, Any], - labels: List[str]) -> Tuple[List[str], List[float]]: +def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[List[str], List[float]]: all_preds = [] all_confidences = [] - probs = session.run([], { - "input_ids": onnx_input['input_ids'], - "token_type_ids": onnx_input['token_type_ids'], - "attention_mask": onnx_input['attention_masks'], - "phoneme_mask": onnx_input['phoneme_masks'], - "char_ids": onnx_input['char_ids'], - "position_ids": onnx_input['position_ids'] - })[0] + probs = session.run( + [], + { + "input_ids": onnx_input["input_ids"], + "token_type_ids": onnx_input["token_type_ids"], + "attention_mask": onnx_input["attention_masks"], + "phoneme_mask": onnx_input["phoneme_masks"], + "char_ids": onnx_input["char_ids"], + "position_ids": onnx_input["position_ids"], + }, + )[0] preds = np.argmax(probs, axis=1).tolist() max_probs = [] @@ -51,17 +51,17 @@ def predict(session, onnx_input: Dict[str, Any], return all_preds, all_confidences -def download_and_decompress(model_dir: str='G2PWModel/'): +def download_and_decompress(model_dir: str = "G2PWModel/"): if not os.path.exists(model_dir): parent_directory = os.path.dirname(model_dir) - zip_dir = os.path.join(parent_directory,"G2PWModel_1.1.zip") - extract_dir = os.path.join(parent_directory,"G2PWModel_1.1") - extract_dir_new = os.path.join(parent_directory,"G2PWModel") + zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip") + extract_dir = os.path.join(parent_directory, "G2PWModel_1.1") + extract_dir_new = os.path.join(parent_directory, "G2PWModel") print("Downloading g2pw model...") - modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" + modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" with requests.get(modelscope_url, stream=True) as r: r.raise_for_status() - with open(zip_dir, 'wb') as f: + with open(zip_dir, "wb") as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) @@ -69,59 +69,80 @@ def download_and_decompress(model_dir: str='G2PWModel/'): print("Extracting g2pw model...") with zipfile.ZipFile(zip_dir, "r") as zip_ref: zip_ref.extractall(parent_directory) - + os.rename(extract_dir, extract_dir_new) return model_dir + class G2PWOnnxConverter: - def __init__(self, - model_dir: str='G2PWModel/', - style: str='bopomofo', - model_source: str=None, - enable_non_tradional_chinese: bool=False): + def __init__( + self, + model_dir: str = "G2PWModel/", + style: str = "bopomofo", + model_source: str = None, + enable_non_tradional_chinese: bool = False, + ): uncompress_path = download_and_decompress(model_dir) sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL - sess_options.intra_op_num_threads = 2 + sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0 try: - self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + self.session_g2pW = onnxruntime.InferenceSession( + os.path.join(uncompress_path, "g2pW.onnx"), + sess_options=sess_options, + providers=["CUDAExecutionProvider", "CPUExecutionProvider"], + ) except: - self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CPUExecutionProvider']) - self.config = load_config( - config_path=os.path.join(uncompress_path, 'config.py'), - use_default=True) + self.session_g2pW = onnxruntime.InferenceSession( + os.path.join(uncompress_path, "g2pW.onnx"), + sess_options=sess_options, + providers=["CPUExecutionProvider"], + ) + self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True) self.model_source = model_source if model_source else self.config.model_source self.enable_opencc = enable_non_tradional_chinese self.tokenizer = AutoTokenizer.from_pretrained(self.model_source) - polyphonic_chars_path = os.path.join(uncompress_path, - 'POLYPHONIC_CHARS.txt') - monophonic_chars_path = os.path.join(uncompress_path, - 'MONOPHONIC_CHARS.txt') + polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt") + monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt") self.polyphonic_chars = [ - line.split('\t') - for line in open(polyphonic_chars_path, encoding='utf-8').read() - .strip().split('\n') + line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n") ] self.non_polyphonic = { - '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊', '听', '噢' + "一", + "不", + "和", + "咋", + "嗲", + "剖", + "差", + "攢", + "倒", + "難", + "奔", + "勁", + "拗", + "肖", + "瘙", + "誒", + "泊", + "听", + "噢", } - self.non_monophonic = {'似', '攢'} + self.non_monophonic = {"似", "攢"} self.monophonic_chars = [ - line.split('\t') - for line in open(monophonic_chars_path, encoding='utf-8').read() - .strip().split('\n') + line.split("\t") for line in open(monophonic_chars_path, encoding="utf-8").read().strip().split("\n") ] - self.labels, self.char2phonemes = get_char_phoneme_labels( - polyphonic_chars=self.polyphonic_chars - ) if self.config.use_char_phoneme else get_phoneme_labels( - polyphonic_chars=self.polyphonic_chars) + self.labels, self.char2phonemes = ( + get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars) + if self.config.use_char_phoneme + else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars) + ) self.chars = sorted(list(self.char2phonemes.keys())) @@ -130,41 +151,29 @@ class G2PWOnnxConverter: if char in self.polyphonic_chars_new: self.polyphonic_chars_new.remove(char) - self.monophonic_chars_dict = { - char: phoneme - for char, phoneme in self.monophonic_chars - } + self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars} for char in self.non_monophonic: if char in self.monophonic_chars_dict: self.monophonic_chars_dict.pop(char) - self.pos_tags = [ - 'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI' - ] + self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"] - with open( - os.path.join(uncompress_path, - 'bopomofo_to_pinyin_wo_tune_dict.json'), - 'r', - encoding='utf-8') as fr: + with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr: self.bopomofo_convert_dict = json.load(fr) self.style_convert_func = { - 'bopomofo': lambda x: x, - 'pinyin': self._convert_bopomofo_to_pinyin, + "bopomofo": lambda x: x, + "pinyin": self._convert_bopomofo_to_pinyin, }[style] - with open( - os.path.join(uncompress_path, 'char_bopomofo_dict.json'), - 'r', - encoding='utf-8') as fr: + with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr: self.char_bopomofo_dict = json.load(fr) if self.enable_opencc: - self.cc = OpenCC('s2tw') + self.cc = OpenCC("s2tw") def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str: tone = bopomofo[-1] - assert tone in '12345' + assert tone in "12345" component = self.bopomofo_convert_dict.get(bopomofo[:-1]) if component: return component + tone @@ -184,8 +193,7 @@ class G2PWOnnxConverter: translated_sentences.append(translated_sent) sentences = translated_sentences - texts, query_ids, sent_ids, partial_results = self._prepare_data( - sentences=sentences) + texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences) if len(texts) == 0: # sentences no polyphonic words return partial_results @@ -198,14 +206,12 @@ class G2PWOnnxConverter: texts=texts, query_ids=query_ids, use_mask=self.config.use_mask, - window_size=None) + window_size=None, + ) - preds, confidences = predict( - session=self.session_g2pW, - onnx_input=onnx_input, - labels=self.labels) + preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels) if self.config.use_char_phoneme: - preds = [pred.split(' ')[1] for pred in preds] + preds = [pred.split(" ")[1] for pred in preds] results = partial_results for sent_id, query_id, pred in zip(sent_ids, query_ids, preds): @@ -213,15 +219,12 @@ class G2PWOnnxConverter: return results - def _prepare_data( - self, sentences: List[str] - ) -> Tuple[List[str], List[int], List[int], List[List[str]]]: + def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]: texts, query_ids, sent_ids, partial_results = [], [], [], [] for sent_id, sent in enumerate(sentences): # pypinyin works well for Simplified Chinese than Traditional Chinese sent_s = tranditional_to_simplified(sent) - pypinyin_result = pinyin( - sent_s, neutral_tone_with_five=True, style=Style.TONE3) + pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3) partial_result = [None] * len(sent) for i, char in enumerate(sent): if char in self.polyphonic_chars_new: @@ -229,8 +232,7 @@ class G2PWOnnxConverter: query_ids.append(i) sent_ids.append(sent_id) elif char in self.monophonic_chars_dict: - partial_result[i] = self.style_convert_func( - self.monophonic_chars_dict[char]) + partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char]) elif char in self.char_bopomofo_dict: partial_result[i] = pypinyin_result[i][0] # partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0]) diff --git a/GPT_SoVITS/text/g2pw/utils.py b/GPT_SoVITS/text/g2pw/utils.py index ba9ce51b..a86b2bc0 100644 --- a/GPT_SoVITS/text/g2pw/utils.py +++ b/GPT_SoVITS/text/g2pw/utils.py @@ -15,6 +15,7 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ + import os import re @@ -24,14 +25,14 @@ def wordize_and_map(text: str): index_map_from_text_to_word = [] index_map_from_word_to_text = [] while len(text) > 0: - match_space = re.match(r'^ +', text) + match_space = re.match(r"^ +", text) if match_space: space_str = match_space.group(0) index_map_from_text_to_word += [None] * len(space_str) - text = text[len(space_str):] + text = text[len(space_str) :] continue - match_en = re.match(r'^[a-zA-Z0-9]+', text) + match_en = re.match(r"^[a-zA-Z0-9]+", text) if match_en: en_word = match_en.group(0) @@ -42,7 +43,7 @@ def wordize_and_map(text: str): index_map_from_text_to_word += [len(words)] * len(en_word) words.append(en_word) - text = text[len(en_word):] + text = text[len(en_word) :] else: word_start_pos = len(index_map_from_text_to_word) word_end_pos = word_start_pos + 1 @@ -63,15 +64,14 @@ def tokenize_and_map(tokenizer, text: str): for word, (word_start, word_end) in zip(words, word2text): word_tokens = tokenizer.tokenize(word) - if len(word_tokens) == 0 or word_tokens == ['[UNK]']: + if len(word_tokens) == 0 or word_tokens == ["[UNK]"]: index_map_from_token_to_text.append((word_start, word_end)) - tokens.append('[UNK]') + tokens.append("[UNK]") else: current_word_start = word_start for word_token in word_tokens: - word_token_len = len(re.sub(r'^##', '', word_token)) - index_map_from_token_to_text.append( - (current_word_start, current_word_start + word_token_len)) + word_token_len = len(re.sub(r"^##", "", word_token)) + index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len)) current_word_start = current_word_start + word_token_len tokens.append(word_token) @@ -85,53 +85,51 @@ def tokenize_and_map(tokenizer, text: str): def _load_config(config_path: os.PathLike): import importlib.util - spec = importlib.util.spec_from_file_location('__init__', config_path) + + spec = importlib.util.spec_from_file_location("__init__", config_path) config = importlib.util.module_from_spec(spec) spec.loader.exec_module(config) return config default_config_dict = { - 'manual_seed': 1313, - 'model_source': 'bert-base-chinese', - 'window_size': 32, - 'num_workers': 2, - 'use_mask': True, - 'use_char_phoneme': False, - 'use_conditional': True, - 'param_conditional': { - 'affect_location': 'softmax', - 'bias': True, - 'char-linear': True, - 'pos-linear': False, - 'char+pos-second': True, - 'char+pos-second_lowrank': False, - 'lowrank_size': 0, - 'char+pos-second_fm': False, - 'fm_size': 0, - 'fix_mode': None, - 'count_json': 'train.count.json' + "manual_seed": 1313, + "model_source": "bert-base-chinese", + "window_size": 32, + "num_workers": 2, + "use_mask": True, + "use_char_phoneme": False, + "use_conditional": True, + "param_conditional": { + "affect_location": "softmax", + "bias": True, + "char-linear": True, + "pos-linear": False, + "char+pos-second": True, + "char+pos-second_lowrank": False, + "lowrank_size": 0, + "char+pos-second_fm": False, + "fm_size": 0, + "fix_mode": None, + "count_json": "train.count.json", }, - 'lr': 5e-5, - 'val_interval': 200, - 'num_iter': 10000, - 'use_focal': False, - 'param_focal': { - 'alpha': 0.0, - 'gamma': 0.7 + "lr": 5e-5, + "val_interval": 200, + "num_iter": 10000, + "use_focal": False, + "param_focal": {"alpha": 0.0, "gamma": 0.7}, + "use_pos": True, + "param_pos ": { + "weight": 0.1, + "pos_joint_training": True, + "train_pos_path": "train.pos", + "valid_pos_path": "dev.pos", + "test_pos_path": "test.pos", }, - 'use_pos': True, - 'param_pos ': { - 'weight': 0.1, - 'pos_joint_training': True, - 'train_pos_path': 'train.pos', - 'valid_pos_path': 'dev.pos', - 'test_pos_path': 'test.pos' - } } -def load_config(config_path: os.PathLike, use_default: bool=False): +def load_config(config_path: os.PathLike, use_default: bool = False): config = _load_config(config_path) if use_default: for attr, val in default_config_dict.items(): diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index e023ce7a..a54d0cf0 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -2,43 +2,51 @@ import re import os import hashlib + try: import pyopenjtalk + current_file_path = os.path.dirname(__file__) # 防止win下无法读取模型 - if os.name == 'nt': + if os.name == "nt": python_dir = os.getcwd() OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8") - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', OPEN_JTALK_DICT_DIR)): - if (OPEN_JTALK_DICT_DIR[:len(python_dir)].upper() == python_dir.upper()): - OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR,python_dir)) + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)): + if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper(): + OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir)) else: import shutil - if not os.path.exists('TEMP'): - os.mkdir('TEMP') + + if not os.path.exists("TEMP"): + os.mkdir("TEMP") if not os.path.exists(os.path.join("TEMP", "ja")): os.mkdir(os.path.join("TEMP", "ja")) if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")): shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic")) - shutil.copytree(pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), os.path.join("TEMP", "ja", "open_jtalk_dic"), ) + shutil.copytree( + pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), + os.path.join("TEMP", "ja", "open_jtalk_dic"), + ) OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic") pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8") - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', current_file_path)): - if (current_file_path[:len(python_dir)].upper() == python_dir.upper()): - current_file_path = os.path.join(os.path.relpath(current_file_path,python_dir)) + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)): + if current_file_path[: len(python_dir)].upper() == python_dir.upper(): + current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir)) else: - if not os.path.exists('TEMP'): - os.mkdir('TEMP') + if not os.path.exists("TEMP"): + os.mkdir("TEMP") if not os.path.exists(os.path.join("TEMP", "ja")): os.mkdir(os.path.join("TEMP", "ja")) if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")): os.mkdir(os.path.join("TEMP", "ja", "ja_userdic")) - shutil.copyfile(os.path.join(current_file_path, "ja_userdic", "userdict.csv"),os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv")) + shutil.copyfile( + os.path.join(current_file_path, "ja_userdic", "userdict.csv"), + os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"), + ) current_file_path = os.path.join("TEMP", "ja") - def get_hash(fp: str) -> str: hash_md5 = hashlib.md5() with open(fp, "rb") as f: @@ -51,21 +59,26 @@ try: USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5") # 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成 if os.path.exists(USERDIC_CSV_PATH): - if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read(): + if ( + not os.path.exists(USERDIC_BIN_PATH) + or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read() + ): pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH) - with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f: + with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f: f.write(get_hash(USERDIC_CSV_PATH)) if os.path.exists(USERDIC_BIN_PATH): - pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH) -except Exception as e: + pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH) +except Exception: # print(e) import pyopenjtalk + # failed to load user dictionary, ignore. pass from text.symbols import punctuation + # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" @@ -123,9 +136,9 @@ def post_replace_ph(ph): def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result @@ -152,7 +165,7 @@ def preprocess_jap(text, with_prosody=False): text += p.split(" ") if i < len(marks): - if marks[i] == " ":# 防止意外的UNK + if marks[i] == " ": # 防止意外的UNK continue text += [marks[i].replace(" ", "")] return text @@ -165,6 +178,7 @@ def text_normalize(text): text = replace_consecutive_punctuation(text) return text + # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): """Extract phoneme + prosoody symbol sequence from input full-context labels. @@ -241,6 +255,7 @@ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): return phones + # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def _numeric_feature_by_regex(regex, s): match = re.search(regex, s) @@ -248,6 +263,7 @@ def _numeric_feature_by_regex(regex, s): return -50 return int(match.group(1)) + def g2p(norm_text, with_prosody=True): phones = preprocess_jap(norm_text, with_prosody) phones = [post_replace_ph(i) for i in phones] diff --git a/GPT_SoVITS/text/korean.py b/GPT_SoVITS/text/korean.py index daae41fc..254b05cf 100644 --- a/GPT_SoVITS/text/korean.py +++ b/GPT_SoVITS/text/korean.py @@ -9,39 +9,43 @@ import importlib import os # 防止win下无法读取模型 -if os.name == 'nt': +if os.name == "nt": + class win_G2p(G2p): def check_mecab(self): super().check_mecab() spam_spec = importlib.util.find_spec("eunjeon") non_found = spam_spec is None if non_found: - print(f'you have to install eunjeon. install it...') + print("you have to install eunjeon. install it...") else: installpath = spam_spec.submodule_search_locations[0] - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)): - + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)): import sys from eunjeon import Mecab as _Mecab + class Mecab(_Mecab): def get_dicpath(installpath): - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)): + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)): import shutil - python_dir = os.getcwd() - if (installpath[:len(python_dir)].upper() == python_dir.upper()): - dicpath = os.path.join(os.path.relpath(installpath,python_dir),'data','mecabrc') - else: - if not os.path.exists('TEMP'): - os.mkdir('TEMP') - if not os.path.exists(os.path.join('TEMP', 'ko')): - os.mkdir(os.path.join('TEMP', 'ko')) - if os.path.exists(os.path.join('TEMP', 'ko', 'ko_dict')): - shutil.rmtree(os.path.join('TEMP', 'ko', 'ko_dict')) - shutil.copytree(os.path.join(installpath, 'data'), os.path.join('TEMP', 'ko', 'ko_dict')) - dicpath = os.path.join('TEMP', 'ko', 'ko_dict', 'mecabrc') + python_dir = os.getcwd() + if installpath[: len(python_dir)].upper() == python_dir.upper(): + dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc") + else: + if not os.path.exists("TEMP"): + os.mkdir("TEMP") + if not os.path.exists(os.path.join("TEMP", "ko")): + os.mkdir(os.path.join("TEMP", "ko")) + if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")): + shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict")) + + shutil.copytree( + os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict") + ) + dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc") else: - dicpath=os.path.abspath(os.path.join(installpath, 'data/mecabrc')) + dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc")) return dicpath def __init__(self, dicpath=get_dicpath(installpath)): @@ -52,97 +56,108 @@ if os.name == 'nt': G2p = win_G2p -from text.symbols2 import symbols +from text.symbols2 import symbols # This is a list of Korean classifiers preceded by pure Korean numerals. -_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' +_korean_classifiers = ( + "군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통" +) # List of (hangul, hangul divided) pairs: -_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ - # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule - # ('ㄵ', 'ㄴㅈ'), - # ('ㄶ', 'ㄴㅎ'), - # ('ㄺ', 'ㄹㄱ'), - # ('ㄻ', 'ㄹㅁ'), - # ('ㄼ', 'ㄹㅂ'), - # ('ㄽ', 'ㄹㅅ'), - # ('ㄾ', 'ㄹㅌ'), - # ('ㄿ', 'ㄹㅍ'), - # ('ㅀ', 'ㄹㅎ'), - # ('ㅄ', 'ㅂㅅ'), - ('ㅘ', 'ㅗㅏ'), - ('ㅙ', 'ㅗㅐ'), - ('ㅚ', 'ㅗㅣ'), - ('ㅝ', 'ㅜㅓ'), - ('ㅞ', 'ㅜㅔ'), - ('ㅟ', 'ㅜㅣ'), - ('ㅢ', 'ㅡㅣ'), - ('ㅑ', 'ㅣㅏ'), - ('ㅒ', 'ㅣㅐ'), - ('ㅕ', 'ㅣㅓ'), - ('ㅖ', 'ㅣㅔ'), - ('ㅛ', 'ㅣㅗ'), - ('ㅠ', 'ㅣㅜ') -]] +_hangul_divided = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule + # ('ㄵ', 'ㄴㅈ'), + # ('ㄶ', 'ㄴㅎ'), + # ('ㄺ', 'ㄹㄱ'), + # ('ㄻ', 'ㄹㅁ'), + # ('ㄼ', 'ㄹㅂ'), + # ('ㄽ', 'ㄹㅅ'), + # ('ㄾ', 'ㄹㅌ'), + # ('ㄿ', 'ㄹㅍ'), + # ('ㅀ', 'ㄹㅎ'), + # ('ㅄ', 'ㅂㅅ'), + ("ㅘ", "ㅗㅏ"), + ("ㅙ", "ㅗㅐ"), + ("ㅚ", "ㅗㅣ"), + ("ㅝ", "ㅜㅓ"), + ("ㅞ", "ㅜㅔ"), + ("ㅟ", "ㅜㅣ"), + ("ㅢ", "ㅡㅣ"), + ("ㅑ", "ㅣㅏ"), + ("ㅒ", "ㅣㅐ"), + ("ㅕ", "ㅣㅓ"), + ("ㅖ", "ㅣㅔ"), + ("ㅛ", "ㅣㅗ"), + ("ㅠ", "ㅣㅜ"), + ] +] # List of (Latin alphabet, hangul) pairs: -_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ - ('a', '에이'), - ('b', '비'), - ('c', '시'), - ('d', '디'), - ('e', '이'), - ('f', '에프'), - ('g', '지'), - ('h', '에이치'), - ('i', '아이'), - ('j', '제이'), - ('k', '케이'), - ('l', '엘'), - ('m', '엠'), - ('n', '엔'), - ('o', '오'), - ('p', '피'), - ('q', '큐'), - ('r', '아르'), - ('s', '에스'), - ('t', '티'), - ('u', '유'), - ('v', '브이'), - ('w', '더블유'), - ('x', '엑스'), - ('y', '와이'), - ('z', '제트') -]] +_latin_to_hangul = [ + (re.compile("%s" % x[0], re.IGNORECASE), x[1]) + for x in [ + ("a", "에이"), + ("b", "비"), + ("c", "시"), + ("d", "디"), + ("e", "이"), + ("f", "에프"), + ("g", "지"), + ("h", "에이치"), + ("i", "아이"), + ("j", "제이"), + ("k", "케이"), + ("l", "엘"), + ("m", "엠"), + ("n", "엔"), + ("o", "오"), + ("p", "피"), + ("q", "큐"), + ("r", "아르"), + ("s", "에스"), + ("t", "티"), + ("u", "유"), + ("v", "브이"), + ("w", "더블유"), + ("x", "엑스"), + ("y", "와이"), + ("z", "제트"), + ] +] # List of (ipa, lazy ipa) pairs: -_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ - ('t͡ɕ','ʧ'), - ('d͡ʑ','ʥ'), - ('ɲ','n^'), - ('ɕ','ʃ'), - ('ʷ','w'), - ('ɭ','l`'), - ('ʎ','ɾ'), - ('ɣ','ŋ'), - ('ɰ','ɯ'), - ('ʝ','j'), - ('ʌ','ə'), - ('ɡ','g'), - ('\u031a','#'), - ('\u0348','='), - ('\u031e',''), - ('\u0320',''), - ('\u0339','') -]] +_ipa_to_lazy_ipa = [ + (re.compile("%s" % x[0], re.IGNORECASE), x[1]) + for x in [ + ("t͡ɕ", "ʧ"), + ("d͡ʑ", "ʥ"), + ("ɲ", "n^"), + ("ɕ", "ʃ"), + ("ʷ", "w"), + ("ɭ", "l`"), + ("ʎ", "ɾ"), + ("ɣ", "ŋ"), + ("ɰ", "ɯ"), + ("ʝ", "j"), + ("ʌ", "ə"), + ("ɡ", "g"), + ("\u031a", "#"), + ("\u0348", "="), + ("\u031e", ""), + ("\u0320", ""), + ("\u0339", ""), + ] +] def fix_g2pk2_error(text): new_text = "" i = 0 while i < len(text) - 4: - if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ': - new_text += text[i:i+3] + ' ' + 'ㄴ' + if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "ㄹ": + new_text += text[i : i + 3] + " " + "ㄴ" i += 5 else: new_text += text[i] @@ -166,20 +181,20 @@ def divide_hangul(text): def hangul_number(num, sino=True): - '''Reference https://github.com/Kyubyong/g2pK''' - num = re.sub(',', '', num) + """Reference https://github.com/Kyubyong/g2pK""" + num = re.sub(",", "", num) - if num == '0': - return '영' - if not sino and num == '20': - return '스무' + if num == "0": + return "영" + if not sino and num == "20": + return "스무" - digits = '123456789' - names = '일이삼사오육칠팔구' + digits = "123456789" + names = "일이삼사오육칠팔구" digit2name = {d: n for d, n in zip(digits, names)} - modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' - decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' + modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉" + decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔" digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} @@ -188,75 +203,75 @@ def hangul_number(num, sino=True): i = len(num) - i - 1 if sino: if i == 0: - name = digit2name.get(digit, '') + name = digit2name.get(digit, "") elif i == 1: - name = digit2name.get(digit, '') + '십' - name = name.replace('일십', '십') + name = digit2name.get(digit, "") + "십" + name = name.replace("일십", "십") else: if i == 0: - name = digit2mod.get(digit, '') + name = digit2mod.get(digit, "") elif i == 1: - name = digit2dec.get(digit, '') - if digit == '0': + name = digit2dec.get(digit, "") + if digit == "0": if i % 4 == 0: - last_three = spelledout[-min(3, len(spelledout)):] - if ''.join(last_three) == '': - spelledout.append('') + last_three = spelledout[-min(3, len(spelledout)) :] + if "".join(last_three) == "": + spelledout.append("") continue else: - spelledout.append('') + spelledout.append("") continue if i == 2: - name = digit2name.get(digit, '') + '백' - name = name.replace('일백', '백') + name = digit2name.get(digit, "") + "백" + name = name.replace("일백", "백") elif i == 3: - name = digit2name.get(digit, '') + '천' - name = name.replace('일천', '천') + name = digit2name.get(digit, "") + "천" + name = name.replace("일천", "천") elif i == 4: - name = digit2name.get(digit, '') + '만' - name = name.replace('일만', '만') + name = digit2name.get(digit, "") + "만" + name = name.replace("일만", "만") elif i == 5: - name = digit2name.get(digit, '') + '십' - name = name.replace('일십', '십') + name = digit2name.get(digit, "") + "십" + name = name.replace("일십", "십") elif i == 6: - name = digit2name.get(digit, '') + '백' - name = name.replace('일백', '백') + name = digit2name.get(digit, "") + "백" + name = name.replace("일백", "백") elif i == 7: - name = digit2name.get(digit, '') + '천' - name = name.replace('일천', '천') + name = digit2name.get(digit, "") + "천" + name = name.replace("일천", "천") elif i == 8: - name = digit2name.get(digit, '') + '억' + name = digit2name.get(digit, "") + "억" elif i == 9: - name = digit2name.get(digit, '') + '십' + name = digit2name.get(digit, "") + "십" elif i == 10: - name = digit2name.get(digit, '') + '백' + name = digit2name.get(digit, "") + "백" elif i == 11: - name = digit2name.get(digit, '') + '천' + name = digit2name.get(digit, "") + "천" elif i == 12: - name = digit2name.get(digit, '') + '조' + name = digit2name.get(digit, "") + "조" elif i == 13: - name = digit2name.get(digit, '') + '십' + name = digit2name.get(digit, "") + "십" elif i == 14: - name = digit2name.get(digit, '') + '백' + name = digit2name.get(digit, "") + "백" elif i == 15: - name = digit2name.get(digit, '') + '천' + name = digit2name.get(digit, "") + "천" spelledout.append(name) - return ''.join(elem for elem in spelledout) + return "".join(elem for elem in spelledout) def number_to_hangul(text): - '''Reference https://github.com/Kyubyong/g2pK''' - tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) + """Reference https://github.com/Kyubyong/g2pK""" + tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text)) for token in tokens: num, classifier = token if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: spelledout = hangul_number(num, sino=False) else: spelledout = hangul_number(num, sino=True) - text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') + text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}") # digit by digit for remaining digits - digits = '0123456789' - names = '영일이삼사오육칠팔구' + digits = "0123456789" + names = "영일이삼사오육칠팔구" for d, n in zip(digits, names): text = text.replace(d, n) return text @@ -265,19 +280,23 @@ def number_to_hangul(text): def korean_to_lazy_ipa(text): text = latin_to_hangul(text) text = number_to_hangul(text) - text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) + text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text) for regex, replacement in _ipa_to_lazy_ipa: text = re.sub(regex, replacement, text) return text -_g2p=G2p() + +_g2p = G2p() + + def korean_to_ipa(text): text = latin_to_hangul(text) text = number_to_hangul(text) text = _g2p(text) text = fix_g2pk2_error(text) text = korean_to_lazy_ipa(text) - return text.replace('ʧ','tʃ').replace('ʥ','dʑ') + return text.replace("ʧ", "tʃ").replace("ʥ", "dʑ") + def post_replace_ph(ph): rep_map = { @@ -301,12 +320,13 @@ def post_replace_ph(ph): ph = "停" return ph + def g2p(text): text = latin_to_hangul(text) text = _g2p(text) text = divide_hangul(text) text = fix_g2pk2_error(text) - text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) + text = re.sub(r"([\u3131-\u3163])$", r"\1.", text) # text = "".join([post_replace_ph(i) for i in text]) text = [post_replace_ph(i) for i in text] return text @@ -314,4 +334,4 @@ def g2p(text): if __name__ == "__main__": text = "안녕하세요" - print(g2p(text)) \ No newline at end of file + print(g2p(text)) diff --git a/GPT_SoVITS/text/symbols.py b/GPT_SoVITS/text/symbols.py index 70499492..b012882b 100644 --- a/GPT_SoVITS/text/symbols.py +++ b/GPT_SoVITS/text/symbols.py @@ -1,5 +1,3 @@ -import os - # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 punctuation.append("-") diff --git a/GPT_SoVITS/text/symbols2.py b/GPT_SoVITS/text/symbols2.py index a4423505..2f159d2b 100644 --- a/GPT_SoVITS/text/symbols2.py +++ b/GPT_SoVITS/text/symbols2.py @@ -1,5 +1,3 @@ -import os - # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 punctuation.append("-") @@ -396,24 +394,404 @@ arpa = { "SH", } -ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停' +ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停" # ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ ' -yue_symbols={'Yeot3', 'Yip1', 'Yyu3', 'Yeng4', 'Yut5', 'Yaan5', 'Ym5', 'Yaan6', 'Yang1', 'Yun4', 'Yon2', 'Yui5', 'Yun2', 'Yat3', 'Ye', 'Yeot1', 'Yoeng5', 'Yoek2', 'Yam2', 'Yeon6', 'Yu6', 'Yiu3', 'Yaang6', 'Yp5', 'Yai4', 'Yoek4', 'Yit6', 'Yam5', 'Yoeng6', 'Yg1', 'Yk3', 'Yoe4', 'Yam3', 'Yc', 'Yyu4', 'Yyut1', 'Yiu4', 'Ying3', 'Yip3', 'Yaap3', 'Yau3', 'Yan4', 'Yau1', 'Yap4', 'Yk6', 'Yok3', 'Yai1', 'Yeot6', 'Yan2', 'Yoek6', 'Yt1', 'Yoi1', 'Yit5', 'Yn4', 'Yaau3', 'Yau4', 'Yuk6', 'Ys', 'Yuk', 'Yin6', 'Yung6', 'Ya', 'You', 'Yaai5', 'Yau5', 'Yoi3', 'Yaak3', 'Yaat3', 'Ying2', 'Yok5', 'Yeng2', 'Yyut3', 'Yam1', 'Yip5', 'You1', 'Yam6', 'Yaa5', 'Yi6', 'Yek4', 'Yyu2', 'Yuk5', 'Yaam1', 'Yang2', 'Yai', 'Yiu6', 'Yin4', 'Yok4', 'Yot3', 'Yui2', 'Yeoi5', 'Yyun6', 'Yyu5', 'Yoi5', 'Yeot2', 'Yim4', 'Yeoi2', 'Yaan1', 'Yang6', 'Yong1', 'Yaang4', 'Yung5', 'Yeon1', 'Yin2', 'Ya3', 'Yaang3', 'Yg', 'Yk2', 'Yaau5', 'Yut1', 'Yt5', 'Yip4', 'Yung4', 'Yj', 'Yong3', 'Ya1', 'Yg6', 'Yaau6', 'Yit3', 'Yun3', 'Ying1', 'Yn2', 'Yg4', 'Yl', 'Yp3', 'Yn3', 'Yak1', 'Yang5', 'Yoe6', 'You2', 'Yap2', 'Yak2', 'Yt3', 'Yot5', 'Yim2', 'Yi1', 'Yn6', 'Yaat5', 'Yaam3', 'Yoek5', 'Ye3', 'Yeon4', 'Yaa2', 'Yu3', 'Yim6', 'Ym', 'Yoe3', 'Yaai2', 'Ym2', 'Ya6', 'Yeng6', 'Yik4', 'Yot4', 'Yaai4', 'Yyun3', 'Yu1', 'Yoeng1', 'Yaap2', 'Yuk3', 'Yoek3', 'Yeng5', 'Yeoi1', 'Yiu2', 'Yok1', 'Yo1', 'Yoek1', 'Yoeng2', 'Yeon5', 'Yiu1', 'Yoeng4', 'Yuk2', 'Yat4', 'Yg5', 'Yut4', 'Yan6', 'Yin3', 'Yaa6', 'Yap1', 'Yg2', 'Yoe5', 'Yt4', 'Ya5', 'Yo4', 'Yyu1', 'Yak3', 'Yeon2', 'Yong4', 'Ym1', 'Ye2', 'Yaang5', 'Yoi2', 'Yeng3', 'Yn', 'Yyut4', 'Yau', 'Yaak2', 'Yaan4', 'Yek2', 'Yin1', 'Yi5', 'Yoe2', 'Yei5', 'Yaat6', 'Yak5', 'Yp6', 'Yok6', 'Yei2', 'Yaap1', 'Yyut5', 'Yi4', 'Yim1', 'Yk5', 'Ye4', 'Yok2', 'Yaam6', 'Yat2', 'Yon6', 'Yei3', 'Yyu6', 'Yeot5', 'Yk4', 'Yai6', 'Yd', 'Yg3', 'Yei6', 'Yau2', 'Yok', 'Yau6', 'Yung3', 'Yim5', 'Yut6', 'Yit1', 'Yon3', 'Yat1', 'Yaam2', 'Yyut2', 'Yui6', 'Yt2', 'Yek6', 'Yt', 'Ye6', 'Yang3', 'Ying6', 'Yaau1', 'Yeon3', 'Yng', 'Yh', 'Yang4', 'Ying5', 'Yaap6', 'Yoeng3', 'Yyun4', 'You3', 'Yan5', 'Yat5', 'Yot1', 'Yun1', 'Yi3', 'Yaa1', 'Yaap4', 'You6', 'Yaang2', 'Yaap5', 'Yaa3', 'Yaak6', 'Yeng1', 'Yaak1', 'Yo5', 'Yoi4', 'Yam4', 'Yik1', 'Ye1', 'Yai5', 'Yung1', 'Yp2', 'Yui4', 'Yaak4', 'Yung2', 'Yak4', 'Yaat4', 'Yeoi4', 'Yut2', 'Yin5', 'Yaau4', 'Yap6', 'Yb', 'Yaam4', 'Yw', 'Yut3', 'Yong2', 'Yt6', 'Yaai6', 'Yap5', 'Yik5', 'Yun6', 'Yaam5', 'Yun5', 'Yik3', 'Ya2', 'Yyut6', 'Yon4', 'Yk1', 'Yit4', 'Yak6', 'Yaan2', 'Yuk1', 'Yai2', 'Yik2', 'Yaat2', 'Yo3', 'Ykw', 'Yn5', 'Yaa', 'Ye5', 'Yu4', 'Yei1', 'Yai3', 'Yyun5', 'Yip2', 'Yaau2', 'Yiu5', 'Ym4', 'Yeoi6', 'Yk', 'Ym6', 'Yoe1', 'Yeoi3', 'Yon', 'Yuk4', 'Yaai3', 'Yaa4', 'Yot6', 'Yaang1', 'Yei4', 'Yek1', 'Yo', 'Yp', 'Yo6', 'Yp4', 'Yan3', 'Yoi', 'Yap3', 'Yek3', 'Yim3', 'Yz', 'Yot2', 'Yoi6', 'Yit2', 'Yu5', 'Yaan3', 'Yan1', 'Yon5', 'Yp1', 'Yong5', 'Ygw', 'Yak', 'Yat6', 'Ying4', 'Yu2', 'Yf', 'Ya4', 'Yon1', 'You4', 'Yik6', 'Yui1', 'Yaat1', 'Yeot4', 'Yi2', 'Yaai1', 'Yek5', 'Ym3', 'Yong6', 'You5', 'Yyun1', 'Yn1', 'Yo2', 'Yip6', 'Yui3', 'Yaak5', 'Yyun2'} +yue_symbols = { + "Yeot3", + "Yip1", + "Yyu3", + "Yeng4", + "Yut5", + "Yaan5", + "Ym5", + "Yaan6", + "Yang1", + "Yun4", + "Yon2", + "Yui5", + "Yun2", + "Yat3", + "Ye", + "Yeot1", + "Yoeng5", + "Yoek2", + "Yam2", + "Yeon6", + "Yu6", + "Yiu3", + "Yaang6", + "Yp5", + "Yai4", + "Yoek4", + "Yit6", + "Yam5", + "Yoeng6", + "Yg1", + "Yk3", + "Yoe4", + "Yam3", + "Yc", + "Yyu4", + "Yyut1", + "Yiu4", + "Ying3", + "Yip3", + "Yaap3", + "Yau3", + "Yan4", + "Yau1", + "Yap4", + "Yk6", + "Yok3", + "Yai1", + "Yeot6", + "Yan2", + "Yoek6", + "Yt1", + "Yoi1", + "Yit5", + "Yn4", + "Yaau3", + "Yau4", + "Yuk6", + "Ys", + "Yuk", + "Yin6", + "Yung6", + "Ya", + "You", + "Yaai5", + "Yau5", + "Yoi3", + "Yaak3", + "Yaat3", + "Ying2", + "Yok5", + "Yeng2", + "Yyut3", + "Yam1", + "Yip5", + "You1", + "Yam6", + "Yaa5", + "Yi6", + "Yek4", + "Yyu2", + "Yuk5", + "Yaam1", + "Yang2", + "Yai", + "Yiu6", + "Yin4", + "Yok4", + "Yot3", + "Yui2", + "Yeoi5", + "Yyun6", + "Yyu5", + "Yoi5", + "Yeot2", + "Yim4", + "Yeoi2", + "Yaan1", + "Yang6", + "Yong1", + "Yaang4", + "Yung5", + "Yeon1", + "Yin2", + "Ya3", + "Yaang3", + "Yg", + "Yk2", + "Yaau5", + "Yut1", + "Yt5", + "Yip4", + "Yung4", + "Yj", + "Yong3", + "Ya1", + "Yg6", + "Yaau6", + "Yit3", + "Yun3", + "Ying1", + "Yn2", + "Yg4", + "Yl", + "Yp3", + "Yn3", + "Yak1", + "Yang5", + "Yoe6", + "You2", + "Yap2", + "Yak2", + "Yt3", + "Yot5", + "Yim2", + "Yi1", + "Yn6", + "Yaat5", + "Yaam3", + "Yoek5", + "Ye3", + "Yeon4", + "Yaa2", + "Yu3", + "Yim6", + "Ym", + "Yoe3", + "Yaai2", + "Ym2", + "Ya6", + "Yeng6", + "Yik4", + "Yot4", + "Yaai4", + "Yyun3", + "Yu1", + "Yoeng1", + "Yaap2", + "Yuk3", + "Yoek3", + "Yeng5", + "Yeoi1", + "Yiu2", + "Yok1", + "Yo1", + "Yoek1", + "Yoeng2", + "Yeon5", + "Yiu1", + "Yoeng4", + "Yuk2", + "Yat4", + "Yg5", + "Yut4", + "Yan6", + "Yin3", + "Yaa6", + "Yap1", + "Yg2", + "Yoe5", + "Yt4", + "Ya5", + "Yo4", + "Yyu1", + "Yak3", + "Yeon2", + "Yong4", + "Ym1", + "Ye2", + "Yaang5", + "Yoi2", + "Yeng3", + "Yn", + "Yyut4", + "Yau", + "Yaak2", + "Yaan4", + "Yek2", + "Yin1", + "Yi5", + "Yoe2", + "Yei5", + "Yaat6", + "Yak5", + "Yp6", + "Yok6", + "Yei2", + "Yaap1", + "Yyut5", + "Yi4", + "Yim1", + "Yk5", + "Ye4", + "Yok2", + "Yaam6", + "Yat2", + "Yon6", + "Yei3", + "Yyu6", + "Yeot5", + "Yk4", + "Yai6", + "Yd", + "Yg3", + "Yei6", + "Yau2", + "Yok", + "Yau6", + "Yung3", + "Yim5", + "Yut6", + "Yit1", + "Yon3", + "Yat1", + "Yaam2", + "Yyut2", + "Yui6", + "Yt2", + "Yek6", + "Yt", + "Ye6", + "Yang3", + "Ying6", + "Yaau1", + "Yeon3", + "Yng", + "Yh", + "Yang4", + "Ying5", + "Yaap6", + "Yoeng3", + "Yyun4", + "You3", + "Yan5", + "Yat5", + "Yot1", + "Yun1", + "Yi3", + "Yaa1", + "Yaap4", + "You6", + "Yaang2", + "Yaap5", + "Yaa3", + "Yaak6", + "Yeng1", + "Yaak1", + "Yo5", + "Yoi4", + "Yam4", + "Yik1", + "Ye1", + "Yai5", + "Yung1", + "Yp2", + "Yui4", + "Yaak4", + "Yung2", + "Yak4", + "Yaat4", + "Yeoi4", + "Yut2", + "Yin5", + "Yaau4", + "Yap6", + "Yb", + "Yaam4", + "Yw", + "Yut3", + "Yong2", + "Yt6", + "Yaai6", + "Yap5", + "Yik5", + "Yun6", + "Yaam5", + "Yun5", + "Yik3", + "Ya2", + "Yyut6", + "Yon4", + "Yk1", + "Yit4", + "Yak6", + "Yaan2", + "Yuk1", + "Yai2", + "Yik2", + "Yaat2", + "Yo3", + "Ykw", + "Yn5", + "Yaa", + "Ye5", + "Yu4", + "Yei1", + "Yai3", + "Yyun5", + "Yip2", + "Yaau2", + "Yiu5", + "Ym4", + "Yeoi6", + "Yk", + "Ym6", + "Yoe1", + "Yeoi3", + "Yon", + "Yuk4", + "Yaai3", + "Yaa4", + "Yot6", + "Yaang1", + "Yei4", + "Yek1", + "Yo", + "Yp", + "Yo6", + "Yp4", + "Yan3", + "Yoi", + "Yap3", + "Yek3", + "Yim3", + "Yz", + "Yot2", + "Yoi6", + "Yit2", + "Yu5", + "Yaan3", + "Yan1", + "Yon5", + "Yp1", + "Yong5", + "Ygw", + "Yak", + "Yat6", + "Ying4", + "Yu2", + "Yf", + "Ya4", + "Yon1", + "You4", + "Yik6", + "Yui1", + "Yaat1", + "Yeot4", + "Yi2", + "Yaai1", + "Yek5", + "Ym3", + "Yong6", + "You5", + "Yyun1", + "Yn1", + "Yo2", + "Yip6", + "Yui3", + "Yaak5", + "Yyun2", +} # symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了 symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) symbols = sorted(set(symbols)) # print(len(symbols)) -symbols+=["[","]"]##日文新增上升下降调型 -symbols+=sorted(list(ko_symbols)) -symbols+=sorted(list(yue_symbols))##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复 +symbols += ["[", "]"] ##日文新增上升下降调型 +symbols += sorted(list(ko_symbols)) +symbols += sorted(list(yue_symbols)) ##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复 # print(len(symbols)) if __name__ == "__main__": print(len(symbols)) -''' +""" 粤语: 732-353=379 韩文+粤语: 732-322=410 -''' +""" diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index f6b0a94a..964ea385 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -510,12 +510,7 @@ class ToneSandhi: # e.g. 走了, 看着, 去过 elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: finals[-1] = finals[-1][:-1] + "5" - elif ( - len(word) > 1 - and word[-1] in "们子" - and pos in {"r", "n"} - and word not in self.must_not_neural_tone_words - ): + elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words: finals[-1] = finals[-1][:-1] + "5" # e.g. 桌上, 地下, 家里 elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: @@ -525,25 +520,18 @@ class ToneSandhi: finals[-1] = finals[-1][:-1] + "5" # 个做量词 elif ( - ge_idx >= 1 - and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") ) or word == "个": finals[ge_idx] = finals[ge_idx][:-1] + "5" else: - if ( - word in self.must_neural_tone_words - or word[-2:] in self.must_neural_tone_words - ): + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: finals[-1] = finals[-1][:-1] + "5" word_list = self._split_word(word) finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] for i, word in enumerate(word_list): # conventional neural in Chinese - if ( - word in self.must_neural_tone_words - or word[-2:] in self.must_neural_tone_words - ): + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: finals_list[i][-1] = finals_list[i][-1][:-1] + "5" finals = sum(finals_list, []) return finals @@ -561,9 +549,7 @@ class ToneSandhi: def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: # "一" in number sequences, e.g. 一零零, 二一零 - if word.find("一") != -1 and all( - [item.isnumeric() for item in word if item != "一"] - ): + if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]): return finals # "一" between reduplication words shold be yi5, e.g. 看一看 elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: @@ -697,13 +683,10 @@ class ToneSandhi: return new_seg # the first and the second words are all_tone_three - def _merge_continuous_three_tones( - self, seg: List[Tuple[str, str]] - ) -> List[Tuple[str, str]]: + def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) @@ -715,10 +698,7 @@ class ToneSandhi: and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if ( - not self._is_reduplication(seg[i - 1][0]) - and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 - ): + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -732,13 +712,10 @@ class ToneSandhi: return len(word) == 2 and word[0] == word[1] # the last char of first word and the first char of second word is tone_three - def _merge_continuous_three_tones_2( - self, seg: List[Tuple[str, str]] - ) -> List[Tuple[str, str]]: + def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) @@ -750,10 +727,7 @@ class ToneSandhi: and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if ( - not self._is_reduplication(seg[i - 1][0]) - and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 - ): + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: diff --git a/GPT_SoVITS/text/zh_normalization/char_convert.py b/GPT_SoVITS/text/zh_normalization/char_convert.py index dcf95d72..5b57ed97 100644 --- a/GPT_SoVITS/text/zh_normalization/char_convert.py +++ b/GPT_SoVITS/text/zh_normalization/char_convert.py @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. -""" -simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' +"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.""" -traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' +simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤" + +traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤" assert len(simplified_charcters) == len(simplified_charcters) @@ -28,13 +28,11 @@ for i, item in enumerate(simplified_charcters): def tranditional_to_simplified(text: str) -> str: - return "".join( - [t2s_dict[item] if item in t2s_dict else item for item in text]) + return "".join([t2s_dict[item] if item in t2s_dict else item for item in text]) def simplified_to_traditional(text: str) -> str: - return "".join( - [s2t_dict[item] if item in s2t_dict else item for item in text]) + return "".join([s2t_dict[item] if item in s2t_dict else item for item in text]) if __name__ == "__main__": diff --git a/GPT_SoVITS/text/zh_normalization/chronology.py b/GPT_SoVITS/text/zh_normalization/chronology.py index ea4558e2..2a6f66c2 100644 --- a/GPT_SoVITS/text/zh_normalization/chronology.py +++ b/GPT_SoVITS/text/zh_normalization/chronology.py @@ -21,25 +21,29 @@ from .num import verbalize_digit def _time_num2str(num_string: str) -> str: """A special case for verbalizing number in time.""" - result = num2str(num_string.lstrip('0')) - if num_string.startswith('0'): - result = DIGITS['0'] + result + result = num2str(num_string.lstrip("0")) + if num_string.startswith("0"): + result = DIGITS["0"] + result return result # 时刻表达式 -RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?') +RE_TIME = re.compile( + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" +) # 时间范围,如8:30-12:30 -RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?' - r'(~|-)' - r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?') +RE_TIME_RANGE = re.compile( + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" + r"(~|-)" + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" +) def replace_time(match) -> str: @@ -62,31 +66,33 @@ def replace_time(match) -> str: second_2 = match.group(9) result = f"{num2str(hour)}点" - if minute.lstrip('0'): + if minute.lstrip("0"): if int(minute) == 30: result += "半" else: result += f"{_time_num2str(minute)}分" - if second and second.lstrip('0'): + if second and second.lstrip("0"): result += f"{_time_num2str(second)}秒" if is_range: result += "至" result += f"{num2str(hour_2)}点" - if minute_2.lstrip('0'): + if minute_2.lstrip("0"): if int(minute) == 30: result += "半" else: result += f"{_time_num2str(minute_2)}分" - if second_2 and second_2.lstrip('0'): + if second_2 and second_2.lstrip("0"): result += f"{_time_num2str(second_2)}秒" return result -RE_DATE = re.compile(r'(\d{4}|\d{2})年' - r'((0?[1-9]|1[0-2])月)?' - r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') +RE_DATE = re.compile( + r"(\d{4}|\d{2})年" + r"((0?[1-9]|1[0-2])月)?" + r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?" +) def replace_date(match) -> str: @@ -110,8 +116,7 @@ def replace_date(match) -> str: # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 -RE_DATE2 = re.compile( - r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') +RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])") def replace_date2(match) -> str: diff --git a/GPT_SoVITS/text/zh_normalization/constants.py b/GPT_SoVITS/text/zh_normalization/constants.py index 6423ad74..4218a551 100644 --- a/GPT_SoVITS/text/zh_normalization/constants.py +++ b/GPT_SoVITS/text/zh_normalization/constants.py @@ -18,10 +18,7 @@ from pypinyin.constants import SUPPORT_UCS4 # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) -F2H_ASCII_LETTERS = { - ord(char) + 65248: ord(char) - for char in string.ascii_letters -} +F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters} # 英文字符半角 -> 全角映射表 H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} @@ -37,26 +34,29 @@ F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} # 空格 (num: 1) -F2H_SPACE = {'\u3000': ' '} -H2F_SPACE = {' ': '\u3000'} +F2H_SPACE = {"\u3000": " "} +H2F_SPACE = {" ": "\u3000"} # 非"有拼音的汉字"的字符串,可用于NSW提取 if SUPPORT_UCS4: - RE_NSW = re.compile(r'(?:[^' - r'\u3007' # 〇 - r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] - r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] - r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] - r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] - r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] - r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] - r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] - r'])+') + RE_NSW = re.compile( + r"(?:[^" + r"\u3007" # 〇 + r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] + r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] + r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] + r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF] + r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F] + r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D] + r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F] + r"])+" + ) else: RE_NSW = re.compile( # pragma: no cover - r'(?:[^' - r'\u3007' # 〇 - r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] - r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] - r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] - r'])+') + r"(?:[^" + r"\u3007" # 〇 + r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] + r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] + r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] + r"])+" + ) diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py index c0460a03..c3af4d6a 100644 --- a/GPT_SoVITS/text/zh_normalization/num.py +++ b/GPT_SoVITS/text/zh_normalization/num.py @@ -15,23 +15,26 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ + import re from collections import OrderedDict from typing import List -DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} -UNITS = OrderedDict({ - 1: '十', - 2: '百', - 3: '千', - 4: '万', - 8: '亿', -}) +DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")} +UNITS = OrderedDict( + { + 1: "十", + 2: "百", + 3: "千", + 4: "万", + 8: "亿", + } +) -COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)" # 分数表达式 -RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') +RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)") def replace_frac(match) -> str: @@ -52,7 +55,7 @@ def replace_frac(match) -> str: # 百分数表达式 -RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') +RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%") def replace_percentage(match) -> str: @@ -72,7 +75,7 @@ def replace_percentage(match) -> str: # 整数表达式 # 带负号的整数 -10 -RE_INTEGER = re.compile(r'(-)' r'(\d+)') +RE_INTEGER = re.compile(r"(-)" r"(\d+)") def replace_negative_num(match) -> str: @@ -92,7 +95,7 @@ def replace_negative_num(match) -> str: # 编号-无符号整形 # 00078 -RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') +RE_DEFAULT_NUM = re.compile(r"\d{3}\d*") def replace_default_num(match): @@ -110,15 +113,11 @@ def replace_default_num(match): # RE_ASMD = re.compile( # r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') RE_ASMD = re.compile( - r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))') + r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))" +) + +asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"} -asmd_map = { - '+': '加', - '-': '减', - '×': '乘', - '÷': '除', - '=': '等于' -} def replace_asmd(match) -> str: """ @@ -132,24 +131,25 @@ def replace_asmd(match) -> str: # 次方专项 -RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+') +RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+") power_map = { - '⁰': '0', - '¹': '1', - '²': '2', - '³': '3', - '⁴': '4', - '⁵': '5', - '⁶': '6', - '⁷': '7', - '⁸': '8', - '⁹': '9', - 'ˣ': 'x', - 'ʸ': 'y', - 'ⁿ': 'n' + "⁰": "0", + "¹": "1", + "²": "2", + "³": "3", + "⁴": "4", + "⁵": "5", + "⁶": "6", + "⁷": "7", + "⁸": "8", + "⁹": "9", + "ˣ": "x", + "ʸ": "y", + "ⁿ": "n", } + def replace_power(match) -> str: """ Args: @@ -166,10 +166,10 @@ def replace_power(match) -> str: # 数字表达式 # 纯小数 -RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') +RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))") # 正整数 + 量词 RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) -RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') +RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))") def replace_positive_quantifier(match) -> str: @@ -220,7 +220,9 @@ RE_RANGE = re.compile( [-~] # 匹配范围分隔符 ((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数) (?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符 - """, re.VERBOSE) + """, + re.VERBOSE, +) def replace_range(match) -> str: @@ -239,7 +241,9 @@ def replace_range(match) -> str: # ~至表达式 RE_TO_RANGE = re.compile( - r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)') + r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)" +) + def replace_to_range(match) -> str: """ @@ -248,71 +252,66 @@ def replace_to_range(match) -> str: Returns: str """ - result = match.group(0).replace('~', '至') + result = match.group(0).replace("~", "至") return result -def _get_value(value_string: str, use_zero: bool=True) -> List[str]: - stripped = value_string.lstrip('0') +def _get_value(value_string: str, use_zero: bool = True) -> List[str]: + stripped = value_string.lstrip("0") if len(stripped) == 0: return [] elif len(stripped) == 1: if use_zero and len(stripped) < len(value_string): - return [DIGITS['0'], DIGITS[stripped]] + return [DIGITS["0"], DIGITS[stripped]] else: return [DIGITS[stripped]] else: - largest_unit = next( - power for power in reversed(UNITS.keys()) if power < len(stripped)) + largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped)) first_part = value_string[:-largest_unit] second_part = value_string[-largest_unit:] - return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( - second_part) + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) def verbalize_cardinal(value_string: str) -> str: if not value_string: - return '' + return "" # 000 -> '零' , 0 -> '零' - value_string = value_string.lstrip('0') + value_string = value_string.lstrip("0") if len(value_string) == 0: - return DIGITS['0'] + return DIGITS["0"] result_symbols = _get_value(value_string) # verbalized number starting with '一十*' is abbreviated as `十*` - if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ - '1'] and result_symbols[1] == UNITS[1]: + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]: result_symbols = result_symbols[1:] - return ''.join(result_symbols) + return "".join(result_symbols) def verbalize_digit(value_string: str, alt_one=False) -> str: result_symbols = [DIGITS[digit] for digit in value_string] - result = ''.join(result_symbols) + result = "".join(result_symbols) if alt_one: result = result.replace("一", "幺") return result def num2str(value_string: str) -> str: - integer_decimal = value_string.split('.') + integer_decimal = value_string.split(".") if len(integer_decimal) == 1: integer = integer_decimal[0] - decimal = '' + decimal = "" elif len(integer_decimal) == 2: integer, decimal = integer_decimal else: - raise ValueError( - f"The value string: '${value_string}' has more than one point in it." - ) + raise ValueError(f"The value string: '${value_string}' has more than one point in it.") result = verbalize_cardinal(integer) - decimal = decimal.rstrip('0') + decimal = decimal.rstrip("0") if decimal: # '.22' is verbalized as '零点二二' # '3.20' is verbalized as '三点二 result = result if result else "零" - result += '点' + verbalize_digit(decimal) + result += "点" + verbalize_digit(decimal) return result diff --git a/GPT_SoVITS/text/zh_normalization/phonecode.py b/GPT_SoVITS/text/zh_normalization/phonecode.py index 51835112..3560ac2e 100644 --- a/GPT_SoVITS/text/zh_normalization/phonecode.py +++ b/GPT_SoVITS/text/zh_normalization/phonecode.py @@ -21,10 +21,8 @@ from .num import verbalize_digit # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 # 联通:130、131、132、156、155、186、185、176 # 电信:133、153、189、180、181、177 -RE_MOBILE_PHONE = re.compile( - r"(? str: if mobile: - sp_parts = phone_string.strip('+').split() - result = ','.join( - [verbalize_digit(part, alt_one=True) for part in sp_parts]) + sp_parts = phone_string.strip("+").split() + result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: - sil_parts = phone_string.split('-') - result = ','.join( - [verbalize_digit(part, alt_one=True) for part in sil_parts]) + sil_parts = phone_string.split("-") + result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/GPT_SoVITS/text/zh_normalization/quantifier.py b/GPT_SoVITS/text/zh_normalization/quantifier.py index 598030e4..1e7f2aab 100644 --- a/GPT_SoVITS/text/zh_normalization/quantifier.py +++ b/GPT_SoVITS/text/zh_normalization/quantifier.py @@ -17,7 +17,7 @@ from .num import num2str # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 -RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') +RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)") measure_dict = { "cm2": "平方厘米", "cm²": "平方厘米", @@ -35,7 +35,7 @@ measure_dict = { "ml": "毫升", "m": "米", "mm": "毫米", - "s": "秒" + "s": "秒", } diff --git a/GPT_SoVITS/text/zh_normalization/text_normlization.py b/GPT_SoVITS/text/zh_normalization/text_normlization.py index 400b30ff..099b01bd 100644 --- a/GPT_SoVITS/text/zh_normalization/text_normlization.py +++ b/GPT_SoVITS/text/zh_normalization/text_normlization.py @@ -56,9 +56,9 @@ from .quantifier import replace_measure from .quantifier import replace_temperature -class TextNormalizer(): +class TextNormalizer: def __init__(self): - self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') + self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)") def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. @@ -71,66 +71,64 @@ class TextNormalizer(): if lang == "zh": text = text.replace(" ", "") # 过滤掉特殊字符 - text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text) - text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) + text = re.sub(r"[——《》【】<>{}()()#&@“”^_|\\]", "", text) + text = self.SENTENCE_SPLITOR.sub(r"\1\n", text) text = text.strip() - sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + sentences = [sentence.strip() for sentence in re.split(r"\n+", text)] return sentences def _post_replace(self, sentence: str) -> str: - sentence = sentence.replace('/', '每') + sentence = sentence.replace("/", "每") # sentence = sentence.replace('~', '至') # sentence = sentence.replace('~', '至') - sentence = sentence.replace('①', '一') - sentence = sentence.replace('②', '二') - sentence = sentence.replace('③', '三') - sentence = sentence.replace('④', '四') - sentence = sentence.replace('⑤', '五') - sentence = sentence.replace('⑥', '六') - sentence = sentence.replace('⑦', '七') - sentence = sentence.replace('⑧', '八') - sentence = sentence.replace('⑨', '九') - sentence = sentence.replace('⑩', '十') - sentence = sentence.replace('α', '阿尔法') - sentence = sentence.replace('β', '贝塔') - sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛') - sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔') - sentence = sentence.replace('ε', '艾普西龙') - sentence = sentence.replace('ζ', '捷塔') - sentence = sentence.replace('η', '依塔') - sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔') - sentence = sentence.replace('ι', '艾欧塔') - sentence = sentence.replace('κ', '喀帕') - sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达') - sentence = sentence.replace('μ', '缪') - sentence = sentence.replace('ν', '拗') - sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西') - sentence = sentence.replace('ο', '欧米克伦') - sentence = sentence.replace('π', '派').replace('Π', '派') - sentence = sentence.replace('ρ', '肉') - sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace( - 'σ', '西格玛') - sentence = sentence.replace('τ', '套') - sentence = sentence.replace('υ', '宇普西龙') - sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾') - sentence = sentence.replace('χ', '器') - sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛') - sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽') + sentence = sentence.replace("①", "一") + sentence = sentence.replace("②", "二") + sentence = sentence.replace("③", "三") + sentence = sentence.replace("④", "四") + sentence = sentence.replace("⑤", "五") + sentence = sentence.replace("⑥", "六") + sentence = sentence.replace("⑦", "七") + sentence = sentence.replace("⑧", "八") + sentence = sentence.replace("⑨", "九") + sentence = sentence.replace("⑩", "十") + sentence = sentence.replace("α", "阿尔法") + sentence = sentence.replace("β", "贝塔") + sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛") + sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔") + sentence = sentence.replace("ε", "艾普西龙") + sentence = sentence.replace("ζ", "捷塔") + sentence = sentence.replace("η", "依塔") + sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔") + sentence = sentence.replace("ι", "艾欧塔") + sentence = sentence.replace("κ", "喀帕") + sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达") + sentence = sentence.replace("μ", "缪") + sentence = sentence.replace("ν", "拗") + sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西") + sentence = sentence.replace("ο", "欧米克伦") + sentence = sentence.replace("π", "派").replace("Π", "派") + sentence = sentence.replace("ρ", "肉") + sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛") + sentence = sentence.replace("τ", "套") + sentence = sentence.replace("υ", "宇普西龙") + sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾") + sentence = sentence.replace("χ", "器") + sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛") + sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽") # 兜底数学运算,顺便兼容懒人用语 - sentence = sentence.replace('+', '加') - sentence = sentence.replace('-', '减') - sentence = sentence.replace('×', '乘') - sentence = sentence.replace('÷', '除') - sentence = sentence.replace('=', '等') + sentence = sentence.replace("+", "加") + sentence = sentence.replace("-", "减") + sentence = sentence.replace("×", "乘") + sentence = sentence.replace("÷", "除") + sentence = sentence.replace("=", "等") # re filter special characters, have one more character "-" than line 68 - sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence) + sentence = re.sub(r"[-——《》【】<=>{}()()#&@“”^_|\\]", "", sentence) return sentence def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) - sentence = sentence.translate(F2H_ASCII_LETTERS).translate( - F2H_DIGITS).translate(F2H_SPACE) + sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE) # number related NSW verbalization sentence = RE_DATE.sub(replace_date, sentence) @@ -161,8 +159,7 @@ class TextNormalizer(): sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) - sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, - sentence) + sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) sentence = self._post_replace(sentence) diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py index 177eda14..f6f388ac 100644 --- a/GPT_SoVITS/utils.py +++ b/GPT_SoVITS/utils.py @@ -1,24 +1,22 @@ -import os -import glob -import sys import argparse -import logging +import glob import json +import logging +import os import subprocess +import sys import traceback import librosa import numpy as np -from scipy.io.wavfile import read import torch -import logging logging.getLogger("numba").setLevel(logging.ERROR) logging.getLogger("matplotlib").setLevel(logging.ERROR) MATPLOTLIB_FLAG = False -logging.basicConfig(stream=sys.stdout, level=logging.ERROR) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging @@ -27,11 +25,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") iteration = checkpoint_dict["iteration"] learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None - and not skip_optimizer - and checkpoint_dict["optimizer"] is not None - ): + if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None: optimizer.load_state_dict(checkpoint_dict["optimizer"]) saved_state_dict = checkpoint_dict["model"] if hasattr(model, "module"): @@ -50,9 +44,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False ) except: traceback.print_exc() - print( - "error, %s is not in the checkpoint" % k - ) # shape不对也会,比如text_embedding当cleaner修改时 + print("error, %s is not in the checkpoint" % k) # shape不对也会,比如text_embedding当cleaner修改时 new_state_dict[k] = v if hasattr(model, "module"): model.module.load_state_dict(new_state_dict) @@ -60,25 +52,28 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False model.load_state_dict(new_state_dict) print("load ") logger.info( - "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + "Loaded checkpoint '{}' (iteration {})".format( + checkpoint_path, + iteration, + ) ) return model, optimizer, learning_rate, iteration -from time import time as ttime + import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s.pth"%(ttime()) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) +from time import time as ttime + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at iteration {} to {}".format( - iteration, checkpoint_path - ) - ) + logger.info("Saving model and optimizer state at iteration {} to {}".format(iteration, checkpoint_path)) if hasattr(model, "module"): state_dict = model.module.state_dict() else: @@ -132,7 +127,6 @@ def plot_spectrogram_to_numpy(spectrogram): mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt - import numpy as np fig, ax = plt.subplots(figsize=(10, 2)) im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") @@ -158,11 +152,13 @@ def plot_alignment_to_numpy(alignment, info=None): mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt - import numpy as np fig, ax = plt.subplots(figsize=(6, 4)) im = ax.imshow( - alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + alignment.transpose(), + aspect="auto", + origin="lower", + interpolation="none", ) fig.colorbar(im, ax=ax) xlabel = "Decoder timestep" @@ -199,9 +195,7 @@ def get_hparams(init=True, stage=1): default="./configs/s2.json", help="JSON file for configuration", ) - parser.add_argument( - "-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir" - ) + parser.add_argument("-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir") parser.add_argument( "-rs", "--resume_step", @@ -250,11 +244,7 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim """ import re - ckpts_files = [ - f - for f in os.listdir(path_to_models) - if os.path.isfile(os.path.join(path_to_models, f)) - ] + ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) sort_key = time_key if sort_by_time else name_key @@ -263,8 +253,7 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim key=sort_key, ) to_del = [ - os.path.join(path_to_models, fn) - for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + os.path.join(path_to_models, fn) for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) ] del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") del_routine = lambda x: [os.remove(x), del_info(x)] @@ -296,7 +285,7 @@ def check_git_hash(model_dir): if not os.path.exists(os.path.join(source_dir, ".git")): logger.warn( "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir + source_dir, ) ) return @@ -309,7 +298,8 @@ def check_git_hash(model_dir): if saved_hash != cur_hash: logger.warn( "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] + saved_hash[:8], + cur_hash[:8], ) ) else: @@ -319,13 +309,13 @@ def check_git_hash(model_dir): def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.ERROR) + logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") if not os.path.exists(model_dir): os.makedirs(model_dir) h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.ERROR) + h.setLevel(logging.INFO) h.setFormatter(formatter) logger.addHandler(h) return logger @@ -366,6 +356,6 @@ class HParams: if __name__ == "__main__": print( load_wav_to_torch( - "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac" + "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac", ) ) diff --git a/GPT_SoVITS_Inference.ipynb b/GPT_SoVITS_Inference.ipynb deleted file mode 100644 index 1b8ec64d..00000000 --- a/GPT_SoVITS_Inference.ipynb +++ /dev/null @@ -1,153 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "himHYZmra7ix" - }, - "source": [ - "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e9b7iFV3dm1f" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", - "%cd GPT-SoVITS\n", - "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", - "!pip install -r extra-req.txt --no-deps\n", - "!pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "0NgxXg5sjv7z" - }, - "outputs": [], - "source": [ - "# @title Download pretrained models 下载预训练模型\n", - "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n", - "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n", - "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", - "%cd /content/GPT-SoVITS/tools/damo_asr/models\n", - "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", - "# @title UVR5 pretrains 安装uvr5模型\n", - "%cd /content/GPT-SoVITS/tools/uvr5\n", - "!git clone https://huggingface.co/Delik/uvr5_weights\n", - "!git config core.sparseCheckout true\n", - "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "cPDEH-9czOJF" - }, - "outputs": [], - "source": [ - "#@title Create folder models 创建文件夹模型\n", - "import os\n", - "base_directory = \"/content/GPT-SoVITS\"\n", - "folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n", - "\n", - "for folder_name in folder_names:\n", - " if os.path.exists(os.path.join(base_directory, folder_name)):\n", - " print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n", - " else:\n", - " os.makedirs(os.path.join(base_directory, folder_name))\n", - " print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n", - "\n", - "print(\"All folders have been created. (所有文件夹均已创建。)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "vbZY-LnM0tzq" - }, - "outputs": [], - "source": [ - "import requests\n", - "import zipfile\n", - "import shutil\n", - "import os\n", - "\n", - "#@title Import model 导入模型 (HuggingFace)\n", - "hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n", - "\n", - "output_path = '/content/'\n", - "\n", - "response = requests.get(hf_link)\n", - "with open(output_path + 'file.zip', 'wb') as file:\n", - " file.write(response.content)\n", - "\n", - "with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n", - " zip_ref.extractall(output_path)\n", - "\n", - "os.remove(output_path + \"file.zip\")\n", - "\n", - "source_directory = output_path\n", - "SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n", - "GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n", - "\n", - "for filename in os.listdir(source_directory):\n", - " if filename.endswith(\".pth\"):\n", - " source_path = os.path.join(source_directory, filename)\n", - " destination_path = os.path.join(SoVITS_destination_directory, filename)\n", - " shutil.move(source_path, destination_path)\n", - "\n", - "for filename in os.listdir(source_directory):\n", - " if filename.endswith(\".ckpt\"):\n", - " source_path = os.path.join(source_directory, filename)\n", - " destination_path = os.path.join(GPT_destination_directory, filename)\n", - " shutil.move(source_path, destination_path)\n", - "\n", - "print(f'Model downloaded. (模型已下载。)')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "4oRGUzkrk8C7" - }, - "outputs": [], - "source": [ - "# @title launch WebUI 启动WebUI\n", - "!/usr/local/bin/pip install ipykernel\n", - "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", - "%cd /content/GPT-SoVITS/\n", - "!/usr/local/bin/python webui.py" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/README.md b/README.md index 8be71a65..b32d2fd9 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,15 @@ For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/ ### Tested Environments -- Python 3.9, PyTorch 2.0.1, CUDA 11 -- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 -- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon) -- Python 3.9, PyTorch 2.2.2, CPU devices - -_Note: numba==0.56.4 requires py<3.11_ +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -60,33 +63,41 @@ If you are a Windows user (tested with win>=10), you can [download the integrate ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh +bash install.sh --device --source [--download-uvr5] ``` ### macOS **Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.** -1. Install Xcode command-line tools by running `xcode-select --install`. -2. Install FFmpeg by running `brew install ffmpeg`. -3. Install the program by running the following commands: +Install the program by running the following commands: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +bash install.sh --device --source [--download-uvr5] ``` ### Install Manually +#### Install Dependences + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### Install FFmpeg ##### Conda Users ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -95,14 +106,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows Users -Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. +Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root -Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) +Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ##### MacOS Users @@ -110,45 +120,64 @@ Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Ko brew install ffmpeg ``` -#### Install Dependences +### Running GPT-SoVITS with Docker + +#### Docker Image Selection + +Due to rapid development in the codebase and a slower Docker image release cycle, please: + +- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags +- Choose an appropriate image tag for your environment +- `Lite` means the Docker image does not include ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed +- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose +- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes + +#### Environment Variables + +- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage. + +#### Shared Memory Configuration + +On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory. + +#### Choosing a Service + +The `docker-compose.yaml` defines two services: + +- `GPT-SoVITS-CU126` & `GPT-SoVITS-CU128`: Full version with all features. +- `GPT-SoVITS-CU126-Lite` & `GPT-SoVITS-CU128-Lite`: Lightweight version with reduced dependencies and functionality. + +To run a specific service with Docker Compose, use: ```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +docker compose run --service-ports ``` -### Using Docker +#### Building the Docker Image Locally -#### docker-compose.yaml configuration +If you want to build the image yourself, use: -0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs. -1. Environment Variables: - - is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation. -2. Volumes Configuration,The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content. -3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation. -4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances. - -#### Running with docker compose - -``` -docker compose -f "docker-compose.yaml" up -d +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### Running with docker command +#### Accessing the Running Container (Bash Shell) -As above, modify the corresponding parameters based on your actual situation, then run the following command: +Once the container is running in the background, you can access it using: -``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## Pretrained Models +**If `install.sh` runs successfully, you may skip No.1,2,3** + **Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).** 1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`. -2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only) +2. Download G2PW models from [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only) 3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. @@ -165,7 +194,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker The TTS annotation .list file format: ``` + vocal_path|speaker_name|language|text + ``` Language dictionary: @@ -179,7 +210,9 @@ Language dictionary: Example: ``` + D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. + ``` ## Finetune and inference @@ -209,12 +242,12 @@ Or maunally switch version in WebUI #### Path Auto-filling is now supported - 1. Fill in the audio path - 2. Slice the audio into small chunks - 3. Denoise(optinal) - 4. ASR - 5. Proofreading ASR transcriptions - 6. Go to the next Tab, then finetune the model +1. Fill in the audio path +2. Slice the audio into small chunks +3. Denoise(optinal) +4. ASR +5. Proofreading ASR transcriptions +6. Go to the next Tab, then finetune the model ### Open Inference WebUI @@ -256,9 +289,9 @@ Use v2 from v1 environment: 2. Clone the latest codes from github. -3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. +3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`. - Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. + Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.) ## V3 Release Notes @@ -268,7 +301,7 @@ New Features: 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression. - [more details]() + [more details]() Use v3 from v2 environment: @@ -276,10 +309,25 @@ Use v3 from v2 environment: 2. Clone the latest codes from github. -3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. +3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) +## V4 Release Notes + +New Features: + +1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed. + [more details]() + +Use v4 from v1/v2/v3 environment: + +1. `pip install -r requirements.txt` to update some packages + +2. Clone the latest codes from github. + +3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. + ## Todo List - [x] **High Priority:** @@ -304,7 +352,7 @@ Use v3 from v2 environment: Use the command line to open the WebUI for UVR5 -``` +```bash python tools/uvr5/webui.py "" ``` @@ -315,7 +363,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- This is how the audio segmentation of the dataset is done using the command line -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -327,7 +375,7 @@ python audio_slicer.py \ This is how dataset ASR processing is done using the command line(Only Chinese) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` @@ -335,7 +383,7 @@ ASR processing is performed through Faster_Whisper(ASR marking except Chinese) (No progress bars, GPU performance may cause time delays) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` diff --git a/api.py b/api.py index d92d9c8b..4b79d501 100644 --- a/api.py +++ b/api.py @@ -140,9 +140,9 @@ RESP: 无 """ - import argparse -import os,re +import os +import re import sys now_dir = os.getcwd() @@ -152,10 +152,11 @@ sys.path.append("%s/GPT_SoVITS" % (now_dir)) import signal from text.LangSegmenter import LangSegmenter from time import time as ttime -import torch, torchaudio +import torch +import torchaudio import librosa import soundfile as sf -from fastapi import FastAPI, Request, Query, HTTPException +from fastapi import FastAPI, Request, Query from fastapi.responses import StreamingResponse, JSONResponse import uvicorn from transformers import AutoModelForMaskedLM, AutoTokenizer @@ -163,12 +164,11 @@ import numpy as np from feature_extractor import cnhubert from io import BytesIO from module.models import SynthesizerTrn, SynthesizerTrnV3 -from peft import LoraConfig, PeftModel, get_peft_model +from peft import LoraConfig, get_peft_model from AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch -from tools.my_utils import load_audio import config as global_config import logging import subprocess @@ -201,7 +201,11 @@ def is_full(*items): # 任意一项为空返回False def init_bigvgan(): global bigvgan_model from BigVGAN import bigvgan - bigvgan_model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() @@ -211,57 +215,71 @@ def init_bigvgan(): bigvgan_model = bigvgan_model.to(device) -resample_transform_dict={} +resample_transform_dict = {} + + def resample(audio_tensor, sr0): global resample_transform_dict if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) return resample_transform_dict[sr0](audio_tensor) -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch +from module.mel_processing import mel_spectrogram_torch + spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) -sr_model=None -def audio_sr(audio,sr): +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +sr_model = None + + +def audio_sr(audio, sr): global sr_model - if sr_model==None: + if sr_model == None: from tools.audio_sr import AP_BWE + try: - sr_model=AP_BWE(device,DictToAttrRecursive) + sr_model = AP_BWE(device, DictToAttrRecursive) except FileNotFoundError: logger.info("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载") - return audio.cpu().detach().numpy(),sr - return sr_model(audio,sr) + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) class Speaker: - def __init__(self, name, gpt, sovits, phones = None, bert = None, prompt = None): + def __init__(self, name, gpt, sovits, phones=None, bert=None, prompt=None): self.name = name self.sovits = sovits self.gpt = gpt self.phones = phones self.bert = bert self.prompt = prompt - + + speaker_list = {} @@ -270,22 +288,25 @@ class Sovits: self.vq_model = vq_model self.hps = hps -from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new -def get_sovits_weights(sovits_path): - path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" - is_exist_s2gv3=os.path.exists(path_sovits_v3) - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) - if if_lora_v3==True and is_exist_s2gv3==False: +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def get_sovits_weights(sovits_path): + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + is_exist_s2gv3 = os.path.exists(path_sovits_v3) + + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + if if_lora_v3 == True and is_exist_s2gv3 == False: logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") dict_s2 = load_sovits_new(sovits_path) hps = dict_s2["config"] hps = DictToAttrRecursive(hps) hps.model.semantic_frame_rate = "25hz" - if 'enc_p.text_embedding.weight' not in dict_s2['weight']: - hps.model.version = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps.model.version = "v1" else: hps.model.version = "v2" @@ -294,27 +315,28 @@ def get_sovits_weights(sovits_path): hps.model.version = "v3" model_params_dict = vars(hps.model) - if model_version!="v3": + if model_version != "v3": vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **model_params_dict + **model_params_dict, ) else: vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **model_params_dict + **model_params_dict, ) init_bigvgan() - model_version=hps.model.version + model_version = hps.model.version logger.info(f"模型版本: {model_version}") - if ("pretrained" not in sovits_path): + if "pretrained" not in sovits_path: try: del vq_model.enc_q - except:pass + except: + pass if is_half == True: vq_model = vq_model.half().to(device) else: @@ -324,7 +346,7 @@ def get_sovits_weights(sovits_path): vq_model.load_state_dict(dict_s2["weight"], strict=False) else: vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False) - lora_rank=dict_s2["lora_rank"] + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -340,13 +362,17 @@ def get_sovits_weights(sovits_path): sovits = Sovits(vq_model, hps) return sovits + class Gpt: def __init__(self, max_sec, t2s_model): self.max_sec = max_sec self.t2s_model = t2s_model + global hz hz = 50 + + def get_gpt_weights(gpt_path): dict_s1 = torch.load(gpt_path, map_location="cpu") config = dict_s1["config"] @@ -363,7 +389,8 @@ def get_gpt_weights(gpt_path): gpt = Gpt(max_sec, t2s_model) return gpt -def change_gpt_sovits_weights(gpt_path,sovits_path): + +def change_gpt_sovits_weights(gpt_path, sovits_path): try: gpt = get_gpt_weights(gpt_path) sovits = get_sovits_weights(sovits_path) @@ -392,16 +419,16 @@ def get_bert_feature(text, word2ph): def clean_text_inf(text, language, version): - language = language.replace("all_","") + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text def get_bert_inf(phones, word2ph, norm_text, language): - language=language.replace("all_","") + language = language.replace("all_", "") if language == "zh": - bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) else: bert = torch.zeros( (1024, len(phones)), @@ -410,24 +437,27 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert + from text import chinese -def get_phones_and_bert(text,language,version,final=False): + + +def get_phones_and_bert(text, language, version, final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"zh",version) + return get_phones_and_bert(formattext, "zh", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = get_bert_feature(norm_text, word2ph).to(device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"yue",version) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = torch.zeros( @@ -435,8 +465,8 @@ def get_phones_and_bert(text,language,version,final=False): dtype=torch.float16 if is_half == True else torch.float32, ).to(device) elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] + textlist = [] + langlist = [] if language == "auto": for tmp in LangSegmenter.getTexts(text): langlist.append(tmp["lang"]) @@ -467,12 +497,12 @@ def get_phones_and_bert(text,language,version,final=False): bert_list.append(bert) bert = torch.cat(bert_list, dim=1) phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + norm_text = "".join(norm_text_list) if not final and len(phones) < 6: - return get_phones_and_bert("." + text,language,version,final=True) + return get_phones_and_bert("." + text, language, version, final=True) - return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text + return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text class DictToAttrRecursive(dict): @@ -504,15 +534,21 @@ class DictToAttrRecursive(dict): def get_spepc(hps, filename): - audio,_ = librosa.load(filename, int(hps.data.sampling_rate)) + audio, _ = librosa.load(filename, sr=int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1): - audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, - hps.data.win_length, center=False) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) return spec @@ -546,10 +582,11 @@ def pack_ogg(audio_bytes, data, rate): # Or split the whole audio data into smaller audio segment to avoid stack overflow? def handle_pack_ogg(): - with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: + with sf.SoundFile(audio_bytes, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: audio_file.write(data) import threading + # See: https://docs.python.org/3/library/threading.html # The stack size of this thread is at least 32768 # If stack overflow error still occurs, just modify the `stack_size`. @@ -581,35 +618,47 @@ def pack_raw(audio_bytes, data, rate): def pack_wav(audio_bytes, rate): if is_int32: - data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int32) + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int32) wav_bytes = BytesIO() - sf.write(wav_bytes, data, rate, format='WAV', subtype='PCM_32') + sf.write(wav_bytes, data, rate, format="WAV", subtype="PCM_32") else: - data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16) + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int16) wav_bytes = BytesIO() - sf.write(wav_bytes, data, rate, format='WAV') + sf.write(wav_bytes, data, rate, format="WAV") return wav_bytes def pack_aac(audio_bytes, data, rate): if is_int32: - pcm = 's32le' - bit_rate = '256k' + pcm = "s32le" + bit_rate = "256k" else: - pcm = 's16le' - bit_rate = '128k' - process = subprocess.Popen([ - 'ffmpeg', - '-f', pcm, # 输入16位有符号小端整数PCM - '-ar', str(rate), # 设置采样率 - '-ac', '1', # 单声道 - '-i', 'pipe:0', # 从管道读取输入 - '-c:a', 'aac', # 音频编码器为AAC - '-b:a', bit_rate, # 比特率 - '-vn', # 不包含视频 - '-f', 'adts', # 输出AAC数据流格式 - 'pipe:1' # 将输出写入管道 - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + pcm = "s16le" + bit_rate = "128k" + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + pcm, # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + bit_rate, # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) out, _ = process.communicate(input=data.tobytes()) audio_bytes.write(out) @@ -632,7 +681,7 @@ def cut_text(text, punc): items = re.split(f"({punds})", text) mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] # 在句子不存在符号或句尾无符号的时候保证文本完整 - if len(items)%2 == 1: + if len(items) % 2 == 1: mergeitems.append(items[-1]) text = "\n".join(mergeitems) @@ -646,8 +695,38 @@ def only_punc(text): return not any(t.isalnum() or t.isalpha() for t in text) -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k= 15, top_p = 0.6, temperature = 0.6, speed = 1, inp_refs = None, sample_steps = 32, if_sr = False, spk = "default"): +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k=15, + top_p=0.6, + temperature=0.6, + speed=1, + inp_refs=None, + sample_steps=32, + if_sr=False, + spk="default", +): infer_sovits = speaker_list[spk].sovits vq_model = infer_sovits.vq_model hps = infer_sovits.hps @@ -659,7 +738,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, t0 = ttime() prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." prompt_language, text = prompt_language, text.strip("\n") dtype = torch.float16 if is_half == True else torch.float32 zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32) @@ -667,7 +747,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, wav16k, sr = librosa.load(ref_wav_path, sr=16000) wav16k = torch.from_numpy(wav16k) zero_wav_torch = torch.from_numpy(zero_wav) - if (is_half == True): + if is_half == True: wav16k = wav16k.half().to(device) zero_wav_torch = zero_wav_torch.half().to(device) else: @@ -680,15 +760,15 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, prompt = prompt_semantic.unsqueeze(0).to(device) if version != "v3": - refers=[] - if(inp_refs): + refers = [] + if inp_refs: for path in inp_refs: try: refer = get_spepc(hps, path).to(dtype).to(device) refers.append(refer) except Exception as e: logger.error(e) - if(len(refers)==0): + if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] else: refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) @@ -707,7 +787,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, continue audio_opt = [] - if (text[-1] not in splits): text += "。" if text_language != "en" else "." + if text[-1] not in splits: + text += "。" if text_language != "en" else "." phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) bert = torch.cat([bert1, bert2], 1) @@ -722,56 +803,62 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, prompt, bert, # prompt_phone_len=ph_offset, - top_k = top_k, - top_p = top_p, - temperature = temperature, - early_stop_num=hz * max_sec) + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) t3 = ttime() if version != "v3": - audio = \ - vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), - refers,speed=speed).detach().cpu().numpy()[ - 0, 0] ###试试重建不带上prompt部分 + audio = ( + vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 else: - phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0) - phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) # print(11111111, phoneme_ids0, phoneme_ids1) - fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) ref_audio, sr = torchaudio.load(ref_wav_path) - ref_audio=ref_audio.to(device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if sr!=24000: - ref_audio=resample(ref_audio,sr) + if sr != 24000: + ref_audio = resample(ref_audio, sr) # print("ref_audio",ref_audio.abs().mean()) mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): + if T_min > 468: mel2 = mel2[:, :, -468:] fea_ref = fea_ref[:, :, -468:] T_min = 468 chunk_len = 934 - T_min # print("fea_ref",fea_ref,fea_ref.shape) # print("mel2",mel2) - mel2=mel2.to(dtype) - fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge,speed) + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) # print("fea_todo",fea_todo) # print("ge",ge.abs().mean()) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) # set_seed(123) - cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] # print("fea", fea) # print("mel2in", mel2) @@ -779,14 +866,15 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, cfm_resss.append(cfm_res) cmf_res = torch.cat(cfm_resss, 2) cmf_res = denorm_spec(cmf_res) - if bigvgan_model==None:init_bigvgan() + if bigvgan_model == None: + init_bigvgan() with torch.inference_mode(): wav_gen = bigvgan_model(cmf_res) - audio=wav_gen[0][0].cpu().detach().numpy() + audio = wav_gen[0][0].cpu().detach().numpy() - max_audio=np.abs(audio).max() - if max_audio>1: - audio/=max_audio + max_audio = np.abs(audio).max() + if max_audio > 1: + audio /= max_audio audio_opt.append(audio) audio_opt.append(zero_wav) audio_opt = np.concatenate(audio_opt, 0) @@ -795,29 +883,29 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, sr = hps.data.sampling_rate if version != "v3" else 24000 if if_sr and sr == 24000: audio_opt = torch.from_numpy(audio_opt).float().to(device) - audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr) - max_audio=np.abs(audio_opt).max() - if max_audio > 1: audio_opt /= max_audio + audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio sr = 48000 if is_int32: - audio_bytes = pack_audio(audio_bytes,(audio_opt * 2147483647).astype(np.int32),sr) + audio_bytes = pack_audio(audio_bytes, (audio_opt * 2147483647).astype(np.int32), sr) else: - audio_bytes = pack_audio(audio_bytes,(audio_opt * 32768).astype(np.int16),sr) - # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + audio_bytes = pack_audio(audio_bytes, (audio_opt * 32768).astype(np.int16), sr) + # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) if stream_mode == "normal": audio_bytes, audio_chunk = read_clean_buffer(audio_bytes) yield audio_chunk - - if not stream_mode == "normal": + + if not stream_mode == "normal": if media_type == "wav": sr = 48000 if if_sr else 24000 sr = hps.data.sampling_rate if version != "v3" else sr - audio_bytes = pack_wav(audio_bytes,sr) + audio_bytes = pack_wav(audio_bytes, sr) yield audio_bytes.getvalue() - def handle_control(command): if command == "restart": os.execl(g_config.python_exec, g_config.python_exec, *sys.argv) @@ -828,7 +916,9 @@ def handle_control(command): def handle_change(path, text, language): if is_empty(path, text, language): - return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400) + return JSONResponse( + {"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400 + ) if path != "" or path is not None: default_refer.path = path @@ -842,15 +932,31 @@ def handle_change(path, text, language): logger.info(f"当前默认参考音频语种: {default_refer.language}") logger.info(f"is_ready: {default_refer.is_ready()}") - return JSONResponse({"code": 0, "message": "Success"}, status_code=200) -def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr): +def handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, +): if ( - refer_wav_path == "" or refer_wav_path is None - or prompt_text == "" or prompt_text is None - or prompt_language == "" or prompt_language is None + refer_wav_path == "" + or refer_wav_path is None + or prompt_text == "" + or prompt_text is None + or prompt_language == "" + or prompt_language is None ): refer_wav_path, prompt_text, prompt_language = ( default_refer.path, @@ -860,17 +966,31 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cu if not default_refer.is_ready(): return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) - if not sample_steps in [4,8,16,32]: + if sample_steps not in [4, 8, 16, 32]: sample_steps = 32 if cut_punc == None: - text = cut_text(text,default_cut_punc) + text = cut_text(text, default_cut_punc) else: - text = cut_text(text,cut_punc) - - return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr), media_type="audio/"+media_type) - + text = cut_text(text, cut_punc) + return StreamingResponse( + get_tts_wav( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ), + media_type="audio/" + media_type, + ) # -------------------------------- @@ -886,7 +1006,7 @@ dict_language = { "粤英混合": "yue", "日英混合": "ja", "韩英混合": "ko", - "多语种混合": "auto", #多语种启动切分识别语种 + "多语种混合": "auto", # 多语种启动切分识别语种 "多语种混合(粤语)": "auto_yue", "all_zh": "all_zh", "all_yue": "all_yue", @@ -903,7 +1023,7 @@ dict_language = { # logger logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG) -logger = logging.getLogger('uvicorn') +logger = logging.getLogger("uvicorn") # 获取配置 g_config = global_config.Config() @@ -919,8 +1039,12 @@ parser.add_argument("-dl", "--default_refer_language", type=str, default="", hel parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu") parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0") parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") -parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度") -parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度") +parser.add_argument( + "-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度" +) +parser.add_argument( + "-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度" +) # bool值的用法为 `python ./api.py -fp ...` # 此时 full_precision==True, half_precision==False parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive") @@ -972,14 +1096,14 @@ if args.full_precision and args.half_precision: logger.info(f"半精: {is_half}") # 流式返回模式 -if args.stream_mode.lower() in ["normal","n"]: +if args.stream_mode.lower() in ["normal", "n"]: stream_mode = "normal" logger.info("流式返回已开启") else: stream_mode = "close" # 音频编码格式 -if args.media_type.lower() in ["aac","ogg"]: +if args.media_type.lower() in ["aac", "ogg"]: media_type = args.media_type.lower() elif stream_mode == "close": media_type = "wav" @@ -988,12 +1112,12 @@ else: logger.info(f"编码格式: {media_type}") # 音频数据类型 -if args.sub_type.lower() == 'int32': +if args.sub_type.lower() == "int32": is_int32 = True - logger.info(f"数据类型: int32") + logger.info("数据类型: int32") else: is_int32 = False - logger.info(f"数据类型: int16") + logger.info("数据类型: int16") # 初始化模型 cnhubert.cnhubert_base_path = cnhubert_base_path @@ -1006,8 +1130,7 @@ if is_half: else: bert_model = bert_model.to(device) ssl_model = ssl_model.to(device) -change_gpt_sovits_weights(gpt_path = gpt_path, sovits_path = sovits_path) - +change_gpt_sovits_weights(gpt_path=gpt_path, sovits_path=sovits_path) # -------------------------------- @@ -1015,21 +1138,21 @@ change_gpt_sovits_weights(gpt_path = gpt_path, sovits_path = sovits_path) # -------------------------------- app = FastAPI() + @app.post("/set_model") async def set_model(request: Request): json_post_raw = await request.json() return change_gpt_sovits_weights( - gpt_path = json_post_raw.get("gpt_model_path"), - sovits_path = json_post_raw.get("sovits_model_path") + gpt_path=json_post_raw.get("gpt_model_path"), sovits_path=json_post_raw.get("sovits_model_path") ) @app.get("/set_model") async def set_model( - gpt_model_path: str = None, - sovits_model_path: str = None, + gpt_model_path: str = None, + sovits_model_path: str = None, ): - return change_gpt_sovits_weights(gpt_path = gpt_model_path, sovits_path = sovits_model_path) + return change_gpt_sovits_weights(gpt_path=gpt_model_path, sovits_path=sovits_model_path) @app.post("/control") @@ -1047,18 +1170,12 @@ async def control(command: str = None): async def change_refer(request: Request): json_post_raw = await request.json() return handle_change( - json_post_raw.get("refer_wav_path"), - json_post_raw.get("prompt_text"), - json_post_raw.get("prompt_language") + json_post_raw.get("refer_wav_path"), json_post_raw.get("prompt_text"), json_post_raw.get("prompt_language") ) @app.get("/change_refer") -async def change_refer( - refer_wav_path: str = None, - prompt_text: str = None, - prompt_language: str = None -): +async def change_refer(refer_wav_path: str = None, prompt_text: str = None, prompt_language: str = None): return handle_change(refer_wav_path, prompt_text, prompt_language) @@ -1078,27 +1195,41 @@ async def tts_endpoint(request: Request): json_post_raw.get("speed", 1.0), json_post_raw.get("inp_refs", []), json_post_raw.get("sample_steps", 32), - json_post_raw.get("if_sr", False) + json_post_raw.get("if_sr", False), ) @app.get("/") async def tts_endpoint( - refer_wav_path: str = None, - prompt_text: str = None, - prompt_language: str = None, - text: str = None, - text_language: str = None, - cut_punc: str = None, - top_k: int = 15, - top_p: float = 1.0, - temperature: float = 1.0, - speed: float = 1.0, - inp_refs: list = Query(default=[]), - sample_steps: int = 32, - if_sr: bool = False + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None, + text: str = None, + text_language: str = None, + cut_punc: str = None, + top_k: int = 15, + top_p: float = 1.0, + temperature: float = 1.0, + speed: float = 1.0, + inp_refs: list = Query(default=[]), + sample_steps: int = 32, + if_sr: bool = False, ): - return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr) + return handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ) if __name__ == "__main__": diff --git a/api_v2.py b/api_v2.py index 3a8566ac..87082074 100644 --- a/api_v2.py +++ b/api_v2.py @@ -78,7 +78,7 @@ GET: ``` http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt ``` -RESP: +RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 @@ -92,11 +92,12 @@ GET: http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth ``` -RESP: +RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 - + """ + import os import sys import traceback @@ -112,16 +113,15 @@ import wave import signal import numpy as np import soundfile as sf -from fastapi import FastAPI, Request, HTTPException, Response +from fastapi import FastAPI, Response from fastapi.responses import StreamingResponse, JSONResponse -from fastapi import FastAPI, UploadFile, File import uvicorn from io import BytesIO from tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names -from fastapi.responses import StreamingResponse from pydantic import BaseModel + # print(sys.path) i18n = I18nAuto() cut_method_names = get_cut_method_names() @@ -145,6 +145,8 @@ print(tts_config) tts_pipeline = TTS(tts_config) APP = FastAPI() + + class TTS_Request(BaseModel): text: str = None text_lang: str = None @@ -152,58 +154,73 @@ class TTS_Request(BaseModel): aux_ref_audio_paths: list = None prompt_lang: str = None prompt_text: str = "" - top_k:int = 5 - top_p:float = 1 - temperature:float = 1 - text_split_method:str = "cut5" - batch_size:int = 1 - batch_threshold:float = 0.75 - split_bucket:bool = True - speed_factor:float = 1.0 - fragment_interval:float = 0.3 - seed:int = -1 - media_type:str = "wav" - streaming_mode:bool = False - parallel_infer:bool = True - repetition_penalty:float = 1.35 - sample_steps:int = 32 - super_sampling:bool = False + top_k: int = 5 + top_p: float = 1 + temperature: float = 1 + text_split_method: str = "cut5" + batch_size: int = 1 + batch_threshold: float = 0.75 + split_bucket: bool = True + speed_factor: float = 1.0 + fragment_interval: float = 0.3 + seed: int = -1 + media_type: str = "wav" + streaming_mode: bool = False + parallel_infer: bool = True + repetition_penalty: float = 1.35 + sample_steps: int = 32 + super_sampling: bool = False + ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files -def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int): - with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: +def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int): + with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: audio_file.write(data) return io_buffer -def pack_raw(io_buffer:BytesIO, data:np.ndarray, rate:int): +def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int): io_buffer.write(data.tobytes()) return io_buffer -def pack_wav(io_buffer:BytesIO, data:np.ndarray, rate:int): +def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int): io_buffer = BytesIO() - sf.write(io_buffer, data, rate, format='wav') + sf.write(io_buffer, data, rate, format="wav") return io_buffer -def pack_aac(io_buffer:BytesIO, data:np.ndarray, rate:int): - process = subprocess.Popen([ - 'ffmpeg', - '-f', 's16le', # 输入16位有符号小端整数PCM - '-ar', str(rate), # 设置采样率 - '-ac', '1', # 单声道 - '-i', 'pipe:0', # 从管道读取输入 - '-c:a', 'aac', # 音频编码器为AAC - '-b:a', '192k', # 比特率 - '-vn', # 不包含视频 - '-f', 'adts', # 输出AAC数据流格式 - 'pipe:1' # 将输出写入管道 - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + +def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int): + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + "s16le", # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + "192k", # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) out, _ = process.communicate(input=data.tobytes()) io_buffer.write(out) return io_buffer -def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str): + +def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str): if media_type == "ogg": io_buffer = pack_ogg(io_buffer, data, rate) elif media_type == "aac": @@ -216,7 +233,6 @@ def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str): return io_buffer - # from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): # This will create a wave header then append the frame input @@ -233,7 +249,7 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=3 return wav_buf.read() -def handle_control(command:str): +def handle_control(command: str): if command == "restart": os.execl(sys.executable, sys.executable, *argv) elif command == "exit": @@ -241,43 +257,52 @@ def handle_control(command:str): exit(0) -def check_params(req:dict): - text:str = req.get("text", "") - text_lang:str = req.get("text_lang", "") - ref_audio_path:str = req.get("ref_audio_path", "") - streaming_mode:bool = req.get("streaming_mode", False) - media_type:str = req.get("media_type", "wav") - prompt_lang:str = req.get("prompt_lang", "") - text_split_method:str = req.get("text_split_method", "cut5") +def check_params(req: dict): + text: str = req.get("text", "") + text_lang: str = req.get("text_lang", "") + ref_audio_path: str = req.get("ref_audio_path", "") + streaming_mode: bool = req.get("streaming_mode", False) + media_type: str = req.get("media_type", "wav") + prompt_lang: str = req.get("prompt_lang", "") + text_split_method: str = req.get("text_split_method", "cut5") if ref_audio_path in [None, ""]: return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"}) if text in [None, ""]: return JSONResponse(status_code=400, content={"message": "text is required"}) - if (text_lang in [None, ""]) : + if text_lang in [None, ""]: return JSONResponse(status_code=400, content={"message": "text_lang is required"}) elif text_lang.lower() not in tts_config.languages: - return JSONResponse(status_code=400, content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"}) - if (prompt_lang in [None, ""]) : + return JSONResponse( + status_code=400, + content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"}, + ) + if prompt_lang in [None, ""]: return JSONResponse(status_code=400, content={"message": "prompt_lang is required"}) elif prompt_lang.lower() not in tts_config.languages: - return JSONResponse(status_code=400, content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"}) + return JSONResponse( + status_code=400, + content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"}, + ) if media_type not in ["wav", "raw", "ogg", "aac"]: return JSONResponse(status_code=400, content={"message": f"media_type: {media_type} is not supported"}) - elif media_type == "ogg" and not streaming_mode: + elif media_type == "ogg" and not streaming_mode: return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"}) - + if text_split_method not in cut_method_names: - return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"}) + return JSONResponse( + status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"} + ) return None -async def tts_handle(req:dict): + +async def tts_handle(req: dict): """ Text to speech handler. - + Args: - req (dict): + req (dict): { "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized @@ -298,14 +323,14 @@ async def tts_handle(req:dict): "media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac". "streaming_mode": False, # bool. whether to return a streaming response. "parallel_infer": True, # bool.(optional) whether to use parallel inference. - "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. + "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. "sample_steps": 32, # int. number of sampling steps for VITS model V3. - "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. } returns: StreamingResponse: audio stream response. """ - + streaming_mode = req.get("streaming_mode", False) return_fragment = req.get("return_fragment", False) media_type = req.get("media_type", "wav") @@ -316,12 +341,13 @@ async def tts_handle(req:dict): if streaming_mode or return_fragment: req["return_fragment"] = True - + try: - tts_generator=tts_pipeline.run(req) - + tts_generator = tts_pipeline.run(req) + if streaming_mode: - def streaming_generator(tts_generator:Generator, media_type:str): + + def streaming_generator(tts_generator: Generator, media_type: str): if_frist_chunk = True for sr, chunk in tts_generator: if if_frist_chunk and media_type == "wav": @@ -329,19 +355,22 @@ async def tts_handle(req:dict): media_type = "raw" if_frist_chunk = False yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue() + # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" - return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}") - + return StreamingResponse( + streaming_generator( + tts_generator, + media_type, + ), + media_type=f"audio/{media_type}", + ) + else: sr, audio_data = next(tts_generator) audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() return Response(audio_data, media_type=f"audio/{media_type}") except Exception as e: - return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)}) - - - - + return JSONResponse(status_code=400, content={"message": "tts failed", "Exception": str(e)}) @APP.get("/control") @@ -351,32 +380,31 @@ async def control(command: str = None): handle_control(command) - @APP.get("/tts") async def tts_get_endpoint( - text: str = None, - text_lang: str = None, - ref_audio_path: str = None, - aux_ref_audio_paths:list = None, - prompt_lang: str = None, - prompt_text: str = "", - top_k:int = 5, - top_p:float = 1, - temperature:float = 1, - text_split_method:str = "cut0", - batch_size:int = 1, - batch_threshold:float = 0.75, - split_bucket:bool = True, - speed_factor:float = 1.0, - fragment_interval:float = 0.3, - seed:int = -1, - media_type:str = "wav", - streaming_mode:bool = False, - parallel_infer:bool = True, - repetition_penalty:float = 1.35, - sample_steps:int =32, - super_sampling:bool = False - ): + text: str = None, + text_lang: str = None, + ref_audio_path: str = None, + aux_ref_audio_paths: list = None, + prompt_lang: str = None, + prompt_text: str = "", + top_k: int = 5, + top_p: float = 1, + temperature: float = 1, + text_split_method: str = "cut0", + batch_size: int = 1, + batch_threshold: float = 0.75, + split_bucket: bool = True, + speed_factor: float = 1.0, + fragment_interval: float = 0.3, + seed: int = -1, + media_type: str = "wav", + streaming_mode: bool = False, + parallel_infer: bool = True, + repetition_penalty: float = 1.35, + sample_steps: int = 32, + super_sampling: bool = False, +): req = { "text": text, "text_lang": text_lang.lower(), @@ -388,21 +416,21 @@ async def tts_get_endpoint( "top_p": top_p, "temperature": temperature, "text_split_method": text_split_method, - "batch_size":int(batch_size), - "batch_threshold":float(batch_threshold), - "speed_factor":float(speed_factor), - "split_bucket":split_bucket, - "fragment_interval":fragment_interval, - "seed":seed, - "media_type":media_type, - "streaming_mode":streaming_mode, - "parallel_infer":parallel_infer, - "repetition_penalty":float(repetition_penalty), - "sample_steps":int(sample_steps), - "super_sampling":super_sampling + "batch_size": int(batch_size), + "batch_threshold": float(batch_threshold), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "fragment_interval": fragment_interval, + "seed": seed, + "media_type": media_type, + "streaming_mode": streaming_mode, + "parallel_infer": parallel_infer, + "repetition_penalty": float(repetition_penalty), + "sample_steps": int(sample_steps), + "super_sampling": super_sampling, } return await tts_handle(req) - + @APP.post("/tts") async def tts_post_endpoint(request: TTS_Request): @@ -415,7 +443,7 @@ async def set_refer_aduio(refer_audio_path: str = None): try: tts_pipeline.set_ref_audio(refer_audio_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "set refer audio failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) @@ -425,18 +453,19 @@ async def set_refer_aduio(refer_audio_path: str = None): # # 检查文件类型,确保是音频文件 # if not audio_file.content_type.startswith("audio/"): # return JSONResponse(status_code=400, content={"message": "file type is not supported"}) - + # os.makedirs("uploaded_audio", exist_ok=True) # save_path = os.path.join("uploaded_audio", audio_file.filename) # # 保存音频文件到服务器上的一个目录 # with open(save_path , "wb") as buffer: # buffer.write(await audio_file.read()) - + # tts_pipeline.set_ref_audio(save_path) # except Exception as e: # return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) # return JSONResponse(status_code=200, content={"message": "success"}) + @APP.get("/set_gpt_weights") async def set_gpt_weights(weights_path: str = None): try: @@ -444,7 +473,7 @@ async def set_gpt_weights(weights_path: str = None): return JSONResponse(status_code=400, content={"message": "gpt weight path is required"}) tts_pipeline.init_t2s_weights(weights_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"change gpt weight failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "change gpt weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) @@ -456,17 +485,16 @@ async def set_sovits_weights(weights_path: str = None): return JSONResponse(status_code=400, content={"message": "sovits weight path is required"}) tts_pipeline.init_vits_weights(weights_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "change sovits weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) - if __name__ == "__main__": try: - if host == 'None': # 在调用时使用 -a None 参数,可以让api监听双栈 + if host == "None": # 在调用时使用 -a None 参数,可以让api监听双栈 host = None uvicorn.run(app=APP, host=host, port=port, workers=1) - except Exception as e: + except Exception: traceback.print_exc() os.kill(os.getpid(), signal.SIGTERM) exit(0) diff --git a/colab_webui.ipynb b/colab_webui.ipynb deleted file mode 100644 index 226cc214..00000000 --- a/colab_webui.ipynb +++ /dev/null @@ -1,98 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_o6a8GS2lWQM" - }, - "source": [ - "环境配置 environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e9b7iFV3dm1f" - }, - "outputs": [], - "source": [ - "!pip install -q condacolab\n", - "# Setting up condacolab and installing packages\n", - "import condacolab\n", - "condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n", - "%cd -q /content\n", - "!git clone https://github.com/RVC-Boss/GPT-SoVITS\n", - "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", - "%cd -q /content/GPT-SoVITS\n", - "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", - "!/usr/local/bin/pip install -r extra-req.txt --no-deps\n", - "!/usr/local/bin/pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0NgxXg5sjv7z" - }, - "outputs": [], - "source": [ - "# @title Download pretrained models 下载预训练模型\n", - "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n", - "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n", - "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", - "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", - "%cd /content/GPT-SoVITS/tools/damo_asr/models\n", - "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", - "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", - "# @title UVR5 pretrains 安装uvr5模型\n", - "%cd /content/GPT-SoVITS/tools/uvr5\n", - "%rm -r uvr5_weights\n", - "!git clone https://huggingface.co/Delik/uvr5_weights\n", - "!git config core.sparseCheckout true\n", - "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4oRGUzkrk8C7" - }, - "outputs": [], - "source": [ - "# @title launch WebUI 启动WebUI\n", - "!/usr/local/bin/pip install ipykernel\n", - "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", - "%cd /content/GPT-SoVITS/\n", - "!/usr/local/bin/python webui.py" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "include_colab_link": true, - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/config.py b/config.py index 1f741285..5f90c5cd 100644 --- a/config.py +++ b/config.py @@ -1,4 +1,5 @@ -import sys,os +import sys +import os import torch @@ -6,9 +7,9 @@ import torch sovits_path = "" gpt_path = "" is_half_str = os.environ.get("is_half", "True") -is_half = True if is_half_str.lower() == 'true' else False -is_share_str = os.environ.get("is_share","False") -is_share= True if is_share_str.lower() == 'true' else False +is_half = True if is_half_str.lower() == "true" else False +is_share_str = os.environ.get("is_share", "False") +is_share = True if is_share_str.lower() == "true" else False cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" @@ -32,16 +33,18 @@ api_port = 9880 if infer_device == "cuda": gpu_name = torch.cuda.get_device_name(0) if ( - ("16" in gpu_name and "V100" not in gpu_name.upper()) - or "P40" in gpu_name.upper() - or "P10" in gpu_name.upper() - or "1060" in gpu_name - or "1070" in gpu_name - or "1080" in gpu_name + ("16" in gpu_name and "V100" not in gpu_name.upper()) + or "P40" in gpu_name.upper() + or "P10" in gpu_name.upper() + or "1060" in gpu_name + or "1070" in gpu_name + or "1080" in gpu_name ): - is_half=False + is_half = False + +if infer_device == "cpu": + is_half = False -if(infer_device=="cpu"):is_half=False class Config: def __init__(self): diff --git a/docker-compose.yaml b/docker-compose.yaml index aca8ab9e..9703d0c7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,32 +1,95 @@ -version: '3.8' +version: "3.8" services: - gpt-sovits: - image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. - container_name: gpt-sovits-container - environment: - - is_half=False - - is_share=False - volumes: - - ./output:/workspace/output - - ./logs:/workspace/logs - - ./SoVITS_weights:/workspace/SoVITS_weights - - ./reference:/workspace/reference - working_dir: /workspace + GPT-SoVITS-CU126: + image: xxxxrt666/gpt-sovits:latest-cu126 + container_name: GPT-SoVITS-CU126 ports: - - "9880:9880" - "9871:9871" - "9872:9872" - "9873:9873" - "9874:9874" - shm_size: 16G - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: "all" - capabilities: [gpu] - stdin_open: true + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + environment: + - is_half=true tty: true + stdin_open: true + shm_size: "16g" restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU126-Lite: + image: xxxxrt666/gpt-sovits:latest-cu126-lite + container_name: GPT-SoVITS-CU126-Lite + ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + - tools/asr/models:/workspace/models/asr_models + - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights + environment: + - is_half=true + tty: true + stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU128: + image: xxxxrt666/gpt-sovits:latest-cu128 + container_name: GPT-SoVITS-CU128 + ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + environment: + - is_half=true + tty: true + stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU128-Lite: + image: xxxxrt666/gpt-sovits:latest-cu128-lite + container_name: GPT-SoVITS-CU128-Lite + ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + - tools/asr/models:/workspace/models/asr_models + - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights + environment: + - is_half=true + tty: true + stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia \ No newline at end of file diff --git a/docker_build.sh b/docker_build.sh new file mode 100644 index 00000000..354599d2 --- /dev/null +++ b/docker_build.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +set -e + +if ! command -v docker &>/dev/null; then + echo "Docker Not Found" + exit 1 +fi + +trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR + +LITE=false +CUDA_VERSION=12.6 + +print_help() { + echo "Usage: bash docker_build.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)" + echo " --lite Build a Lite Image" + echo " -h, --help Show this help message and exit" + echo "" + echo "Examples:" + echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper" +} + +# Show help if no arguments provided +if [[ $# -eq 0 ]]; then + print_help + exit 0 +fi + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --cuda) + case "$2" in + 12.6) + CUDA_VERSION=12.6 + ;; + 12.8) + CUDA_VERSION=12.8 + ;; + *) + echo "Error: Invalid CUDA_VERSION: $2" + echo "Choose From: [12.6, 12.8]" + exit 1 + ;; + esac + shift 2 + ;; + --lite) + LITE=true + shift + ;; + *) + echo "Unknown Argument: $1" + echo "Use -h or --help to see available options." + exit 1 + ;; + esac +done + +TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64") + +if [ $LITE = true ]; then + TORCH_BASE="lite" +else + TORCH_BASE="full" +fi + +docker build \ + --build-arg CUDA_VERSION=$CUDA_VERSION \ + --build-arg LITE=$LITE \ + --build-arg TARGETPLATFORM="$TARGETPLATFORM" \ + --build-arg TORCH_BASE=$TORCH_BASE \ + -t "${USER}/gpt-sovits:local" \ + . diff --git a/dockerbuild.sh b/dockerbuild.sh deleted file mode 100755 index 3a4a1e18..00000000 --- a/dockerbuild.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# 获取当前日期,格式为 YYYYMMDD -DATE=$(date +%Y%m%d) -# 获取最新的 Git commit 哈希值的前 7 位 -COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) - -# 构建 full 版本的镜像 -docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH - - -# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) -docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md index 4666f6e2..cd1d50d0 100644 --- a/docs/cn/Changelog_CN.md +++ b/docs/cn/Changelog_CN.md @@ -1,12 +1,12 @@ ### 20240121更新 -1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网 +1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网 2-WebUI添加英文系统英文翻译适配 -3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载 +3-cmd-asr自动判断是否已自带damo模型, 如不在默认目录上将从modelscope自带下载 -4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) +4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) 5-清理TEMP文件夹缓存音频等文件 @@ -14,11 +14,11 @@ ### 20240122更新 -1-修复过短输出文件返回重复参考音频的问题。 +1-修复过短输出文件返回重复参考音频的问题. -2-经测试,英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符)。 +2-经测试, 英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符). -3-音频路径检查。如果尝试读取输入错的路径报错路径不存在,而非ffmpeg错误。 +3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误. ### 20240123更新 @@ -44,7 +44,7 @@ 6-支持mac训练推理 -7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。 +7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度. ### 20240128更新 @@ -85,7 +85,7 @@ 1-修复asr路径尾缀带/保存文件名报错 -2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题,例如:xx.xx%(带百分号类),元/吨 会读成 元吨 而不是元每吨,下划线不再会报错 +2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错 ### 20240207更新 @@ -93,7 +93,7 @@ 2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403 -3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理,16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) +3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理, 16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) 4-优化英文文本前端 @@ -105,19 +105,19 @@ ### 20240208更新 -1-GPT训练卡死(win10 1909)和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体)GPT训练报错,[尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)。 +1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b). ### 20240212更新 -1-faster whisper和funasr逻辑优化。faster whisper转镜像站下载,规避huggingface连不上的问题。 +1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题. -2-DPO Loss实验性训练选项开启,通过构造负样本训练缓解GPT重复漏字问题。推理界面公开几个推理参数。 https://github.com/RVC-Boss/GPT-SoVITS/pull/457 +2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457 ### 20240214更新 -1-训练支持中文实验名(原来会报错) +1-训练支持中文实验名 (原来会报错) -2-DPO训练改为可勾选选项而非必须。如勾选batch size自动减半。修复推理界面新参数不传参的问题。 +2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题. ### 20240216更新 @@ -127,7 +127,7 @@ ### 20240221更新 -1-数据处理添加语音降噪选项(降噪为只剩16k采样率,除非底噪很大先不急着用哦。) +1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦). 2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509 @@ -137,7 +137,7 @@ ### 20240306更新 -1-推理加速50%(RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested)https://github.com/RVC-Boss/GPT-SoVITS/pull/672 +1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672 2-如果用faster whisper非中文ASR不再需要先下中文funasr模型 @@ -151,7 +151,7 @@ 2个重点 -1-修复sovits训练未冻结vq的问题(可能造成效果下降) +1-修复sovits训练未冻结vq的问题 (可能造成效果下降) 2-增加一个快速推理分支 @@ -171,21 +171,21 @@ ### 20240610 -小问题修复: +小问题修复: 1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169 -2-uvr5中的mdxnet去混响cmd格式修复,兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) +2-uvr5中的mdxnet去混响cmd格式修复, 兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) 3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159 -大问题修复: +大问题修复: -4-修复了webui的GPT中文微调没读到bert导致和推理不一致,训练太多可能效果还会变差的问题。如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) +4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) ### 20240706 -小问题修复: +小问题修复: 1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) @@ -197,9 +197,9 @@ 5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251 -重点: +重点: -6-倍速推理代码经过验证后推理效果和base完全一致,合并进main。使用的代码:https://github.com/RVC-Boss/GPT-SoVITS/pull/672 。支持无参考文本模式也倍速。 +6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速. 后面会逐渐验证快速推理分支的推理改动的一致性 @@ -211,20 +211,20 @@ 3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756 -重点: +重点: -4-[支持合成语速调节。支持冻结随机性只调节语速,](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340 +4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340 ### 20240806 -1-增加bs-roformer人声伴奏分离模型支持。 https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理。](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) +1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) -2-更好的中文文本前端。 https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑(v2版本特供)。 https://github.com/RVC-Boss/GPT-SoVITS/pull/488 +2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488 3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355 -4-增加喂饭逻辑,用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) +4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) 5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) @@ -234,11 +234,11 @@ ### 20240821 -1-fast_inference分支合并进main:https://github.com/RVC-Boss/GPT-SoVITS/pull/1490 +1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490 -2-支持通过ssml标签优化数字、电话、时间日期等:https://github.com/RVC-Boss/GPT-SoVITS/issues/1508 +2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508 -3-api修复优化:https://github.com/RVC-Boss/GPT-SoVITS/pull/1503 +3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503 4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422 @@ -246,11 +246,11 @@ ### 20250211 -增加gpt-sovits-v3模型,需要14G显存可以微调 +增加gpt-sovits-v3模型, 需要14G显存可以微调 ### 20250212 -sovits-v3微调支持开启梯度检查点,需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040 +sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040 ### 20250214 @@ -266,7 +266,7 @@ sovits-v3微调支持开启梯度检查点,需要12G显存可以微调https:// ### 20250223 -1-sovits-v3微调支持lora训练,需要8G显存可以微调,效果比全参微调更好 +1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好 2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078 @@ -274,11 +274,11 @@ sovits-v3微调支持开启梯度检查点,需要12G显存可以微调https:// https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114 -修复中文路径下mecab的报错(具体表现为日文韩文、文本混合语种切分可能会遇到的报错) +修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错) ### 20250227 -针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解。 +针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解. ### 20250228 @@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT 修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122 修复v3sovits未传参以支持调节语速 + +### 202503 + +修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8 + +修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +修复其他若干bug + +重点更新: + +1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa + +2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理 diff --git a/docs/cn/README.md b/docs/cn/README.md index c6738cdb..832e75f4 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -1,7 +1,7 @@

GPT-SoVITS-WebUI

-强大的少样本语音转换与语音合成Web用户界面。

+强大的少样本语音转换与语音合成Web用户界面.

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) @@ -20,73 +20,84 @@ --- -## 功能: +## 功能 -1. **零样本文本到语音(TTS):** 输入 5 秒的声音样本,即刻体验文本到语音转换。 +1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换. -2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型,提升声音相似度和真实感。 +2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感. -3. **跨语言支持:** 支持与训练数据集不同语言的推理,目前支持英语、日语、韩语、粤语和中文。 +3. **跨语言支持:** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文. -4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注,协助初学者创建训练数据集和 GPT/SoVITS 模型。 +4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型. **查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)** -未见过的说话者 few-shot 微调演示: +未见过的说话者 few-shot 微调演示: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + **用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** ## 安装 -中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验。 +中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验. ### 测试通过的环境 -- Python 3.9,PyTorch 2.0.1,CUDA 11 -- Python 3.10.13,PyTorch 2.1.2,CUDA 12.3 -- Python 3.9,Pytorch 2.2.2,macOS 14.4.1(Apple 芯片) -- Python 3.9,PyTorch 2.2.2,CPU 设备 - -_注: numba==0.56.4 需要 python<3.11_ +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows -如果你是 Windows 用户(已在 win>=10 上测试),可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。 +如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI. -**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO)。** +**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).** ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh +bash install.sh --device --source [--download-uvr5] ``` ### macOS -**注:在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型,所以我们暂时使用 CPU 进行训练。** +**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.** -1. 运行 `xcode-select --install` 安装 Xcode command-line tools。 -2. 运行 `brew install ffmpeg` 安装 FFmpeg。 -3. 完成上述步骤后,运行以下的命令来安装本项目: +运行以下的命令来安装本项目: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +bash install.sh --device --source [--download-uvr5] ``` ### 手动安装 +#### 安装依赖 + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### 安装 FFmpeg ##### Conda 用户 ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -95,14 +106,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows 用户 -下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。 +下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下 -安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS) +安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境 ##### MacOS 用户 @@ -110,67 +120,84 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 安装依赖 +### 运行 GPT-SoVITS (使用 Docker) + +#### Docker 镜像选择 + +由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意: + +- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags) +- 根据你的运行环境选择合适的镜像标签 +- `Lite` Docker 镜像不包含 ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载 +- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64) +- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像 + +#### 环境变量 + +- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用 + +#### 共享内存配置 + +在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`) + +#### 选择服务 + +`docker-compose.yaml` 文件定义了两个主要服务类型: + +- `GPT-SoVITS-CU126` 与 `GPT-SoVITS-CU128`:完整版, 包含所有功能 +- `GPT-SoVITS-CU126-Lite` 与 `GPT-SoVITS-CU128-Lite`:轻量版, 依赖更少, 功能略有删减 + +如需使用 Docker Compose 运行指定服务, 请执行: ```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +docker compose run --service-ports ``` -### 在 Docker 中使用 +#### 本地构建 Docker 镜像 -#### docker-compose.yaml 设置 +如果你希望自行构建镜像, 请使用以下命令: -0. image 的标签:由于代码库更新很快,镜像的打包和测试又很慢,所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) 查看当前打包好的最新的镜像并根据自己的情况选用,或者在本地根据您自己的需求通过 Dockerfile 进行构建。 -1. 环境变量: - -- is_half: 半精度/双精度控制。在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时,一般都是它引起的,可以根据实际情况来调整为 True 或者 False。 - -2. Volume 设置,容器内的应用根目录设置为 /workspace。 默认的 docker-compose.yaml 中列出了一些实际的例子,便于上传/下载内容。 -3. shm_size:Windows 下的 Docker Desktop 默认可用内存过小,会导致运行异常,根据自己情况酌情设置。 -4. deploy 小节下的 gpu 相关内容,请根据您的系统和实际情况酌情设置。 - -#### 通过 docker compose 运行 - -``` -docker compose -f "docker-compose.yaml" up -d +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### 通过 docker 命令运行 +#### 访问运行中的容器 (Bash Shell) -同上,根据您自己的实际情况修改对应的参数,然后运行如下命令: +当容器在后台运行时, 你可以通过以下命令进入容器: -``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## 预训练模型 -**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX)。** +**若成功运行`install.sh`可跳过 No.1,2,3** -1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。 +**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).** -2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文 TTS) +1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中. -3. 对于 UVR5(人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。 +2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS) - - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。 +3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中. - - 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。 + - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型. -4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。 + - 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对. -5. 对于英语或日语 ASR(额外功能),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。 +4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中. + +5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间. ## 数据集格式 -文本到语音(TTS)注释 .list 文件格式: +文本到语音 (TTS) 注释 .list 文件格式: ``` vocal_path|speaker_name|language|text ``` -语言字典: +语言字典: - 'zh': 中文 - 'ja': 日语 @@ -178,10 +205,10 @@ vocal_path|speaker_name|language|text - 'ko': 韩语 - 'yue': 粤语 -示例: +示例: ``` -D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。 +D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神. ``` ## 微调与推理 @@ -211,12 +238,12 @@ python webui.py v1 #### 现已支持自动填充路径 - 1. 填入训练音频路径 - 2. 切割音频 - 3. 进行降噪(可选) - 4. 进行ASR - 5. 校对标注 - 6. 前往下一个窗口,点击训练 +1. 填入训练音频路径 +2. 切割音频 +3. 进行降噪(可选) +4. 进行 ASR +5. 校对标注 +6. 前往下一个窗口,点击训练 ### 打开推理 WebUI @@ -248,7 +275,7 @@ python webui.py 3. 底模由 2k 小时扩展至 5k 小时 -4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好 +4. 对低音质参考音频 (尤其是来源于网络的高频严重缺失、听着很闷的音频) 合成出来音质更好 详见[wiki]() @@ -258,17 +285,17 @@ python webui.py 2. 需要克隆 github 上的最新代码 -3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下 +3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下 - 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) + 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) ## V3 更新说明 新模型特点: -1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大) +1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大) -2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感 +2. GPT 合成更稳定, 重复漏字更少, 也更容易跑出丰富情感 详见[wiki]() @@ -278,46 +305,46 @@ python webui.py 2. 需要克隆 github 上的最新代码 -3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下 +3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下 - 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) + 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) ## 待办事项清单 -- [x] **高优先级:** +- [x] **高优先级:** - - [x] 日语和英语的本地化。 - - [x] 用户指南。 - - [x] 日语和英语数据集微调训练。 + - [x] 日语和英语的本地化. + - [x] 用户指南. + - [x] 日语和英语数据集微调训练. - [ ] **功能:** - - [x] 零样本声音转换(5 秒)/ 少样本声音转换(1 分钟)。 - - [x] TTS 语速控制。 - - [ ] ~~增强的 TTS 情感控制。~~ - - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布。 - - [x] 改进英语和日语文本前端。 - - [ ] 开发体积小和更大的 TTS 模型。 - - [x] Colab 脚本。 - - [x] 扩展训练数据集(从 2k 小时到 10k 小时)。 - - [x] 更好的 sovits 基础模型(增强的音频质量)。 - - [ ] 模型混合。 + - [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟). + - [x] TTS 语速控制. + - [ ] ~~增强的 TTS 情感控制.~~ + - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布. + - [x] 改进英语和日语文本前端. + - [ ] 开发体积小和更大的 TTS 模型. + - [x] Colab 脚本. + - [x] 扩展训练数据集 (从 2k 小时到 10k 小时). + - [x] 更好的 sovits 基础模型 (增强的音频质量). + - [ ] 模型混合. -## (附加)命令行运行方式 +## (附加) 命令行运行方式 使用命令行打开 UVR5 的 WebUI -``` +```bash python tools/uvr5/webui.py "" ``` - 这是使用命令行完成数据集的音频切分的方式 -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -327,17 +354,17 @@ python audio_slicer.py \ --hop_size ``` -这是使用命令行完成数据集 ASR 处理的方式(仅限中文) +这是使用命令行完成数据集 ASR 处理的方式 (仅限中文) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` -通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记) +通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记) -(没有进度条,GPU 性能可能会导致时间延迟) +(没有进度条, GPU 性能可能会导致时间延迟) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` @@ -345,7 +372,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## 致谢 -特别感谢以下项目和贡献者: +特别感谢以下项目和贡献者: ### 理论研究 @@ -384,7 +411,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p - [FunASR](https://github.com/alibaba-damo-academy/FunASR) - [AP-BWE](https://github.com/yxlu-0102/AP-BWE) -感谢 @Naozumi520 提供粤语训练集,并在粤语相关知识方面给予指导。 +感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导. ## 感谢所有贡献者的努力 diff --git a/docs/ja/Changelog_JA.md b/docs/ja/Changelog_JA.md index 53c28ff1..1173c081 100644 --- a/docs/ja/Changelog_JA.md +++ b/docs/ja/Changelog_JA.md @@ -1,221 +1,221 @@ ### 20240121 更新 -1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます。 -2. WebUI に英語システムの英語翻訳を追加しました。 -3. `cmd-asr`は FunASR モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします。 -4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます(長さ 0 のサンプルをフィルタリングなど) -5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します。 -6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました。 +1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます. +2. WebUI に英語システムの英語翻訳を追加しました. +3. `cmd-asr`は FunASR モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします. +4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます (長さ 0 のサンプルをフィルタリングなど) +5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します. +6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました. ### 20240122 更新 -1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました。 -2. 英語-日本語学習がスムーズに進む QA を完了しました。(ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります) -3. オーディオパスをチェックします。間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます。これは ffmpeg モジュールのエラーではありません。 +1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました. +2. 英語-日本語学習がスムーズに進む QA を完了しました. (ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります) +3. オーディオパスをチェックします.間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます.これは ffmpeg モジュールのエラーではありません. ### 20240123 更新 -1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました。 -2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました。 -3. モデルファイルのソートロジックを最適化しました。 -4. 中国語の分析に `jieba_fast` を `jieba` に置き換えました。 +1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました. +2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました. +3. モデルファイルのソートロジックを最適化しました. +4. 中国語の分析に `jieba_fast` を `jieba` に置き換えました. ### 20240126 更新 -1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします。 -2. 出力で選択的な分割モードをサポートします。 -3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました。 -4. 複数の改行による推論エラーを修正しました。 -5. 推論インターフェースから不要なログを削除しました。 -6. MacOS での学習と推論をサポートします。 -7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します。 +1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします. +2. 出力で選択的な分割モードをサポートします. +3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました. +4. 複数の改行による推論エラーを修正しました. +5. 推論インターフェースから不要なログを削除しました. +6. MacOS での学習と推論をサポートします. +7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します. ### 20240128 更新 -1. 数字を漢字で読む問題を修正しました。 -2. 文章の先頭の一部の単語が欠落する問題を修正しました。 -3. 不適切な長さのリファレンスオーディオを制限しました。 -4. GPT 学習時の ckpt が保存されない問題を修正しました。 -5. Dockerfile のモデルダウンロードプロセスを改善しました。 +1. 数字を漢字で読む問題を修正しました. +2. 文章の先頭の一部の単語が欠落する問題を修正しました. +3. 不適切な長さのリファレンスオーディオを制限しました. +4. GPT 学習時の ckpt が保存されない問題を修正しました. +5. Dockerfile のモデルダウンロードプロセスを改善しました. ### 20240129 更新 -1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました。 -2. Colab でも使用可能なバージョンをテストして更新しました。 -3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました。 +1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました. +2. Colab でも使用可能なバージョンをテストして更新しました. +3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました. ### 20240130 更新 -1. パスと関連する文字列を解析して、二重引用符を自動的に削除します。また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません。 -2. 中国語と英語、日本語と英語の混合出力をサポートします。 -3. 出力で選択的な分割モードをサポートします。 +1. パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません. +2. 中国語と英語、日本語と英語の混合出力をサポートします. +3. 出力で選択的な分割モードをサポートします. ### 20240201 更新 -1. UVR5 形式の読み取りエラーによる分離失敗を修正しました。 -2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました。 +1. UVR5 形式の読み取りエラーによる分離失敗を修正しました. +2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました. ### 20240202 更新 -1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました。 -2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%"(パーセント記号)の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました。 +1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました. +2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%" (パーセント記号) の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました. ### 20240207 更新 -1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました。 -2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました。 -3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) で、`is_half` パラメータがブール値に変換されず、常に半精度推論が行われ、16 シリーズの GPU で `inf` が発生する UVR5 inf everywhereエラーを修正しました。 -4. 英語テキストフロントエンドを最適化しました。 -5. Gradio の依存関係を修正しました。 -6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました。 -7. 日本語と英語のために Faster Whisper ASR を統合しました。 +1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました. +2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました. +3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) で、`is_half` パラメータがブール値に変換されず、常に半精度推論が行われ、16 シリーズの GPU で `inf` が発生する UVR5 inf everywhereエラーを修正しました. +4. 英語テキストフロントエンドを最適化しました. +5. Gradio の依存関係を修正しました. +6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました. +7. 日本語と英語のために Faster Whisper ASR を統合しました. ### 20240208 更新 -1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)(繁体字中国語システム言語)での GPT トレーニングのハングを修正する試みを行いました。 +1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (繁体字中国語システム言語) での GPT トレーニングのハングを修正する試みを行いました. ### 20240212 更新 -1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました。 -2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました。 +1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました. +2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました. ### 20240214 更新 -1. トレーニングで中国語の実験名をサポート(以前はエラーが発生していました)。 -2. DPOトレーニングを必須ではなくオプション機能に変更。選択された場合、バッチサイズは自動的に半分になります。推論 WebUI で新しいパラメータが渡されない問題を修正しました。 +1. トレーニングで中国語の実験名をサポート (以前はエラーが発生していました). +2. DPOトレーニングを必須ではなくオプション機能に変更.選択された場合、バッチサイズは自動的に半分になります.推論 WebUI で新しいパラメータが渡されない問題を修正しました. ### 20240216 更新 -1. 参照テキストなしでの入力をサポート。 -2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました。 +1. 参照テキストなしでの入力をサポート. +2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました. ### 20240221 更新 -1. データ処理中のノイズ低減オプションを追加(ノイズ低減は16kHzサンプリングレートのみを残します;背景ノイズが大きい場合にのみ使用してください)。 -2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました。 -3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました。 -4. Colab のパブリック URL の問題を修正しました。 +1. データ処理中のノイズ低減オプションを追加 (ノイズ低減は16kHzサンプリングレートのみを残します;背景ノイズが大きい場合にのみ使用してください). +2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました. +3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました. +4. Colab のパブリック URL の問題を修正しました. ### 20240306 更新 -1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました(RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト)。 -2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました。 -3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) で UVR5 残響除去モデルの設定が逆になっていた問題を修正しました。 -4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました。 -5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました。 +1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト). +2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました. +3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) で UVR5 残響除去モデルの設定が逆になっていた問題を修正しました. +4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました. +5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました. ### 202403/202404/202405 更新 #### マイナー修正: -1. 参照テキストなしモードの問題を修正しました。 -2. 中国語と英語のテキストフロントエンドを最適化しました。 -3. API フォーマットを改善しました。 -4. CMD フォーマットの問題を修正しました。 -5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました。 -6. Hubert 抽出のバグを修正しました。 +1. 参照テキストなしモードの問題を修正しました. +2. 中国語と英語のテキストフロントエンドを最適化しました. +3. API フォーマットを改善しました. +4. CMD フォーマットの問題を修正しました. +5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました. +6. Hubert 抽出のバグを修正しました. #### メジャー修正: -1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました。 -2. クイック推論ブランチを追加しました。 +1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました. +2. クイック推論ブランチを追加しました. ### 20240610 更新 #### マイナー修正: -1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました。 -2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました。 -3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました。 +1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました. +2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました. +3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました. #### メジャー修正: -4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました。 - **注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします。** +4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました. + **注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします.** ### 20240706 更新 #### マイナー修正: -1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました。 -2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、ノイズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました。 -3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) で、句読点で分割する際の小数点分割の問題を修正しました。 -4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました。 -5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました。 +1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました. +2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、ノイズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました. +3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) で、句読点で分割する際の小数点分割の問題を修正しました. +4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました. +5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました. #### メジャー修正: -6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました。 - また、参照テキストなしモードでの加速推論もサポートしています。 +6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました. + また、参照テキストなしモードでの加速推論もサポートしています. -**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**。 +**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**. ### 20240727 更新 #### マイナー修正: -1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました。 -2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました。 -3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました。 +1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました. +2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました. +3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました. #### メジャー修正: -4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました。 - スピーチレートのみを調整しながらランダム性を固定できるようになりました。 +4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました. + スピーチレートのみを調整しながらランダム性を固定できるようになりました. ### 20240806 更新 -1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました。[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました。 -2. 中国語テキストフロントエンドを改善しました。 - - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加(v2 のみ); +1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました.[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました. +2. 中国語テキストフロントエンドを改善しました. + - [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加 (v2 のみ); - [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 量詞を追加; - [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 四則演算と基本数式のサポート; - - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正。 -3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました。 -4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました。 -5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました。 -6. GPT-SoVITS v2 のサポートを追加しました。 -7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました。 + - [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正. +3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました. +4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました. +5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました. +6. GPT-SoVITS v2 のサポートを追加しました. +7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました. ### 20240821 更新 -1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました。 -2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました。 -3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました。 -4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました。 +1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました. +2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました. +3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました. +4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました. ### 20250211 更新 -1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました。SoVITS v3のファインチューニングには14GBのGPUメモリが必要です。 +1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました.SoVITS v3のファインチューニングには14GBのGPUメモリが必要です. ### 20250212 更新 -- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です。 +- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です. ### 20250214 更新 -- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**。 - - `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました。 +- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**. + - `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました. ### 20250217 更新 -- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化。 +- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化. ### 20250218 更新 -- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**。 +- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**. ### 20250223 更新 -1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました。8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています。 -2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました。 +1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました.8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています. +2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました. ### 20250226 更新 -1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました。 - - `langsegmenter`を使用して韓国語の問題を修正。 -2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました。 - - `langsegmenter`を使用して韓国語/日本語の問題を修正。 +1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました. + - `langsegmenter`を使用して韓国語の問題を修正. +2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました. + - `langsegmenter`を使用して韓国語/日本語の問題を修正. ### 20250227 更新 -- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました。[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています。 \ No newline at end of file +- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました.[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています. \ No newline at end of file diff --git a/docs/ja/README.md b/docs/ja/README.md index 7a1b0a1e..8cedb0d9 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -1,7 +1,7 @@

GPT-SoVITS-WebUI

-パワフルなFew-Shot音声変換・音声合成 WebUI。

+パワフルなFew-Shot音声変換・音声合成 WebUI.

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) @@ -20,13 +20,13 @@ ## 機能: -1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。 +1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます. -2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。 +2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上. -3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。 +3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています. -4. **WebUI ツール:** 統合されたツールは、音声と伴奏(BGM 等)の分離、トレーニングセットの自動セグメンテーション、ASR(中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。 +4. **WebUI ツール:** 統合されたツールは、音声と伴奏 (BGM 等) の分離、トレーニングセットの自動セグメンテーション、ASR (中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます. **[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!** @@ -40,47 +40,58 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### テスト済みの環境 -- Python 3.9, PyTorch 2.0.1, CUDA 11 -- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 -- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon) -- Python 3.9, PyTorch 2.2.2, CPU デバイス - -_注記: numba==0.56.4 は py<3.11 が必要です_ +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows -Windows ユーザー:(Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。 +Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します. ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh +bash install.sh --device --source [--download-uvr5] ``` ### macOS -**注:Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。** +**注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.** -1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。 -2. `brew install ffmpeg` を実行して FFmpeg をインストールします。 -3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。 +以下のコマンドを実行してこのプロジェクトをインストールします: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +bash install.sh --device --source [--download-uvr5] ``` ### 手動インストール -#### FFmpeg をインストールします。 +#### 依存関係をインストールします + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +#### FFmpeg をインストールします ##### Conda ユーザー ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -89,12 +100,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows ユーザー -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。 +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます + +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください ##### MacOS ユーザー @@ -102,55 +114,72 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 依存関係をインストールします +### GPT-SoVITS の実行 (Docker 使用) + +#### Docker イメージの選択 + +コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください: + +- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください +- 環境に合った適切なイメージタグを選択してください +- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが含まれていないことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします +- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます +- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です + +#### 環境変数 + +- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます + +#### 共有メモリの設定 + +Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします + +#### サービスの選択 + +`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています: + +- `GPT-SoVITS-CU126` および `GPT-SoVITS-CU128`:すべての機能を含むフルバージョン +- `GPT-SoVITS-CU126-Lite` および `GPT-SoVITS-CU128-Lite`:依存関係を削減した軽量バージョン + +特定のサービスを Docker Compose で実行するには、以下のコマンドを使用します: ```bash -pip install -r extra-req.txt --no-deps -pip install -r requirementx.txt +docker compose run --service-ports ``` -### Docker の使用 +#### Docker イメージのローカルビルド -#### docker-compose.yaml の設定 +自分でイメージをビルドするには、以下のコマンドを使ってください: -0. イメージのタグについて:コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください。 -1. 環境変数: - - - `is_half`:半精度/倍精度の制御。"SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です。実際の状況に応じて True または False に調整してください。 - -2. ボリューム設定:コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます。デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています。 -3. `shm_size`:Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります。状況に応じて適宜設定してください。 -4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください。 - -#### docker compose で実行する - -```markdown -docker compose -f "docker-compose.yaml" up -d +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### docker コマンドで実行する +#### 実行中のコンテナへアクセス (Bash Shell) -上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します: +コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます: -```markdown -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## 事前訓練済みモデル -1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。 +**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.** -2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語 TTS のみ) +1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください. -3. UVR5(ボーカル/伴奏(BGM 等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。 +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ) - - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。 +3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください. - - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例:mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。 + - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます. -4. 中国語 ASR(追加機能)の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。 + - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです. -5. 英語または日本語の ASR(追加機能)を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。 +4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください. + +5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります. ## データセット形式 @@ -178,8 +207,8 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. #### 統合パッケージ利用者 -`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。 -V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。 +`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します. +V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください. #### その他 @@ -193,24 +222,24 @@ V1 に切り替えたい場合は python webui.py v1 <言語(オプション)> ``` -または WebUI で手動でバージョンを切り替えてください。 +または WebUI で手動でバージョンを切り替えてください. ### 微調整 #### パス自動補完のサポート - 1. 音声パスを入力する - 2. 音声を小さなチャンクに分割する - 3. ノイズ除去(オプション) - 4. ASR - 5. ASR転写を校正する - 6. 次のタブに移動し、モデルを微調整する +1. 音声パスを入力する +2. 音声を小さなチャンクに分割する +3. ノイズ除去 (オプション) +4. ASR +5. ASR 転写を校正する +6. 次のタブに移動し、モデルを微調整する ### 推論 WebUI を開く #### 統合パッケージ利用者 -`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。 +`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. #### その他 @@ -224,7 +253,7 @@ python GPT_SoVITS/inference_webui.py <言語(オプション)> python webui.py ``` -その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。 +その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます. ## V2 リリースノート @@ -246,66 +275,66 @@ V1 環境から V2 を使用するには: 2. 最新のコードを github からクローン -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置 +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置 - 中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) + 中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) ## V3 リリースノート 新機能: -1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。 +1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます). -2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。 +2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました. [詳細情報はこちら]() v2 環境から v3 を使用する方法: -1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。 +1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します. -2. GitHub から最新のコードをクローンします。 +2. GitHub から最新のコードをクローンします. -3. v3 の事前学習済みモデル(s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。 +3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します. - 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。 + 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください. ## Todo リスト - [x] **優先度 高:** - - [x] 日本語と英語でのローカライズ。 - - [x] ユーザーガイド。 - - [x] 日本語データセットと英語データセットのファインチューニングトレーニング。 + - [x] 日本語と英語でのローカライズ. + - [x] ユーザーガイド. + - [x] 日本語データセットと英語データセットのファインチューニングトレーニング. - [ ] **機能:** - - [x] ゼロショット音声変換(5 秒)/数ショット音声変換(1 分)。 - - [x] TTS スピーキングスピードコントロール。 - - [ ] ~~TTS の感情コントロールの強化。~~ - - [ ] SoVITS トークン入力を語彙の確率分布に変更する実験。 - - [x] 英語と日本語のテキストフロントエンドを改善。 - - [ ] 小型と大型の TTS モデルを開発する。 - - [x] Colab のスクリプト。 - - [ ] トレーニングデータセットを拡張する(2k→10k)。 - - [x] より良い sovits ベースモデル(音質向上) + - [x] ゼロショット音声変換 (5 秒) /数ショット音声変換 (1 分). + - [x] TTS スピーキングスピードコントロール. + - [ ] ~~TTS の感情コントロールの強化.~~ + - [ ] SoVITS トークン入力を語彙の確率分布に変更する実験. + - [x] 英語と日本語のテキストフロントエンドを改善. + - [ ] 小型と大型の TTS モデルを開発する. + - [x] Colab のスクリプト. + - [ ] トレーニングデータセットを拡張する (2k→10k). + - [x] より良い sovits ベースモデル (音質向上) - [ ] モデルミックス ## (追加の) コマンドラインから実行する方法 コマンド ラインを使用して UVR5 の WebUI を開きます -``` +```bash python tools/uvr5/webui.py "" ``` - -コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。 +コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです. -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -317,15 +346,15 @@ python audio_slicer.py \ コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング) -(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります) +(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` @@ -333,7 +362,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## クレジット -特に以下のプロジェクトと貢献者に感謝します: +特に以下のプロジェクトと貢献者に感謝します: ### 理論研究 @@ -372,7 +401,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p - [FunASR](https://github.com/alibaba-damo-academy/FunASR) - [AP-BWE](https://github.com/yxlu-0102/AP-BWE) -@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます。 +@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます. ## すべてのコントリビューターに感謝します diff --git a/docs/ko/README.md b/docs/ko/README.md index 4b2fb632..e6419a11 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -40,12 +40,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### 테스트 통과 환경 -- Python 3.9, PyTorch 2.0.1, CUDA 11 -- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 -- Python 3.9, Pytorch 2.2.2, macOS 14.4.1 (Apple Slilicon) -- Python 3.9, PyTorch 2.2.2, CPU 장치 - -_참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._ +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -54,33 +57,41 @@ Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다 ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh +bash install.sh --device --source [--download-uvr5] ``` ### macOS **주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.** -1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요. -2. `brew install ffmpeg` 명령어를 실행하여 FFmpeg를 설치합니다. -3. 위의 단계를 완료한 후, 다음 명령어를 실행하여 이 프로젝트를 설치하세요. +다음 명령어를 실행하여 이 프로젝트를 설치하세요 ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +bash install.sh --device --source [--download-uvr5] ``` ### 수동 설치 +#### 의존성 설치 + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### FFmpeg 설치 ##### Conda 사용자 ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -89,14 +100,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows 사용자 -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다. +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다 -[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용) +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 ##### MacOS 사용자 @@ -104,52 +114,66 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 의존성 설치 +### GPT-SoVITS 실행하기 (Docker 사용) + +#### Docker 이미지 선택 + +코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요: + +- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요 +- 환경에 맞는 적절한 이미지 태그를 선택하세요 +- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 포함되어 있지 않음을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다 +- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다 +- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다 + +#### 환경 변수 + +- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다 + +#### 공유 메모리 설정 + +Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다 + +#### 서비스 선택 + +`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다: + +- `GPT-SoVITS-CU126` 및 `GPT-SoVITS-CU128`: 전체 기능을 포함한 풀 버전 +- `GPT-SoVITS-CU126-Lite` 및 `GPT-SoVITS-CU128-Lite`: 의존성이 줄어든 경량 버전 + +특정 서비스를 Docker Compose로 실행하려면 다음 명령을 사용하세요: ```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +docker compose run --service-ports ``` -### Docker에서 사용 +#### Docker 이미지 직접 빌드하기 -#### docker-compose.yaml 설정 +직접 이미지를 빌드하려면 다음 명령어를 사용하세요: -0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬에서 빌드할 수 있습니다. - -1. 환경 변수: - -- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다. - -2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다. - -3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다. - -4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다. - -#### docker compose로 실행 - -``` -docker compose -f "docker-compose.yaml" up -d +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### docker 명령으로 실행 +#### 실행 중인 컨테이너 접속하기 (Bash Shell) -위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다: +컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다: -``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## 사전 학습된 모델 +**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.** + 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요. -2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용) +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용) 3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요. - - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다. + - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다. - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다. @@ -204,12 +228,12 @@ python webui.py v1 <언어(옵션)> #### 경로 자동 채우기가 지원됩니다 - 1. 오디오 경로를 입력하십시오. - 2. 오디오를 작은 청크로 분할하십시오. - 3. 노이즈 제거(옵션) - 4. ASR 수행 - 5. ASR 전사를 교정하십시오. - 6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. +1. 오디오 경로를 입력하십시오. +2. 오디오를 작은 청크로 분할하십시오. +3. 노이즈 제거(옵션) +4. ASR 수행 +5. ASR 전사를 교정하십시오. +6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. ### 추론 WebUI 열기 @@ -251,9 +275,9 @@ V1 환경에서 V2를 사용하려면: 2. github에서 최신 코드를 클론하십시오. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오. - 중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) + 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) ## V3 릴리스 노트 @@ -271,7 +295,7 @@ v2 환경에서 v3 사용하기: 2. 최신 코드를 github 에서 클론합니다. -3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다. +3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다. 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. @@ -300,7 +324,7 @@ v2 환경에서 v3 사용하기: 명령줄을 사용하여 UVR5용 WebUI 열기 -``` +```bash python tools/uvr5/webui.py "" ``` @@ -311,7 +335,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -323,7 +347,7 @@ python audio_slicer.py \ 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). -``` +```bash python tools/asr/funasr_asr.py -i -o ``` @@ -331,7 +355,7 @@ ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` diff --git a/docs/tr/README.md b/docs/tr/README.md index 2e16e730..0a8ee4a8 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -42,12 +42,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### Test Edilmiş Ortamlar -- Python 3.9, PyTorch 2.0.1, CUDA 11 -- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 -- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silikon) -- Python 3.9, PyTorch 2.2.2, CPU cihazları - -_Not: numba==0.56.4, py<3.11 gerektirir_ +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -56,33 +59,41 @@ Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh +bash install.sh --device --source [--download-uvr5] ``` ### macOS **Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.** -1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin. -2. FFmpeg'i yüklemek için `brew install ffmpeg` komutunu çalıştırın. -3. Aşağıdaki komutları çalıştırarak programı yükleyin: +Aşağıdaki komutları çalıştırarak programı yükleyin: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +bash install.sh --device --source [--download-uvr5] ``` ### El ile Yükleme +#### Bağımlılıkları Yükleme + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### FFmpeg'i Yükleme ##### Conda Kullanıcıları ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -91,12 +102,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows Kullanıcıları -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin. +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin + +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin ##### MacOS Kullanıcıları @@ -104,47 +116,66 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### Bağımlılıkları Yükleme +### GPT-SoVITS Çalıştırma (Docker Kullanarak) + +#### Docker İmajı Seçimi + +Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin: + +- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin +- Ortamınıza uygun bir imaj etiketi seçin +- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin bulunmadığı anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir +- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir +- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz + +#### Ortam Değişkenleri + +- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın. + +#### Paylaşılan Bellek Yapılandırması + +Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir. + +#### Servis Seçimi + +`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır: + +- `GPT-SoVITS-CU126` ve `GPT-SoVITS-CU128`: Tüm özellikleri içeren tam sürüm. +- `GPT-SoVITS-CU126-Lite` ve `GPT-SoVITS-CU128-Lite`: Daha az bağımlılığa ve sınırlı işlevselliğe sahip hafif sürüm. + +Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanın: ```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt +docker compose run --service-ports ``` -### Docker Kullanarak +#### Docker İmajını Yerel Olarak Oluşturma -#### docker-compose.yaml yapılandırması +Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın: -0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun. -1. Ortam Değişkenleri: - - is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın. -2. Birim Yapılandırması,Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler. -3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın. -4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır. - -#### docker compose ile çalıştırma - -``` -docker compose -f "docker-compose.yaml" up -d +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### docker komutu ile çalıştırma +#### Çalışan Konteynere Erişim (Bash Shell) -Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın: +Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz: -``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## Önceden Eğitilmiş Modeller +**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.** + 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin. -2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için) +2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için) 3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin. - - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır. + - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır. - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir. @@ -201,12 +232,12 @@ veya WebUI'de manuel olarak sürüm değiştirin. #### Yol Otomatik Doldurma artık destekleniyor - 1. Ses yolunu doldurun - 2. Sesi küçük parçalara ayırın - 3. Gürültü azaltma (isteğe bağlı) - 4. ASR - 5. ASR transkripsiyonlarını düzeltin - 6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın +1. Ses yolunu doldurun +2. Sesi küçük parçalara ayırın +3. Gürültü azaltma (isteğe bağlı) +4. ASR +5. ASR transkripsiyonlarını düzeltin +6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın ### Çıkarım WebUI'sini Açın @@ -248,9 +279,9 @@ V1 ortamından V2'yi kullanmak için: 2. github'dan en son kodları klonlayın. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin. - Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) + Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) ## V3 Sürüm Notları @@ -266,9 +297,9 @@ V1 ortamından V2'yi kullanmak için: 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin. -2. GitHub’dan en son kodları klonlayın. +2. GitHub'dan en son kodları klonlayın. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin. ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz. @@ -296,7 +327,7 @@ V1 ortamından V2'yi kullanmak için: UVR5 için Web Arayüzünü açmak için komut satırını kullanın -``` +```bash python tools/uvr5/webui.py "" ``` @@ -307,7 +338,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -319,7 +350,7 @@ python audio_slicer.py \ Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) -``` +```bash python tools/asr/funasr_asr.py -i -o <çıktı> ``` @@ -327,7 +358,7 @@ ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışınd (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l ``` @@ -374,7 +405,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l - [FunASR](https://github.com/alibaba-damo-academy/FunASR) - [AP-BWE](https://github.com/yxlu-0102/AP-BWE) -@Naozumi520’ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım. +@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım. ## Tüm katkıda bulunanlara çabaları için teşekkürler diff --git a/go-webui.bat b/go-webui.bat index 398f6d9b..c1c81089 100644 --- a/go-webui.bat +++ b/go-webui.bat @@ -1,2 +1,6 @@ -runtime\python.exe webui.py zh_CN +set "SCRIPT_DIR=%~dp0" +set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +cd /d "%SCRIPT_DIR%" +set "PATH=%SCRIPT_DIR%\runtime;%PATH%" +runtime\python.exe -I webui.py zh_CN pause diff --git a/go-webui.ps1 b/go-webui.ps1 index 6e8dce25..09103425 100644 --- a/go-webui.ps1 +++ b/go-webui.ps1 @@ -1,4 +1,7 @@ $ErrorActionPreference = "SilentlyContinue" chcp 65001 -& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py" zh_CN +Set-Location $PSScriptRoot +$runtimePath = Join-Path $PSScriptRoot "runtime" +$env:PATH = "$runtimePath;$env:PATH" +& "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN pause diff --git a/gpt-sovits_kaggle.ipynb b/gpt-sovits_kaggle.ipynb index 67ad473b..764c23c4 100644 --- a/gpt-sovits_kaggle.ipynb +++ b/gpt-sovits_kaggle.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "9fd922fb", + "metadata": {}, + "source": [ + "# Deprecated" + ] + }, { "cell_type": "code", "execution_count": null, @@ -101,21 +109,31 @@ "import time\n", "import socket\n", "import urllib.request\n", + "\n", + "\n", "def iframe_thread(port):\n", " while True:\n", " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", " if result == 0:\n", " break\n", " sock.close()\n", "\n", " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9874,)).start()\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9874,)).start()\n", "\n", "!python webui.py" ] @@ -143,26 +161,32 @@ "# 开启推理页面\n", "%cd /kaggle/working/GPT-SoVITS/\n", "!npm install -g localtunnel\n", - "import subprocess\n", "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", + "\n", + "\n", "def iframe_thread(port):\n", " while True:\n", " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", " if result == 0:\n", " break\n", " sock.close()\n", "\n", " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9872,)).start()\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9872,)).start()\n", "\n", "!python ./GPT_SoVITS/inference_webui.py" ] diff --git a/install.sh b/install.sh index 66a3a496..eba18683 100644 --- a/install.sh +++ b/install.sh @@ -1,37 +1,238 @@ #!/bin/bash +# cd into GPT-SoVITS Base Path +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + set -e -# 安装构建工具 -# Install build tools -echo "Installing GCC..." -conda install -c conda-forge gcc=14 -y - -echo "Installing G++..." -conda install -c conda-forge gxx -y - -echo "Installing ffmpeg and cmake..." -conda install ffmpeg cmake -y - -# 设置编译环境 -# Set up build environment -export CMAKE_MAKE_PROGRAM="$CONDA_PREFIX/bin/cmake" -export CC="$CONDA_PREFIX/bin/gcc" -export CXX="$CONDA_PREFIX/bin/g++" - -echo "Checking for CUDA installation..." -if command -v nvidia-smi &>/dev/null; then - USE_CUDA=true - echo "CUDA found." -else - echo "CUDA not found." - USE_CUDA=false +if ! command -v conda &>/dev/null; then + echo "Conda Not Found" + exit 1 fi -if [ "$USE_CUDA" = false ]; then +trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR + +USE_CUDA=false +USE_ROCM=false +USE_CPU=false +WORKFLOW=${WORKFLOW:-"false"} + +USE_HF=false +USE_HF_MIRROR=false +USE_MODELSCOPE=false +DOWNLOAD_UVR5=false + +print_help() { + echo "Usage: bash install.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --device CU126|CU128|ROCM|MPS|CPU Specify the Device (REQUIRED)" + echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)" + echo " --download-uvr5 Enable downloading the UVR5 model" + echo " -h, --help Show this help message and exit" + echo "" + echo "Examples:" + echo " bash install.sh --source HF --download-uvr5" + echo " bash install.sh --source ModelScope" +} + +# Show help if no arguments provided +if [[ $# -eq 0 ]]; then + print_help + exit 0 +fi + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --source) + case "$2" in + HF) + USE_HF=true + ;; + HF-Mirror) + USE_HF_MIRROR=true + ;; + ModelScope) + USE_MODELSCOPE=true + ;; + *) + echo "Error: Invalid Download Source: $2" + echo "Choose From: [HF, HF-Mirror, ModelScope]" + exit 1 + ;; + esac + shift 2 + ;; + --device) + case "$2" in + CU126) + CUDA=126 + USE_CUDA=true + ;; + CU128) + CUDA=128 + USE_CUDA=true + ;; + ROCM) + USE_ROCM=true + ;; + MPS) + USE_CPU=true + ;; + CPU) + USE_CPU=true + ;; + *) + echo "Error: Invalid Device: $2" + echo "Choose From: [CU126, CU128, ROCM, MPS, CPU]" + exit 1 + ;; + esac + shift 2 + ;; + --download-uvr5) + DOWNLOAD_UVR5=true + shift + ;; + -h | --help) + print_help + exit 0 + ;; + *) + echo "Unknown Argument: $1" + echo "Use -h or --help to see available options." + exit 1 + ;; + esac +done + +if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then + echo "Error: Device is REQUIRED" + echo "" + print_help + exit 1 +fi + +if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then + echo "Error: Download Source is REQUIRED" + echo "" + print_help + exit 1 +fi + +# 安装构建工具 +# Install build tools +if [ "$(uname)" != "Darwin" ]; then + gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0) + if [ "$gcc_major_version" -lt 11 ]; then + echo "Installing GCC & G++..." + conda install -c conda-forge gcc=11 gxx=11 -q -y + else + echo "GCC >=11" + fi +else + if ! xcode-select -p &>/dev/null; then + echo "Installing Xcode Command Line Tools..." + xcode-select --install + fi + echo "Waiting For Xcode Command Line Tools Installation Complete..." + while true; do + sleep 20 + + if xcode-select -p &>/dev/null; then + echo "Xcode Command Line Tools Installed" + break + else + echo "Installing,Please Wait..." + fi + done + conda install -c conda-forge -q -y +fi + +echo "Installing ffmpeg and cmake..." +conda install ffmpeg cmake make -q -y + +echo "Installing unzip..." +conda install unzip -y --quiet + +if [ "$USE_HF" = "true" ]; then + echo "Download Model From HuggingFace" + PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" + NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" +elif [ "$USE_HF_MIRROR" = "true" ]; then + echo "Download Model From HuggingFace-Mirror" + PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" + G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" + UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" + NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" +elif [ "$USE_MODELSCOPE" = "true" ]; then + echo "Download Model From ModelScope" + PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip" + G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip" + UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip" + NLTK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip" + PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz" +fi + +if [ "$WORKFLOW" = "true" ]; then + WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +else + WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +fi + +if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then + echo "Pretrained Model Exists" +else + echo "Download Pretrained Models" + "${WGET_CMD[@]}" "$PRETRINED_URL" + + unzip -q -o pretrained_models.zip -d GPT_SoVITS + rm -rf pretrained_models.zip +fi + +if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then + echo "Download G2PWModel" + "${WGET_CMD[@]}" "$G2PW_URL" + + unzip -q -o G2PWModel.zip -d GPT_SoVITS/text + rm -rf G2PWModel.zip +else + echo "G2PWModel Exists" +fi + +if [ "$DOWNLOAD_UVR5" = "true" ]; then + if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then + echo "UVR5 Model Exists" + else + echo "Download UVR5 Model" + "${WGET_CMD[@]}" "$UVR5_URL" + + unzip -q -o uvr5_weights.zip -d tools/uvr5 + rm -rf uvr5_weights.zip + fi +fi + +if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then + echo "Checking for CUDA installation..." + if command -v nvidia-smi &>/dev/null; then + echo "CUDA found." + else + USE_CUDA=false + USE_CPU=true + echo "CUDA not found." + fi +fi + +if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then echo "Checking for ROCm installation..." if [ -d "/opt/rocm" ]; then - USE_ROCM=true echo "ROCm found." if grep -qi "microsoft" /proc/version; then echo "You are running WSL." @@ -41,20 +242,28 @@ if [ "$USE_CUDA" = false ]; then IS_WSL=false fi else - echo "ROCm not found." USE_ROCM=false + USE_CPU=true + echo "ROCm not found." fi fi -if [ "$USE_CUDA" = true ]; then +if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch with CUDA support..." - conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia -elif [ "$USE_ROCM" = true ]; then + if [ "$CUDA" = 128 ]; then + pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 + elif [ "$CUDA" = 126 ]; then + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu126 + fi +elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch with ROCm support..." - pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 -else + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 +elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch for CPU..." - conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cpu +elif [ "$WORKFLOW" = false ]; then + echo "Unknown Err" + exit 1 fi echo "Installing Python dependencies from requirements.txt..." @@ -63,40 +272,20 @@ echo "Installing Python dependencies from requirements.txt..." # Refresh environment hash -r -# pyopenjtalk Installation -conda install jq -y +pip install -r extra-req.txt --no-deps --quiet -OS_TYPE=$(uname) +pip install -r requirements.txt --quiet -PACKAGE_NAME="pyopenjtalk" +PY_PREFIX=$(python -c "import sys; print(sys.prefix)") +PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))") -VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version) +"${WGET_CMD[@]}" "$NLTK_URL" -O nltk_data.zip +unzip -q -o nltk_data -d "$PY_PREFIX" +rm -rf nltk_data.zip -wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz" - -TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz) -DIR_NAME="${TAR_FILE%.tar.gz}" - -tar -xzf "$TAR_FILE" -rm "$TAR_FILE" - -CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt" - -if [[ "$OS_TYPE" == "darwin"* ]]; then - sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE" -else - sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE" -fi - -tar -czf "$TAR_FILE" "$DIR_NAME" - -pip install "$TAR_FILE" - -rm -rf "$TAR_FILE" "$DIR_NAME" - -pip install -r extra-req.txt --no-deps - -pip install -r requirements.txt +"${WGET_CMD[@]}" "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz +tar -xvzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX" +rm -rf open_jtalk_dic_utf_8-1.11.tar.gz if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then echo "Update to WSL compatible runtime lib..." diff --git a/requirements.txt b/requirements.txt index e02069ac..07431a06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,19 @@ -numpy==1.23.4 +--no-binary=opencc +numpy<2.0 scipy tensorboard -librosa==0.9.2 -numba==0.56.4 -pytorch-lightning>2.0 -gradio>=4.0,<=4.24.0 +librosa==0.10.2 +numba +pytorch-lightning>=2.4 +gradio<5 ffmpeg-python -onnxruntime; sys_platform == 'darwin' -onnxruntime-gpu; sys_platform != 'darwin' +onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64" +onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64" tqdm funasr==1.0.27 cn2an pypinyin -pyopenjtalk>=0.3.4 +pyopenjtalk>=0.4.1 g2p_en torchaudio modelscope==1.10.0 @@ -25,16 +26,15 @@ psutil jieba_fast jieba split-lang -fast_langdetect>=0.3.0 +fast_langdetect>=0.3.1 wordsegment rotary_embedding_torch ToJyutping g2pk2 ko_pron -opencc; sys_platform != 'linux' -opencc==1.1.1; sys_platform == 'linux' +opencc python_mecab_ko; sys_platform != 'win32' -fastapi<0.112.2 +fastapi[standard]>=0.115.2 x_transformers torchmetrics<=1.5 pydantic<=2.10.6 diff --git a/tools/AP_BWE_main/datasets1/dataset.py b/tools/AP_BWE_main/datasets1/dataset.py index b5ccd43a..40f993b5 100644 --- a/tools/AP_BWE_main/datasets1/dataset.py +++ b/tools/AP_BWE_main/datasets1/dataset.py @@ -5,24 +5,31 @@ import torchaudio import torch.utils.data import torchaudio.functional as aF -def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True): +def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True): hann_window = torch.hann_window(win_size).to(audio.device) - stft_spec = torch.stft(audio, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, - center=center, pad_mode='reflect', normalized=False, return_complex=True) - log_amp = torch.log(torch.abs(stft_spec)+1e-4) + stft_spec = torch.stft( + audio, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode="reflect", + normalized=False, + return_complex=True, + ) + log_amp = torch.log(torch.abs(stft_spec) + 1e-4) pha = torch.angle(stft_spec) - com = torch.stack((torch.exp(log_amp)*torch.cos(pha), - torch.exp(log_amp)*torch.sin(pha)), dim=-1) + com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1) return log_amp, pha, com def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True): - amp = torch.exp(log_amp) - com = torch.complex(amp*torch.cos(pha), amp*torch.sin(pha)) + com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha)) hann_window = torch.hann_window(win_size).to(com.device) audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center) @@ -30,18 +37,28 @@ def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True): def get_dataset_filelist(a): - with open(a.input_training_file, 'r', encoding='utf-8') as fi: - training_indexes = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] + with open(a.input_training_file, "r", encoding="utf-8") as fi: + training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] - with open(a.input_validation_file, 'r', encoding='utf-8') as fi: - validation_indexes = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] + with open(a.input_validation_file, "r", encoding="utf-8") as fi: + validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] return training_indexes, validation_indexes class Dataset(torch.utils.data.Dataset): - def __init__(self, training_indexes, wavs_dir, segment_size, hr_sampling_rate, lr_sampling_rate, - split=True, shuffle=True, n_cache_reuse=1, device=None): + def __init__( + self, + training_indexes, + wavs_dir, + segment_size, + hr_sampling_rate, + lr_sampling_rate, + split=True, + shuffle=True, + n_cache_reuse=1, + device=None, + ): self.audio_indexes = training_indexes random.seed(1234) if shuffle: @@ -59,7 +76,7 @@ class Dataset(torch.utils.data.Dataset): def __getitem__(self, index): filename = self.audio_indexes[index] if self._cache_ref_count == 0: - audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + '.wav')) + audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav")) self.cached_wav = audio self._cache_ref_count = self.n_cache_reuse else: @@ -79,14 +96,13 @@ class Dataset(torch.utils.data.Dataset): if audio_hr.size(1) >= self.segment_size: max_audio_start = audio_hr.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) - audio_hr = audio_hr[:, audio_start: audio_start+self.segment_size] - audio_lr = audio_lr[:, audio_start: audio_start+self.segment_size] + audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size] + audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size] else: - audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), 'constant') - audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), 'constant') + audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant") + audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant") return (audio_hr.squeeze(), audio_lr.squeeze()) def __len__(self): - return len(self.audio_indexes) diff --git a/tools/AP_BWE_main/models/model.py b/tools/AP_BWE_main/models/model.py index 0c235d69..e5386001 100644 --- a/tools/AP_BWE_main/models/model.py +++ b/tools/AP_BWE_main/models/model.py @@ -1,20 +1,26 @@ import torch import torch.nn.functional as F import torch.nn as nn -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn.utils import weight_norm, spectral_norm + + # from utils import init_weights, get_padding def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) + return int((kernel_size * dilation - dilation) / 2) + + def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) + import numpy as np from typing import Tuple, List LRELU_SLOPE = 0.1 + class ConvNeXtBlock(nn.Module): """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. @@ -30,24 +36,24 @@ class ConvNeXtBlock(nn.Module): def __init__( self, dim: int, - layer_scale_init_value= None, - adanorm_num_embeddings = None, + layer_scale_init_value=None, + adanorm_num_embeddings=None, ): super().__init__() self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv self.adanorm = adanorm_num_embeddings is not None - + self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, dim*3) # pointwise/1x1 convs, implemented with linear layers + self.pwconv1 = nn.Linear(dim, dim * 3) # pointwise/1x1 convs, implemented with linear layers self.act = nn.GELU() - self.pwconv2 = nn.Linear(dim*3, dim) + self.pwconv2 = nn.Linear(dim * 3, dim) self.gamma = ( nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) if layer_scale_init_value > 0 else None ) - def forward(self, x, cond_embedding_id = None) : + def forward(self, x, cond_embedding_id=None): residual = x x = self.dwconv(x) x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) @@ -72,11 +78,11 @@ class APNet_BWE_Model(torch.nn.Module): super(APNet_BWE_Model, self).__init__() self.h = h self.adanorm_num_embeddings = None - layer_scale_init_value = 1 / h.ConvNeXt_layers + layer_scale_init_value = 1 / h.ConvNeXt_layers - self.conv_pre_mag = nn.Conv1d(h.n_fft//2+1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) + self.conv_pre_mag = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) self.norm_pre_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) - self.conv_pre_pha = nn.Conv1d(h.n_fft//2+1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) + self.conv_pre_pha = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) self.norm_pre_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.convnext_mag = nn.ModuleList( @@ -104,9 +110,9 @@ class APNet_BWE_Model(torch.nn.Module): self.norm_post_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.norm_post_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.apply(self._init_weights) - self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) - self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) - self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) + self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) + self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) + self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) def _init_weights(self, m): if isinstance(m, (nn.Conv1d, nn.Linear)): @@ -114,7 +120,6 @@ class APNet_BWE_Model(torch.nn.Module): nn.init.constant_(m.bias, 0) def forward(self, mag_nb, pha_nb): - x_mag = self.conv_pre_mag(mag_nb) x_pha = self.conv_pre_pha(pha_nb) x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2) @@ -134,11 +139,9 @@ class APNet_BWE_Model(torch.nn.Module): x_pha_i = self.linear_post_pha_i(x_pha) pha_wb = torch.atan2(x_pha_i, x_pha_r).transpose(1, 2) - com_wb = torch.stack((torch.exp(mag_wb)*torch.cos(pha_wb), - torch.exp(mag_wb)*torch.sin(pha_wb)), dim=-1) - - return mag_wb, pha_wb, com_wb + com_wb = torch.stack((torch.exp(mag_wb) * torch.cos(pha_wb), torch.exp(mag_wb) * torch.sin(pha_wb)), dim=-1) + return mag_wb, pha_wb, com_wb class DiscriminatorP(torch.nn.Module): @@ -146,13 +149,15 @@ class DiscriminatorP(torch.nn.Module): super(DiscriminatorP, self).__init__() self.period = period norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), - ]) + self.convs = nn.ModuleList( + [ + norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): @@ -160,13 +165,13 @@ class DiscriminatorP(torch.nn.Module): # 1d to 2d b, c, t = x.shape - if t % self.period != 0: # pad first + if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) - for i,l in enumerate(self.convs): + for i, l in enumerate(self.convs): x = l(x) x = F.leaky_relu(x, LRELU_SLOPE) if i > 0: @@ -181,13 +186,15 @@ class DiscriminatorP(torch.nn.Module): class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self): super(MultiPeriodDiscriminator, self).__init__() - self.discriminators = nn.ModuleList([ - DiscriminatorP(2), - DiscriminatorP(3), - DiscriminatorP(5), - DiscriminatorP(7), - DiscriminatorP(11), - ]) + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) def forward(self, y, y_hat): y_d_rs = [] @@ -264,8 +271,8 @@ class DiscriminatorAR(nn.Module): self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None ) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] - x=x.squeeze(1) - + x = x.squeeze(1) + x = self.spectrogram(x) x = x.unsqueeze(1) for l in self.convs: @@ -358,8 +365,8 @@ class DiscriminatorPR(nn.Module): self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None ) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] - x=x.squeeze(1) - + x = x.squeeze(1) + x = self.spectrogram(x) x = x.unsqueeze(1) for l in self.convs: @@ -407,11 +414,11 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs): r_losses = [] g_losses = [] for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - r_loss = torch.mean(torch.clamp(1 - dr, min=0)) - g_loss = torch.mean(torch.clamp(1 + dg, min=0)) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) + r_loss = torch.mean(torch.clamp(1 - dr, min=0)) + g_loss = torch.mean(torch.clamp(1 + dg, min=0)) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) return loss, r_losses, g_losses @@ -420,35 +427,37 @@ def generator_loss(disc_outputs): loss = 0 gen_losses = [] for dg in disc_outputs: - l = torch.mean(torch.clamp(1 - dg, min=0)) - gen_losses.append(l) - loss += l + l = torch.mean(torch.clamp(1 - dg, min=0)) + gen_losses.append(l) + loss += l return loss, gen_losses def phase_losses(phase_r, phase_g): - ip_loss = torch.mean(anti_wrapping_function(phase_r - phase_g)) gd_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=1) - torch.diff(phase_g, dim=1))) iaf_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=2) - torch.diff(phase_g, dim=2))) return ip_loss, gd_loss, iaf_loss -def anti_wrapping_function(x): +def anti_wrapping_function(x): return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi) + def stft_mag(audio, n_fft=2048, hop_length=512): hann_window = torch.hann_window(n_fft).to(audio.device) stft_spec = torch.stft(audio, n_fft, hop_length, window=hann_window, return_complex=True) stft_mag = torch.abs(stft_spec) - return(stft_mag) + return stft_mag + def cal_snr(pred, target): snr = (20 * torch.log10(torch.norm(target, dim=-1) / torch.norm(pred - target, dim=-1).clamp(min=1e-8))).mean() return snr + def cal_lsd(pred, target): sp = torch.log10(stft_mag(pred).square().clamp(1e-8)) st = torch.log10(stft_mag(target).square().clamp(1e-8)) diff --git a/tools/asr/config.py b/tools/asr/config.py index 4b0d37ae..c04069b2 100644 --- a/tools/asr/config.py +++ b/tools/asr/config.py @@ -1,33 +1,36 @@ import os + def check_fw_local_models(): - ''' + """ 启动时检查本地是否有 Faster Whisper 模型. - ''' + """ model_size_list = [ - "tiny", "tiny.en", - "base", "base.en", - "small", "small.en", - "medium", "medium.en", - "large", "large-v1", - "large-v2", "large-v3"] + "tiny", + "tiny.en", + "base", + "base.en", + "small", + "small.en", + "medium", + "medium.en", + "large", + "large-v1", + "large-v2", + "large-v3", + ] for i, size in enumerate(model_size_list): - if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): - model_size_list[i] = size + '-local' + if os.path.exists(f"tools/asr/models/faster-whisper-{size}"): + model_size_list[i] = size + "-local" return model_size_list + asr_dict = { - "达摩 ASR (中文)": { - 'lang': ['zh','yue'], - 'size': ['large'], - 'path': 'funasr_asr.py', - 'precision': ['float32'] - }, + "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, "Faster Whisper (多语种)": { - 'lang': ['auto', 'zh', 'en', 'ja', 'ko', 'yue'], - 'size': check_fw_local_models(), - 'path': 'fasterwhisper_asr.py', - 'precision': ['float32', 'float16', 'int8'] + "lang": ["auto", "zh", "en", "ja", "ko", "yue"], + "size": check_fw_local_models(), + "path": "fasterwhisper_asr.py", + "precision": ["float32", "float16", "int8"], }, } - diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py index d46cbbd7..27cabbc2 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/tools/asr/fasterwhisper_asr.py @@ -2,7 +2,7 @@ import argparse import os import traceback -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import torch @@ -10,7 +10,9 @@ from faster_whisper import WhisperModel from tqdm import tqdm from tools.asr.config import check_fw_local_models +from tools.my_utils import load_cudnn +# fmt: off language_code_list = [ "af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", @@ -32,82 +34,99 @@ language_code_list = [ "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz", "vi", "yi", "yo", "zh", "yue", - "auto"] + "auto"] +# fmt: on + def execute_asr(input_folder, output_folder, model_size, language, precision): - if '-local' in model_size: + if "-local" in model_size: model_size = model_size[:-6] - model_path = f'tools/asr/models/faster-whisper-{model_size}' + model_path = f"tools/asr/models/faster-whisper-{model_size}" else: model_path = model_size - if language == 'auto': - language = None #不设置语种由模型自动输出概率最高的语种 - print("loading faster whisper model:",model_size,model_path) - device = 'cuda' if torch.cuda.is_available() else 'cpu' + if language == "auto": + language = None # 不设置语种由模型自动输出概率最高的语种 + print("loading faster whisper model:", model_size, model_path) + device = "cuda" if torch.cuda.is_available() else "cpu" try: model = WhisperModel(model_path, device=device, compute_type=precision) except: return print(traceback.format_exc()) - + input_file_names = os.listdir(input_folder) input_file_names.sort() output = [] output_file_name = os.path.basename(input_folder) - + for file_name in tqdm(input_file_names): try: file_path = os.path.join(input_folder, file_name) segments, info = model.transcribe( - audio = file_path, - beam_size = 5, - vad_filter = True, - vad_parameters = dict(min_silence_duration_ms=700), - language = language) - text = '' + audio=file_path, + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=700), + language=language, + ) + text = "" if info.language == "zh": print("检测为中文文本, 转 FunASR 处理") - if("only_asr" not in globals()): - from tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 + if "only_asr" not in globals(): + from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型 text = only_asr(file_path, language=info.language.lower()) - if text == '': + if text == "": for segment in segments: text += segment.text output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") except: print(traceback.format_exc()) - + output_folder = output_folder or "output/asr_opt" os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") with open(output_file_path, "w", encoding="utf-8") as f: f.write("\n".join(output)) print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") return output_file_path -if __name__ == '__main__': + +load_cudnn() + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large-v3', - choices=check_fw_local_models(), - help="Model Size of Faster Whisper") - parser.add_argument("-l", "--language", type=str, default='ja', - choices=language_code_list, - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32','int8'], - help="fp16, int8 or fp32") + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument( + "-s", + "--model_size", + type=str, + default="large-v3", + choices=check_fw_local_models(), + help="Model Size of Faster Whisper", + ) + parser.add_argument( + "-l", "--language", type=str, default="ja", choices=language_code_list, help="Language of the audio files." + ) + parser.add_argument( + "-p", + "--precision", + type=str, + default="float16", + choices=["float16", "float32", "int8"], + help="fp16, int8 or fp32", + ) cmd = parser.parse_args() output_file_path = execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - precision = cmd.precision, + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, + precision=cmd.precision, ) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py index fe520e24..b0ffceb0 100644 --- a/tools/asr/funasr_asr.py +++ b/tools/asr/funasr_asr.py @@ -9,31 +9,41 @@ import traceback from funasr import AutoModel from tqdm import tqdm -funasr_models = {} # 存储模型避免重复加载 +funasr_models = {} # 存储模型避免重复加载 + def only_asr(input_file, language): try: model = create_model(language) text = model.generate(input=input_file)[0]["text"] except: - text = '' + text = "" print(traceback.format_exc()) return text + def create_model(language="zh"): - path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' - path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' - path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" + path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch" + path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" + path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" vad_model_revision = punc_model_revision = "v2.0.4" if language == "zh": - path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + path_asr = ( + path_asr + if os.path.exists(path_asr) + else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + ) model_revision = "v2.0.4" elif language == "yue": - path_asr = 'tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + path_asr = ( + path_asr + if os.path.exists(path_asr) + else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + ) model_revision = "master" path_vad = path_punc = None vad_model_revision = punc_model_revision = None @@ -45,25 +55,26 @@ def create_model(language="zh"): return funasr_models[language] else: model = AutoModel( - model = path_asr, - model_revision = model_revision, - vad_model = path_vad, - vad_model_revision = vad_model_revision, - punc_model = path_punc, - punc_model_revision = punc_model_revision, + model=path_asr, + model_revision=model_revision, + vad_model=path_vad, + vad_model_revision=vad_model_revision, + punc_model=path_punc, + punc_model_revision=punc_model_revision, ) print(f"FunASR 模型加载完成: {language.upper()}") funasr_models[language] = model return model + def execute_asr(input_folder, output_folder, model_size, language): input_file_names = os.listdir(input_folder) input_file_names.sort() - + output = [] output_file_name = os.path.basename(input_folder) - + model = create_model(language) for file_name in tqdm(input_file_names): @@ -77,29 +88,31 @@ def execute_asr(input_folder, output_folder, model_size, language): output_folder = output_folder or "output/asr_opt" os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") with open(output_file_path, "w", encoding="utf-8") as f: f.write("\n".join(output)) print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") return output_file_path -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large', - help="Model Size of FunASR is Large") - parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh','yue','auto'], - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default="large", help="Model Size of FunASR is Large") + parser.add_argument( + "-l", "--language", type=str, default="zh", choices=["zh", "yue", "auto"], help="Language of the audio files." + ) + parser.add_argument( + "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" + ) # 还没接入 cmd = parser.parse_args() execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, ) diff --git a/tools/audio_sr.py b/tools/audio_sr.py index 009ad26a..58df6d20 100644 --- a/tools/audio_sr.py +++ b/tools/audio_sr.py @@ -1,50 +1,44 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import sys,os -import traceback -AP_BWE_main_dir_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'AP_BWE_main') +import sys +import os + +AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main") sys.path.append(AP_BWE_main_dir_path) -import glob -import argparse import json -from re import S import torch -import numpy as np -import torchaudio -import time import torchaudio.functional as aF # from attrdict import AttrDict####will be bug in py3.10 from datasets1.dataset import amp_pha_stft, amp_pha_istft from models.model import APNet_BWE_Model -import soundfile as sf -import matplotlib.pyplot as plt -from rich.progress import track -class AP_BWE(): - def __init__(self,device,DictToAttrRecursive,checkpoint_file=None): - if checkpoint_file==None: - checkpoint_file="%s/24kto48k/g_24kto48k.zip"%(AP_BWE_main_dir_path) - if os.path.exists(checkpoint_file)==False: + +class AP_BWE: + def __init__(self, device, DictToAttrRecursive, checkpoint_file=None): + if checkpoint_file == None: + checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path) + if os.path.exists(checkpoint_file) == False: raise FileNotFoundError - config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json') - with open(config_file) as f:data = f.read() + config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() json_config = json.loads(data) # h = AttrDict(json_config) h = DictToAttrRecursive(json_config) model = APNet_BWE_Model(h).to(device) - state_dict = torch.load(checkpoint_file,map_location="cpu",weights_only=False) - model.load_state_dict(state_dict['generator']) + state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False) + model.load_state_dict(state_dict["generator"]) model.eval() - self.device=device - self.model=model - self.h=h + self.device = device + self.model = model + self.h = h def to(self, *arg, **kwargs): self.model.to(*arg, **kwargs) self.device = self.model.conv_pre_mag.weight.device return self - def __call__(self, audio,orig_sampling_rate): + def __call__(self, audio, orig_sampling_rate): with torch.no_grad(): # audio, orig_sampling_rate = torchaudio.load(inp_path) # audio = audio.to(self.device) @@ -53,4 +47,4 @@ class AP_BWE(): amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb) audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size) # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16') - return audio_hr_g.squeeze().cpu().numpy(),self.h.hr_sampling_rate + return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate diff --git a/tools/cmd-denoise.py b/tools/cmd-denoise.py index 1fdcab6d..bbf68476 100644 --- a/tools/cmd-denoise.py +++ b/tools/cmd-denoise.py @@ -1,33 +1,38 @@ -import os,argparse +import os +import argparse import traceback from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from tqdm import tqdm -path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' -path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" -ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) -def execute_denoise(input_folder,output_folder): - os.makedirs(output_folder,exist_ok=True) +path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k" +path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" +ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise) + + +def execute_denoise(input_folder, output_folder): + os.makedirs(output_folder, exist_ok=True) # print(input_folder) # print(list(os.listdir(input_folder).sort())) for name in tqdm(os.listdir(input_folder)): try: - ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) + ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name)) except: traceback.print_exc() -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument( + "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" + ) # 还没接入 cmd = parser.parse_args() execute_denoise( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - ) \ No newline at end of file + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + ) diff --git a/tools/i18n/i18n.py b/tools/i18n/i18n.py index e256941a..4cd123f3 100644 --- a/tools/i18n/i18n.py +++ b/tools/i18n/i18n.py @@ -2,23 +2,27 @@ import json import locale import os -I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale') +I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") + def load_language_list(language): with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f: language_list = json.load(f) return language_list + def scan_language_list(): language_list = [] for name in os.listdir(I18N_JSON_DIR): - if name.endswith(".json"):language_list.append(name.split('.')[0]) + if name.endswith(".json"): + language_list.append(name.split(".")[0]) return language_list + class I18nAuto: def __init__(self, language=None): if language in ["Auto", None]: - language = locale.getdefaultlocale()[0] + language = locale.getdefaultlocale()[0] # getlocale can't identify the system's language ((None, None)) if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")): language = "en_US" @@ -31,6 +35,7 @@ class I18nAuto: def __repr__(self): return "Use Language: " + self.language + if __name__ == "__main__": - i18n = I18nAuto(language='en_US') - print(i18n) \ No newline at end of file + i18n = I18nAuto(language="en_US") + print(i18n) diff --git a/tools/i18n/scan_i18n.py b/tools/i18n/scan_i18n.py index d2bd12b7..0f12091d 100644 --- a/tools/i18n/scan_i18n.py +++ b/tools/i18n/scan_i18n.py @@ -4,21 +4,18 @@ import json import os from collections import OrderedDict -I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale') -DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言 -TITLE_LEN : int = 60 # 标题显示长度 -KEY_LEN : int = 30 # 键名显示长度 -SHOW_KEYS : bool = False # 是否显示键信息 -SORT_KEYS : bool = False # 是否按全局键名写入文件 +I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") +DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言 +TITLE_LEN: int = 60 # 标题显示长度 +KEY_LEN: int = 30 # 键名显示长度 +SHOW_KEYS: bool = False # 是否显示键信息 +SORT_KEYS: bool = False # 是否按全局键名写入文件 + def extract_i18n_strings(node): i18n_strings = [] - if ( - isinstance(node, ast.Call) - and isinstance(node.func, ast.Name) - and node.func.id == "i18n" - ): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "i18n": for arg in node.args: if isinstance(arg, ast.Str): i18n_strings.append(arg.s) @@ -28,6 +25,7 @@ def extract_i18n_strings(node): return i18n_strings + def scan_i18n_strings(): """ scan the directory for all .py files (recursively) @@ -43,7 +41,7 @@ def scan_i18n_strings(): if "I18nAuto" in code: tree = ast.parse(code) i18n_strings = extract_i18n_strings(tree) - print(f"{filename.ljust(KEY_LEN*3//2)}: {len(i18n_strings)}") + print(f"{filename.ljust(KEY_LEN * 3 // 2)}: {len(i18n_strings)}") if SHOW_KEYS: print("\n".join([s for s in i18n_strings])) strings.extend(i18n_strings) @@ -51,9 +49,10 @@ def scan_i18n_strings(): print(f"\033[31m[Failed] Error occur at {filename}: {e}\033[0m") code_keys = set(strings) - print(f"{'Total Unique'.ljust(KEY_LEN*3//2)}: {len(code_keys)}") + print(f"{'Total Unique'.ljust(KEY_LEN * 3 // 2)}: {len(code_keys)}") return code_keys + def update_i18n_json(json_file, standard_keys): standard_keys = sorted(standard_keys) print(f" Process {json_file} ".center(TITLE_LEN, "=")) @@ -89,8 +88,10 @@ def update_i18n_json(json_file, standard_keys): sorted( json_data.items(), key=lambda x: ( - list(standard_keys).index(x[0]) if x[0] in standard_keys and not x[1].startswith('#!') else len(json_data), - ) + list(standard_keys).index(x[0]) + if x[0] in standard_keys and not x[1].startswith("#!") + else len(json_data), + ), ) ) # 打印处理后的 JSON 条目数 @@ -111,21 +112,26 @@ def update_i18n_json(json_file, standard_keys): # 打印是否有重复的值 for value, keys in duplicate_items.items(): if len(keys) > 1: - print("\n".join([f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys])) + print( + "\n".join( + [f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys] + ) + ) if num_miss_translation > 0: print(f"\033[31m{'[Failed] Missing Translation'.ljust(KEY_LEN)}: {num_miss_translation}\033[0m") else: - print(f"\033[32m[Passed] All Keys Translated\033[0m") + print("\033[32m[Passed] All Keys Translated\033[0m") # 将处理后的结果写入 JSON 文件 with open(json_file, "w", encoding="utf-8") as f: json.dump(json_data, f, ensure_ascii=False, indent=4, sort_keys=SORT_KEYS) f.write("\n") - print(f" Updated {json_file} ".center(TITLE_LEN, "=") + '\n') + print(f" Updated {json_file} ".center(TITLE_LEN, "=") + "\n") + if __name__ == "__main__": code_keys = scan_i18n_strings() for json_file in os.listdir(I18N_JSON_DIR): if json_file.endswith(r".json"): json_file = os.path.join(I18N_JSON_DIR, json_file) - update_i18n_json(json_file, code_keys) \ No newline at end of file + update_i18n_json(json_file, code_keys) diff --git a/tools/my_utils.py b/tools/my_utils.py index 33692485..59a7cd3c 100644 --- a/tools/my_utils.py +++ b/tools/my_utils.py @@ -1,10 +1,17 @@ -import platform,os,traceback +import ctypes +import os +import sys +from pathlib import Path + import ffmpeg -import numpy as np import gradio as gr -from tools.i18n.i18n import I18nAuto +import numpy as np import pandas as pd -i18n = I18nAuto(language=os.environ.get('language','Auto')) + +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto(language=os.environ.get("language", "Auto")) + def load_audio(file, sr): try: @@ -12,46 +19,54 @@ def load_audio(file, sr): # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 - if os.path.exists(file) == False: - raise RuntimeError( - "You input a wrong audio path that does not exists, please fix it!" - ) + if os.path.exists(file) is False: + raise RuntimeError("You input a wrong audio path that does not exists, please fix it!") out, _ = ( ffmpeg.input(file, threads=0) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) - except Exception as e: - traceback.print_exc() + except Exception: + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True) + ) # Expose the Error raise RuntimeError(i18n("音频加载失败")) return np.frombuffer(out, np.float32).flatten() -def clean_path(path_str:str): - if path_str.endswith(('\\','/')): +def clean_path(path_str: str): + if path_str.endswith(("\\", "/")): return clean_path(path_str[0:-1]) - path_str = path_str.replace('/', os.sep).replace('\\', os.sep) - return path_str.strip(" \'\n\"\u202a")#path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") + path_str = path_str.replace("/", os.sep).replace("\\", os.sep) + return path_str.strip( + " '\n\"\u202a" + ) # path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") -def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): - files_status=[] +def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False): + files_status = [] if is_train == True and file_list: - file_list.append(os.path.join(file_list[0],'2-name2text.txt')) - file_list.append(os.path.join(file_list[0],'3-bert')) - file_list.append(os.path.join(file_list[0],'4-cnhubert')) - file_list.append(os.path.join(file_list[0],'5-wav32k')) - file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv')) + file_list.append(os.path.join(file_list[0], "2-name2text.txt")) + file_list.append(os.path.join(file_list[0], "3-bert")) + file_list.append(os.path.join(file_list[0], "4-cnhubert")) + file_list.append(os.path.join(file_list[0], "5-wav32k")) + file_list.append(os.path.join(file_list[0], "6-name2semantic.tsv")) for file in file_list: - if os.path.exists(file):files_status.append(True) - else:files_status.append(False) - if sum(files_status)!=len(files_status): + if os.path.exists(file): + files_status.append(True) + else: + files_status.append(False) + if sum(files_status) != len(files_status): if is_train: - for file,status in zip(file_list,files_status): - if status:pass - else:gr.Warning(file) - gr.Warning(i18n('以下文件或文件夹不存在')) + for file, status in zip(file_list, files_status): + if status: + pass + else: + gr.Warning(file) + gr.Warning(i18n("以下文件或文件夹不存在")) return False elif is_dataset_processing: if files_status[0]: @@ -60,56 +75,157 @@ def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing gr.Warning(file_list[0]) elif not files_status[1] and file_list[1]: gr.Warning(file_list[1]) - gr.Warning(i18n('以下文件或文件夹不存在')) + gr.Warning(i18n("以下文件或文件夹不存在")) return False else: if file_list[0]: gr.Warning(file_list[0]) - gr.Warning(i18n('以下文件或文件夹不存在')) + gr.Warning(i18n("以下文件或文件夹不存在")) else: - gr.Warning(i18n('路径不能为空')) + gr.Warning(i18n("路径不能为空")) return False return True -def check_details(path_list=None,is_train=False,is_dataset_processing=False): + +def check_details(path_list=None, is_train=False, is_dataset_processing=False): if is_dataset_processing: list_path, audio_path = path_list - if (not list_path.endswith('.list')): - gr.Warning(i18n('请填入正确的List路径')) + if not list_path.endswith(".list"): + gr.Warning(i18n("请填入正确的List路径")) return if audio_path: if not os.path.isdir(audio_path): - gr.Warning(i18n('请填入正确的音频文件夹路径')) + gr.Warning(i18n("请填入正确的音频文件夹路径")) return - with open(list_path,"r",encoding="utf8")as f: - line=f.readline().strip("\n").split("\n") + with open(list_path, "r", encoding="utf8") as f: + line = f.readline().strip("\n").split("\n") wav_name, _, __, ___ = line[0].split("|") - wav_name=clean_path(wav_name) - if (audio_path != "" and audio_path != None): + wav_name = clean_path(wav_name) + if audio_path != "" and audio_path != None: wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(audio_path, wav_name) + wav_path = "%s/%s" % (audio_path, wav_name) else: - wav_path=wav_name + wav_path = wav_name if os.path.exists(wav_path): ... else: - gr.Warning(i18n('路径错误')) + gr.Warning(i18n("路径错误")) return if is_train: - path_list.append(os.path.join(path_list[0],'2-name2text.txt')) - path_list.append(os.path.join(path_list[0],'4-cnhubert')) - path_list.append(os.path.join(path_list[0],'5-wav32k')) - path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv')) + path_list.append(os.path.join(path_list[0], "2-name2text.txt")) + path_list.append(os.path.join(path_list[0], "4-cnhubert")) + path_list.append(os.path.join(path_list[0], "5-wav32k")) + path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv")) phone_path, hubert_path, wav_path, semantic_path = path_list[1:] - with open(phone_path,'r',encoding='utf-8') as f: - if f.read(1):... - else:gr.Warning(i18n('缺少音素数据集')) - if os.listdir(hubert_path):... - else:gr.Warning(i18n('缺少Hubert数据集')) - if os.listdir(wav_path):... - else:gr.Warning(i18n('缺少音频数据集')) - df = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" - ) - if len(df) >= 1:... - else:gr.Warning(i18n('缺少语义数据集')) + with open(phone_path, "r", encoding="utf-8") as f: + if f.read(1): + ... + else: + gr.Warning(i18n("缺少音素数据集")) + if os.listdir(hubert_path): + ... + else: + gr.Warning(i18n("缺少Hubert数据集")) + if os.listdir(wav_path): + ... + else: + gr.Warning(i18n("缺少音频数据集")) + df = pd.read_csv(semantic_path, delimiter="\t", encoding="utf-8") + if len(df) >= 1: + ... + else: + gr.Warning(i18n("缺少语义数据集")) + + +def load_cudnn(): + import torch + + if not torch.cuda.is_available(): + print("[INFO] CUDA is not available, skipping cuDNN setup.") + return + + if sys.platform == "win32": + torch_lib_dir = Path(torch.__file__).parent / "lib" + if torch_lib_dir.exists(): + os.add_dll_directory(str(torch_lib_dir)) + print(f"[INFO] Added DLL directory: {torch_lib_dir}") + matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll")) + if not matching_files: + print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}") + return + for dll_path in matching_files: + dll_name = os.path.basename(dll_path) + try: + ctypes.CDLL(dll_name) + print(f"[INFO] Loaded: {dll_name}") + except OSError as e: + print(f"[WARNING] Failed to load {dll_name}: {e}") + else: + print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}") + + elif sys.platform == "linux": + site_packages = Path(torch.__file__).resolve().parents[1] + cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib" + + if not cudnn_dir.exists(): + print(f"[ERROR] cudnn dir not found: {cudnn_dir}") + return + + matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*")) + if not matching_files: + print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}") + return + + for so_path in matching_files: + try: + ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore + print(f"[INFO] Loaded: {so_path}") + except OSError as e: + print(f"[WARNING] Failed to load {so_path}: {e}") + + +def load_nvrtc(): + import torch + + if not torch.cuda.is_available(): + print("[INFO] CUDA is not available, skipping nvrtc setup.") + return + + if sys.platform == "win32": + torch_lib_dir = Path(torch.__file__).parent / "lib" + if torch_lib_dir.exists(): + os.add_dll_directory(str(torch_lib_dir)) + print(f"[INFO] Added DLL directory: {torch_lib_dir}") + matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll")) + if not matching_files: + print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}") + return + for dll_path in matching_files: + dll_name = os.path.basename(dll_path) + try: + ctypes.CDLL(dll_name) + print(f"[INFO] Loaded: {dll_name}") + except OSError as e: + print(f"[WARNING] Failed to load {dll_name}: {e}") + else: + print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}") + + elif sys.platform == "linux": + site_packages = Path(torch.__file__).resolve().parents[1] + nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib" + + if not nvrtc_dir.exists(): + print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}") + return + + matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*")) + if not matching_files: + print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}") + return + + for so_path in matching_files: + try: + ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore + print(f"[INFO] Loaded: {so_path}") + except OSError as e: + print(f"[WARNING] Failed to load {so_path}: {e}") diff --git a/tools/slice_audio.py b/tools/slice_audio.py index 8a06292d..66fafa93 100644 --- a/tools/slice_audio.py +++ b/tools/slice_audio.py @@ -1,30 +1,34 @@ -import os,sys,numpy as np +import os +import sys +import numpy as np import traceback from scipy.io import wavfile + # parent_directory = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(parent_directory) from tools.my_utils import load_audio from slicer2 import Slicer -def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): - os.makedirs(opt_root,exist_ok=True) + +def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part): + os.makedirs(opt_root, exist_ok=True) if os.path.isfile(inp): - input=[inp] + input = [inp] elif os.path.isdir(inp): - input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] + input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] else: return "输入路径存在但既不是文件也不是文件夹" slicer = Slicer( sr=32000, # 长音频采样率 - threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 - min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 - min_interval= int(min_interval), # 最短切割间隔 - hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) - max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 + threshold=int(threshold), # 音量小于这个值视作静音的备选切割点 + min_length=int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 + min_interval=int(min_interval), # 最短切割间隔 + hop_size=int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) + max_sil_kept=int(max_sil_kept), # 切完后静音最多留多长 ) - _max=float(_max) - alpha=float(alpha) - for inp_path in input[int(i_part)::int(all_part)]: + _max = float(_max) + alpha = float(alpha) + for inp_path in input[int(i_part) :: int(all_part)]: # print(inp_path) try: name = os.path.basename(inp_path) @@ -32,7 +36,8 @@ def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_ # print(audio.shape) for chunk, start, end in slicer.slice(audio): # start和end是帧数 tmp_max = np.abs(chunk).max() - if(tmp_max>1):chunk/=tmp_max + if tmp_max > 1: + chunk /= tmp_max chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk wavfile.write( "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), @@ -41,8 +46,8 @@ def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_ (chunk * 32767).astype(np.int16), ) except: - print(inp_path,"->fail->",traceback.format_exc()) + print(inp_path, "->fail->", traceback.format_exc()) return "执行完毕,请检查输出文件" -print(slice(*sys.argv[1:])) +print(slice(*sys.argv[1:])) diff --git a/tools/slicer2.py b/tools/slicer2.py index ba6794b6..8d80f1b4 100644 --- a/tools/slicer2.py +++ b/tools/slicer2.py @@ -46,13 +46,9 @@ class Slicer: max_sil_kept: int = 5000, ): if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) + raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size") if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) + raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size") min_interval = sr * min_interval / 1000 self.threshold = 10 ** (threshold / 20.0) self.hop_size = round(sr * hop_size / 1000) @@ -63,13 +59,9 @@ class Slicer: def _apply_slice(self, waveform, begin, end): if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] + return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)] else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] + return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)] # @timeit def slice(self, waveform): @@ -79,9 +71,7 @@ class Slicer: samples = waveform if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) + rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) sil_tags = [] silence_start = None clip_start = 0 @@ -97,10 +87,7 @@ class Slicer: continue # Clear recorded silence start if interval is not enough or clip is too short is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length if not is_leading_silence and not need_slice_middle: silence_start = None continue @@ -113,21 +100,10 @@ class Slicer: sil_tags.append((pos, pos)) clip_start = pos elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() + pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) clip_start = pos_r @@ -135,17 +111,8 @@ class Slicer: sil_tags.append((min(pos_l, pos), max(pos_r, pos))) clip_start = max(pos_r, pos) else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) else: @@ -154,28 +121,33 @@ class Slicer: silence_start = None # Deal with trailing silence. total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): + if silence_start is not None and total_frames - silence_start >= self.min_interval: silence_end = min(total_frames, silence_start + self.max_sil_kept) pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start sil_tags.append((pos, total_frames + 1)) # Apply and return slices. ####音频+起始时间+终止时间 if len(sil_tags) == 0: - return [[waveform,0,int(total_frames*self.hop_size)]] + return [[waveform, 0, int(total_frames * self.hop_size)]] else: chunks = [] if sil_tags[0][0] > 0: - chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) + chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)]) for i in range(len(sil_tags) - 1): chunks.append( - [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] + [ + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), + int(sil_tags[i][1] * self.hop_size), + int(sil_tags[i + 1][0] * self.hop_size), + ] ) if sil_tags[-1][1] < total_frames: chunks.append( - [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] + [ + self._apply_slice(waveform, sil_tags[-1][1], total_frames), + int(sil_tags[-1][1] * self.hop_size), + int(total_frames * self.hop_size), + ] ) return chunks @@ -189,9 +161,7 @@ def main(): parser = ArgumentParser() parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) + parser.add_argument("--out", type=str, help="Output directory of the sliced audio clips") parser.add_argument( "--db_thresh", type=float, @@ -249,8 +219,7 @@ def main(): soundfile.write( os.path.join( out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + "%s_%d.wav" % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), ), chunk, sr, diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py index 9ae6c7c4..4244449f 100644 --- a/tools/subfix_webui.py +++ b/tools/subfix_webui.py @@ -1,4 +1,4 @@ -import argparse,os +import argparse import copy import json import os @@ -6,11 +6,13 @@ import uuid try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... -import librosa + analytics.version_check = lambda: None +except: + ... + import gradio as gr +import librosa import numpy as np import soundfile @@ -33,15 +35,10 @@ def reload_data(index, batch): g_index = index global g_batch g_batch = batch - datas = g_data_json[index:index+batch] + datas = g_data_json[index : index + batch] output = [] for d in datas: - output.append( - { - g_json_key_text: d[g_json_key_text], - g_json_key_path: d[g_json_key_path] - } - ) + output.append({g_json_key_text: d[g_json_key_text], g_json_key_path: d[g_json_key_path]}) return output @@ -50,17 +47,13 @@ def b_change_index(index, batch): g_index, g_batch = index, batch datas = reload_data(index, batch) output = [] - for i , _ in enumerate(datas): + for i, _ in enumerate(datas): output.append( # gr.Textbox( # label=f"Text {i+index}", # value=_[g_json_key_text]#text # ) - { - "__type__":"update", - "label":f"Text {i+index}", - "value":_[g_json_key_text] - } + {"__type__": "update", "label": f"Text {i + index}", "value": _[g_json_key_text]} ) for _ in range(g_batch - len(datas)): output.append( @@ -68,11 +61,7 @@ def b_change_index(index, batch): # label=f"Text", # value="" # ) - { - "__type__": "update", - "label": f"Text", - "value": "" - } + {"__type__": "update", "label": "Text", "value": ""} ) for _ in datas: output.append(_[g_json_key_path]) @@ -86,7 +75,7 @@ def b_change_index(index, batch): def b_next_index(index, batch): b_save_file() if (index + batch) <= g_max_json_index: - return index + batch , *b_change_index(index + batch, batch) + return index + batch, *b_change_index(index + batch, batch) else: return index, *b_change_index(index, batch) @@ -94,7 +83,7 @@ def b_next_index(index, batch): def b_previous_index(index, batch): b_save_file() if (index - batch) >= 0: - return index - batch , *b_change_index(index - batch, batch) + return index - batch, *b_change_index(index - batch, batch) else: return 0, *b_change_index(0, batch) @@ -104,8 +93,8 @@ def b_submit_change(*text_list): change = False for i, new_text in enumerate(text_list): if g_index + i <= g_max_json_index: - new_text = new_text.strip()+' ' - if (g_data_json[g_index + i][g_json_key_text] != new_text): + new_text = new_text.strip() + " " + if g_data_json[g_index + i][g_json_key_text] != new_text: g_data_json[g_index + i][g_json_key_text] = new_text change = True if change: @@ -119,18 +108,22 @@ def b_delete_audio(*checkbox_list): change = False for i, checkbox in reversed(list(enumerate(checkbox_list))): if g_index + i < len(g_data_json): - if (checkbox == True): + if checkbox == True: g_data_json.pop(g_index + i) change = True - - g_max_json_index = len(g_data_json)-1 + + g_max_json_index = len(g_data_json) - 1 if g_index > g_max_json_index: g_index = g_max_json_index g_index = g_index if g_index >= 0 else 0 if change: b_save_file() # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) - return {"value":g_index,"__type__":"update","maximum":(g_max_json_index if g_max_json_index>=0 else 0)},*b_change_index(g_index, g_batch) + return { + "value": g_index, + "__type__": "update", + "maximum": (g_max_json_index if g_max_json_index >= 0 else 0), + }, *b_change_index(g_index, g_batch) def b_invert_selection(*checkbox_list): @@ -143,18 +136,18 @@ def get_next_path(filename): base_name = os.path.splitext(os.path.basename(filename))[0] for i in range(100): new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") - if not os.path.exists(new_path) : + if not os.path.exists(new_path): return new_path - return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav') + return os.path.join(base_dir, f"{str(uuid.uuid4())}.wav") def b_audio_split(audio_breakpoint, *checkbox_list): - global g_data_json , g_max_json_index + global g_data_json, g_max_json_index checked_index = [] for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): + if checkbox == True and g_index + i < len(g_data_json): checked_index.append(g_index + i) - if len(checked_index) == 1 : + if len(checked_index) == 1: index = checked_index[0] audio_json = copy.deepcopy(g_data_json[index]) path = audio_json[g_json_key_path] @@ -162,7 +155,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list): audio_maxframe = len(data) break_frame = int(audio_breakpoint * sample_rate) - if (break_frame >= 1 and break_frame < audio_maxframe): + if break_frame >= 1 and break_frame < audio_maxframe: audio_first = data[0:break_frame] audio_second = data[break_frame:] nextpath = get_next_path(path) @@ -174,19 +167,20 @@ def b_audio_split(audio_breakpoint, *checkbox_list): g_max_json_index = len(g_data_json) - 1 # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) + return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) + def b_merge_audio(interval_r, *checkbox_list): - global g_data_json , g_max_json_index + global g_data_json, g_max_json_index b_save_file() checked_index = [] audios_path = [] audios_text = [] for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): + if checkbox == True and g_index + i < len(g_data_json): checked_index.append(g_index + i) - - if (len(checked_index)>1): + + if len(checked_index) > 1: for i in checked_index: audios_path.append(g_data_json[i][g_json_key_path]) audios_text.append(g_data_json[i][g_json_key_text]) @@ -202,7 +196,7 @@ def b_merge_audio(interval_r, *checkbox_list): for i, path in enumerate(audios_path): data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) l_sample_rate = sample_rate - if (i > 0): + if i > 0: silence = np.zeros(int(l_sample_rate * interval_r)) audio_list.append(silence) @@ -213,32 +207,32 @@ def b_merge_audio(interval_r, *checkbox_list): soundfile.write(base_path, audio_concat, l_sample_rate) b_save_file() - + g_max_json_index = len(g_data_json) - 1 - + # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) + return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) def b_save_json(): - with open(g_load_file,'w', encoding="utf-8") as file: + with open(g_load_file, "w", encoding="utf-8") as file: for data in g_data_json: - file.write(f'{json.dumps(data, ensure_ascii = False)}\n') + file.write(f"{json.dumps(data, ensure_ascii=False)}\n") def b_save_list(): - with open(g_load_file,'w', encoding="utf-8") as file: + with open(g_load_file, "w", encoding="utf-8") as file: for data in g_data_json: wav_path = data["wav_path"] speaker_name = data["speaker_name"] language = data["language"] text = data["text"] - file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n') + file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip() + "\n") def b_load_json(): global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as file: + with open(g_load_file, "r", encoding="utf-8") as file: g_data_json = file.readlines() g_data_json = [json.loads(line) for line in g_data_json] g_max_json_index = len(g_data_json) - 1 @@ -246,19 +240,14 @@ def b_load_json(): def b_load_list(): global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as source: + with open(g_load_file, "r", encoding="utf-8") as source: data_list = source.readlines() for _ in data_list: - data = _.split('|') - if (len(data) == 4): + data = _.split("|") + if len(data) == 4: wav_path, speaker_name, language, text = data g_data_json.append( - { - 'wav_path':wav_path, - 'speaker_name':speaker_name, - 'language':language, - 'text':text.strip() - } + {"wav_path": wav_path, "speaker_name": speaker_name, "language": language, "text": text.strip()} ) else: print("error line:", data) @@ -283,17 +272,17 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch): global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch g_batch = int(batch) - - if (load_json != "None"): + + if load_json != "None": g_load_format = "json" g_load_file = load_json - elif (load_list != "None"): + elif load_list != "None": g_load_format = "list" g_load_file = load_list else: g_load_format = "list" g_load_file = "demo.list" - + g_json_key_text = json_key_text g_json_key_path = json_key_path @@ -301,21 +290,20 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument('--load_json', default="None", help='source file, like demo.json') - parser.add_argument('--is_share', default="False", help='whether webui is_share=True') - parser.add_argument('--load_list', default="None", help='source file, like demo.list') - parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list') - parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text') - parser.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path') - parser.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10') + parser = argparse.ArgumentParser(description="Process some integers.") + parser.add_argument("--load_json", default="None", help="source file, like demo.json") + parser.add_argument("--is_share", default="False", help="whether webui is_share=True") + parser.add_argument("--load_list", default="None", help="source file, like demo.list") + parser.add_argument("--webui_port_subfix", default=9871, help="source file, like demo.list") + parser.add_argument("--json_key_text", default="text", help="the text key name in json, Default: text") + parser.add_argument("--json_key_path", default="wav_path", help="the path key name in json, Default: wav_path") + parser.add_argument("--g_batch", default=10, help="max number g_batch wav to display, Default: 10") args = parser.parse_args() set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) - - with gr.Blocks() as demo: + with gr.Blocks(analytics_enabled=False) as demo: with gr.Row(): btn_change_index = gr.Button("Change Index") btn_submit_change = gr.Button("Submit Text") @@ -323,79 +311,50 @@ if __name__ == "__main__": btn_delete_audio = gr.Button("Delete Audio") btn_previous_index = gr.Button("Previous Index") btn_next_index = gr.Button("Next Index") - + with gr.Row(): - index_slider = gr.Slider( - minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3 - ) + index_slider = gr.Slider(minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3) splitpoint_slider = gr.Slider( - minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 + minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 ) btn_audio_split = gr.Button("Split Audio", scale=1) btn_save_json = gr.Button("Save File", visible=True, scale=1) btn_invert_selection = gr.Button("Invert Selection", scale=1) - + with gr.Row(): with gr.Column(): - for _ in range(0,g_batch): + for _ in range(0, g_batch): with gr.Row(): - text = gr.Textbox( - label = "Text", - visible = True, - scale=5 - ) - audio_output = gr.Audio( - label="Output Audio", - visible = True, - scale=5 - ) - audio_check = gr.Checkbox( - label="Yes", - show_label = True, - info = "Choose Audio", - scale=1 - ) + text = gr.Textbox(label="Text", visible=True, scale=5) + audio_output = gr.Audio(label="Output Audio", visible=True, scale=5) + audio_check = gr.Checkbox(label="Yes", show_label=True, info="Choose Audio", scale=1) g_text_list.append(text) g_audio_list.append(audio_output) g_checkbox_list.append(audio_check) - - with gr.Row(): batchsize_slider = gr.Slider( - minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False - ) - interval_slider = gr.Slider( - minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3 + minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False ) + interval_slider = gr.Slider(minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3) btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1) btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1) - + btn_change_index.click( b_change_index, inputs=[ index_slider, batchsize_slider, ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], ) - btn_submit_change.click( b_submit_change, inputs=[ *g_text_list, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_previous_index.click( @@ -404,82 +363,39 @@ if __name__ == "__main__": index_slider, batchsize_slider, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) - + btn_next_index.click( b_next_index, inputs=[ index_slider, batchsize_slider, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_delete_audio.click( b_delete_audio, - inputs=[ - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[*g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_merge_audio.click( b_merge_audio, - inputs=[ - interval_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[interval_slider, *g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_audio_split.click( b_audio_split, - inputs=[ - splitpoint_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[splitpoint_slider, *g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) - btn_invert_selection.click( - b_invert_selection, - inputs=[ - *g_checkbox_list - ], - outputs=[ - *g_checkbox_list - ] - ) + btn_invert_selection.click(b_invert_selection, inputs=[*g_checkbox_list], outputs=[*g_checkbox_list]) - btn_save_json.click( - b_save_file - ) + btn_save_json.click(b_save_file) demo.load( b_change_index, @@ -487,17 +403,13 @@ if __name__ == "__main__": index_slider, batchsize_slider, ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], ) - + demo.launch( server_name="0.0.0.0", inbrowser=True, - quiet=True, + # quiet=True, share=eval(args.is_share), - server_port=int(args.webui_port_subfix) - ) \ No newline at end of file + server_port=int(args.webui_port_subfix), + ) diff --git a/tools/uvr5/bs_roformer/attend.py b/tools/uvr5/bs_roformer/attend.py index 2e3555a9..29dad18e 100644 --- a/tools/uvr5/bs_roformer/attend.py +++ b/tools/uvr5/bs_roformer/attend.py @@ -7,23 +7,22 @@ import torch.nn.functional as F def exists(val): return val is not None + def default(v, d): return v if exists(v) else d + class Attend(nn.Module): - def __init__( - self, - dropout = 0., - flash = False, - scale = None - ): + def __init__(self, dropout=0.0, flash=False, scale=None): super().__init__() self.scale = scale self.dropout = dropout self.attn_dropout = nn.Dropout(dropout) self.flash = flash - assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above' + assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), ( + "in order to use flash attention, you must be using pytorch 2.0 or above" + ) def flash_attn(self, q, k, v): # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device @@ -34,7 +33,7 @@ class Attend(nn.Module): # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - return F.scaled_dot_product_attention(q, k, v,dropout_p = self.dropout if self.training else 0.) + return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0) def forward(self, q, k, v): """ @@ -54,7 +53,7 @@ class Attend(nn.Module): # similarity - sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale + sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale # attention @@ -63,6 +62,6 @@ class Attend(nn.Module): # aggregate values - out = einsum(f"b h i j, b h j d -> b h i d", attn, v) + out = einsum("b h i j, b h j d -> b h i d", attn, v) return out diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/tools/uvr5/bs_roformer/bs_roformer.py index 45c46d51..8c9d1f30 100644 --- a/tools/uvr5/bs_roformer/bs_roformer.py +++ b/tools/uvr5/bs_roformer/bs_roformer.py @@ -1,14 +1,14 @@ from functools import partial import torch -from torch import nn, einsum, Tensor +from torch import nn from torch.nn import Module, ModuleList import torch.nn.functional as F from bs_roformer.attend import Attend from torch.utils.checkpoint import checkpoint -from typing import Tuple, Optional, List, Callable +from typing import Tuple, Optional, Callable # from beartype.typing import Tuple, Optional, List, Callable # from beartype import beartype @@ -19,6 +19,7 @@ from einops.layers.torch import Rearrange # helper functions + def exists(val): return val is not None @@ -37,14 +38,15 @@ def unpack_one(t, ps, pattern): # norm + def l2norm(t): - return F.normalize(t, dim = -1, p = 2) + return F.normalize(t, dim=-1, p=2) class RMSNorm(Module): def __init__(self, dim): super().__init__() - self.scale = dim ** 0.5 + self.scale = dim**0.5 self.gamma = nn.Parameter(torch.ones(dim)) def forward(self, x): @@ -53,13 +55,9 @@ class RMSNorm(Module): # attention + class FeedForward(Module): - def __init__( - self, - dim, - mult=4, - dropout=0. - ): + def __init__(self, dim, mult=4, dropout=0.0): super().__init__() dim_inner = int(dim * mult) self.net = nn.Sequential( @@ -68,7 +66,7 @@ class FeedForward(Module): nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), - nn.Dropout(dropout) + nn.Dropout(dropout), ) def forward(self, x): @@ -76,18 +74,10 @@ class FeedForward(Module): class Attention(Module): - def __init__( - self, - dim, - heads=8, - dim_head=64, - dropout=0., - rotary_embed=None, - flash=True - ): + def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True): super().__init__() self.heads = heads - self.scale = dim_head ** -0.5 + self.scale = dim_head**-0.5 dim_inner = heads * dim_head self.rotary_embed = rotary_embed @@ -99,15 +89,12 @@ class Attention(Module): self.to_gates = nn.Linear(dim, heads) - self.to_out = nn.Sequential( - nn.Linear(dim_inner, dim, bias=False), - nn.Dropout(dropout) - ) + self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)) def forward(self, x): x = self.norm(x) - q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads) + q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads) if exists(self.rotary_embed): q = self.rotary_embed.rotate_queries_or_keys(q) @@ -116,9 +103,9 @@ class Attention(Module): out = self.attend(q, k, v) gates = self.to_gates(x) - out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid() + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() - out = rearrange(out, 'b h n d -> b n (h d)') + out = rearrange(out, "b h n d -> b n (h d)") return self.to_out(out) @@ -128,42 +115,22 @@ class LinearAttention(Module): """ # @beartype - def __init__( - self, - *, - dim, - dim_head=32, - heads=8, - scale=8, - flash=False, - dropout=0. - ): + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): super().__init__() dim_inner = dim_head * heads self.norm = RMSNorm(dim) self.to_qkv = nn.Sequential( - nn.Linear(dim, dim_inner * 3, bias=False), - Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads) + nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads) ) self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) - self.attend = Attend( - scale=scale, - dropout=dropout, - flash=flash - ) + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) - self.to_out = nn.Sequential( - Rearrange('b h d n -> b n (h d)'), - nn.Linear(dim_inner, dim, bias=False) - ) + self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)) - def forward( - self, - x - ): + def forward(self, x): x = self.norm(x) q, k, v = self.to_qkv(x) @@ -178,19 +145,19 @@ class LinearAttention(Module): class Transformer(Module): def __init__( - self, - *, - dim, - depth, - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - ff_mult=4, - norm_output=True, - rotary_embed=None, - flash_attn=True, - linear_attn=False + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, ): super().__init__() self.layers = ModuleList([]) @@ -199,18 +166,20 @@ class Transformer(Module): if linear_attn: attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn) else: - attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, - rotary_embed=rotary_embed, flash=flash_attn) + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) - self.layers.append(ModuleList([ - attn, - FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) - ])) + self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)])) self.norm = RMSNorm(dim) if norm_output else nn.Identity() def forward(self, x): - for attn, ff in self.layers: x = attn(x) + x x = ff(x) + x @@ -220,22 +189,16 @@ class Transformer(Module): # bandsplit module + class BandSplit(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...] - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...]): super().__init__() self.dim_inputs = dim_inputs self.to_features = ModuleList([]) for dim_in in dim_inputs: - net = nn.Sequential( - RMSNorm(dim_in), - nn.Linear(dim_in, dim) - ) + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) self.to_features.append(net) @@ -250,13 +213,7 @@ class BandSplit(Module): return torch.stack(outs, dim=-2) -def MLP( - dim_in, - dim_out, - dim_hidden=None, - depth=1, - activation=nn.Tanh -): +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): dim_hidden = default(dim_hidden, dim_in) net = [] @@ -277,13 +234,7 @@ def MLP( class MaskEstimator(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...], - depth, - mlp_expansion_factor=4 - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): super().__init__() self.dim_inputs = dim_inputs self.to_freqs = ModuleList([]) @@ -292,10 +243,7 @@ class MaskEstimator(Module): for dim_in in dim_inputs: net = [] - mlp = nn.Sequential( - MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), - nn.GLU(dim=-1) - ) + mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)) self.to_freqs.append(mlp) @@ -314,53 +262,106 @@ class MaskEstimator(Module): # main class DEFAULT_FREQS_PER_BANDS = ( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 12, 12, 12, 12, 12, 12, 12, 12, - 24, 24, 24, 24, 24, 24, 24, 24, - 48, 48, 48, 48, 48, 48, 48, 48, - 128, 129, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 128, + 129, ) class BSRoformer(Module): - # @beartype def __init__( - self, - dim, - *, - depth, - stereo=False, - num_stems=1, - time_transformer_depth=2, - freq_transformer_depth=2, - linear_transformer_depth=0, - freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, - # in the paper, they divide into ~60 bands, test with 1 for starters - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - flash_attn=True, - dim_freqs_in=1025, - stft_n_fft=2048, - stft_hop_length=512, - # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction - stft_win_length=2048, - stft_normalized=False, - stft_window_fn: Optional[Callable] = None, - mask_estimator_depth=2, - multi_stft_resolution_loss_weight=1., - multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), - multi_stft_hop_size=147, - multi_stft_normalized=False, - multi_stft_window_fn: Callable = torch.hann_window, - mlp_expansion_factor=4, - use_torch_checkpoint=False, - skip_connection=False, + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, + # in the paper, they divide into ~60 bands, test with 1 for starters + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + flash_attn=True, + dim_freqs_in=1025, + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=2, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + mlp_expansion_factor=4, + use_torch_checkpoint=False, + skip_connection=False, ): super().__init__() @@ -379,7 +380,7 @@ class BSRoformer(Module): attn_dropout=attn_dropout, ff_dropout=ff_dropout, flash_attn=flash_attn, - norm_output=False + norm_output=False, ) time_rotary_embed = RotaryEmbedding(dim=dim_head) @@ -400,26 +401,23 @@ class BSRoformer(Module): self.final_norm = RMSNorm(dim) self.stft_kwargs = dict( - n_fft=stft_n_fft, - hop_length=stft_hop_length, - win_length=stft_win_length, - normalized=stft_normalized + n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized ) self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length) - freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True).shape[1] + freqs = torch.stft( + torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True + ).shape[1] assert len(freqs_per_bands) > 1 - assert sum( - freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}' + assert sum(freqs_per_bands) == freqs, ( + f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}" + ) freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands) - self.band_split = BandSplit( - dim=dim, - dim_inputs=freqs_per_bands_with_complex - ) + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) self.mask_estimators = nn.ModuleList([]) @@ -440,17 +438,9 @@ class BSRoformer(Module): self.multi_stft_n_fft = stft_n_fft self.multi_stft_window_fn = multi_stft_window_fn - self.multi_stft_kwargs = dict( - hop_length=multi_stft_hop_size, - normalized=multi_stft_normalized - ) + self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized) - def forward( - self, - raw_audio, - target=None, - return_loss_breakdown=False - ): + def forward(self, raw_audio, target=None, return_loss_breakdown=False): """ einops @@ -469,14 +459,16 @@ class BSRoformer(Module): x_is_mps = True if device.type == "mps" else False if raw_audio.ndim == 2: - raw_audio = rearrange(raw_audio, 'b t -> b 1 t') + raw_audio = rearrange(raw_audio, "b t -> b 1 t") channels = raw_audio.shape[1] - assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)' + assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), ( + "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + ) # to stft - raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t') + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") stft_window = self.stft_window_fn(device=device) @@ -485,16 +477,21 @@ class BSRoformer(Module): try: stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True) except: - stft_repr = torch.stft(raw_audio.cpu() if x_is_mps else raw_audio, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=True).to(device) + stft_repr = torch.stft( + raw_audio.cpu() if x_is_mps else raw_audio, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=True, + ).to(device) stft_repr = torch.view_as_real(stft_repr) - stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c') + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting - stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c') + stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c") - x = rearrange(stft_repr, 'b f t c -> b t (f c)') + x = rearrange(stft_repr, "b f t c -> b t (f c)") if self.use_torch_checkpoint: x = checkpoint(self.band_split, x, use_reentrant=False) @@ -505,16 +502,15 @@ class BSRoformer(Module): store = [None] * len(self.layers) for i, transformer_block in enumerate(self.layers): - if len(transformer_block) == 3: linear_transformer, time_transformer, freq_transformer = transformer_block - x, ft_ps = pack([x], 'b * d') + x, ft_ps = pack([x], "b * d") if self.use_torch_checkpoint: x = checkpoint(linear_transformer, x, use_reentrant=False) else: x = linear_transformer(x) - x, = unpack(x, ft_ps, 'b * d') + (x,) = unpack(x, ft_ps, "b * d") else: time_transformer, freq_transformer = transformer_block @@ -523,24 +519,24 @@ class BSRoformer(Module): for j in range(i): x = x + store[j] - x = rearrange(x, 'b t f d -> b f t d') - x, ps = pack([x], '* t d') + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") if self.use_torch_checkpoint: x = checkpoint(time_transformer, x, use_reentrant=False) else: x = time_transformer(x) - x, = unpack(x, ps, '* t d') - x = rearrange(x, 'b f t d -> b t f d') - x, ps = pack([x], '* f d') + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") if self.use_torch_checkpoint: x = checkpoint(freq_transformer, x, use_reentrant=False) else: x = freq_transformer(x) - x, = unpack(x, ps, '* f d') + (x,) = unpack(x, ps, "* f d") if self.skip_connection: store[i] = x @@ -553,11 +549,11 @@ class BSRoformer(Module): mask = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators], dim=1) else: mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) - mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2) + mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2) # modulate frequency representation - stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c') + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") # complex number multiplication @@ -568,18 +564,26 @@ class BSRoformer(Module): # istft - stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels) + stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels) # same as torch.stft() fix for MacOS MPS above try: - recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1]) + recon_audio = torch.istft( + stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1] + ) except: - recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False, length=raw_audio.shape[-1]).to(device) + recon_audio = torch.istft( + stft_repr.cpu() if x_is_mps else stft_repr, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=False, + length=raw_audio.shape[-1], + ).to(device) - recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', s=self.audio_channels, n=num_stems) + recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems) if num_stems == 1: - recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t') + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") # if a target is passed in, calculate loss for learning @@ -590,13 +594,13 @@ class BSRoformer(Module): assert target.ndim == 4 and target.shape[1] == self.num_stems if target.ndim == 2: - target = rearrange(target, '... t -> ... 1 t') + target = rearrange(target, "... t -> ... 1 t") - target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft + target = target[..., : recon_audio.shape[-1]] # protect against lost length on istft loss = F.l1_loss(recon_audio, target) - multi_stft_resolution_loss = 0. + multi_stft_resolution_loss = 0.0 for window_size in self.multi_stft_resolutions_window_sizes: res_stft_kwargs = dict( @@ -607,8 +611,8 @@ class BSRoformer(Module): **self.multi_stft_kwargs, ) - recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs) - target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs) + recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs) + target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs) multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y) @@ -619,4 +623,4 @@ class BSRoformer(Module): if not return_loss_breakdown: return total_loss - return total_loss, (loss, multi_stft_resolution_loss) \ No newline at end of file + return total_loss, (loss, multi_stft_resolution_loss) diff --git a/tools/uvr5/bs_roformer/mel_band_roformer.py b/tools/uvr5/bs_roformer/mel_band_roformer.py index 7b6c4850..78a39040 100644 --- a/tools/uvr5/bs_roformer/mel_band_roformer.py +++ b/tools/uvr5/bs_roformer/mel_band_roformer.py @@ -1,14 +1,14 @@ from functools import partial import torch -from torch import nn, einsum, Tensor +from torch import nn from torch.nn import Module, ModuleList import torch.nn.functional as F from bs_roformer.attend import Attend from torch.utils.checkpoint import checkpoint -from typing import Tuple, Optional, List, Callable +from typing import Tuple, Optional, Callable # from beartype.typing import Tuple, Optional, List, Callable # from beartype import beartype @@ -22,6 +22,7 @@ from librosa import filters # helper functions + def exists(val): return val is not None @@ -38,9 +39,9 @@ def unpack_one(t, ps, pattern): return unpack(t, ps, pattern)[0] -def pad_at_dim(t, pad, dim=-1, value=0.): - dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1) - zeros = ((0, 0) * dims_from_right) +def pad_at_dim(t, pad, dim=-1, value=0.0): + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right return F.pad(t, (*zeros, *pad), value=value) @@ -50,10 +51,11 @@ def l2norm(t): # norm + class RMSNorm(Module): def __init__(self, dim): super().__init__() - self.scale = dim ** 0.5 + self.scale = dim**0.5 self.gamma = nn.Parameter(torch.ones(dim)) def forward(self, x): @@ -62,13 +64,9 @@ class RMSNorm(Module): # attention + class FeedForward(Module): - def __init__( - self, - dim, - mult=4, - dropout=0. - ): + def __init__(self, dim, mult=4, dropout=0.0): super().__init__() dim_inner = int(dim * mult) self.net = nn.Sequential( @@ -77,7 +75,7 @@ class FeedForward(Module): nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), - nn.Dropout(dropout) + nn.Dropout(dropout), ) def forward(self, x): @@ -85,18 +83,10 @@ class FeedForward(Module): class Attention(Module): - def __init__( - self, - dim, - heads=8, - dim_head=64, - dropout=0., - rotary_embed=None, - flash=True - ): + def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True): super().__init__() self.heads = heads - self.scale = dim_head ** -0.5 + self.scale = dim_head**-0.5 dim_inner = heads * dim_head self.rotary_embed = rotary_embed @@ -108,15 +98,12 @@ class Attention(Module): self.to_gates = nn.Linear(dim, heads) - self.to_out = nn.Sequential( - nn.Linear(dim_inner, dim, bias=False), - nn.Dropout(dropout) - ) + self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)) def forward(self, x): x = self.norm(x) - q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads) + q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads) if exists(self.rotary_embed): q = self.rotary_embed.rotate_queries_or_keys(q) @@ -125,9 +112,9 @@ class Attention(Module): out = self.attend(q, k, v) gates = self.to_gates(x) - out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid() + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() - out = rearrange(out, 'b h n d -> b n (h d)') + out = rearrange(out, "b h n d -> b n (h d)") return self.to_out(out) @@ -137,42 +124,22 @@ class LinearAttention(Module): """ # @beartype - def __init__( - self, - *, - dim, - dim_head=32, - heads=8, - scale=8, - flash=False, - dropout=0. - ): + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): super().__init__() dim_inner = dim_head * heads self.norm = RMSNorm(dim) self.to_qkv = nn.Sequential( - nn.Linear(dim, dim_inner * 3, bias=False), - Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads) + nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads) ) self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) - self.attend = Attend( - scale=scale, - dropout=dropout, - flash=flash - ) + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) - self.to_out = nn.Sequential( - Rearrange('b h d n -> b n (h d)'), - nn.Linear(dim_inner, dim, bias=False) - ) + self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)) - def forward( - self, - x - ): + def forward(self, x): x = self.norm(x) q, k, v = self.to_qkv(x) @@ -187,19 +154,19 @@ class LinearAttention(Module): class Transformer(Module): def __init__( - self, - *, - dim, - depth, - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - ff_mult=4, - norm_output=True, - rotary_embed=None, - flash_attn=True, - linear_attn=False + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, ): super().__init__() self.layers = ModuleList([]) @@ -208,18 +175,20 @@ class Transformer(Module): if linear_attn: attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn) else: - attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, - rotary_embed=rotary_embed, flash=flash_attn) + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) - self.layers.append(ModuleList([ - attn, - FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) - ])) + self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)])) self.norm = RMSNorm(dim) if norm_output else nn.Identity() def forward(self, x): - for attn, ff in self.layers: x = attn(x) + x x = ff(x) + x @@ -229,22 +198,16 @@ class Transformer(Module): # bandsplit module + class BandSplit(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...] - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...]): super().__init__() self.dim_inputs = dim_inputs self.to_features = ModuleList([]) for dim_in in dim_inputs: - net = nn.Sequential( - RMSNorm(dim_in), - nn.Linear(dim_in, dim) - ) + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) self.to_features.append(net) @@ -259,13 +222,7 @@ class BandSplit(Module): return torch.stack(outs, dim=-2) -def MLP( - dim_in, - dim_out, - dim_hidden=None, - depth=1, - activation=nn.Tanh -): +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): dim_hidden = default(dim_hidden, dim_in) net = [] @@ -286,13 +243,7 @@ def MLP( class MaskEstimator(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...], - depth, - mlp_expansion_factor=4 - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): super().__init__() self.dim_inputs = dim_inputs self.to_freqs = ModuleList([]) @@ -301,10 +252,7 @@ class MaskEstimator(Module): for dim_in in dim_inputs: net = [] - mlp = nn.Sequential( - MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), - nn.GLU(dim=-1) - ) + mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)) self.to_freqs.append(mlp) @@ -322,43 +270,43 @@ class MaskEstimator(Module): # main class -class MelBandRoformer(Module): +class MelBandRoformer(Module): # @beartype def __init__( - self, - dim, - *, - depth, - stereo=False, - num_stems=1, - time_transformer_depth=2, - freq_transformer_depth=2, - linear_transformer_depth=0, - num_bands=60, - dim_head=64, - heads=8, - attn_dropout=0.1, - ff_dropout=0.1, - flash_attn=True, - dim_freqs_in=1025, - sample_rate=44100, # needed for mel filter bank from librosa - stft_n_fft=2048, - stft_hop_length=512, - # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction - stft_win_length=2048, - stft_normalized=False, - stft_window_fn: Optional[Callable] = None, - mask_estimator_depth=1, - multi_stft_resolution_loss_weight=1., - multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), - multi_stft_hop_size=147, - multi_stft_normalized=False, - multi_stft_window_fn: Callable = torch.hann_window, - match_input_audio_length=False, # if True, pad output tensor to match length of input tensor - mlp_expansion_factor=4, - use_torch_checkpoint=False, - skip_connection=False, + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + num_bands=60, + dim_head=64, + heads=8, + attn_dropout=0.1, + ff_dropout=0.1, + flash_attn=True, + dim_freqs_in=1025, + sample_rate=44100, # needed for mel filter bank from librosa + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=1, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + match_input_audio_length=False, # if True, pad output tensor to match length of input tensor + mlp_expansion_factor=4, + use_torch_checkpoint=False, + skip_connection=False, ): super().__init__() @@ -376,7 +324,7 @@ class MelBandRoformer(Module): dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, - flash_attn=flash_attn + flash_attn=flash_attn, ) time_rotary_embed = RotaryEmbedding(dim=dim_head) @@ -397,13 +345,12 @@ class MelBandRoformer(Module): self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length) self.stft_kwargs = dict( - n_fft=stft_n_fft, - hop_length=stft_hop_length, - win_length=stft_win_length, - normalized=stft_normalized + n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized ) - freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True).shape[1] + freqs = torch.stft( + torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True + ).shape[1] # create mel filter bank # with librosa.filters.mel as in section 2 of paper @@ -414,43 +361,40 @@ class MelBandRoformer(Module): # for some reason, it doesn't include the first freq? just force a value for now - mel_filter_bank[0][0] = 1. + mel_filter_bank[0][0] = 1.0 # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position, # so let's force a positive value - mel_filter_bank[-1, -1] = 1. + mel_filter_bank[-1, -1] = 1.0 # binary as in paper (then estimated masks are averaged for overlapping regions) freqs_per_band = mel_filter_bank > 0 - assert freqs_per_band.any(dim=0).all(), 'all frequencies need to be covered by all bands for now' + assert freqs_per_band.any(dim=0).all(), "all frequencies need to be covered by all bands for now" - repeated_freq_indices = repeat(torch.arange(freqs), 'f -> b f', b=num_bands) + repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands) freq_indices = repeated_freq_indices[freqs_per_band] if stereo: - freq_indices = repeat(freq_indices, 'f -> f s', s=2) + freq_indices = repeat(freq_indices, "f -> f s", s=2) freq_indices = freq_indices * 2 + torch.arange(2) - freq_indices = rearrange(freq_indices, 'f s -> (f s)') + freq_indices = rearrange(freq_indices, "f s -> (f s)") - self.register_buffer('freq_indices', freq_indices, persistent=False) - self.register_buffer('freqs_per_band', freqs_per_band, persistent=False) + self.register_buffer("freq_indices", freq_indices, persistent=False) + self.register_buffer("freqs_per_band", freqs_per_band, persistent=False) - num_freqs_per_band = reduce(freqs_per_band, 'b f -> b', 'sum') - num_bands_per_freq = reduce(freqs_per_band, 'b f -> f', 'sum') + num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum") + num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum") - self.register_buffer('num_freqs_per_band', num_freqs_per_band, persistent=False) - self.register_buffer('num_bands_per_freq', num_bands_per_freq, persistent=False) + self.register_buffer("num_freqs_per_band", num_freqs_per_band, persistent=False) + self.register_buffer("num_bands_per_freq", num_bands_per_freq, persistent=False) # band split and mask estimator freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist()) - self.band_split = BandSplit( - dim=dim, - dim_inputs=freqs_per_bands_with_complex - ) + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) self.mask_estimators = nn.ModuleList([]) @@ -471,19 +415,11 @@ class MelBandRoformer(Module): self.multi_stft_n_fft = stft_n_fft self.multi_stft_window_fn = multi_stft_window_fn - self.multi_stft_kwargs = dict( - hop_length=multi_stft_hop_size, - normalized=multi_stft_normalized - ) + self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized) self.match_input_audio_length = match_input_audio_length - def forward( - self, - raw_audio, - target=None, - return_loss_breakdown=False - ): + def forward(self, raw_audio, target=None, return_loss_breakdown=False): """ einops @@ -499,28 +435,29 @@ class MelBandRoformer(Module): device = raw_audio.device if raw_audio.ndim == 2: - raw_audio = rearrange(raw_audio, 'b t -> b 1 t') + raw_audio = rearrange(raw_audio, "b t -> b 1 t") batch, channels, raw_audio_length = raw_audio.shape istft_length = raw_audio_length if self.match_input_audio_length else None - assert (not self.stereo and channels == 1) or ( - self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)' + assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), ( + "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + ) # to stft - raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t') + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") stft_window = self.stft_window_fn(device=device) stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True) stft_repr = torch.view_as_real(stft_repr) - stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c') + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting - stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c') + stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c") # index out all frequencies for all frequency ranges across bands ascending in one go @@ -532,7 +469,7 @@ class MelBandRoformer(Module): # fold the complex (real and imag) into the frequencies dimension - x = rearrange(x, 'b f t c -> b t (f c)') + x = rearrange(x, "b f t c -> b t (f c)") if self.use_torch_checkpoint: x = checkpoint(self.band_split, x, use_reentrant=False) @@ -543,16 +480,15 @@ class MelBandRoformer(Module): store = [None] * len(self.layers) for i, transformer_block in enumerate(self.layers): - if len(transformer_block) == 3: linear_transformer, time_transformer, freq_transformer = transformer_block - x, ft_ps = pack([x], 'b * d') + x, ft_ps = pack([x], "b * d") if self.use_torch_checkpoint: x = checkpoint(linear_transformer, x, use_reentrant=False) else: x = linear_transformer(x) - x, = unpack(x, ft_ps, 'b * d') + (x,) = unpack(x, ft_ps, "b * d") else: time_transformer, freq_transformer = transformer_block @@ -561,24 +497,24 @@ class MelBandRoformer(Module): for j in range(i): x = x + store[j] - x = rearrange(x, 'b t f d -> b f t d') - x, ps = pack([x], '* t d') + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") if self.use_torch_checkpoint: x = checkpoint(time_transformer, x, use_reentrant=False) else: x = time_transformer(x) - x, = unpack(x, ps, '* t d') - x = rearrange(x, 'b f t d -> b t f d') - x, ps = pack([x], '* f d') + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") if self.use_torch_checkpoint: x = checkpoint(freq_transformer, x, use_reentrant=False) else: x = freq_transformer(x) - x, = unpack(x, ps, '* f d') + (x,) = unpack(x, ps, "* f d") if self.skip_connection: store[i] = x @@ -588,11 +524,11 @@ class MelBandRoformer(Module): masks = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators], dim=1) else: masks = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) - masks = rearrange(masks, 'b n t (f c) -> b n f t c', c=2) + masks = rearrange(masks, "b n t (f c) -> b n f t c", c=2) # modulate frequency representation - stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c') + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") # complex number multiplication @@ -603,12 +539,12 @@ class MelBandRoformer(Module): # need to average the estimated mask for the overlapped frequencies - scatter_indices = repeat(self.freq_indices, 'f -> b n f t', b=batch, n=num_stems, t=stft_repr.shape[-1]) + scatter_indices = repeat(self.freq_indices, "f -> b n f t", b=batch, n=num_stems, t=stft_repr.shape[-1]) - stft_repr_expanded_stems = repeat(stft_repr, 'b 1 ... -> b n ...', n=num_stems) + stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems) masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(2, scatter_indices, masks) - denom = repeat(self.num_bands_per_freq, 'f -> (f r) 1', r=channels) + denom = repeat(self.num_bands_per_freq, "f -> (f r) 1", r=channels) masks_averaged = masks_summed / denom.clamp(min=1e-8) @@ -618,15 +554,16 @@ class MelBandRoformer(Module): # istft - stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels) + stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels) - recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, - length=istft_length) + recon_audio = torch.istft( + stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=istft_length + ) - recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', b=batch, s=self.audio_channels, n=num_stems) + recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", b=batch, s=self.audio_channels, n=num_stems) if num_stems == 1: - recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t') + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") # if a target is passed in, calculate loss for learning @@ -637,13 +574,13 @@ class MelBandRoformer(Module): assert target.ndim == 4 and target.shape[1] == self.num_stems if target.ndim == 2: - target = rearrange(target, '... t -> ... 1 t') + target = rearrange(target, "... t -> ... 1 t") - target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft + target = target[..., : recon_audio.shape[-1]] # protect against lost length on istft loss = F.l1_loss(recon_audio, target) - multi_stft_resolution_loss = 0. + multi_stft_resolution_loss = 0.0 for window_size in self.multi_stft_resolutions_window_sizes: res_stft_kwargs = dict( @@ -654,8 +591,8 @@ class MelBandRoformer(Module): **self.multi_stft_kwargs, ) - recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs) - target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs) + recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs) + target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs) multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y) diff --git a/tools/uvr5/bsroformer.py b/tools/uvr5/bsroformer.py index 9ac09a94..ddcbfa74 100644 --- a/tools/uvr5/bsroformer.py +++ b/tools/uvr5/bsroformer.py @@ -1,28 +1,31 @@ # This code is modified from https://github.com/ZFTurbo/ -import librosa -from tqdm import tqdm import os -import torch +import warnings + +import librosa import numpy as np import soundfile as sf +import torch import torch.nn as nn import yaml -import warnings +from tqdm import tqdm + warnings.filterwarnings("ignore") class Roformer_Loader: def get_config(self, config_path): - with open(config_path, 'r', encoding='utf-8') as f: + with open(config_path, "r", encoding="utf-8") as f: # use fullloader to load tag !!python/tuple, code can be improved config = yaml.load(f, Loader=yaml.FullLoader) return config def get_default_config(self): default_config = None - if self.model_type == 'bs_roformer': + if self.model_type == "bs_roformer": # Use model_bs_roformer_ep_368_sdr_12.9628.yaml and model_bs_roformer_ep_317_sdr_12.9755.yaml as default configuration files # Other BS_Roformer models may not be compatible + # fmt: off default_config = { "audio": {"chunk_size": 352800, "sample_rate": 44100}, "model": { @@ -51,9 +54,10 @@ class Roformer_Loader: "multi_stft_normalized": False, }, "training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"}, - "inference": {"batch_size": 2, "num_overlap": 2} + "inference": {"batch_size": 2, "num_overlap": 2}, } - elif self.model_type == 'mel_band_roformer': + # fmt: on + elif self.model_type == "mel_band_roformer": # Use model_mel_band_roformer_ep_3005_sdr_11.4360.yaml as default configuration files # Other Mel_Band_Roformer models may not be compatible default_config = { @@ -82,29 +86,30 @@ class Roformer_Loader: "multi_stft_resolution_loss_weight": 1.0, "multi_stft_resolutions_window_sizes": (4096, 2048, 1024, 512, 256), "multi_stft_hop_size": 147, - "multi_stft_normalized": False + "multi_stft_normalized": False, }, "training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"}, - "inference": {"batch_size": 2, "num_overlap": 2} + "inference": {"batch_size": 2, "num_overlap": 2}, } + return default_config - def get_model_from_config(self): - if self.model_type == 'bs_roformer': + if self.model_type == "bs_roformer": from bs_roformer.bs_roformer import BSRoformer + model = BSRoformer(**dict(self.config["model"])) - elif self.model_type == 'mel_band_roformer': + elif self.model_type == "mel_band_roformer": from bs_roformer.mel_band_roformer import MelBandRoformer + model = MelBandRoformer(**dict(self.config["model"])) else: - print('Error: Unknown model: {}'.format(self.model_type)) + print("Error: Unknown model: {}".format(self.model_type)) model = None return model - def demix_track(self, model, mix, device): - C = self.config["audio"]["chunk_size"] # chunk_size + C = self.config["audio"]["chunk_size"] # chunk_size N = self.config["inference"]["num_overlap"] fade_size = C // 10 step = int(C // N) @@ -116,7 +121,7 @@ class Roformer_Loader: # Do pad from the beginning and end to account floating window results better if length_init > 2 * border and (border > 0): - mix = nn.functional.pad(mix, (border, border), mode='reflect') + mix = nn.functional.pad(mix, (border, border), mode="reflect") # Prepare windows arrays (do 1 time for speed up). This trick repairs click problems on the edges of segment window_size = C @@ -125,17 +130,17 @@ class Roformer_Loader: window_start = torch.ones(window_size) window_middle = torch.ones(window_size) window_finish = torch.ones(window_size) - window_start[-fade_size:] *= fadeout # First audio chunk, no fadein - window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout + window_start[-fade_size:] *= fadeout # First audio chunk, no fadein + window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout window_middle[-fade_size:] *= fadeout window_middle[:fade_size] *= fadein - with torch.amp.autocast('cuda'): + with torch.amp.autocast("cuda"): with torch.inference_mode(): if self.config["training"]["target_instrument"] is None: req_shape = (len(self.config["training"]["instruments"]),) + tuple(mix.shape) else: - req_shape = (1, ) + tuple(mix.shape) + req_shape = (1,) + tuple(mix.shape) result = torch.zeros(req_shape, dtype=torch.float32) counter = torch.zeros(req_shape, dtype=torch.float32) @@ -143,15 +148,15 @@ class Roformer_Loader: batch_data = [] batch_locations = [] while i < mix.shape[1]: - part = mix[:, i:i + C].to(device) + part = mix[:, i : i + C].to(device) length = part.shape[-1] if length < C: if length > C // 2 + 1: - part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect') + part = nn.functional.pad(input=part, pad=(0, C - length), mode="reflect") else: - part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0) + part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode="constant", value=0) if self.is_half: - part=part.half() + part = part.half() batch_data.append(part) batch_locations.append((i, length)) i += step @@ -170,8 +175,8 @@ class Roformer_Loader: for j in range(len(batch_locations)): start, l = batch_locations[j] - result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l] - counter[..., start:start+l] += window[..., :l] + result[..., start : start + l] += x[j][..., :l].cpu() * window[..., :l] + counter[..., start : start + l] += window[..., :l] batch_data = [] batch_locations = [] @@ -191,7 +196,6 @@ class Roformer_Loader: else: return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)} - def run_folder(self, input, vocal_root, others_root, format): self.model.eval() path = input @@ -200,20 +204,20 @@ class Roformer_Loader: file_base_name = os.path.splitext(os.path.basename(path))[0] sample_rate = 44100 - if 'sample_rate' in self.config["audio"]: - sample_rate = self.config["audio"]['sample_rate'] + if "sample_rate" in self.config["audio"]: + sample_rate = self.config["audio"]["sample_rate"] try: mix, sr = librosa.load(path, sr=sample_rate, mono=False) except Exception as e: - print('Can read track: {}'.format(path)) - print('Error message: {}'.format(str(e))) + print("Can read track: {}".format(path)) + print("Error message: {}".format(str(e))) return # in case if model only supports mono tracks isstereo = self.config["model"].get("stereo", True) if not isstereo and len(mix.shape) != 1: - mix = np.mean(mix, axis=0) # if more than 2 channels, take mean + mix = np.mean(mix, axis=0) # if more than 2 channels, take mean print("Warning: Track has more than 1 channels, but model is mono, taking mean of all channels.") mix_orig = mix.copy() @@ -226,7 +230,7 @@ class Roformer_Loader: # other instruments are caculated by subtracting target instrument from mixture target_instrument = self.config["training"]["target_instrument"] other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument] - other = mix_orig - res[target_instrument] # caculate other instruments + other = mix_orig - res[target_instrument] # caculate other instruments path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument) path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0]) @@ -237,11 +241,10 @@ class Roformer_Loader: vocal_inst = self.config["training"]["instruments"][0] path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst) self.save_audio(path_vocal, res[vocal_inst].T, sr, format) - for other in self.config["training"]["instruments"][1:]: # save other instruments + for other in self.config["training"]["instruments"][1:]: # save other instruments path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other) self.save_audio(path_other, res[other].T, sr, format) - def save_audio(self, path, data, sr, format): # input path should be endwith '.wav' if format in ["wav", "flac"]: @@ -250,10 +253,11 @@ class Roformer_Loader: sf.write(path, data, sr) else: sf.write(path, data, sr) - os.system("ffmpeg -i \"{}\" -vn \"{}\" -q:a 2 -y".format(path, path[:-3] + format)) - try: os.remove(path) - except: pass - + os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format)) + try: + os.remove(path) + except: + pass def __init__(self, model_path, config_path, device, is_half): self.device = device @@ -270,7 +274,9 @@ class Roformer_Loader: if not os.path.exists(config_path): if self.model_type is None: # if model_type is still None, raise an error - raise ValueError("Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again.") + raise ValueError( + "Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again." + ) self.config = self.get_default_config() else: # if there is a configuration file @@ -289,12 +295,10 @@ class Roformer_Loader: state_dict = torch.load(model_path, map_location="cpu") model.load_state_dict(state_dict) - if(is_half==False): + if is_half == False: self.model = model.to(device) else: self.model = model.half().to(device) - def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False): self.run_folder(input, vocal_root, others_root, format) - diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/tools/uvr5/lib/lib_v5/dataset.py index cfd01a17..1a30eec7 100644 --- a/tools/uvr5/lib/lib_v5/dataset.py +++ b/tools/uvr5/lib/lib_v5/dataset.py @@ -32,18 +32,10 @@ def make_pair(mix_dir, inst_dir): input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] X_list = sorted( - [ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts - ] + [os.path.join(mix_dir, fname) for fname in os.listdir(mix_dir) if os.path.splitext(fname)[1] in input_exts] ) y_list = sorted( - [ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts - ] + [os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts] ) filelist = list(zip(X_list, y_list)) @@ -65,14 +57,10 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): train_filelist = filelist[:-val_size] val_filelist = filelist[-val_size:] else: - train_filelist = [ - pair for pair in filelist if list(pair) not in val_filelist - ] + train_filelist = [pair for pair in filelist if list(pair) not in val_filelist] elif split_mode == "subdirs": if len(val_filelist) != 0: - raise ValueError( - "The `val_filelist` option is not available in `subdirs` mode" - ) + raise ValueError("The `val_filelist` option is not available in `subdirs` mode") train_filelist = make_pair( os.path.join(dataset_dir, "training/mixtures"), @@ -91,9 +79,7 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): perm = np.random.permutation(len(X)) for i, idx in enumerate(tqdm(perm)): if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively( - X[idx], y[idx], reduction_mask - ) + y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) if np.random.uniform() < 0.5: # swap channel @@ -152,9 +138,7 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): patch_list = [] - patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( - cropsize, sr, hop_length, n_fft, offset - ) + patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset) os.makedirs(patch_dir, exist_ok=True) for i, (X_path, y_path) in enumerate(tqdm(filelist)): diff --git a/tools/uvr5/lib/lib_v5/layers.py b/tools/uvr5/lib/lib_v5/layers.py index 4fc1b5cb..2b9101e0 100644 --- a/tools/uvr5/lib/lib_v5/layers.py +++ b/tools/uvr5/lib/lib_v5/layers.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/tools/uvr5/lib/lib_v5/layers_123812KB.py index 4fc1b5cb..2b9101e0 100644 --- a/tools/uvr5/lib/lib_v5/layers_123812KB.py +++ b/tools/uvr5/lib/lib_v5/layers_123812KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/tools/uvr5/lib/lib_v5/layers_123821KB.py index 4fc1b5cb..2b9101e0 100644 --- a/tools/uvr5/lib/lib_v5/layers_123821KB.py +++ b/tools/uvr5/lib/lib_v5/layers_123821KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/tools/uvr5/lib/lib_v5/layers_33966KB.py index 9b127bc6..43977778 100644 --- a/tools/uvr5/lib/lib_v5/layers_33966KB.py +++ b/tools/uvr5/lib/lib_v5/layers_33966KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/tools/uvr5/lib/lib_v5/layers_537227KB.py index 9b127bc6..43977778 100644 --- a/tools/uvr5/lib/lib_v5/layers_537227KB.py +++ b/tools/uvr5/lib/lib_v5/layers_537227KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/tools/uvr5/lib/lib_v5/layers_537238KB.py index 9b127bc6..43977778 100644 --- a/tools/uvr5/lib/lib_v5/layers_537238KB.py +++ b/tools/uvr5/lib/lib_v5/layers_537238KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/tools/uvr5/lib/lib_v5/layers_new.py index 44153b6a..7d7005c0 100644 --- a/tools/uvr5/lib/lib_v5/layers_new.py +++ b/tools/uvr5/lib/lib_v5/layers_new.py @@ -40,9 +40,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) @@ -72,23 +70,15 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[2], dilations[2], activ=activ - ) + self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ) self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) @@ -106,12 +96,8 @@ class LSTMModule(nn.Module): def __init__(self, nin_conv, nin_lstm, nout_lstm): super(LSTMModule, self).__init__() self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM( - input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True - ) - self.dense = nn.Sequential( - nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() - ) + self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True) + self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()) def forward(self, x): N, _, nbins, nframes = x.size() diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/tools/uvr5/lib/lib_v5/model_param_init.py index b995c0bf..79b94d33 100644 --- a/tools/uvr5/lib/lib_v5/model_param_init.py +++ b/tools/uvr5/lib/lib_v5/model_param_init.py @@ -1,5 +1,4 @@ import json -import os import pathlib default_param = {} @@ -48,9 +47,7 @@ class ModelParameters(object): import zipfile with zipfile.ZipFile(config_path, "r") as zip: - self.param = json.loads( - zip.read("param.json"), object_pairs_hook=int_keys - ) + self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys) elif ".json" == pathlib.Path(config_path).suffix: with open(config_path, "r") as f: self.param = json.loads(f.read(), object_pairs_hook=int_keys) @@ -65,5 +62,5 @@ class ModelParameters(object): "stereo_n", "reverse", ]: - if not k in self.param: + if k not in self.param: self.param[k] = False diff --git a/tools/uvr5/lib/lib_v5/nets.py b/tools/uvr5/lib/lib_v5/nets.py index 5da3948c..42d7807a 100644 --- a/tools/uvr5/lib/lib_v5/nets.py +++ b/tools/uvr5/lib/lib_v5/nets.py @@ -3,8 +3,6 @@ import torch import torch.nn.functional as F from torch import nn -from . import spec_utils - class BaseASPPNet(nn.Module): def __init__(self, nin, ch, dilations=(4, 8, 16)): diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/tools/uvr5/lib/lib_v5/nets_537227KB.py index 823b44fb..9bb1df1e 100644 --- a/tools/uvr5/lib/lib_v5/nets_537227KB.py +++ b/tools/uvr5/lib/lib_v5/nets_537227KB.py @@ -1,4 +1,3 @@ -import numpy as np import torch import torch.nn.functional as F from torch import nn diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/tools/uvr5/lib/lib_v5/nets_537238KB.py index 823b44fb..9bb1df1e 100644 --- a/tools/uvr5/lib/lib_v5/nets_537238KB.py +++ b/tools/uvr5/lib/lib_v5/nets_537238KB.py @@ -1,4 +1,3 @@ -import numpy as np import torch import torch.nn.functional as F from torch import nn diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/tools/uvr5/lib/lib_v5/nets_new.py index 1c0f4fa9..ba1a5599 100644 --- a/tools/uvr5/lib/lib_v5/nets_new.py +++ b/tools/uvr5/lib/lib_v5/nets_new.py @@ -6,9 +6,7 @@ from . import layers_new class BaseNet(nn.Module): - def __init__( - self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) - ): + def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))): super(BaseNet, self).__init__() self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) @@ -56,21 +54,15 @@ class CascadedNet(nn.Module): layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), ) - self.stg1_high_band_net = BaseNet( - 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 - ) + self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2) self.stg2_low_band_net = nn.Sequential( BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), ) - self.stg2_high_band_net = BaseNet( - nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 - ) + self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2) - self.stg3_full_band_net = BaseNet( - 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm - ) + self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm) self.out = nn.Conv2d(nout, 2, 1, bias=False) self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/tools/uvr5/lib/lib_v5/spec_utils.py index da072e4b..4d987cd8 100644 --- a/tools/uvr5/lib/lib_v5/spec_utils.py +++ b/tools/uvr5/lib/lib_v5/spec_utils.py @@ -27,9 +27,7 @@ def crop_center(h1, h2): return h1 -def wave_to_spectrogram( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): +def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): if reverse: wave_left = np.flip(np.asfortranarray(wave[0])) wave_right = np.flip(np.asfortranarray(wave[1])) @@ -43,7 +41,7 @@ def wave_to_spectrogram( wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) spec = np.asfortranarray([spec_left, spec_right]) @@ -51,9 +49,7 @@ def wave_to_spectrogram( return spec -def wave_to_spectrogram_mt( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): +def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): import threading if reverse: @@ -103,21 +99,13 @@ def combine_spectrograms(specs, mp): raise ValueError("Too much bins") # lowpass fiter - if ( - mp.param["pre_filter_start"] > 0 - ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: + if mp.param["pre_filter_start"] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: if bands_n == 1: - spec_c = fft_lp_filter( - spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] - ) + spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) else: gp = 1 - for b in range( - mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] - ): - g = math.pow( - 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 - ) + for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): + g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) gp = g spec_c[:, b, :] *= g @@ -189,9 +177,7 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): else: e += fade_size - mag[:, :, s + fade_size : e - fade_size] += ref[ - :, :, s + fade_size : e - fade_size - ] + mag[:, :, s + fade_size : e - fade_size] += ref[:, :, s + fade_size : e - fade_size] old_e = e return mag @@ -207,9 +193,7 @@ def cache_or_load(mix_path, inst_path, mp): mix_basename = os.path.splitext(os.path.basename(mix_path))[0] inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - cache_dir = "mph{}".format( - hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() - ) + cache_dir = "mph{}".format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()) mix_cache_dir = os.path.join("cache", cache_dir) inst_cache_dir = os.path.join("cache", cache_dir) @@ -230,31 +214,27 @@ def cache_or_load(mix_path, inst_path, mp): if d == len(mp.param["band"]): # high-end band X_wave[d], _ = librosa.load( - mix_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"] + mix_path, sr=bp["sr"], mono=False, dtype=np.float32, res_type=bp["res_type"] ) y_wave[d], _ = librosa.load( inst_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) else: # lower bands X_wave[d] = librosa.resample( X_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) y_wave[d] = librosa.resample( y_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) @@ -302,9 +282,7 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) elif mid_side_b2: return np.asfortranarray( [ @@ -326,9 +304,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): global wave_left wave_left = librosa.istft(**kwargs) - thread = threading.Thread( - target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} - ) + thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}) thread.start() wave_right = librosa.istft(spec_right, hop_length=hop_length) thread.join() @@ -336,9 +312,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) elif mid_side_b2: return np.asfortranarray( [ @@ -357,21 +331,15 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): for d in range(1, bands_n + 1): bp = mp.param["band"][d] - spec_s = np.ndarray( - shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex - ) + spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ - :, offset : offset + h, : - ] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] offset += h if d == bands_n: # higher if extra_bins_h: # if --high_end_process bypass max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ - :, :extra_bins_h, : - ] + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] if bp["hpf_start"] > 0: spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) if bands_n == 1: @@ -405,9 +373,9 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): mp.param["mid_side_b2"], mp.param["reverse"], ), - orig_sr = bp["sr"], - target_sr = sr, - res_type = "sinc_fastest", + orig_sr=bp["sr"], + target_sr=sr, + res_type="sinc_fastest", ) else: # mid spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) @@ -456,10 +424,7 @@ def mirroring(a, spec_m, input_high_end, mp): np.abs( spec_m[ :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, + mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :, ] ), @@ -467,19 +432,14 @@ def mirroring(a, spec_m, input_high_end, mp): ) mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) - return np.where( - np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror - ) + return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) if "mirroring2" == a: mirror = np.flip( np.abs( spec_m[ :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, + mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :, ] ), @@ -528,7 +488,6 @@ def istft(spec, hl): if __name__ == "__main__": import argparse - import sys import time import cv2 @@ -573,10 +532,10 @@ if __name__ == "__main__": if d == len(mp.param["band"]): # high-end band wave[d], _ = librosa.load( args.input[i], - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if len(wave[d].shape) == 1: # mono to stereo @@ -584,9 +543,9 @@ if __name__ == "__main__": else: # lower bands wave[d] = librosa.resample( wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) spec[d] = wave_to_spectrogram( diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py index 5e8cd22f..0166d52b 100644 --- a/tools/uvr5/lib/utils.py +++ b/tools/uvr5/lib/utils.py @@ -27,9 +27,7 @@ def inference(X_spec, device, model, aggressiveness, data): data : dic configs """ - def _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True - ): + def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True): model.eval() with torch.no_grad(): preds = [] @@ -39,9 +37,7 @@ def inference(X_spec, device, model, aggressiveness, data): total_iterations = sum(iterations) for i in tqdm(range(n_window)): start = i * roi_size - X_mag_window = X_mag_pad[ - None, :, :, start : start + data["window_size"] - ] + X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]] X_mag_window = torch.from_numpy(X_mag_window) if is_half: X_mag_window = X_mag_window.half() @@ -76,9 +72,7 @@ def inference(X_spec, device, model, aggressiveness, data): is_half = True else: is_half = False - pred = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) + pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) pred = pred[:, :, :n_frame] if data["tta"]: @@ -88,9 +82,7 @@ def inference(X_spec, device, model, aggressiveness, data): X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - pred_tta = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) + pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) pred_tta = pred_tta[:, :, roi_size // 2 :] pred_tta = pred_tta[:, :, :n_frame] diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py index 372db25b..e1098273 100644 --- a/tools/uvr5/mdxnet.py +++ b/tools/uvr5/mdxnet.py @@ -13,9 +13,7 @@ cpu = torch.device("cpu") class ConvTDFNetTrim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): + def __init__(self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024): super(ConvTDFNetTrim, self).__init__() self.dim_f = dim_f @@ -24,17 +22,13 @@ class ConvTDFNetTrim: self.hop = hop self.n_bins = self.n_fft // 2 + 1 self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device) self.target_name = target_name self.blender = "blender" in model_name self.dim_c = 4 out_c = self.dim_c * 4 if target_name == "*" else self.dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) + self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device) self.n = L // 2 @@ -50,28 +44,18 @@ class ConvTDFNetTrim: ) x = torch.view_as_real(x) x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, self.dim_c, self.n_bins, self.dim_t] - ) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, self.dim_c, self.n_bins, self.dim_t]) return x[:, :, : self.dim_f] def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) + freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad x = torch.cat([x, freq_pad], -2) c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t]) x = x.permute([0, 2, 3, 1]) x = x.contiguous() x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) + x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True) return x.reshape([-1, c, self.chunk_size]) @@ -93,9 +77,7 @@ class Predictor: logger.info(ort.get_available_providers()) self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) + self.model_ = get_models(device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft) self.model = ort.InferenceSession( os.path.join(args.onnx, self.model_.target_name + ".onnx"), providers=[ @@ -152,9 +134,7 @@ class Predictor: trim = model.n_fft // 2 gen_size = model.chunk_size - 2 * trim pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) + mix_p = np.concatenate((np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1) mix_waves = [] i = 0 while i < n_sample + pad: @@ -172,15 +152,8 @@ class Predictor: ) tar_waves = model.istft(torch.tensor(spec_pred)) else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) + tar_waves = model.istft(torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])) + tar_signal = tar_waves[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).numpy()[:, :-pad] start = 0 if mix == 0 else margin_size end = None if mix == list(mixes.keys())[::-1][0] else -margin_size @@ -207,9 +180,7 @@ class Predictor: sources = self.demix(mix.T) opt = sources[0].T if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) + sf.write("%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate) sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) else: path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) @@ -219,18 +190,14 @@ class Predictor: opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) - ) + os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)) if os.path.exists(opt_path_vocal): try: os.remove(path_vocal) except: pass if os.path.exists(path_other): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) - ) + os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)) if os.path.exists(opt_path_other): try: os.remove(path_other) @@ -240,7 +207,7 @@ class Predictor: class MDXNetDereverb: def __init__(self, chunks): - self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy"%os.path.dirname(os.path.abspath(__file__)) + self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy" % os.path.dirname(os.path.abspath(__file__)) self.shifts = 10 # 'Predict with randomised equivariant stabilisation' self.mixing = "min_mag" # ['default','min_mag','max_mag'] self.chunks = chunks diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py index 640392a4..4ca8a3b7 100644 --- a/tools/uvr5/vr.py +++ b/tools/uvr5/vr.py @@ -1,6 +1,8 @@ -import os,sys +import os + parent_directory = os.path.dirname(os.path.abspath(__file__)) -import logging,pdb +import logging + logger = logging.getLogger(__name__) import librosa @@ -27,7 +29,7 @@ class AudioPre: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory) + mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json" % parent_directory) model = Nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) @@ -40,9 +42,7 @@ class AudioPre: self.mp = mp self.model = model - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False - ): + def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False): if ins_root is None and vocal_root is None: return "No save root." name = os.path.basename(music_file) @@ -61,19 +61,19 @@ class AudioPre: _, ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if X_wave[d].ndim == 1: X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) else: # lower bands X_wave[d] = librosa.core.resample( X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) # Stft of wave source X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( @@ -89,9 +89,7 @@ class AudioPre: input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] + input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :] X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) aggresive_set = float(self.data["agg"] / 100) @@ -100,9 +98,7 @@ class AudioPre: "split_bin": self.mp.param["band"][1]["crop_stop"], } with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) + pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data) # Postprocess if self.data["postprocess"]: pred_inv = np.clip(X_mag - pred, 0, np.inf) @@ -111,13 +107,11 @@ class AudioPre: v_spec_m = X_spec_m - y_spec_m if is_hp3 == True: - ins_root,vocal_root = vocal_root,ins_root + ins_root, vocal_root = vocal_root, ins_root if ins_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], y_spec_m, input_high_end, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave( y_spec_m, self.mp, input_high_end_h, input_high_end_ ) @@ -138,9 +132,7 @@ class AudioPre: self.mp.param["sr"], ) # else: - path = os.path.join( - ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(ins_root, head + "{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_instrument) * 32768).astype("int16"), @@ -160,12 +152,8 @@ class AudioPre: else: head = "vocal_" if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], v_spec_m, input_high_end, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) @@ -179,9 +167,7 @@ class AudioPre: self.mp.param["sr"], ) else: - path = os.path.join( - vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_vocals) * 32768).astype("int16"), @@ -210,7 +196,7 @@ class AudioPreDeEcho: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json"%parent_directory) + mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json" % parent_directory) nout = 64 if "DeReverb" in model_path else 48 model = CascadedNet(mp.param["bins"] * 2, nout) cpk = torch.load(model_path, map_location="cpu") @@ -245,19 +231,19 @@ class AudioPreDeEcho: _, ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if X_wave[d].ndim == 1: X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) else: # lower bands X_wave[d] = librosa.core.resample( X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) # Stft of wave source X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( @@ -273,9 +259,7 @@ class AudioPreDeEcho: input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] + input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :] X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) aggresive_set = float(self.data["agg"] / 100) @@ -284,9 +268,7 @@ class AudioPreDeEcho: "split_bin": self.mp.param["band"][1]["crop_stop"], } with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) + pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data) # Postprocess if self.data["postprocess"]: pred_inv = np.clip(X_mag - pred, 0, np.inf) @@ -296,9 +278,7 @@ class AudioPreDeEcho: if ins_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], y_spec_m, input_high_end, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave( y_spec_m, self.mp, input_high_end_h, input_high_end_ ) @@ -315,9 +295,7 @@ class AudioPreDeEcho: self.mp.param["sr"], ) # else: - path = os.path.join( - ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_instrument) * 32768).astype("int16"), @@ -333,12 +311,8 @@ class AudioPreDeEcho: pass if vocal_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], v_spec_m, input_high_end, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) @@ -352,9 +326,7 @@ class AudioPreDeEcho: self.mp.param["sr"], ) else: - path = os.path.join( - vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_vocals) * 32768).astype("int16"), diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index ce52af5c..f5f8d3f6 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,23 +1,22 @@ -import os -import traceback,gradio as gr import logging +import os +import traceback + +import gradio as gr + from tools.i18n.i18n import I18nAuto from tools.my_utils import clean_path + i18n = I18nAuto() logger = logging.getLogger(__name__) -import librosa,ffmpeg -import soundfile as sf -import torch import sys + +import ffmpeg +import torch +from bsroformer import Roformer_Loader from mdxnet import MDXNetDereverb from vr import AudioPre, AudioPreDeEcho -from bsroformer import Roformer_Loader - -try: - import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... weight_uvr5_root = "tools/uvr5/uvr5_weights" uvr5_names = [] @@ -25,21 +24,24 @@ for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or name.endswith(".ckpt") or "onnx" in name: uvr5_names.append(name.replace(".pth", "").replace(".ckpt", "")) -device=sys.argv[1] -is_half=eval(sys.argv[2]) -webui_port_uvr5=int(sys.argv[3]) -is_share=eval(sys.argv[4]) +device = sys.argv[1] +is_half = eval(sys.argv[2]) +webui_port_uvr5 = int(sys.argv[3]) +is_share = eval(sys.argv[4]) -def html_left(text, label='p'): + +def html_left(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" -def html_center(text, label='p'): + +def html_center(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" + def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] try: @@ -52,13 +54,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format elif "roformer" in model_name.lower(): func = Roformer_Loader pre_fun = func( - model_path = os.path.join(weight_uvr5_root, model_name + ".ckpt"), - config_path = os.path.join(weight_uvr5_root, model_name + ".yaml"), - device = device, - is_half=is_half + model_path=os.path.join(weight_uvr5_root, model_name + ".ckpt"), + config_path=os.path.join(weight_uvr5_root, model_name + ".yaml"), + device=device, + is_half=is_half, ) if not os.path.exists(os.path.join(weight_uvr5_root, model_name + ".yaml")): - infos.append("Warning: You are using a model without a configuration file. The program will automatically use the default configuration file. However, the default configuration file cannot guarantee that all models will run successfully. You can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again. (For example, the configuration file corresponding to the model 'bs_roformer_ep_368_sdr_12.9628.ckpt' should be 'bs_roformer_ep_368_sdr_12.9628.yaml'.) Or you can just ignore this warning.") + infos.append( + "Warning: You are using a model without a configuration file. The program will automatically use the default configuration file. However, the default configuration file cannot guarantee that all models will run successfully. You can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again. (For example, the configuration file corresponding to the model 'bs_roformer_ep_368_sdr_12.9628.ckpt' should be 'bs_roformer_ep_368_sdr_12.9628.yaml'.) Or you can just ignore this warning." + ) yield "\n".join(infos) else: func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho @@ -74,19 +78,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format paths = [path.name for path in paths] for path in paths: inp_path = os.path.join(inp_root, path) - if(os.path.isfile(inp_path)==False):continue + if os.path.isfile(inp_path) == False: + continue need_reformat = 1 done = 0 try: info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): + if info["streams"][0]["channels"] == 2 and info["streams"][0]["sample_rate"] == "44100": need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3) done = 1 except: need_reformat = 1 @@ -96,21 +96,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format os.path.join(os.environ["TEMP"]), os.path.basename(inp_path), ) - os.system( - f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y' - ) + os.system(f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y') inp_path = tmp_path try: if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3) infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) + infos.append("%s->%s" % (os.path.basename(inp_path), traceback.format_exc())) yield "\n".join(infos) except: infos.append(traceback.format_exc()) @@ -130,83 +124,101 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format torch.cuda.empty_cache() yield "\n".join(infos) -with gr.Blocks(title="UVR5 WebUI") as app: + +with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app: gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Group(): - gr.Markdown(html_center(i18n("伴奏人声分离&去混响&去回声"),'h2')) + gr.Markdown(html_center(i18n("伴奏人声分离&去混响&去回声"), "h2")) with gr.Group(): - gr.Markdown( - value=html_left(i18n("人声伴奏分离批量处理, 使用UVR5模型。") + "
" + \ - i18n("合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。")+ "
" + \ - i18n("模型分为三类:") + "
" + \ - i18n("1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;") + "
" + \ - i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + "
" + \ - i18n("3、去混响、去延迟模型(by FoxJoy):") + "
  " + \ - i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + "
 " + \ - i18n("(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。") + "
" + \ - i18n("去混响/去延迟,附:") + "
" + \ - i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + "
" + \ - i18n("2、MDX-Net-Dereverb模型挺慢的;") + "
" + \ - i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"),'h4') - ) - with gr.Row(): - with gr.Column(): - model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) - dir_wav_input = gr.Textbox( - label=i18n("输入待处理音频文件夹路径"), - placeholder="C:\\Users\\Desktop\\todo-songs", - ) - wav_inputs = gr.File( - file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") - ) - with gr.Column(): - agg = gr.Slider( - minimum=0, - maximum=20, - step=1, - label=i18n("人声提取激进程度"), - value=10, - interactive=True, - visible=False, # 先不开放调整 - ) - opt_vocal_root = gr.Textbox( - label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt" - ) - opt_ins_root = gr.Textbox( - label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt" - ) - format0 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="flac", - interactive=True, - ) - with gr.Column(): - with gr.Row(): - but2 = gr.Button(i18n("转换"), variant="primary") - with gr.Row(): - vc_output4 = gr.Textbox(label=i18n("输出信息"),lines=3) - but2.click( - uvr, - [ - model_choose, - dir_wav_input, - opt_vocal_root, - wav_inputs, - opt_ins_root, - agg, - format0, - ], - [vc_output4], - api_name="uvr_convert", + gr.Markdown( + value=html_left( + i18n("人声伴奏分离批量处理, 使用UVR5模型。") + + "
" + + i18n( + "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。" ) -app.queue().launch(#concurrency_count=511, max_size=1022 + + "
" + + i18n("模型分为三类:") + + "
" + + i18n( + "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;" + ) + + "
" + + i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + + "
" + + i18n("3、去混响、去延迟模型(by FoxJoy):") + + "
  " + + i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + + "
 " + + i18n( + "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。" + ) + + "
" + + i18n("去混响/去延迟,附:") + + "
" + + i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + + "
" + + i18n("2、MDX-Net-Dereverb模型挺慢的;") + + "
" + + i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"), + "h4", + ) + ) + with gr.Row(): + with gr.Column(): + model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) + dir_wav_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径"), + placeholder="C:\\Users\\Desktop\\todo-songs", + ) + wav_inputs = gr.File( + file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + ) + with gr.Column(): + agg = gr.Slider( + minimum=0, + maximum=20, + step=1, + label=i18n("人声提取激进程度"), + value=10, + interactive=True, + visible=False, # 先不开放调整 + ) + opt_vocal_root = gr.Textbox(label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt") + opt_ins_root = gr.Textbox(label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt") + format0 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="flac", + interactive=True, + ) + with gr.Column(): + with gr.Row(): + but2 = gr.Button(i18n("转换"), variant="primary") + with gr.Row(): + vc_output4 = gr.Textbox(label=i18n("输出信息"), lines=3) + but2.click( + uvr, + [ + model_choose, + dir_wav_input, + opt_vocal_root, + wav_inputs, + opt_ins_root, + agg, + format0, + ], + [vc_output4], + api_name="uvr_convert", + ) +app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, server_port=webui_port_uvr5, - quiet=True, + # quiet=True, ) diff --git a/webui.py b/webui.py index b73ed89e..43f33249 100644 --- a/webui.py +++ b/webui.py @@ -1,25 +1,36 @@ -import os,sys -if len(sys.argv)==1:sys.argv.append('v2') -version="v1"if sys.argv[1]=="v1" else"v2" -os.environ["version"]=version +import os +import sys + +if len(sys.argv) == 1: + sys.argv.append("v2") +version = "v1" if sys.argv[1] == "v1" else "v2" +os.environ["version"] = version now_dir = os.getcwd() sys.path.insert(0, now_dir) import warnings + warnings.filterwarnings("ignore") -import json,yaml,torch,pdb,re,shutil +import json import platform -import psutil +import re +import shutil import signal -os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO' + +import psutil +import torch +import yaml + +os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO" torch.manual_seed(233333) tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp -if(os.path.exists(tmp)): +if os.path.exists(tmp): for name in os.listdir(tmp): - if(name=="jieba.cache"):continue - path="%s/%s"%(tmp,name) - delete=os.remove if os.path.isfile(path) else shutil.rmtree + if name == "jieba.cache": + continue + path = "%s/%s" % (tmp, name) + delete = os.remove if os.path.isfile(path) else shutil.rmtree try: delete(path) except Exception as e: @@ -27,12 +38,14 @@ if(os.path.exists(tmp)): pass import site import traceback + site_packages_roots = [] for path in site.getsitepackages(): if "packages" in path: site_packages_roots.append(path) -if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] -#os.environ["OPENBLAS_NUM_THREADS"] = "4" +if site_packages_roots == []: + site_packages_roots = ["%s/runtime/Lib/site-packages" % now_dir] +# os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" os.environ["all_proxy"] = "" for site_packages_root in site_packages_roots: @@ -45,29 +58,43 @@ for site_packages_root in site_packages_roots: % (now_dir, now_dir, now_dir, now_dir, now_dir, now_dir) ) break - except PermissionError as e: + except PermissionError: traceback.print_exc() -from tools import my_utils import shutil -import pdb import subprocess from subprocess import Popen -import signal -from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share + +from config import ( + exp_root, + infer_device, + is_half, + is_share, + python_exec, + webui_port_infer_tts, + webui_port_main, + webui_port_subfix, + webui_port_uvr5, +) +from tools import my_utils from tools.i18n.i18n import I18nAuto, scan_language_list -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" -os.environ["language"]=language + +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" +os.environ["language"] = language i18n = I18nAuto(language=language) -from scipy.io import wavfile -from tools.my_utils import load_audio, check_for_existance, check_details from multiprocessing import cpu_count + +from tools.my_utils import check_details, check_for_existance + # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... + + analytics.version_check = lambda: None +except: + ... import gradio as gr -n_cpu=cpu_count() + +n_cpu = cpu_count() ngpu = torch.cuda.device_count() gpu_infos = [] @@ -75,25 +102,65 @@ mem = [] if_gpu_ok = False # 判断是否有能用来训练和加速推理的N卡 -ok_gpu_keywords={"10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060","H","600","506","507","508","509"} -set_gpu_numbers=set() +ok_gpu_keywords = { + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "L4", + "4060", + "H", + "600", + "506", + "507", + "508", + "509", +} +set_gpu_numbers = set() if torch.cuda.is_available() or ngpu != 0: for i in range(ngpu): gpu_name = torch.cuda.get_device_name(i) - if any(value in gpu_name.upper()for value in ok_gpu_keywords): + if any(value in gpu_name.upper() for value in ok_gpu_keywords): # A10#A100#V100#A40#P40#M40#K80#A4500 if_gpu_ok = True # 至少有一张能用的N卡 gpu_infos.append("%s\t%s" % (i, gpu_name)) set_gpu_numbers.add(i) - mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4)) + mem.append(int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)) # # 判断是否支持mps加速 # if torch.backends.mps.is_available(): # if_gpu_ok = True # gpu_infos.append("%s\t%s" % ("0", "Apple GPU")) # mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 + +v3v4set = {"v3", "v4"} + + def set_default(): - global default_batch_size,default_max_batch_size,gpu_info,default_sovits_epoch,default_sovits_save_every_epoch,max_sovits_epoch,max_sovits_save_every_epoch,default_batch_size_s1,if_force_ckpt + global \ + default_batch_size, \ + default_max_batch_size, \ + gpu_info, \ + default_sovits_epoch, \ + default_sovits_save_every_epoch, \ + max_sovits_epoch, \ + max_sovits_save_every_epoch, \ + default_batch_size_s1, \ + if_force_ckpt if_force_ckpt = False if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) @@ -117,100 +184,141 @@ def set_default(): # minmem = 14 # except RuntimeError as _: # print("显存不足以开启V3训练") - default_batch_size = minmem // 2 if version!="v3"else minmem//8 - default_batch_size_s1=minmem // 2 + default_batch_size = minmem // 2 if version not in v3v4set else minmem // 8 + default_batch_size_s1 = minmem // 2 else: - gpu_info = ("%s\t%s" % ("0", "CPU")) + gpu_info = "%s\t%s" % ("0", "CPU") gpu_infos.append("%s\t%s" % ("0", "CPU")) set_gpu_numbers.add(0) - default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 4) - if version!="v3": - default_sovits_epoch=8 - default_sovits_save_every_epoch=4 - max_sovits_epoch=25#40 - max_sovits_save_every_epoch=25#10 + default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total / 1024 / 1024 / 1024 / 4) + if version not in v3v4set: + default_sovits_epoch = 8 + default_sovits_save_every_epoch = 4 + max_sovits_epoch = 25 # 40 + max_sovits_save_every_epoch = 25 # 10 else: - default_sovits_epoch=2 - default_sovits_save_every_epoch=1 - max_sovits_epoch=3#40 - max_sovits_save_every_epoch=3#10 + default_sovits_epoch = 2 + default_sovits_save_every_epoch = 1 + max_sovits_epoch = 20 # 40 # 3 + max_sovits_save_every_epoch = 10 # 10 # 3 default_batch_size = max(1, default_batch_size) default_batch_size_s1 = max(1, default_batch_size_s1) default_max_batch_size = default_batch_size * 3 + set_default() gpus = "-".join([i[0] for i in gpu_infos]) -default_gpu_numbers=str(sorted(list(set_gpu_numbers))[0]) -def fix_gpu_number(input):#将越界的number强制改到界内 +default_gpu_numbers = str(sorted(list(set_gpu_numbers))[0]) + + +def fix_gpu_number(input): # 将越界的number强制改到界内 try: - if(int(input)not in set_gpu_numbers):return default_gpu_numbers - except:return input + if int(input) not in set_gpu_numbers: + return default_gpu_numbers + except: + return input return input + + def fix_gpu_numbers(inputs): - output=[] + output = [] try: - for input in inputs.split(","):output.append(str(fix_gpu_number(input))) + for input in inputs.split(","): + output.append(str(fix_gpu_number(input))) return ",".join(output) except: return inputs -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] -pretrained_model_list = (pretrained_sovits_name[int(version[-1])-1],pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"),pretrained_gpt_name[int(version[-1])-1],"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large","GPT_SoVITS/pretrained_models/chinese-hubert-base") +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] -_ = '' +pretrained_model_list = ( + pretrained_sovits_name[int(version[-1]) - 1], + pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + pretrained_gpt_name[int(version[-1]) - 1], + "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "GPT_SoVITS/pretrained_models/chinese-hubert-base", +) + +_ = "" for i in pretrained_model_list: if "s2Dv3" not in i and os.path.exists(i) == False: - _ += f'\n {i}' + _ += f"\n {i}" if _: - print("warning: ", i18n('以下模型不存在:') + _) + print("warning: ", i18n("以下模型不存在:") + _) + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + else: + _[0].append("") ##没有下pretrained模型的,说不定他们是想自己从零训底模呢 + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) + else: + _[-1].append("") +pretrained_gpt_name, pretrained_sovits_name = _ + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for root in SoVITS_weight_root + GPT_weight_root: + os.makedirs(root, exist_ok=True) -_ = [[],[]] -for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - else:_[0].append("")##没有下pretrained模型的,说不定他们是想自己从零训底模呢 - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) - else:_[-1].append("") -pretrained_gpt_name,pretrained_sovits_name = _ -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for root in SoVITS_weight_root+GPT_weight_root: - os.makedirs(root,exist_ok=True) def get_weights_names(): - SoVITS_names = [name for name in pretrained_sovits_name if name!=""] + SoVITS_names = [name for name in pretrained_sovits_name if name != ""] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) - GPT_names = [name for name in pretrained_gpt_name if name!=""] + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [name for name in pretrained_gpt_name if name != ""] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names -SoVITS_names,GPT_names = get_weights_names() -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) + +SoVITS_names, GPT_names = get_weights_names() +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts + def change_choices(): SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"} + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +p_label = None +p_uvr5 = None +p_asr = None +p_denoise = None +p_tts_inference = None -p_label=None -p_uvr5=None -p_asr=None -p_denoise=None -p_tts_inference=None def kill_proc_tree(pid, including_parent=True): try: @@ -231,16 +339,20 @@ def kill_proc_tree(pid, including_parent=True): except OSError: pass -system=platform.system() + +system = platform.system() + + def kill_process(pid, process_name=""): - if(system=="Windows"): + if system == "Windows": cmd = "taskkill /t /f /pid %s" % pid # os.system(cmd) - subprocess.run(cmd,shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: kill_proc_tree(pid) print(process_name + i18n("进程已终止")) + def process_info(process_name="", indicator=""): if indicator == "opened": return process_name + i18n("已开启") @@ -263,298 +375,548 @@ def process_info(process_name="", indicator=""): else: return process_name + process_name_subfix = i18n("音频标注WebUI") + + def change_label(path_list): global p_label if p_label is None: check_for_existance([path_list]) path_list = my_utils.clean_path(path_list) - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share) - yield process_info(process_name_subfix, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + cmd = '"%s" -s tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( + python_exec, + path_list, + webui_port_subfix, + is_share, + ) + yield ( + process_info(process_name_subfix, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_label = Popen(cmd, shell=True) else: kill_process(p_label.pid, process_name_subfix) p_label = None - yield process_info(process_name_subfix, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_subfix, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_uvr5 = i18n("人声分离WebUI") + + def change_uvr5(): global p_uvr5 if p_uvr5 is None: - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share) - yield process_info(process_name_uvr5, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + cmd = '"%s" -s tools/uvr5/webui.py "%s" %s %s %s' % ( + python_exec, + infer_device, + is_half, + webui_port_uvr5, + is_share, + ) + yield ( + process_info(process_name_uvr5, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_uvr5 = Popen(cmd, shell=True) else: kill_process(p_uvr5.pid, process_name_uvr5) p_uvr5 = None - yield process_info(process_name_uvr5, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_uvr5, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_tts = i18n("TTS推理WebUI") -def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path, batched_infer_enabled): + + +def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled): global p_tts_inference if batched_infer_enabled: - cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language) + cmd = '"%s" -s GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) else: - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) - #####v3暂不支持加速推理 - if version=="v3": - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) + cmd = '"%s" -s GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) + # #####v3暂不支持加速推理 + # if version=="v3": + # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) if p_tts_inference is None: - os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) - os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) - os.environ["cnhubert_base_path"]=cnhubert_base_path - os.environ["bert_path"]=bert_path - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_number(gpu_number) - os.environ["is_half"]=str(is_half) - os.environ["infer_ttswebui"]=str(webui_port_infer_tts) - os.environ["is_share"]=str(is_share) - yield process_info(process_name_tts, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) + os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) + os.environ["cnhubert_base_path"] = cnhubert_base_path + os.environ["bert_path"] = bert_path + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number) + os.environ["is_half"] = str(is_half) + os.environ["infer_ttswebui"] = str(webui_port_infer_tts) + os.environ["is_share"] = str(is_share) + yield ( + process_info(process_name_tts, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_tts_inference = Popen(cmd, shell=True) else: kill_process(p_tts_inference.pid, process_name_tts) p_tts_inference = None - yield process_info(process_name_tts, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_tts, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + from tools.asr.config import asr_dict process_name_asr = i18n("语音识别") + + def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_precision): global p_asr if p_asr is None: - asr_inp_dir=my_utils.clean_path(asr_inp_dir) - asr_opt_dir=my_utils.clean_path(asr_opt_dir) + asr_inp_dir = my_utils.clean_path(asr_inp_dir) + asr_opt_dir = my_utils.clean_path(asr_opt_dir) check_for_existance([asr_inp_dir]) - cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' + cmd = f'"{python_exec}" -s tools/asr/{asr_dict[asr_model]["path"]}' cmd += f' -i "{asr_inp_dir}"' cmd += f' -o "{asr_opt_dir}"' - cmd += f' -s {asr_model_size}' - cmd += f' -l {asr_lang}' + cmd += f" -s {asr_model_size}" + cmd += f" -l {asr_lang}" cmd += f" -p {asr_precision}" output_file_name = os.path.basename(asr_inp_dir) output_folder = asr_opt_dir or "output/asr_opt" - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - yield process_info(process_name_asr, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") + yield ( + process_info(process_name_asr, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_asr = Popen(cmd, shell=True) p_asr.wait() p_asr = None - yield process_info(process_name_asr, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": output_file_path}, {"__type__": "update", "value": output_file_path}, {"__type__": "update", "value": asr_inp_dir} + yield ( + process_info(process_name_asr, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": asr_inp_dir}, + ) else: - yield process_info(process_name_asr, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_asr, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_asr(): global p_asr if p_asr is not None: kill_process(p_asr.pid, process_name_asr) p_asr = None - return process_info(process_name_asr, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_asr, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_denoise = i18n("语音降噪") + + def open_denoise(denoise_inp_dir, denoise_opt_dir): global p_denoise - if(p_denoise==None): - denoise_inp_dir=my_utils.clean_path(denoise_inp_dir) - denoise_opt_dir=my_utils.clean_path(denoise_opt_dir) + if p_denoise == None: + denoise_inp_dir = my_utils.clean_path(denoise_inp_dir) + denoise_opt_dir = my_utils.clean_path(denoise_opt_dir) check_for_existance([denoise_inp_dir]) - cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32") + cmd = '"%s" -s tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % ( + python_exec, + denoise_inp_dir, + denoise_opt_dir, + "float16" if is_half == True else "float32", + ) - yield process_info(process_name_denoise, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_denoise, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_denoise = Popen(cmd, shell=True) p_denoise.wait() - p_denoise=None - yield process_info(process_name_denoise, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": denoise_opt_dir}, {"__type__": "update", "value": denoise_opt_dir} + p_denoise = None + yield ( + process_info(process_name_denoise, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": denoise_opt_dir}, + {"__type__": "update", "value": denoise_opt_dir}, + ) else: - yield process_info(process_name_denoise, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_denoise, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_denoise(): global p_denoise if p_denoise is not None: kill_process(p_denoise.pid, process_name_denoise) p_denoise = None - return process_info(process_name_denoise, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_denoise, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -p_train_SoVITS=None + +p_train_SoVITS = None process_name_sovits = i18n("SoVITS训练") -def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D,if_grad_ckpt,lora_rank): + + +def open1Ba( + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, +): global p_train_SoVITS - if(p_train_SoVITS==None): - with open("GPT_SoVITS/configs/s2.json")as f: - data=f.read() - data=json.loads(data) - s2_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s2_%s"%(s2_dir,version),exist_ok=True) - if check_for_existance([s2_dir],is_train=True): - check_details([s2_dir],is_train=True) - if(is_half==False): - data["train"]["fp16_run"]=False - batch_size=max(1,batch_size//2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["train"]["text_low_lr_rate"]=text_low_lr_rate - data["train"]["pretrained_s2G"]=pretrained_s2G - data["train"]["pretrained_s2D"]=pretrained_s2D - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["save_every_epoch"]=save_every_epoch - data["train"]["gpu_numbers"]=gpu_numbers1Ba - data["train"]["grad_ckpt"]=if_grad_ckpt - data["train"]["lora_rank"]=lora_rank - data["model"]["version"]=version - data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir - data["save_weight_dir"]=SoVITS_weight_root[int(version[-1])-1] - data["name"]=exp_name - data["version"]=version - tmp_config_path="%s/tmp_s2.json"%tmp - with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) - if version in ["v1","v2"]: - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) + if p_train_SoVITS == None: + with open("GPT_SoVITS/configs/s2.json") as f: + data = f.read() + data = json.loads(data) + s2_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s2_%s" % (s2_dir, version), exist_ok=True) + if check_for_existance([s2_dir], is_train=True): + check_details([s2_dir], is_train=True) + if is_half == False: + data["train"]["fp16_run"] = False + batch_size = max(1, batch_size // 2) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["train"]["text_low_lr_rate"] = text_low_lr_rate + data["train"]["pretrained_s2G"] = pretrained_s2G + data["train"]["pretrained_s2D"] = pretrained_s2D + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["save_every_epoch"] = save_every_epoch + data["train"]["gpu_numbers"] = gpu_numbers1Ba + data["train"]["grad_ckpt"] = if_grad_ckpt + data["train"]["lora_rank"] = lora_rank + data["model"]["version"] = version + data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir + data["save_weight_dir"] = SoVITS_weight_root[int(version[-1]) - 1] + data["name"] = exp_name + data["version"] = version + tmp_config_path = "%s/tmp_s2.json" % tmp + with open(tmp_config_path, "w") as f: + f.write(json.dumps(data)) + if version in ["v1", "v2"]: + cmd = '"%s" -s GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) else: - cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"'%(python_exec,tmp_config_path) - yield process_info(process_name_sovits, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + cmd = '"%s" -s GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_sovits, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_train_SoVITS = Popen(cmd, shell=True) p_train_SoVITS.wait() p_train_SoVITS = None - yield process_info(process_name_sovits, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + SoVITS_dropdown_update, GPT_dropdown_update = change_choices() + yield ( + process_info(process_name_sovits, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + SoVITS_dropdown_update, + GPT_dropdown_update, + ) else: - yield process_info(process_name_sovits, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_sovits, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close1Ba(): global p_train_SoVITS if p_train_SoVITS is not None: kill_process(p_train_SoVITS.pid, process_name_sovits) p_train_SoVITS = None - return process_info(process_name_sovits, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_sovits, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -p_train_GPT=None + +p_train_GPT = None process_name_gpt = i18n("GPT训练") -def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): + + +def open1Bb( + batch_size, + total_epoch, + exp_name, + if_dpo, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers, + pretrained_s1, +): global p_train_GPT - if(p_train_GPT==None): - with open("GPT_SoVITS/configs/s1longer.yaml"if version=="v1"else "GPT_SoVITS/configs/s1longer-v2.yaml")as f: - data=f.read() - data=yaml.load(data, Loader=yaml.FullLoader) - s1_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True) - if check_for_existance([s1_dir],is_train=True): - check_details([s1_dir],is_train=True) - if(is_half==False): - data["train"]["precision"]="32" + if p_train_GPT == None: + with open( + "GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml" + ) as f: + data = f.read() + data = yaml.load(data, Loader=yaml.FullLoader) + s1_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s1" % (s1_dir), exist_ok=True) + if check_for_existance([s1_dir], is_train=True): + check_details([s1_dir], is_train=True) + if is_half == False: + data["train"]["precision"] = "32" batch_size = max(1, batch_size // 2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["pretrained_s1"]=pretrained_s1 - data["train"]["save_every_n_epoch"]=save_every_epoch - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_dpo"]=if_dpo - data["train"]["half_weights_save_dir"]=GPT_weight_root[int(version[-1])-1] - data["train"]["exp_name"]=exp_name - data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir - data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir - data["output_dir"]="%s/logs_s1_%s"%(s1_dir,version) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["pretrained_s1"] = pretrained_s1 + data["train"]["save_every_n_epoch"] = save_every_epoch + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_dpo"] = if_dpo + data["train"]["half_weights_save_dir"] = GPT_weight_root[int(version[-1]) - 1] + data["train"]["exp_name"] = exp_name + data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir + data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir + data["output_dir"] = "%s/logs_s1_%s" % (s1_dir, version) # data["version"]=version - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_numbers(gpu_numbers.replace("-",",")) - os.environ["hz"]="25hz" - tmp_config_path="%s/tmp_s1.yaml"%tmp - with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_numbers(gpu_numbers.replace("-", ",")) + os.environ["hz"] = "25hz" + tmp_config_path = "%s/tmp_s1.yaml" % tmp + with open(tmp_config_path, "w") as f: + f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) - yield process_info(process_name_gpt, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + cmd = '"%s" -s GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_gpt, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_train_GPT = Popen(cmd, shell=True) p_train_GPT.wait() p_train_GPT = None - yield process_info(process_name_gpt, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + SoVITS_dropdown_update, GPT_dropdown_update = change_choices() + yield ( + process_info(process_name_gpt, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + SoVITS_dropdown_update, + GPT_dropdown_update, + ) else: - yield process_info(process_name_gpt, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_gpt, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close1Bb(): global p_train_GPT if p_train_GPT is not None: kill_process(p_train_GPT.pid, process_name_gpt) p_train_GPT = None - return process_info(process_name_gpt, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_gpt, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps_slice=[] + +ps_slice = [] process_name_slice = i18n("语音切分") -def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): + + +def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, n_parts): global ps_slice inp = my_utils.clean_path(inp) opt_root = my_utils.clean_path(opt_root) check_for_existance([inp]) - if(os.path.exists(inp)==False): - yield i18n("输入路径不存在"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + if os.path.exists(inp) == False: + yield ( + i18n("输入路径不存在"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) return - if os.path.isfile(inp):n_parts=1 - elif os.path.isdir(inp):pass + if os.path.isfile(inp): + n_parts = 1 + elif os.path.isdir(inp): + pass else: - yield i18n("输入路径存在但不可用"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + i18n("输入路径存在但不可用"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) return - if (ps_slice == []): + if ps_slice == []: for i_part in range(n_parts): - cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts) + cmd = '"%s" -s tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % ( + python_exec, + inp, + opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + i_part, + n_parts, + ) print(cmd) p = Popen(cmd, shell=True) ps_slice.append(p) - yield process_info(process_name_slice, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_slice, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) for p in ps_slice: p.wait() - ps_slice=[] - yield process_info(process_name_slice, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": opt_root}, {"__type__": "update", "value": opt_root}, {"__type__": "update", "value": opt_root} + ps_slice = [] + yield ( + process_info(process_name_slice, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + ) else: - yield process_info(process_name_slice, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_slice, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_slice(): global ps_slice - if (ps_slice != []): + if ps_slice != []: for p_slice in ps_slice: try: kill_process(p_slice.pid, process_name_slice) except: traceback.print_exc() - ps_slice=[] - return process_info(process_name_slice, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps_slice = [] + return ( + process_info(process_name_slice, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1a=[] + +ps1a = [] process_name_1a = i18n("文本分词与特征提取") -def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): + + +def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): global ps1a inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1a == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1a == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { "i_part": str(i_part), "all_parts": str(all_parts), "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - "is_half": str(is_half) + "is_half": str(is_half), } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1a.append(p) - yield process_info(process_name_1a, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1a, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1a: p.wait() opt = [] @@ -566,13 +928,26 @@ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): path_text = "%s/2-name2text.txt" % opt_dir with open(path_text, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - ps1a=[] + ps1a = [] if len("".join(opt)) > 0: - yield process_info(process_name_1a, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1a, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1a, "failed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1a, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1a, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1a, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1a(): global ps1a @@ -583,27 +958,34 @@ def close1a(): except: traceback.print_exc() ps1a = [] - return process_info(process_name_1a, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_1a, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1b=[] + +ps1b = [] process_name_1b = i18n("语音自监督特征提取") -def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): + + +def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): global ps1b inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1b == []): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir": "%s/%s"%(exp_root,exp_name), - "cnhubert_base_dir":ssl_pretrained_dir, - "is_half": str(is_half) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1b == []: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "cnhubert_base_dir": ssl_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -613,48 +995,68 @@ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1b.append(p) - yield process_info(process_name_1b, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1b, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1b: p.wait() - ps1b=[] - yield process_info(process_name_1b, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1b = [] + yield ( + process_info(process_name_1b, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1b, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1b, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1b(): global ps1b - if (ps1b != []): + if ps1b != []: for p1b in ps1b: try: kill_process(p1b.pid, process_name_1b) except: traceback.print_exc() - ps1b=[] - return process_info(process_name_1b, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1b = [] + return ( + process_info(process_name_1b, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1c=[] + +ps1c = [] process_name_1c = i18n("语义Token提取") -def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): + + +def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): global ps1c inp_text = my_utils.clean_path(inp_text) - if check_for_existance([inp_text,''], is_dataset_processing=True): - check_details([inp_text,''], is_dataset_processing=True) - if (ps1c == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", - "is_half": str(is_half) + if check_for_existance([inp_text, ""], is_dataset_processing=True): + check_details([inp_text, ""], is_dataset_processing=True) + if ps1c == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -664,11 +1066,15 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1c.append(p) - yield process_info(process_name_1c, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1c, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1c: p.wait() opt = ["item_name\tsemantic_audio"] @@ -680,46 +1086,75 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): os.remove(semantic_path) with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - ps1c=[] - yield process_info(process_name_1c, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1c = [] + yield ( + process_info(process_name_1c, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1c, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1c, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1c(): global ps1c - if (ps1c != []): + if ps1c != []: for p1c in ps1c: try: kill_process(p1c.pid, process_name_1c) except: traceback.print_exc() - ps1c=[] - return process_info(process_name_1c, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1c = [] + return ( + process_info(process_name_1c, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1abc=[] + +ps1abc = [] process_name_1abc = i18n("训练集格式化一键三连") -def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path): + + +def open1abc( + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + ssl_pretrained_dir, + pretrained_s2G_path, +): global ps1abc inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1abc == []): - opt_dir="%s/%s"%(exp_root,exp_name) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1abc == []: + opt_dir = "%s/%s" % (exp_root, exp_name) try: #############################1a - path_text="%s/2-name2text.txt" % opt_dir - if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, - "is_half": str(is_half) + path_text = "%s/2-name2text.txt" % opt_dir + if os.path.exists(path_text) == False or ( + os.path.exists(path_text) == True + and len(open(path_text, "r", encoding="utf8").read().strip("\n").split("\n")) < 2 + ): + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers1a.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1a.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -729,34 +1164,43 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield ( + i18n("进度") + ": 1A-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() opt = [] - for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) + for i_part in range(all_parts): # txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r",encoding="utf8") as f: + with open(txt_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(txt_path) - with open(path_text, "w",encoding="utf8") as f: + with open(path_text, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") assert len("".join(opt)) > 0, process_info(process_name_1a, "failed") - yield i18n("进度") + ": 1A-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] + yield ( + i18n("进度") + ": 1A-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] #############################1b - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "cnhubert_base_dir":ssl_pretrained_dir, + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "cnhubert_base_dir": ssl_pretrained_dir, } - gpu_names=gpu_numbers1Ba.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1Ba.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -766,26 +1210,37 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Done, 1B-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - yield i18n("进度") + ": 1A-Done, 1B-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] + yield ( + i18n("进度") + ": 1A-Done, 1B-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() + yield ( + i18n("进度") + ": 1A-Done, 1B-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)): - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", + if os.path.exists(path_semantic) == False or ( + os.path.exists(path_semantic) == True and os.path.getsize(path_semantic) < 31 + ): + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", } - gpu_names=gpu_numbers1c.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1c.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -795,338 +1250,736 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Done, 1B-Done, 1C-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() opt = ["item_name\tsemantic_audio"] for i_part in range(all_parts): semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r",encoding="utf8") as f: + with open(semantic_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(semantic_path) - with open(path_semantic, "w",encoding="utf8") as f: + with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield i18n("进度") + ": 1A-Done, 1B-Done, 1C-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) ps1abc = [] - yield process_info(process_name_1abc, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1abc, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) except: traceback.print_exc() close1abc() - yield process_info(process_name_1abc, "failed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1abc, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1abc, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1abc, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1abc(): global ps1abc - if (ps1abc != []): + if ps1abc != []: for p1abc in ps1abc: try: kill_process(p1abc.pid, process_name_1abc) except: traceback.print_exc() - ps1abc=[] - return process_info(process_name_1abc, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1abc = [] + return ( + process_info(process_name_1abc, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + def switch_version(version_): - os.environ["version"]=version_ + os.environ["version"] = version_ global version version = version_ - if pretrained_sovits_name[int(version[-1])-1] !='' and pretrained_gpt_name[int(version[-1])-1] !='':... + if pretrained_sovits_name[int(version[-1]) - 1] != "" and pretrained_gpt_name[int(version[-1]) - 1] != "": + ... else: - gr.Warning(i18n('未下载模型') + ": " + version.upper()) + gr.Warning(i18n("未下载模型") + ": " + version.upper()) set_default() - return {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D")}, \ - {'__type__': 'update', 'value': pretrained_gpt_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_gpt_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1]}, \ - {'__type__': 'update', "value": default_batch_size, "maximum": default_max_batch_size}, \ - {'__type__': 'update', "value": default_sovits_epoch, "maximum": max_sovits_epoch}, \ - {'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \ - {'__type__': 'update', "visible": True if version!="v3"else False}, \ - {'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \ - {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ - {'__type__': 'update', "visible": True if version== "v3" else False} + return ( + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D")}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": default_batch_size, "maximum": default_max_batch_size}, + {"__type__": "update", "value": default_sovits_epoch, "maximum": max_sovits_epoch}, + {"__type__": "update", "value": default_sovits_save_every_epoch, "maximum": max_sovits_save_every_epoch}, + {"__type__": "update", "visible": True if version not in v3v4set else False}, + { + "__type__": "update", + "value": False if not if_force_ckpt else True, + "interactive": True if not if_force_ckpt else False, + }, + {"__type__": "update", "interactive": True, "value": False}, + {"__type__": "update", "visible": True if version in v3v4set else False}, + ) # {'__type__': 'update', "interactive": False if version in v3v4set else True, "value": False}, \ ####batch infer -if os.path.exists('GPT_SoVITS/text/G2PWModel'):... + +if os.path.exists("GPT_SoVITS/text/G2PWModel"): + ... else: - cmd = '"%s" GPT_SoVITS/download.py'%python_exec + cmd = '"%s" -s GPT_SoVITS/download.py' % python_exec p = Popen(cmd, shell=True) p.wait() -def sync(text): - return {'__type__': 'update', 'value': text} -with gr.Blocks(title="GPT-SoVITS WebUI") as app: +def sync(text): + return {"__type__": "update", "value": text} + + +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - gr.Markdown( - value= - i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) + gr.Markdown(value=i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e") with gr.Tabs(): - with gr.TabItem("0-"+i18n("前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 - gr.Markdown(value="0a-"+i18n("UVR5人声伴奏分离&去混响去延迟工具")) + with gr.TabItem("0-" + i18n("前置数据集获取工具")): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 + gr.Markdown(value="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info")) - open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"),variant="primary",visible=True) - close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"),variant="primary",visible=False) + open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"), variant="primary", visible=True) + close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"), variant="primary", visible=False) - gr.Markdown(value="0b-"+i18n("语音切分工具")) + gr.Markdown(value="0b-" + i18n("语音切分工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="") - slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt") + slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"), value="") + slice_opt_root = gr.Textbox(label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt") with gr.Row(): - threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34") - min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000") - min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300") - hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10") - max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500") + threshold = gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34") + min_length = gr.Textbox( + label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"), + value="4000", + ) + min_interval = gr.Textbox(label=i18n("min_interval:最短切割间隔"), value="300") + hop_size = gr.Textbox( + label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"), + value="10", + ) + max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500") with gr.Row(): - _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True) - alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True) + _max = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("max:归一化后最大值多少"), + value=0.9, + interactive=True, + ) + alpha = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("alpha_mix:混多少比例归一化后音频进来"), + value=0.25, + interactive=True, + ) with gr.Row(): - n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True) + n_process = gr.Slider( + minimum=1, maximum=n_cpu, step=1, label=i18n("切割使用的进程数"), value=4, interactive=True + ) slicer_info = gr.Textbox(label=process_info(process_name_slice, "info")) - open_slicer_button = gr.Button(value=process_info(process_name_slice, "open"),variant="primary",visible=True) - close_slicer_button = gr.Button(value=process_info(process_name_slice, "close"),variant="primary",visible=False) + open_slicer_button = gr.Button( + value=process_info(process_name_slice, "open"), variant="primary", visible=True + ) + close_slicer_button = gr.Button( + value=process_info(process_name_slice, "close"), variant="primary", visible=False + ) - gr.Markdown(value="0bb-"+i18n("语音降噪工具")) + gr.Markdown(value="0bb-" + i18n("语音降噪工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - denoise_input_dir=gr.Textbox(label=i18n("输入文件夹路径"),value="") - denoise_output_dir=gr.Textbox(label=i18n("输出文件夹路径"),value="output/denoise_opt") + denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="") + denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt") with gr.Row(): denoise_info = gr.Textbox(label=process_info(process_name_denoise, "info")) - open_denoise_button = gr.Button(value=process_info(process_name_denoise, "open"),variant="primary",visible=True) - close_denoise_button = gr.Button(value=process_info(process_name_denoise, "close"),variant="primary",visible=False) + open_denoise_button = gr.Button( + value=process_info(process_name_denoise, "open"), variant="primary", visible=True + ) + close_denoise_button = gr.Button( + value=process_info(process_name_denoise, "close"), variant="primary", visible=False + ) - gr.Markdown(value="0c-"+i18n("语音识别工具")) + gr.Markdown(value="0c-" + i18n("语音识别工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - asr_inp_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True) + asr_inp_dir = gr.Textbox( + label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True + ) asr_opt_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True) with gr.Row(): - asr_model = gr.Dropdown(label=i18n("ASR 模型"), choices=list(asr_dict.keys()), interactive=True, value="达摩 ASR (中文)") - asr_size = gr.Dropdown(label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large") - asr_lang = gr.Dropdown(label=i18n("ASR 语言设置"), choices=["zh","yue"], interactive=True, value="zh") - asr_precision = gr.Dropdown(label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32") + asr_model = gr.Dropdown( + label=i18n("ASR 模型"), + choices=list(asr_dict.keys()), + interactive=True, + value="达摩 ASR (中文)", + ) + asr_size = gr.Dropdown( + label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large" + ) + asr_lang = gr.Dropdown( + label=i18n("ASR 语言设置"), choices=["zh", "yue"], interactive=True, value="zh" + ) + asr_precision = gr.Dropdown( + label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32" + ) with gr.Row(): asr_info = gr.Textbox(label=process_info(process_name_asr, "info")) - open_asr_button = gr.Button(value=process_info(process_name_asr, "open"),variant="primary",visible=True) - close_asr_button = gr.Button(value=process_info(process_name_asr, "close"),variant="primary",visible=False) + open_asr_button = gr.Button( + value=process_info(process_name_asr, "open"), variant="primary", visible=True + ) + close_asr_button = gr.Button( + value=process_info(process_name_asr, "close"), variant="primary", visible=False + ) - def change_lang_choices(key): #根据选择的模型修改可选的语言 - return {"__type__": "update", "choices": asr_dict[key]['lang'], "value": asr_dict[key]['lang'][0]} - def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 - return {"__type__": "update", "choices": asr_dict[key]['size'], "value": asr_dict[key]['size'][-1]} - def change_precision_choices(key): #根据选择的模型修改可选的语言 - if key =="Faster Whisper (多语种)": + def change_lang_choices(key): # 根据选择的模型修改可选的语言 + return {"__type__": "update", "choices": asr_dict[key]["lang"], "value": asr_dict[key]["lang"][0]} + + def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 + return {"__type__": "update", "choices": asr_dict[key]["size"], "value": asr_dict[key]["size"][-1]} + + def change_precision_choices(key): # 根据选择的模型修改可选的语言 + if key == "Faster Whisper (多语种)": if default_batch_size <= 4: - precision = 'int8' + precision = "int8" elif is_half: - precision = 'float16' + precision = "float16" else: - precision = 'float32' + precision = "float32" else: - precision = 'float32' - return {"__type__": "update", "choices": asr_dict[key]['precision'], "value": precision} + precision = "float32" + return {"__type__": "update", "choices": asr_dict[key]["precision"], "value": precision} + asr_model.change(change_lang_choices, [asr_model], [asr_lang]) asr_model.change(change_size_choices, [asr_model], [asr_size]) asr_model.change(change_precision_choices, [asr_model], [asr_precision]) - gr.Markdown(value="0d-"+i18n("语音文本校对标注工具")) + gr.Markdown(value="0d-" + i18n("语音文本校对标注工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - path_list = gr.Textbox(label=i18n("标注文件路径 (含文件后缀 *.list)"), value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", interactive=True) + path_list = gr.Textbox( + label=i18n("标注文件路径 (含文件后缀 *.list)"), + value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", + interactive=True, + ) label_info = gr.Textbox(label=process_info(process_name_subfix, "info")) - open_label = gr.Button(value=process_info(process_name_subfix, "open"),variant="primary",visible=True) - close_label = gr.Button(value=process_info(process_name_subfix, "close"),variant="primary",visible=False) + open_label = gr.Button(value=process_info(process_name_subfix, "open"), variant="primary", visible=True) + close_label = gr.Button( + value=process_info(process_name_subfix, "close"), variant="primary", visible=False + ) - open_label.click(change_label, [path_list], [label_info,open_label,close_label]) - close_label.click(change_label, [path_list], [label_info,open_label,close_label]) - open_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) - close_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) + open_label.click(change_label, [path_list], [label_info, open_label, close_label]) + close_label.click(change_label, [path_list], [label_info, open_label, close_label]) + open_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + close_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): with gr.Row(): with gr.Row(): exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) - version_checkbox = gr.Radio(label=i18n("版本"),value=version,choices=['v1','v2','v3']) + version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"]) # , "v3" with gr.Row(): - pretrained_s2G = gr.Textbox(label=i18n("预训练SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s2D = gr.Textbox(label=i18n("预训练SoVITS-D模型路径"), value=pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"), interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s1 = gr.Textbox(label=i18n("预训练GPT模型路径"), value=pretrained_gpt_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=10) + pretrained_s2G = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s2D = gr.Textbox( + label=i18n("预训练SoVITS-D模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s1 = gr.Textbox( + label=i18n("预训练GPT模型路径"), + value=pretrained_gpt_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=10, + ) - with gr.TabItem("1A-"+i18n("训练集格式化工具")): + with gr.TabItem("1A-" + i18n("训练集格式化工具")): gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")) with gr.Row(): with gr.Row(): - inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True,scale=10) + inp_text = gr.Textbox( + label=i18n("*文本标注文件"), + value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", + interactive=True, + scale=10, + ) with gr.Row(): inp_wav_dir = gr.Textbox( label=i18n("*训练集音频文件目录"), # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", interactive=True, - placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。"), scale=10 + placeholder=i18n( + "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。" + ), + scale=10, ) - gr.Markdown(value="1Aa-"+process_name_1a) + gr.Markdown(value="1Aa-" + process_name_1a) with gr.Row(): with gr.Row(): - gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1a = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - bert_pretrained_dir = gr.Textbox(label=i18n("预训练中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False,lines=2) + bert_pretrained_dir = gr.Textbox( + label=i18n("预训练中文BERT模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + interactive=False, + lines=2, + ) with gr.Row(): - button1a_open = gr.Button(value=process_info(process_name_1a, "open"),variant="primary",visible=True) - button1a_close = gr.Button(value=process_info(process_name_1a, "close"),variant="primary",visible=False) + button1a_open = gr.Button( + value=process_info(process_name_1a, "open"), variant="primary", visible=True + ) + button1a_close = gr.Button( + value=process_info(process_name_1a, "close"), variant="primary", visible=False + ) with gr.Row(): - info1a=gr.Textbox(label=process_info(process_name_1a, "info")) + info1a = gr.Textbox(label=process_info(process_name_1a, "info")) - gr.Markdown(value="1Ab-"+process_name_1b) + gr.Markdown(value="1Ab-" + process_name_1b) with gr.Row(): with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - cnhubert_base_dir = gr.Textbox(label=i18n("预训练SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False,lines=2) + cnhubert_base_dir = gr.Textbox( + label=i18n("预训练SSL模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-hubert-base", + interactive=False, + lines=2, + ) with gr.Row(): - button1b_open = gr.Button(value=process_info(process_name_1b, "open"),variant="primary",visible=True) - button1b_close = gr.Button(value=process_info(process_name_1b, "close"),variant="primary",visible=False) + button1b_open = gr.Button( + value=process_info(process_name_1b, "open"), variant="primary", visible=True + ) + button1b_close = gr.Button( + value=process_info(process_name_1b, "close"), variant="primary", visible=False + ) with gr.Row(): - info1b=gr.Textbox(label=process_info(process_name_1b, "info")) + info1b = gr.Textbox(label=process_info(process_name_1b, "info")) - gr.Markdown(value="1Ac-"+process_name_1c) + gr.Markdown(value="1Ac-" + process_name_1c) with gr.Row(): with gr.Row(): - gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1c = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - pretrained_s2G_ = gr.Textbox(label=i18n("预训练SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=False,lines=2) + pretrained_s2G_ = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=False, + lines=2, + ) with gr.Row(): - button1c_open = gr.Button(value=process_info(process_name_1c, "open"),variant="primary",visible=True) - button1c_close = gr.Button(value=process_info(process_name_1c, "close"),variant="primary",visible=False) + button1c_open = gr.Button( + value=process_info(process_name_1c, "open"), variant="primary", visible=True + ) + button1c_close = gr.Button( + value=process_info(process_name_1c, "close"), variant="primary", visible=False + ) with gr.Row(): - info1c=gr.Textbox(label=process_info(process_name_1c, "info")) + info1c = gr.Textbox(label=process_info(process_name_1c, "info")) - gr.Markdown(value="1Aabc-"+process_name_1abc) + gr.Markdown(value="1Aabc-" + process_name_1abc) with gr.Row(): with gr.Row(): - button1abc_open = gr.Button(value=process_info(process_name_1abc, "open"),variant="primary",visible=True) - button1abc_close = gr.Button(value=process_info(process_name_1abc, "close"),variant="primary",visible=False) + button1abc_open = gr.Button( + value=process_info(process_name_1abc, "open"), variant="primary", visible=True + ) + button1abc_close = gr.Button( + value=process_info(process_name_1abc, "close"), variant="primary", visible=False + ) with gr.Row(): - info1abc=gr.Textbox(label=process_info(process_name_1abc, "info")) + info1abc = gr.Textbox(label=process_info(process_name_1abc, "info")) - pretrained_s2G.change(sync,[pretrained_s2G],[pretrained_s2G_]) - open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], [asr_info,open_asr_button,close_asr_button,path_list,inp_text,inp_wav_dir]) - close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) - open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button,asr_inp_dir,denoise_input_dir,inp_wav_dir]) - close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) - open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button,asr_inp_dir,inp_wav_dir]) - close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button]) + pretrained_s2G.change(sync, [pretrained_s2G], [pretrained_s2G_]) + open_asr_button.click( + open_asr, + [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], + [asr_info, open_asr_button, close_asr_button, path_list, inp_text, inp_wav_dir], + ) + close_asr_button.click(close_asr, [], [asr_info, open_asr_button, close_asr_button]) + open_slicer_button.click( + open_slice, + [ + slice_inp_path, + slice_opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + n_process, + ], + [slicer_info, open_slicer_button, close_slicer_button, asr_inp_dir, denoise_input_dir, inp_wav_dir], + ) + close_slicer_button.click(close_slice, [], [slicer_info, open_slicer_button, close_slicer_button]) + open_denoise_button.click( + open_denoise, + [denoise_input_dir, denoise_output_dir], + [denoise_info, open_denoise_button, close_denoise_button, asr_inp_dir, inp_wav_dir], + ) + close_denoise_button.click(close_denoise, [], [denoise_info, open_denoise_button, close_denoise_button]) - button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close]) - button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close]) - button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close]) - button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close]) - button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close]) - button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close]) - button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close]) - button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close]) + button1a_open.click( + open1a, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1a, bert_pretrained_dir], + [info1a, button1a_open, button1a_close], + ) + button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) + button1b_open.click( + open1b, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], + [info1b, button1b_open, button1b_close], + ) + button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) + button1c_open.click( + open1c, [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close] + ) + button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) + button1abc_open.click( + open1abc, + [ + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + cnhubert_base_dir, + pretrained_s2G, + ], + [info1abc, button1abc_open, button1abc_close], + ) + button1abc_close.click(close1abc, [], [info1abc, button1abc_open, button1abc_close]) - with gr.TabItem("1B-"+i18n("微调训练")): - gr.Markdown(value="1Ba-"+i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) + with gr.TabItem("1B-" + i18n("微调训练")): + gr.Markdown(value="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) with gr.Row(): with gr.Column(): with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=default_max_batch_size,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch = gr.Slider(minimum=1,maximum=max_sovits_epoch,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=default_sovits_epoch,interactive=True) + batch_size = gr.Slider( + minimum=1, + maximum=default_max_batch_size, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + total_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_epoch, + step=1, + label=i18n("总训练轮数total_epoch,不建议太高"), + value=default_sovits_epoch, + interactive=True, + ) with gr.Row(): - text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,visible=True if version!="v3"else False)#v3 not need - lora_rank = gr.Radio(label=i18n("LoRA秩"), value="32", choices=['16', '32', '64', '128'],visible=True if version=="v3"else False)#v1v2 not need - save_every_epoch = gr.Slider(minimum=1,maximum=max_sovits_save_every_epoch,step=1,label=i18n("保存频率save_every_epoch"),value=default_sovits_save_every_epoch,interactive=True) + text_low_lr_rate = gr.Slider( + minimum=0.2, + maximum=0.6, + step=0.05, + label=i18n("文本模块学习率权重"), + value=0.4, + visible=True if version not in v3v4set else False, + ) # v3v4 not need + lora_rank = gr.Radio( + label=i18n("LoRA秩"), + value="32", + choices=["16", "32", "64", "128"], + visible=True if version in v3v4set else False, + ) # v1v2 not need + save_every_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_save_every_epoch, + step=1, + label=i18n("保存频率save_every_epoch"), + value=default_sovits_save_every_epoch, + interactive=True, + ) with gr.Column(): with gr.Column(): - if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - if_grad_ckpt = gr.Checkbox(label="v3是否开启梯度检查点节省显存占用", value=False, interactive=True if version == "v3" else False, show_label=True,visible=False) # 只有V3s2可以用 + if_save_latest = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + if_grad_ckpt = gr.Checkbox( + label="v3是否开启梯度检查点节省显存占用", + value=False, + interactive=True if version in v3v4set else False, + show_label=True, + visible=False, + ) # 只有V3s2可以用 with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) with gr.Row(): with gr.Row(): - button1Ba_open = gr.Button(value=process_info(process_name_sovits, "open"),variant="primary",visible=True) - button1Ba_close = gr.Button(value=process_info(process_name_sovits, "close"),variant="primary",visible=False) + button1Ba_open = gr.Button( + value=process_info(process_name_sovits, "open"), variant="primary", visible=True + ) + button1Ba_close = gr.Button( + value=process_info(process_name_sovits, "close"), variant="primary", visible=False + ) with gr.Row(): - info1Ba=gr.Textbox(label=process_info(process_name_sovits, "info")) - gr.Markdown(value="1Bb-"+i18n("GPT 训练: 模型权重文件在 GPT_weights/")) + info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info")) + gr.Markdown(value="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")) with gr.Row(): with gr.Column(): with gr.Row(): - batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size_s1,interactive=True) - total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True) + batch_size1Bb = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size_s1, + interactive=True, + ) + total_epoch1Bb = gr.Slider( + minimum=2, + maximum=50, + step=1, + label=i18n("总训练轮数total_epoch"), + value=15, + interactive=True, + ) with gr.Row(): - save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True) - if_dpo = gr.Checkbox(label=i18n("是否开启DPO训练选项(实验性)"), value=False, interactive=True, show_label=True) + save_every_epoch1Bb = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + if_dpo = gr.Checkbox( + label=i18n("是否开启DPO训练选项(实验性)"), + value=False, + interactive=True, + show_label=True, + ) with gr.Column(): with gr.Column(): - if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) + if_save_latest1Bb = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights1Bb = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) with gr.Row(): - gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) + gpu_numbers1Bb = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) with gr.Row(): with gr.Row(): - button1Bb_open = gr.Button(value=process_info(process_name_gpt, "open"),variant="primary",visible=True) - button1Bb_close = gr.Button(value=process_info(process_name_gpt, "close"),variant="primary",visible=False) + button1Bb_open = gr.Button( + value=process_info(process_name_gpt, "open"), variant="primary", visible=True + ) + button1Bb_close = gr.Button( + value=process_info(process_name_gpt, "close"), variant="primary", visible=False + ) with gr.Row(): - info1Bb=gr.Textbox(label=process_info(process_name_gpt, "info")) + info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info")) - button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D,if_grad_ckpt,lora_rank], [info1Ba,button1Ba_open,button1Ba_close]) - button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close]) - button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close]) - button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close]) + button1Ba_close.click(close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close]) + button1Bb_close.click(close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close]) - with gr.TabItem("1C-"+i18n("推理")): - gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。")) + with gr.TabItem("1C-" + i18n("推理")): + gr.Markdown( + value=i18n( + "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" + ) + ) with gr.Row(): with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name[0],interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name[0],interactive=True) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=pretrained_gpt_name[0], + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=pretrained_sovits_name[0], + interactive=True, + ) with gr.Row(): - gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) + gpu_number_1C = gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") - refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown]) + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) with gr.Row(): with gr.Row(): - batched_infer_enabled = gr.Checkbox(label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True) + batched_infer_enabled = gr.Checkbox( + label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True + ) with gr.Row(): - open_tts = gr.Button(value=process_info(process_name_tts, "open"),variant='primary',visible=True) - close_tts = gr.Button(value=process_info(process_name_tts, "close"),variant='primary',visible=False) + open_tts = gr.Button( + value=process_info(process_name_tts, "open"), variant="primary", visible=True + ) + close_tts = gr.Button( + value=process_info(process_name_tts, "close"), variant="primary", visible=False + ) with gr.Row(): tts_info = gr.Textbox(label=process_info(process_name_tts, "info")) - open_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) - close_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) + open_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) + close_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) + button1Ba_open.click( + open1Ba, + [ + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, + ], + [info1Ba, button1Ba_open, button1Ba_close, SoVITS_dropdown, GPT_dropdown], + ) + button1Bb_open.click( + open1Bb, + [ + batch_size1Bb, + total_epoch1Bb, + exp_name, + if_dpo, + if_save_latest1Bb, + if_save_every_weights1Bb, + save_every_epoch1Bb, + gpu_numbers1Bb, + pretrained_s1, + ], + [info1Bb, button1Bb_open, button1Bb_close, SoVITS_dropdown, GPT_dropdown], + ) + version_checkbox.change( + switch_version, + [version_checkbox], + [ + pretrained_s2G, + pretrained_s2D, + pretrained_s1, + GPT_dropdown, + SoVITS_dropdown, + batch_size, + total_epoch, + save_every_epoch, + text_low_lr_rate, + if_grad_ckpt, + batched_infer_enabled, + lora_rank, + ], + ) - version_checkbox.change(switch_version,[version_checkbox],[pretrained_s2G,pretrained_s2D,pretrained_s1,GPT_dropdown,SoVITS_dropdown,batch_size,total_epoch,save_every_epoch,text_low_lr_rate, if_grad_ckpt, batched_infer_enabled, lora_rank]) + with gr.TabItem(i18n("2-GPT-SoVITS-变声")): + gr.Markdown(value=i18n("施工中,请静候佳音")) - with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中,请静候佳音")) - - app.queue().launch(#concurrency_count=511, max_size=1022 + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, server_port=webui_port_main, - quiet=True, + # quiet=True, )