mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-06-04 05:01:27 +08:00
Merge 319a09fa2d54335ab4893190c860c77beed76a83 into 2d9193b0d3c0eae0c3a14d8c68a839f1bae157dc
This commit is contained in:
commit
60a25737d2
@ -1,6 +1,6 @@
|
|||||||
GPT_SoVITS/pretrained_models/*
|
GPT_SoVITS/pretrained_models/*
|
||||||
tools/asr/models/*
|
gsv_tools/asr/models/*
|
||||||
tools/uvr5/uvr5_weights/*
|
gsv_tools/uvr5/uvr5_weights/*
|
||||||
|
|
||||||
.git
|
.git
|
||||||
.DS_Store
|
.DS_Store
|
||||||
@ -21,6 +21,8 @@ cfg.json
|
|||||||
speakers.json
|
speakers.json
|
||||||
ref_audios
|
ref_audios
|
||||||
|
|
||||||
|
pylock.toml
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
|
|||||||
4
.github/build_windows_packages.ps1
vendored
4
.github/build_windows_packages.ps1
vendored
@ -92,13 +92,13 @@ Write-Host "[INFO] Download G2PWModel..."
|
|||||||
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
|
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
|
||||||
|
|
||||||
Write-Host "[INFO] Download UVR5 model..."
|
Write-Host "[INFO] Download UVR5 model..."
|
||||||
DownloadAndUnzip $UVR5_URL "tools\uvr5"
|
DownloadAndUnzip $UVR5_URL "gsv_tools\uvr5"
|
||||||
|
|
||||||
Write-Host "[INFO] Downloading funasr..."
|
Write-Host "[INFO] Downloading funasr..."
|
||||||
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
|
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
|
||||||
$funasrZip = "$tmpDir\funasr.zip"
|
$funasrZip = "$tmpDir\funasr.zip"
|
||||||
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
|
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
|
||||||
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
|
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\gsv_tools\asr\models" -Force
|
||||||
Remove-Item $funasrZip
|
Remove-Item $funasrZip
|
||||||
|
|
||||||
Write-Host "[INFO] Download ffmpeg..."
|
Write-Host "[INFO] Download ffmpeg..."
|
||||||
|
|||||||
46
.gitignore
vendored
46
.gitignore
vendored
@ -1,24 +1,3 @@
|
|||||||
.DS_Store
|
|
||||||
.vscode
|
|
||||||
__pycache__
|
|
||||||
*.pyc
|
|
||||||
env
|
|
||||||
runtime
|
|
||||||
.idea
|
|
||||||
output
|
|
||||||
logs
|
|
||||||
SoVITS_weights*/
|
|
||||||
GPT_weights*/
|
|
||||||
TEMP
|
|
||||||
weight.json
|
|
||||||
ffmpeg*
|
|
||||||
ffprobe*
|
|
||||||
cfg.json
|
|
||||||
speakers.json
|
|
||||||
ref_audios
|
|
||||||
tools/AP_BWE_main/24kto48k/*
|
|
||||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@ -193,3 +172,28 @@ cython_debug/
|
|||||||
|
|
||||||
# PyPI configuration file
|
# PyPI configuration file
|
||||||
.pypirc
|
.pypirc
|
||||||
|
|
||||||
|
# GPT-SoVITS
|
||||||
|
|
||||||
|
.DS_Store
|
||||||
|
.vscode
|
||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
pylock.toml
|
||||||
|
env
|
||||||
|
runtime
|
||||||
|
.idea
|
||||||
|
output
|
||||||
|
logs
|
||||||
|
SoVITS_weights*/
|
||||||
|
GPT_weights*/
|
||||||
|
TEMP
|
||||||
|
weight.json
|
||||||
|
ffmpeg*
|
||||||
|
ffprobe*
|
||||||
|
cfg.json
|
||||||
|
speakers.json
|
||||||
|
ref_audios
|
||||||
|
gsv_tools/AP_BWE_main/24kto48k/*
|
||||||
|
!gsv_tools/AP_BWE_main/24kto48k/readme.txt
|
||||||
|
!gsv_tools/uvr5/lib
|
||||||
@ -3,13 +3,18 @@ ci:
|
|||||||
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.15.1
|
||||||
hooks:
|
hooks:
|
||||||
# Run the linter.
|
# Run the linter.
|
||||||
- id: ruff
|
- id: ruff
|
||||||
types_or: [ python, pyi ]
|
types_or: [ python, pyi ]
|
||||||
args: [ --fix , "--exit-zero" ]
|
args: [ --fix , "--unsafe-fixes", "--exit-zero" ]
|
||||||
# Run the formatter.
|
# Run the formatter.
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
types_or: [ python, pyi ]
|
types_or: [ python, pyi ]
|
||||||
args: [ --line-length, "120", --target-version, "py311" ]
|
|
||||||
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
|
# uv version.
|
||||||
|
rev: 0.10.3
|
||||||
|
hooks:
|
||||||
|
- id: uv-lock
|
||||||
@ -53,7 +53,7 @@
|
|||||||
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
|
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
|
||||||
" :\n",
|
" :\n",
|
||||||
"else\n",
|
"else\n",
|
||||||
" conda create -n GPTSoVITS python=3.10 -y\n",
|
" conda create -n GPTSoVITS python=3.11 -y\n",
|
||||||
"fi\n",
|
"fi\n",
|
||||||
"\n",
|
"\n",
|
||||||
"source activate GPTSoVITS\n",
|
"source activate GPTSoVITS\n",
|
||||||
|
|||||||
@ -52,7 +52,7 @@
|
|||||||
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
|
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
|
||||||
" :\n",
|
" :\n",
|
||||||
"else\n",
|
"else\n",
|
||||||
" conda create -n GPTSoVITS python=3.10 -y\n",
|
" conda create -n GPTSoVITS python=3.11 -y\n",
|
||||||
"fi\n",
|
"fi\n",
|
||||||
"\n",
|
"\n",
|
||||||
"source activate GPTSoVITS\n",
|
"source activate GPTSoVITS\n",
|
||||||
|
|||||||
@ -57,21 +57,20 @@ source "$HOME/.bashrc"
|
|||||||
|
|
||||||
"$HOME/conda/bin/conda" install python=3.12 -y
|
"$HOME/conda/bin/conda" install python=3.12 -y
|
||||||
|
|
||||||
"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
|
"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg uv cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
|
||||||
|
|
||||||
|
cd workspace
|
||||||
|
|
||||||
if [ "$CUDA_VERSION" = "12.8" ]; then
|
if [ "$CUDA_VERSION" = "12.8" ]; then
|
||||||
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
|
"$HOME/conda/bin/uv" pip install ".[cu128]" --no-cache-dir --python "$HOME/conda/bin/python"
|
||||||
"$HOME/conda/bin/conda" install cuda-nvcc=12.8 -y
|
|
||||||
elif [ "$CUDA_VERSION" = "12.6" ]; then
|
elif [ "$CUDA_VERSION" = "12.6" ]; then
|
||||||
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
|
"$HOME/conda/bin/uv" pip install ".[cu126]" --no-cache-dir --python "$HOME/conda/bin/python"
|
||||||
"$HOME/conda/bin/conda" install cuda-nvcc=12.6 -y
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
export PATH="$HOME/conda/bin:$PATH"
|
export PATH="$HOME/conda/bin:$PATH"
|
||||||
|
|
||||||
"$HOME/conda/bin/pip" install psutil ninja packaging wheel "setuptools>=42" einops
|
"$HOME/conda/bin/uv" pip install ".[flash-attn]" --python "$HOME/conda/bin/python"
|
||||||
"$HOME/conda/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
|
"$HOME/conda/bin/uv" cache clean
|
||||||
"$HOME/conda/bin/pip" cache purge
|
|
||||||
|
|
||||||
rm $LOG_PATH
|
rm $LOG_PATH
|
||||||
|
|
||||||
|
|||||||
24
Dockerfile
24
Dockerfile
@ -15,7 +15,7 @@ SHELL ["/bin/bash", "-c"]
|
|||||||
|
|
||||||
WORKDIR /workspace/GPT-SoVITS
|
WORKDIR /workspace/GPT-SoVITS
|
||||||
|
|
||||||
COPY Docker /workspace/GPT-SoVITS/Docker/
|
COPY . /workspace/GPT-SoVITS
|
||||||
|
|
||||||
ARG LITE=false
|
ARG LITE=false
|
||||||
ENV LITE=${LITE}
|
ENV LITE=${LITE}
|
||||||
@ -26,12 +26,6 @@ ENV WORKFLOW=${WORKFLOW}
|
|||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV TARGETPLATFORM=${TARGETPLATFORM}
|
ENV TARGETPLATFORM=${TARGETPLATFORM}
|
||||||
|
|
||||||
COPY extra-req.txt /workspace/GPT-SoVITS/
|
|
||||||
|
|
||||||
COPY requirements.txt /workspace/GPT-SoVITS/
|
|
||||||
|
|
||||||
COPY install.sh /workspace/GPT-SoVITS/
|
|
||||||
|
|
||||||
RUN bash Docker/install_wrapper.sh
|
RUN bash Docker/install_wrapper.sh
|
||||||
|
|
||||||
EXPOSE 9871 9872 9873 9874 9880
|
EXPOSE 9871 9872 9873 9874 9880
|
||||||
@ -40,21 +34,13 @@ ENV PYTHONPATH="/workspace/GPT-SoVITS"
|
|||||||
|
|
||||||
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
|
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN rm -rf /workspace/GPT-SoVITS
|
|
||||||
|
|
||||||
WORKDIR /workspace/GPT-SoVITS
|
|
||||||
|
|
||||||
COPY . /workspace/GPT-SoVITS
|
|
||||||
|
|
||||||
CMD ["/bin/bash", "-c", "\
|
CMD ["/bin/bash", "-c", "\
|
||||||
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
||||||
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
||||||
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
|
rm -rf /workspace/GPT-SoVITS/gsv_tools/asr/models && \
|
||||||
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
|
rm -rf /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
|
||||||
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
|
||||||
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
|
||||||
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
|
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/gsv_tools/asr/models && \
|
||||||
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
|
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
|
||||||
exec bash"]
|
exec bash"]
|
||||||
@ -9,6 +9,7 @@
|
|||||||
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
|
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
- **Sep 2024 (v2.4):**
|
- **Sep 2024 (v2.4):**
|
||||||
- We have updated the pretrained checkpoints trained for 5M steps. This is final release of the BigVGAN-v2 checkpoints.
|
- We have updated the pretrained checkpoints trained for 5M steps. This is final release of the BigVGAN-v2 checkpoints.
|
||||||
|
|
||||||
@ -31,7 +32,7 @@
|
|||||||
The codebase has been tested on Python `3.10` and PyTorch `2.3.1` conda packages with either `pytorch-cuda=12.1` or `pytorch-cuda=11.8`. Below is an example command to create the conda environment:
|
The codebase has been tested on Python `3.10` and PyTorch `2.3.1` conda packages with either `pytorch-cuda=12.1` or `pytorch-cuda=11.8`. Below is an example command to create the conda environment:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
conda create -n bigvgan python=3.10 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
|
conda create -n bigvgan python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
|
||||||
conda activate bigvgan
|
conda activate bigvgan
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -186,17 +187,17 @@ If you see `[Fail] test CUDA fused vs. plain torch BigVGAN inference`, it means
|
|||||||
We provide the [pretrained models on Hugging Face Collections](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a).
|
We provide the [pretrained models on Hugging Face Collections](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a).
|
||||||
One can download the checkpoints of the generator weight (named `bigvgan_generator.pt`) and its discriminator/optimizer states (named `bigvgan_discriminator_optimizer.pt`) within the listed model repositories.
|
One can download the checkpoints of the generator weight (named `bigvgan_generator.pt`) and its discriminator/optimizer states (named `bigvgan_discriminator_optimizer.pt`) within the listed model repositories.
|
||||||
|
|
||||||
| Model Name | Sampling Rate | Mel band | fmax | Upsampling Ratio | Params | Dataset | Steps | Fine-Tuned |
|
| Model Name | Sampling Rate | Mel band | fmax | Upsampling Ratio | Params | Dataset | Steps | Fine-Tuned |
|
||||||
|:--------------------------------------------------------------------------------------------------------:|:-------------:|:--------:|:-----:|:----------------:|:------:|:--------------------------:|:-----:|:----------:|
|
| :------------------------------------------------------------------------------------------------------: | :-----------: | :------: | :---: | :--------------: | :----: | :------------------------: | :---: | :--------: |
|
||||||
| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x) | 44 kHz | 128 | 22050 | 512 | 122M | Large-scale Compilation | 5M | No |
|
| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x) | 44 kHz | 128 | 22050 | 512 | 122M | Large-scale Compilation | 5M | No |
|
||||||
| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x) | 44 kHz | 128 | 22050 | 256 | 112M | Large-scale Compilation | 5M | No |
|
| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x) | 44 kHz | 128 | 22050 | 256 | 112M | Large-scale Compilation | 5M | No |
|
||||||
| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x) | 24 kHz | 100 | 12000 | 256 | 112M | Large-scale Compilation | 5M | No |
|
| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x) | 24 kHz | 100 | 12000 | 256 | 112M | Large-scale Compilation | 5M | No |
|
||||||
| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x) | 22 kHz | 80 | 11025 | 256 | 112M | Large-scale Compilation | 5M | No |
|
| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x) | 22 kHz | 80 | 11025 | 256 | 112M | Large-scale Compilation | 5M | No |
|
||||||
| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz | 80 | 8000 | 256 | 112M | Large-scale Compilation | 5M | No |
|
| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz | 80 | 8000 | 256 | 112M | Large-scale Compilation | 5M | No |
|
||||||
| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 112M | LibriTTS | 5M | No |
|
| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 112M | LibriTTS | 5M | No |
|
||||||
| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 14M | LibriTTS | 5M | No |
|
| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 14M | LibriTTS | 5M | No |
|
||||||
| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 112M | LibriTTS + VCTK + LJSpeech | 5M | No |
|
| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 112M | LibriTTS + VCTK + LJSpeech | 5M | No |
|
||||||
| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 14M | LibriTTS + VCTK + LJSpeech | 5M | No |
|
| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 14M | LibriTTS + VCTK + LJSpeech | 5M | No |
|
||||||
|
|
||||||
The paper results are based on the original 24kHz BigVGAN models (`bigvgan_24khz_100band` and `bigvgan_base_24khz_100band`) trained on LibriTTS dataset.
|
The paper results are based on the original 24kHz BigVGAN models (`bigvgan_24khz_100band` and `bigvgan_base_24khz_100band`) trained on LibriTTS dataset.
|
||||||
We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
|
We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
|
||||||
@ -219,37 +220,37 @@ When training BigVGAN-v2 from scratch with small batch size, it can potentially
|
|||||||
|
|
||||||
Below are the objective results of the 24kHz model (`bigvgan_v2_24khz_100band_256x`) obtained from the LibriTTS `dev` sets. BigVGAN-v2 shows noticeable improvements of the metrics. The model also exhibits reduced perceptual artifacts, especially for non-speech audio.
|
Below are the objective results of the 24kHz model (`bigvgan_v2_24khz_100band_256x`) obtained from the LibriTTS `dev` sets. BigVGAN-v2 shows noticeable improvements of the metrics. The model also exhibits reduced perceptual artifacts, especially for non-speech audio.
|
||||||
|
|
||||||
| Model | Dataset | Steps | PESQ(↑) | M-STFT(↓) | MCD(↓) | Periodicity(↓) | V/UV F1(↑) |
|
| Model | Dataset | Steps | PESQ(↑) | M-STFT(↓) | MCD(↓) | Periodicity(↓) | V/UV F1(↑) |
|
||||||
|:----------:|:-----------------------:|:-----:|:---------:|:----------:|:----------:|:--------------:|:----------:|
|
| :--------: | :---------------------: | :---: | :-------: | :--------: | :--------: | :------------: | :--------: |
|
||||||
| BigVGAN | LibriTTS | 1M | 4.027 | 0.7997 | 0.3745 | 0.1018 | 0.9598 |
|
| BigVGAN | LibriTTS | 1M | 4.027 | 0.7997 | 0.3745 | 0.1018 | 0.9598 |
|
||||||
| BigVGAN | LibriTTS | 5M | 4.256 | 0.7409 | 0.2988 | 0.0809 | 0.9698 |
|
| BigVGAN | LibriTTS | 5M | 4.256 | 0.7409 | 0.2988 | 0.0809 | 0.9698 |
|
||||||
| BigVGAN-v2 | Large-scale Compilation | 3M | 4.359 | 0.7134 | 0.3060 | 0.0621 | 0.9777 |
|
| BigVGAN-v2 | Large-scale Compilation | 3M | 4.359 | 0.7134 | 0.3060 | 0.0621 | 0.9777 |
|
||||||
| BigVGAN-v2 | Large-scale Compilation | 5M | **4.362** | **0.7026** | **0.2903** | **0.0593** | **0.9793** |
|
| BigVGAN-v2 | Large-scale Compilation | 5M | **4.362** | **0.7026** | **0.2903** | **0.0593** | **0.9793** |
|
||||||
|
|
||||||
## Speed Benchmark
|
## Speed Benchmark
|
||||||
|
|
||||||
Below are the speed and VRAM usage benchmark results of BigVGAN from `tests/test_cuda_vs_torch_model.py`, using `bigvgan_v2_24khz_100band_256x` as a reference model.
|
Below are the speed and VRAM usage benchmark results of BigVGAN from `tests/test_cuda_vs_torch_model.py`, using `bigvgan_v2_24khz_100band_256x` as a reference model.
|
||||||
|
|
||||||
| GPU | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
|
| GPU | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
|
||||||
|:--------------------------:|:-------------:|:---------------:|:-----------:|:----------------:|:---------:|
|
| :------------------------: | :-----------: | :-------------: | :---------: | :--------------: | :-------: |
|
||||||
| NVIDIA A100 | 256 | False | 1672.1 | 69.7x | 1.3 |
|
| NVIDIA A100 | 256 | False | 1672.1 | 69.7x | 1.3 |
|
||||||
| | | True | 3916.5 | 163.2x | 1.3 |
|
| | | True | 3916.5 | 163.2x | 1.3 |
|
||||||
| | 2048 | False | 1899.6 | 79.2x | 1.7 |
|
| | 2048 | False | 1899.6 | 79.2x | 1.7 |
|
||||||
| | | True | 5330.1 | 222.1x | 1.7 |
|
| | | True | 5330.1 | 222.1x | 1.7 |
|
||||||
| | 16384 | False | 1973.8 | 82.2x | 5.0 |
|
| | 16384 | False | 1973.8 | 82.2x | 5.0 |
|
||||||
| | | True | 5761.7 | 240.1x | 4.4 |
|
| | | True | 5761.7 | 240.1x | 4.4 |
|
||||||
| NVIDIA GeForce RTX 3080 | 256 | False | 841.1 | 35.0x | 1.3 |
|
| NVIDIA GeForce RTX 3080 | 256 | False | 841.1 | 35.0x | 1.3 |
|
||||||
| | | True | 1598.1 | 66.6x | 1.3 |
|
| | | True | 1598.1 | 66.6x | 1.3 |
|
||||||
| | 2048 | False | 929.9 | 38.7x | 1.7 |
|
| | 2048 | False | 929.9 | 38.7x | 1.7 |
|
||||||
| | | True | 1971.3 | 82.1x | 1.6 |
|
| | | True | 1971.3 | 82.1x | 1.6 |
|
||||||
| | 16384 | False | 943.4 | 39.3x | 5.0 |
|
| | 16384 | False | 943.4 | 39.3x | 5.0 |
|
||||||
| | | True | 2026.5 | 84.4x | 3.9 |
|
| | | True | 2026.5 | 84.4x | 3.9 |
|
||||||
| NVIDIA GeForce RTX 2080 Ti | 256 | False | 515.6 | 21.5x | 1.3 |
|
| NVIDIA GeForce RTX 2080 Ti | 256 | False | 515.6 | 21.5x | 1.3 |
|
||||||
| | | True | 811.3 | 33.8x | 1.3 |
|
| | | True | 811.3 | 33.8x | 1.3 |
|
||||||
| | 2048 | False | 576.5 | 24.0x | 1.7 |
|
| | 2048 | False | 576.5 | 24.0x | 1.7 |
|
||||||
| | | True | 1023.0 | 42.6x | 1.5 |
|
| | | True | 1023.0 | 42.6x | 1.5 |
|
||||||
| | 16384 | False | 589.4 | 24.6x | 5.0 |
|
| | 16384 | False | 589.4 | 24.6x | 5.0 |
|
||||||
| | | True | 1068.1 | 44.5x | 3.2 |
|
| | | True | 1068.1 | 44.5x | 3.2 |
|
||||||
|
|
||||||
## Acknowledgements
|
## Acknowledgements
|
||||||
|
|
||||||
|
|||||||
@ -10,10 +10,10 @@ from copy import deepcopy
|
|||||||
import torchaudio
|
import torchaudio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
import os
|
import os
|
||||||
from typing import List, Tuple, Union
|
|
||||||
|
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
import librosa
|
import librosa
|
||||||
@ -25,23 +25,24 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|||||||
from BigVGAN.bigvgan import BigVGAN
|
from BigVGAN.bigvgan import BigVGAN
|
||||||
from feature_extractor.cnhubert import CNHubert
|
from feature_extractor.cnhubert import CNHubert
|
||||||
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
|
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
|
||||||
from module.models import SynthesizerTrn, SynthesizerTrnV3, Generator
|
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
|
||||||
from peft import LoraConfig, get_peft_model
|
from peft import LoraConfig, get_peft_model
|
||||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||||
|
from sv import SV
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
|
|
||||||
from tools.audio_sr import AP_BWE
|
from gsv_tools.audio_sr import AP_BWE
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
from TTS_infer_pack.text_segmentation_method import splits
|
from TTS_infer_pack.text_segmentation_method import splits
|
||||||
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
||||||
from sv import SV
|
|
||||||
|
|
||||||
resample_transform_dict = {}
|
resample_transform_dict = {}
|
||||||
|
|
||||||
|
|
||||||
def resample(audio_tensor, sr0, sr1, device):
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
key = f"{sr0}-{sr1}-{str(device)}"
|
||||||
if key not in resample_transform_dict:
|
if key not in resample_transform_dict:
|
||||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
return resample_transform_dict[key](audio_tensor)
|
return resample_transform_dict[key](audio_tensor)
|
||||||
@ -64,33 +65,36 @@ def denorm_spec(x):
|
|||||||
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
||||||
|
|
||||||
|
|
||||||
mel_fn = lambda x: mel_spectrogram_torch(
|
def mel_fn(x):
|
||||||
x,
|
return mel_spectrogram_torch(
|
||||||
**{
|
x,
|
||||||
"n_fft": 1024,
|
**{
|
||||||
"win_size": 1024,
|
"n_fft": 1024,
|
||||||
"hop_size": 256,
|
"win_size": 1024,
|
||||||
"num_mels": 100,
|
"hop_size": 256,
|
||||||
"sampling_rate": 24000,
|
"num_mels": 100,
|
||||||
"fmin": 0,
|
"sampling_rate": 24000,
|
||||||
"fmax": None,
|
"fmin": 0,
|
||||||
"center": False,
|
"fmax": None,
|
||||||
},
|
"center": False,
|
||||||
)
|
},
|
||||||
|
)
|
||||||
|
|
||||||
mel_fn_v4 = lambda x: mel_spectrogram_torch(
|
|
||||||
x,
|
def mel_fn_v4(x):
|
||||||
**{
|
return mel_spectrogram_torch(
|
||||||
"n_fft": 1280,
|
x,
|
||||||
"win_size": 1280,
|
**{
|
||||||
"hop_size": 320,
|
"n_fft": 1280,
|
||||||
"num_mels": 100,
|
"win_size": 1280,
|
||||||
"sampling_rate": 32000,
|
"hop_size": 320,
|
||||||
"fmin": 0,
|
"num_mels": 100,
|
||||||
"fmax": None,
|
"sampling_rate": 32000,
|
||||||
"center": False,
|
"fmin": 0,
|
||||||
},
|
"fmax": None,
|
||||||
)
|
"center": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
||||||
@ -132,7 +136,7 @@ class DictToAttrRecursive(dict):
|
|||||||
def __setattr__(self, key, value):
|
def __setattr__(self, key, value):
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
value = DictToAttrRecursive(value)
|
value = DictToAttrRecursive(value)
|
||||||
super(DictToAttrRecursive, self).__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
super().__setattr__(key, value)
|
super().__setattr__(key, value)
|
||||||
|
|
||||||
def __delattr__(self, item):
|
def __delattr__(self, item):
|
||||||
@ -276,12 +280,12 @@ class TTS_Config:
|
|||||||
v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
|
v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
|
||||||
languages: list = v2_languages
|
languages: list = v2_languages
|
||||||
mute_tokens: dict = {
|
mute_tokens: dict = {
|
||||||
"v1" : 486,
|
"v1": 486,
|
||||||
"v2" : 486,
|
"v2": 486,
|
||||||
"v2Pro": 486,
|
"v2Pro": 486,
|
||||||
"v2ProPlus": 486,
|
"v2ProPlus": 486,
|
||||||
"v3" : 486,
|
"v3": 486,
|
||||||
"v4" : 486,
|
"v4": 486,
|
||||||
}
|
}
|
||||||
mute_emb_sim_matrix: torch.Tensor = None
|
mute_emb_sim_matrix: torch.Tensor = None
|
||||||
# "all_zh",#全部按中文识别
|
# "all_zh",#全部按中文识别
|
||||||
@ -296,7 +300,7 @@ class TTS_Config:
|
|||||||
# "auto",#多语种启动切分识别语种
|
# "auto",#多语种启动切分识别语种
|
||||||
# "auto_yue",#多语种启动切分识别语种
|
# "auto_yue",#多语种启动切分识别语种
|
||||||
|
|
||||||
def __init__(self, configs: Union[dict, str] = None):
|
def __init__(self, configs: dict | str = None):
|
||||||
# 设置默认配置文件路径
|
# 设置默认配置文件路径
|
||||||
configs_base_path: str = "GPT_SoVITS/configs/"
|
configs_base_path: str = "GPT_SoVITS/configs/"
|
||||||
os.makedirs(configs_base_path, exist_ok=True)
|
os.makedirs(configs_base_path, exist_ok=True)
|
||||||
@ -325,7 +329,7 @@ class TTS_Config:
|
|||||||
|
|
||||||
self.is_half = self.configs.get("is_half", False)
|
self.is_half = self.configs.get("is_half", False)
|
||||||
if str(self.device) == "cpu" and self.is_half:
|
if str(self.device) == "cpu" and self.is_half:
|
||||||
print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
|
print("Warning: Half precision is not supported on CPU, set is_half to False.")
|
||||||
self.is_half = False
|
self.is_half = False
|
||||||
|
|
||||||
version = self.configs.get("version", None)
|
version = self.configs.get("version", None)
|
||||||
@ -369,7 +373,7 @@ class TTS_Config:
|
|||||||
else:
|
else:
|
||||||
print(i18n("路径不存在,使用默认配置"))
|
print(i18n("路径不存在,使用默认配置"))
|
||||||
self.save_configs(configs_path)
|
self.save_configs(configs_path)
|
||||||
with open(configs_path, "r", encoding="utf-8") as f:
|
with open(configs_path, encoding="utf-8") as f:
|
||||||
configs = yaml.load(f, Loader=yaml.FullLoader)
|
configs = yaml.load(f, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
return configs
|
return configs
|
||||||
@ -419,14 +423,14 @@ class TTS_Config:
|
|||||||
|
|
||||||
|
|
||||||
class TTS:
|
class TTS:
|
||||||
def __init__(self, configs: Union[dict, str, TTS_Config]):
|
def __init__(self, configs: dict | str | TTS_Config):
|
||||||
if isinstance(configs, TTS_Config):
|
if isinstance(configs, TTS_Config):
|
||||||
self.configs = configs
|
self.configs = configs
|
||||||
else:
|
else:
|
||||||
self.configs: TTS_Config = TTS_Config(configs)
|
self.configs: TTS_Config = TTS_Config(configs)
|
||||||
|
|
||||||
self.t2s_model: Text2SemanticLightningModule = None
|
self.t2s_model: Text2SemanticLightningModule = None
|
||||||
self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None
|
self.vits_model: SynthesizerTrn | SynthesizerTrnV3 = None
|
||||||
self.bert_tokenizer: AutoTokenizer = None
|
self.bert_tokenizer: AutoTokenizer = None
|
||||||
self.bert_model: AutoModelForMaskedLM = None
|
self.bert_model: AutoModelForMaskedLM = None
|
||||||
self.cnhuhbert_model: CNHubert = None
|
self.cnhuhbert_model: CNHubert = None
|
||||||
@ -497,8 +501,8 @@ class TTS:
|
|||||||
self.init_sv_model()
|
self.init_sv_model()
|
||||||
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
||||||
|
|
||||||
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
|
if if_lora_v3 and not os.path.exists(path_sovits):
|
||||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
info = path_sovits + i18n(f"SoVITS {model_version} 底模缺失,无法加载相应 LoRA 权重")
|
||||||
raise FileExistsError(info)
|
raise FileExistsError(info)
|
||||||
|
|
||||||
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||||
@ -558,7 +562,7 @@ class TTS:
|
|||||||
|
|
||||||
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
|
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
|
||||||
|
|
||||||
if if_lora_v3 == False:
|
if not if_lora_v3:
|
||||||
print(
|
print(
|
||||||
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
|
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
|
||||||
)
|
)
|
||||||
@ -589,8 +593,6 @@ class TTS:
|
|||||||
|
|
||||||
self.configs.save_configs()
|
self.configs.save_configs()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def init_t2s_weights(self, weights_path: str):
|
def init_t2s_weights(self, weights_path: str):
|
||||||
print(f"Loading Text2Semantic weights from {weights_path}")
|
print(f"Loading Text2Semantic weights from {weights_path}")
|
||||||
self.configs.t2s_weights_path = weights_path
|
self.configs.t2s_weights_path = weights_path
|
||||||
@ -622,7 +624,7 @@ class TTS:
|
|||||||
self.empty_cache()
|
self.empty_cache()
|
||||||
|
|
||||||
self.vocoder = BigVGAN.from_pretrained(
|
self.vocoder = BigVGAN.from_pretrained(
|
||||||
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
|
||||||
use_cuda_kernel=False,
|
use_cuda_kernel=False,
|
||||||
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
||||||
# remove weight norm in the model and set to eval mode
|
# remove weight norm in the model and set to eval mode
|
||||||
@ -655,7 +657,7 @@ class TTS:
|
|||||||
)
|
)
|
||||||
self.vocoder.remove_weight_norm()
|
self.vocoder.remove_weight_norm()
|
||||||
state_dict_g = torch.load(
|
state_dict_g = torch.load(
|
||||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
|
||||||
map_location="cpu",
|
map_location="cpu",
|
||||||
weights_only=False,
|
weights_only=False,
|
||||||
)
|
)
|
||||||
@ -668,7 +670,7 @@ class TTS:
|
|||||||
self.vocoder_configs["overlapped_len"] = 12
|
self.vocoder_configs["overlapped_len"] = 12
|
||||||
|
|
||||||
self.vocoder = self.vocoder.eval()
|
self.vocoder = self.vocoder.eval()
|
||||||
if self.configs.is_half == True:
|
if self.configs.is_half:
|
||||||
self.vocoder = self.vocoder.half().to(self.configs.device)
|
self.vocoder = self.vocoder.half().to(self.configs.device)
|
||||||
else:
|
else:
|
||||||
self.vocoder = self.vocoder.to(self.configs.device)
|
self.vocoder = self.vocoder.to(self.configs.device)
|
||||||
@ -798,7 +800,7 @@ class TTS:
|
|||||||
)
|
)
|
||||||
if self.configs.is_half:
|
if self.configs.is_half:
|
||||||
spec = spec.half()
|
spec = spec.half()
|
||||||
if self.is_v2pro == True:
|
if self.is_v2pro:
|
||||||
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
|
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
|
||||||
if self.configs.is_half:
|
if self.configs.is_half:
|
||||||
audio = audio.half()
|
audio = audio.half()
|
||||||
@ -832,7 +834,7 @@ class TTS:
|
|||||||
prompt_semantic = codes[0, 0].to(self.configs.device)
|
prompt_semantic = codes[0, 0].to(self.configs.device)
|
||||||
self.prompt_cache["prompt_semantic"] = prompt_semantic
|
self.prompt_cache["prompt_semantic"] = prompt_semantic
|
||||||
|
|
||||||
def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
|
def batch_sequences(self, sequences: list[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
|
||||||
seq = sequences[0]
|
seq = sequences[0]
|
||||||
ndim = seq.dim()
|
ndim = seq.dim()
|
||||||
if axis < 0:
|
if axis < 0:
|
||||||
@ -846,7 +848,7 @@ class TTS:
|
|||||||
max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length
|
max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length
|
||||||
|
|
||||||
padded_sequences = []
|
padded_sequences = []
|
||||||
for seq, length in zip(sequences, seq_lengths):
|
for seq, length in zip(sequences, seq_lengths, strict=False):
|
||||||
padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
|
padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
|
||||||
padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
|
padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
|
||||||
padded_sequences.append(padded_seq)
|
padded_sequences.append(padded_seq)
|
||||||
@ -898,7 +900,7 @@ class TTS:
|
|||||||
batch_index_list.append([])
|
batch_index_list.append([])
|
||||||
batch_index_list[-1].append(i)
|
batch_index_list[-1].append(i)
|
||||||
|
|
||||||
for batch_idx, index_list in enumerate(batch_index_list):
|
for _batch_idx, index_list in enumerate(batch_index_list):
|
||||||
item_list = [data[idx] for idx in index_list]
|
item_list = [data[idx] for idx in index_list]
|
||||||
phones_list = []
|
phones_list = []
|
||||||
phones_len_list = []
|
phones_len_list = []
|
||||||
@ -1051,7 +1053,7 @@ class TTS:
|
|||||||
fragment_interval = inputs.get("fragment_interval", 0.3)
|
fragment_interval = inputs.get("fragment_interval", 0.3)
|
||||||
seed = inputs.get("seed", -1)
|
seed = inputs.get("seed", -1)
|
||||||
seed = -1 if seed in ["", None] else seed
|
seed = -1 if seed in ["", None] else seed
|
||||||
actual_seed = set_seed(seed)
|
set_seed(seed)
|
||||||
parallel_infer = inputs.get("parallel_infer", True)
|
parallel_infer = inputs.get("parallel_infer", True)
|
||||||
repetition_penalty = inputs.get("repetition_penalty", 1.35)
|
repetition_penalty = inputs.get("repetition_penalty", 1.35)
|
||||||
sample_steps = inputs.get("sample_steps", 32)
|
sample_steps = inputs.get("sample_steps", 32)
|
||||||
@ -1060,7 +1062,7 @@ class TTS:
|
|||||||
overlap_length = inputs.get("overlap_length", 2)
|
overlap_length = inputs.get("overlap_length", 2)
|
||||||
min_chunk_length = inputs.get("min_chunk_length", 16)
|
min_chunk_length = inputs.get("min_chunk_length", 16)
|
||||||
fixed_length_chunk = inputs.get("fixed_length_chunk", False)
|
fixed_length_chunk = inputs.get("fixed_length_chunk", False)
|
||||||
chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值,若大于该阈值,则视为可切分点。
|
chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值,若大于该阈值,则视为可切分点。
|
||||||
|
|
||||||
if parallel_infer and not streaming_mode:
|
if parallel_infer and not streaming_mode:
|
||||||
print(i18n("并行推理模式已开启"))
|
print(i18n("并行推理模式已开启"))
|
||||||
@ -1093,7 +1095,6 @@ class TTS:
|
|||||||
print(i18n("分段返回模式/流式推理模式不支持分桶处理,已自动关闭分桶处理"))
|
print(i18n("分段返回模式/流式推理模式不支持分桶处理,已自动关闭分桶处理"))
|
||||||
split_bucket = False
|
split_bucket = False
|
||||||
|
|
||||||
|
|
||||||
if split_bucket and speed_factor == 1.0 and not (self.configs.use_vocoder and parallel_infer):
|
if split_bucket and speed_factor == 1.0 and not (self.configs.use_vocoder and parallel_infer):
|
||||||
print(i18n("分桶处理模式已开启"))
|
print(i18n("分桶处理模式已开启"))
|
||||||
elif speed_factor != 1.0:
|
elif speed_factor != 1.0:
|
||||||
@ -1171,7 +1172,7 @@ class TTS:
|
|||||||
if not (return_fragment or streaming_mode):
|
if not (return_fragment or streaming_mode):
|
||||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
||||||
if len(data) == 0:
|
if len(data) == 0:
|
||||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
yield 16000, np.zeros(16000, dtype=np.int16)
|
||||||
return
|
return
|
||||||
|
|
||||||
batch_index_list: list = None
|
batch_index_list: list = None
|
||||||
@ -1237,9 +1238,9 @@ class TTS:
|
|||||||
if item is None:
|
if item is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
batch_phones: List[torch.LongTensor] = item["phones"]
|
batch_phones: list[torch.LongTensor] = item["phones"]
|
||||||
# batch_phones:torch.LongTensor = item["phones"]
|
# batch_phones:torch.LongTensor = item["phones"]
|
||||||
batch_phones_len: torch.LongTensor = item["phones_len"]
|
item["phones_len"]
|
||||||
all_phoneme_ids: torch.LongTensor = item["all_phones"]
|
all_phoneme_ids: torch.LongTensor = item["all_phones"]
|
||||||
all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
|
all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
|
||||||
all_bert_features: torch.LongTensor = item["all_bert_features"]
|
all_bert_features: torch.LongTensor = item["all_bert_features"]
|
||||||
@ -1281,7 +1282,6 @@ class TTS:
|
|||||||
t4 = time.perf_counter()
|
t4 = time.perf_counter()
|
||||||
t_34 += t4 - t3
|
t_34 += t4 - t3
|
||||||
|
|
||||||
|
|
||||||
batch_audio_fragment = []
|
batch_audio_fragment = []
|
||||||
|
|
||||||
# ## vits并行推理 method 1
|
# ## vits并行推理 method 1
|
||||||
@ -1301,7 +1301,9 @@ class TTS:
|
|||||||
if speed_factor == 1.0:
|
if speed_factor == 1.0:
|
||||||
print(f"{i18n('并行合成中')}...")
|
print(f"{i18n('并行合成中')}...")
|
||||||
# ## vits并行推理 method 2
|
# ## vits并行推理 method 2
|
||||||
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
pred_semantic_list = [
|
||||||
|
item[-idx:] for item, idx in zip(pred_semantic_list, idx_list, strict=False)
|
||||||
|
]
|
||||||
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
||||||
audio_frag_idx = [
|
audio_frag_idx = [
|
||||||
pred_semantic_list[i].shape[0] * 2 * upsample_rate
|
pred_semantic_list[i].shape[0] * 2 * upsample_rate
|
||||||
@ -1314,8 +1316,8 @@ class TTS:
|
|||||||
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
||||||
|
|
||||||
_batch_audio_fragment = self.vits_model.decode(
|
_batch_audio_fragment = self.vits_model.decode(
|
||||||
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
||||||
).detach()[0, 0, :]
|
).detach()[0, 0, :]
|
||||||
|
|
||||||
audio_frag_end_idx.insert(0, 0)
|
audio_frag_end_idx.insert(0, 0)
|
||||||
batch_audio_fragment = [
|
batch_audio_fragment = [
|
||||||
@ -1330,14 +1332,18 @@ class TTS:
|
|||||||
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
||||||
) # .unsqueeze(0)#mq要多unsqueeze一次
|
) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||||
audio_fragment = self.vits_model.decode(
|
audio_fragment = self.vits_model.decode(
|
||||||
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
||||||
).detach()[0, 0, :]
|
).detach()[0, 0, :]
|
||||||
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
||||||
else:
|
else:
|
||||||
if parallel_infer:
|
if parallel_infer:
|
||||||
print(f"{i18n('并行合成中')}...")
|
print(f"{i18n('并行合成中')}...")
|
||||||
audio_fragments = self.using_vocoder_synthesis_batched_infer(
|
audio_fragments = self.using_vocoder_synthesis_batched_infer(
|
||||||
idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps
|
idx_list,
|
||||||
|
pred_semantic_list,
|
||||||
|
batch_phones,
|
||||||
|
speed=speed_factor,
|
||||||
|
sample_steps=sample_steps,
|
||||||
)
|
)
|
||||||
batch_audio_fragment.extend(audio_fragments)
|
batch_audio_fragment.extend(audio_fragments)
|
||||||
else:
|
else:
|
||||||
@ -1356,7 +1362,7 @@ class TTS:
|
|||||||
# item.to(dtype=self.precision, device=self.configs.device)
|
# item.to(dtype=self.precision, device=self.configs.device)
|
||||||
# for item in self.prompt_cache["refer_spec"]
|
# for item in self.prompt_cache["refer_spec"]
|
||||||
# ]
|
# ]
|
||||||
semantic_token_generator =self.t2s_model.model.infer_panel(
|
semantic_token_generator = self.t2s_model.model.infer_panel(
|
||||||
all_phoneme_ids[0].unsqueeze(0),
|
all_phoneme_ids[0].unsqueeze(0),
|
||||||
all_phoneme_lens,
|
all_phoneme_lens,
|
||||||
prompt,
|
prompt,
|
||||||
@ -1381,30 +1387,34 @@ class TTS:
|
|||||||
# if speed_factor == 1.0:
|
# if speed_factor == 1.0:
|
||||||
# upsample_rate = math.prod(self.vits_model.upsample_rates)*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1)
|
# upsample_rate = math.prod(self.vits_model.upsample_rates)*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1)
|
||||||
# else:
|
# else:
|
||||||
upsample_rate = math.prod(self.vits_model.upsample_rates)*((2 if self.vits_model.semantic_frame_rate == "25hz" else 1)/speed_factor)
|
upsample_rate = math.prod(self.vits_model.upsample_rates) * (
|
||||||
|
(2 if self.vits_model.semantic_frame_rate == "25hz" else 1) / speed_factor
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# if speed_factor == 1.0:
|
# if speed_factor == 1.0:
|
||||||
# upsample_rate = self.vocoder_configs["upsample_rate"]*(3.875 if self.configs.version == "v3" else 4)
|
# upsample_rate = self.vocoder_configs["upsample_rate"]*(3.875 if self.configs.version == "v3" else 4)
|
||||||
# else:
|
# else:
|
||||||
upsample_rate = self.vocoder_configs["upsample_rate"]*((3.875 if self.configs.version == "v3" else 4)/speed_factor)
|
upsample_rate = self.vocoder_configs["upsample_rate"] * (
|
||||||
|
(3.875 if self.configs.version == "v3" else 4) / speed_factor
|
||||||
|
)
|
||||||
|
|
||||||
last_audio_chunk = None
|
last_audio_chunk = None
|
||||||
# last_tokens = None
|
# last_tokens = None
|
||||||
last_latent = None
|
last_latent = None
|
||||||
previous_tokens = []
|
previous_tokens = []
|
||||||
overlap_len = overlap_length
|
overlap_len = overlap_length
|
||||||
overlap_size = math.ceil(overlap_length*upsample_rate)
|
overlap_size = math.ceil(overlap_length * upsample_rate)
|
||||||
for semantic_tokens, is_final in semantic_token_generator:
|
for semantic_tokens, is_final in semantic_token_generator:
|
||||||
if semantic_tokens is None and last_audio_chunk is not None:
|
if semantic_tokens is None and last_audio_chunk is not None:
|
||||||
yield self.audio_postprocess(
|
yield self.audio_postprocess(
|
||||||
[[last_audio_chunk[-overlap_size:]]],
|
[[last_audio_chunk[-overlap_size:]]],
|
||||||
output_sr,
|
output_sr,
|
||||||
None,
|
None,
|
||||||
speed_factor,
|
speed_factor,
|
||||||
False,
|
False,
|
||||||
0.0,
|
0.0,
|
||||||
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
_semantic_tokens = semantic_tokens
|
_semantic_tokens = semantic_tokens
|
||||||
@ -1415,11 +1425,10 @@ class TTS:
|
|||||||
_semantic_tokens = torch.cat(previous_tokens, dim=-1)
|
_semantic_tokens = torch.cat(previous_tokens, dim=-1)
|
||||||
|
|
||||||
if not is_first_chunk and semantic_tokens.shape[-1] < 10:
|
if not is_first_chunk and semantic_tokens.shape[-1] < 10:
|
||||||
overlap_len = overlap_length+(10-semantic_tokens.shape[-1])
|
overlap_len = overlap_length + (10 - semantic_tokens.shape[-1])
|
||||||
else:
|
else:
|
||||||
overlap_len = overlap_length
|
overlap_len = overlap_length
|
||||||
|
|
||||||
|
|
||||||
if not self.configs.use_vocoder:
|
if not self.configs.use_vocoder:
|
||||||
token_padding_length = 0
|
token_padding_length = 0
|
||||||
# token_padding_length = int(phones.shape[-1]*2)-_semantic_tokens.shape[-1]
|
# token_padding_length = int(phones.shape[-1]*2)-_semantic_tokens.shape[-1]
|
||||||
@ -1429,21 +1438,27 @@ class TTS:
|
|||||||
# token_padding_length = 0
|
# token_padding_length = 0
|
||||||
|
|
||||||
audio_chunk, latent, latent_mask = self.vits_model.decode_streaming(
|
audio_chunk, latent, latent_mask = self.vits_model.decode_streaming(
|
||||||
_semantic_tokens.unsqueeze(0),
|
_semantic_tokens.unsqueeze(0),
|
||||||
phones, refer_audio_spec,
|
phones,
|
||||||
speed=speed_factor,
|
refer_audio_spec,
|
||||||
sv_emb=sv_emb,
|
speed=speed_factor,
|
||||||
result_length=semantic_tokens.shape[-1]+overlap_len if not is_first_chunk else None,
|
sv_emb=sv_emb,
|
||||||
overlap_frames=last_latent[:,:,-overlap_len*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1):] \
|
result_length=semantic_tokens.shape[-1] + overlap_len if not is_first_chunk else None,
|
||||||
if last_latent is not None else None,
|
overlap_frames=last_latent[
|
||||||
padding_length=token_padding_length
|
:, :, -overlap_len * (2 if self.vits_model.semantic_frame_rate == "25hz" else 1) :
|
||||||
)
|
]
|
||||||
audio_chunk=audio_chunk.detach()[0, 0, :]
|
if last_latent is not None
|
||||||
|
else None,
|
||||||
|
padding_length=token_padding_length,
|
||||||
|
)
|
||||||
|
audio_chunk = audio_chunk.detach()[0, 0, :]
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(i18n("SoVits V3/4模型不支持流式推理模式"))
|
raise RuntimeError(i18n("SoVits V3/4模型不支持流式推理模式"))
|
||||||
|
|
||||||
if overlap_len>overlap_length:
|
if overlap_len > overlap_length:
|
||||||
audio_chunk=audio_chunk[-int((overlap_length+semantic_tokens.shape[-1])*upsample_rate):]
|
audio_chunk = audio_chunk[
|
||||||
|
-int((overlap_length + semantic_tokens.shape[-1]) * upsample_rate) :
|
||||||
|
]
|
||||||
|
|
||||||
audio_chunk_ = audio_chunk
|
audio_chunk_ = audio_chunk
|
||||||
if is_first_chunk and not is_final:
|
if is_first_chunk and not is_final:
|
||||||
@ -1454,33 +1469,33 @@ class TTS:
|
|||||||
elif not is_first_chunk and not is_final:
|
elif not is_first_chunk and not is_final:
|
||||||
audio_chunk_ = self.sola_algorithm([last_audio_chunk, audio_chunk_], overlap_size)
|
audio_chunk_ = self.sola_algorithm([last_audio_chunk, audio_chunk_], overlap_size)
|
||||||
audio_chunk_ = (
|
audio_chunk_ = (
|
||||||
audio_chunk_[last_audio_chunk.shape[0]-overlap_size:-overlap_size] if not is_final \
|
audio_chunk_[last_audio_chunk.shape[0] - overlap_size : -overlap_size]
|
||||||
else audio_chunk_[last_audio_chunk.shape[0]-overlap_size:]
|
if not is_final
|
||||||
)
|
else audio_chunk_[last_audio_chunk.shape[0] - overlap_size :]
|
||||||
|
)
|
||||||
|
|
||||||
last_latent = latent
|
last_latent = latent
|
||||||
last_audio_chunk = audio_chunk
|
last_audio_chunk = audio_chunk
|
||||||
yield self.audio_postprocess(
|
yield self.audio_postprocess(
|
||||||
[[audio_chunk_]],
|
[[audio_chunk_]],
|
||||||
output_sr,
|
output_sr,
|
||||||
None,
|
None,
|
||||||
speed_factor,
|
speed_factor,
|
||||||
False,
|
False,
|
||||||
0.0,
|
0.0,
|
||||||
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
||||||
)
|
)
|
||||||
|
|
||||||
if is_first_package:
|
if is_first_package:
|
||||||
print(f"first_package_delay: {time.perf_counter()-t0:.3f}")
|
print(f"first_package_delay: {time.perf_counter() - t0:.3f}")
|
||||||
is_first_package = False
|
is_first_package = False
|
||||||
|
|
||||||
|
yield output_sr, np.zeros(int(output_sr * fragment_interval), dtype=np.int16)
|
||||||
yield output_sr, np.zeros(int(output_sr*fragment_interval), dtype=np.int16)
|
|
||||||
|
|
||||||
t5 = time.perf_counter()
|
t5 = time.perf_counter()
|
||||||
t_45 += t5 - t4
|
t_45 += t5 - t4
|
||||||
if return_fragment:
|
if return_fragment:
|
||||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t4 - t3:.3f}\t{t5 - t4:.3f}")
|
||||||
yield self.audio_postprocess(
|
yield self.audio_postprocess(
|
||||||
[batch_audio_fragment],
|
[batch_audio_fragment],
|
||||||
output_sr,
|
output_sr,
|
||||||
@ -1490,7 +1505,8 @@ class TTS:
|
|||||||
fragment_interval,
|
fragment_interval,
|
||||||
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
|
||||||
)
|
)
|
||||||
elif streaming_mode:...
|
elif streaming_mode:
|
||||||
|
...
|
||||||
else:
|
else:
|
||||||
audio.append(batch_audio_fragment)
|
audio.append(batch_audio_fragment)
|
||||||
|
|
||||||
@ -1499,7 +1515,7 @@ class TTS:
|
|||||||
return
|
return
|
||||||
|
|
||||||
if not (return_fragment or streaming_mode):
|
if not (return_fragment or streaming_mode):
|
||||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
|
print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t_34:.3f}\t{t_45:.3f}")
|
||||||
if len(audio) == 0:
|
if len(audio) == 0:
|
||||||
yield output_sr, np.zeros(int(output_sr), dtype=np.int16)
|
yield output_sr, np.zeros(int(output_sr), dtype=np.int16)
|
||||||
return
|
return
|
||||||
@ -1516,7 +1532,7 @@ class TTS:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
# 必须返回一个空音频, 否则会导致显存不释放。
|
# 必须返回一个空音频, 否则会导致显存不释放。
|
||||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
yield 16000, np.zeros(16000, dtype=np.int16)
|
||||||
# 重置模型, 否则会导致显存释放不完全。
|
# 重置模型, 否则会导致显存释放不完全。
|
||||||
del self.t2s_model
|
del self.t2s_model
|
||||||
del self.vits_model
|
del self.vits_model
|
||||||
@ -1540,15 +1556,15 @@ class TTS:
|
|||||||
|
|
||||||
def audio_postprocess(
|
def audio_postprocess(
|
||||||
self,
|
self,
|
||||||
audio: List[torch.Tensor],
|
audio: list[torch.Tensor],
|
||||||
sr: int,
|
sr: int,
|
||||||
batch_index_list: list = None,
|
batch_index_list: list = None,
|
||||||
speed_factor: float = 1.0,
|
speed_factor: float = 1.0,
|
||||||
split_bucket: bool = True,
|
split_bucket: bool = True,
|
||||||
fragment_interval: float = 0.3,
|
fragment_interval: float = 0.3,
|
||||||
super_sampling: bool = False,
|
super_sampling: bool = False,
|
||||||
) -> Tuple[int, np.ndarray]:
|
) -> tuple[int, np.ndarray]:
|
||||||
if fragment_interval>0:
|
if fragment_interval > 0:
|
||||||
zero_wav = torch.zeros(
|
zero_wav = torch.zeros(
|
||||||
int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
|
int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
|
||||||
)
|
)
|
||||||
@ -1558,7 +1574,9 @@ class TTS:
|
|||||||
max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音
|
max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音
|
||||||
if max_audio > 1:
|
if max_audio > 1:
|
||||||
audio_fragment /= max_audio
|
audio_fragment /= max_audio
|
||||||
audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval>0 else audio_fragment
|
audio_fragment: torch.Tensor = (
|
||||||
|
torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval > 0 else audio_fragment
|
||||||
|
)
|
||||||
audio[i][j] = audio_fragment
|
audio[i][j] = audio_fragment
|
||||||
|
|
||||||
if split_bucket:
|
if split_bucket:
|
||||||
@ -1589,7 +1607,6 @@ class TTS:
|
|||||||
|
|
||||||
audio = (audio * 32768).astype(np.int16)
|
audio = (audio * 32768).astype(np.int16)
|
||||||
|
|
||||||
|
|
||||||
# try:
|
# try:
|
||||||
# if speed_factor != 1.0:
|
# if speed_factor != 1.0:
|
||||||
# audio = speed_change(audio, speed=speed_factor, sr=int(sr))
|
# audio = speed_change(audio, speed=speed_factor, sr=int(sr))
|
||||||
@ -1665,12 +1682,12 @@ class TTS:
|
|||||||
|
|
||||||
def using_vocoder_synthesis_batched_infer(
|
def using_vocoder_synthesis_batched_infer(
|
||||||
self,
|
self,
|
||||||
idx_list: List[int],
|
idx_list: list[int],
|
||||||
semantic_tokens_list: List[torch.Tensor],
|
semantic_tokens_list: list[torch.Tensor],
|
||||||
batch_phones: List[torch.Tensor],
|
batch_phones: list[torch.Tensor],
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
sample_steps: int = 32,
|
sample_steps: int = 32,
|
||||||
) -> List[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||||
raw_entry = self.prompt_cache["refer_spec"][0]
|
raw_entry = self.prompt_cache["refer_spec"][0]
|
||||||
@ -1778,12 +1795,7 @@ class TTS:
|
|||||||
|
|
||||||
return audio_fragments
|
return audio_fragments
|
||||||
|
|
||||||
def sola_algorithm(
|
def sola_algorithm(self, audio_fragments: list[torch.Tensor], overlap_len: int, search_len: int = 320):
|
||||||
self,
|
|
||||||
audio_fragments: List[torch.Tensor],
|
|
||||||
overlap_len: int,
|
|
||||||
search_len:int= 320
|
|
||||||
):
|
|
||||||
# overlap_len-=search_len
|
# overlap_len-=search_len
|
||||||
|
|
||||||
dtype = audio_fragments[0].dtype
|
dtype = audio_fragments[0].dtype
|
||||||
@ -1792,26 +1804,23 @@ class TTS:
|
|||||||
f1 = audio_fragments[i].float()
|
f1 = audio_fragments[i].float()
|
||||||
f2 = audio_fragments[i + 1].float()
|
f2 = audio_fragments[i + 1].float()
|
||||||
w1 = f1[-overlap_len:]
|
w1 = f1[-overlap_len:]
|
||||||
w2 = f2[:overlap_len+search_len]
|
w2 = f2[: overlap_len + search_len]
|
||||||
# w2 = w2[-w2.shape[-1]//2:]
|
# w2 = w2[-w2.shape[-1]//2:]
|
||||||
# assert w1.shape == w2.shape
|
# assert w1.shape == w2.shape
|
||||||
corr_norm = F.conv1d(w2.view(1, 1, -1), w1.view(1, 1, -1)).view(-1)
|
corr_norm = F.conv1d(w2.view(1, 1, -1), w1.view(1, 1, -1)).view(-1)
|
||||||
|
|
||||||
corr_den = F.conv1d(w2.view(1, 1, -1)**2, torch.ones_like(w1).view(1, 1, -1)).view(-1)+ 1e-8
|
corr_den = F.conv1d(w2.view(1, 1, -1) ** 2, torch.ones_like(w1).view(1, 1, -1)).view(-1) + 1e-8
|
||||||
idx = (corr_norm/corr_den.sqrt()).argmax()
|
idx = (corr_norm / corr_den.sqrt()).argmax()
|
||||||
|
|
||||||
print(f"seg_idx: {idx}")
|
print(f"seg_idx: {idx}")
|
||||||
|
|
||||||
# idx = corr.argmax()
|
# idx = corr.argmax()
|
||||||
f1_ = f1[: -overlap_len]
|
f1_ = f1[:-overlap_len]
|
||||||
audio_fragments[i] = f1_
|
audio_fragments[i] = f1_
|
||||||
|
|
||||||
f2_ = f2[idx:]
|
f2_ = f2[idx:]
|
||||||
window = torch.hann_window((overlap_len) * 2, device=f1.device, dtype=f1.dtype)
|
window = torch.hann_window((overlap_len) * 2, device=f1.device, dtype=f1.dtype)
|
||||||
f2_[: overlap_len] = (
|
f2_[:overlap_len] = window[:overlap_len] * f2_[:overlap_len] + window[overlap_len:] * f1[-overlap_len:]
|
||||||
window[: overlap_len] * f2_[: overlap_len]
|
|
||||||
+ window[overlap_len :] * f1[-overlap_len :]
|
|
||||||
)
|
|
||||||
|
|
||||||
# window = torch.sin(torch.arange((overlap_len - idx), device=f1.device) * np.pi / (overlap_len - idx))
|
# window = torch.sin(torch.arange((overlap_len - idx), device=f1.device) * np.pi / (overlap_len - idx))
|
||||||
# f2_[: (overlap_len - idx)] = (
|
# f2_[: (overlap_len - idx)] = (
|
||||||
|
|||||||
@ -4,20 +4,21 @@ import threading
|
|||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import torch
|
|
||||||
from text.LangSegmenter import LangSegmenter
|
|
||||||
from text import chinese
|
|
||||||
from typing import Dict, List, Tuple
|
|
||||||
from text.cleaner import clean_text
|
|
||||||
from text import cleaned_text_to_sequence
|
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
||||||
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
import torch
|
||||||
|
from text import cleaned_text_to_sequence
|
||||||
|
from text.cleaner import clean_text
|
||||||
|
from text.LangSegmenter import LangSegmenter
|
||||||
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
|
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
from TTS_infer_pack.text_segmentation_method import get_method as get_seg_method, split_big_text, splits
|
||||||
|
|
||||||
|
|
||||||
language = os.environ.get("language", "Auto")
|
language = os.environ.get("language", "Auto")
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
@ -56,7 +57,7 @@ class TextPreprocessor:
|
|||||||
self.device = device
|
self.device = device
|
||||||
self.bert_lock = threading.RLock()
|
self.bert_lock = threading.RLock()
|
||||||
|
|
||||||
def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]:
|
def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> list[dict]:
|
||||||
print(f"############ {i18n('切分文本')} ############")
|
print(f"############ {i18n('切分文本')} ############")
|
||||||
text = self.replace_consecutive_punctuation(text)
|
text = self.replace_consecutive_punctuation(text)
|
||||||
texts = self.pre_seg_text(text, lang, text_split_method)
|
texts = self.pre_seg_text(text, lang, text_split_method)
|
||||||
@ -98,7 +99,7 @@ class TextPreprocessor:
|
|||||||
# 解决输入目标文本的空行导致报错的问题
|
# 解决输入目标文本的空行导致报错的问题
|
||||||
if len(text.strip()) == 0:
|
if len(text.strip()) == 0:
|
||||||
continue
|
continue
|
||||||
if not re.sub("\W+", "", text):
|
if not re.sub(r"\W+", "", text):
|
||||||
# 检测一下,如果是纯符号,就跳过。
|
# 检测一下,如果是纯符号,就跳过。
|
||||||
continue
|
continue
|
||||||
if text[-1] not in splits:
|
if text[-1] not in splits:
|
||||||
@ -116,30 +117,30 @@ class TextPreprocessor:
|
|||||||
|
|
||||||
def segment_and_extract_feature_for_text(
|
def segment_and_extract_feature_for_text(
|
||||||
self, text: str, language: str, version: str = "v1"
|
self, text: str, language: str, version: str = "v1"
|
||||||
) -> Tuple[list, torch.Tensor, str]:
|
) -> tuple[list, torch.Tensor, str]:
|
||||||
return self.get_phones_and_bert(text, language, version)
|
return self.get_phones_and_bert(text, language, version)
|
||||||
|
|
||||||
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
|
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
|
||||||
with self.bert_lock:
|
with self.bert_lock:
|
||||||
text = re.sub(r' {2,}', ' ', text)
|
text = re.sub(r" {2,}", " ", text)
|
||||||
textlist = []
|
textlist = []
|
||||||
langlist = []
|
langlist = []
|
||||||
if language == "all_zh":
|
if language == "all_zh":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_yue":
|
elif language == "all_yue":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
if tmp["lang"] == "zh":
|
if tmp["lang"] == "zh":
|
||||||
tmp["lang"] = "yue"
|
tmp["lang"] = "yue"
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ja":
|
elif language == "all_ja":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ja"):
|
for tmp in LangSegmenter.getTexts(text, "ja"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ko":
|
elif language == "all_ko":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ko"):
|
for tmp in LangSegmenter.getTexts(text, "ko"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "en":
|
elif language == "en":
|
||||||
@ -158,7 +159,9 @@ class TextPreprocessor:
|
|||||||
else:
|
else:
|
||||||
for tmp in LangSegmenter.getTexts(text):
|
for tmp in LangSegmenter.getTexts(text):
|
||||||
if langlist:
|
if langlist:
|
||||||
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
|
if (tmp["lang"] == "en" and langlist[-1] == "en") or (
|
||||||
|
tmp["lang"] != "en" and langlist[-1] != "en"
|
||||||
|
):
|
||||||
textlist[-1] += tmp["text"]
|
textlist[-1] += tmp["text"]
|
||||||
continue
|
continue
|
||||||
if tmp["lang"] == "en":
|
if tmp["lang"] == "en":
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
|
||||||
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
@ -19,11 +21,11 @@ def synthesize(
|
|||||||
output_path,
|
output_path,
|
||||||
):
|
):
|
||||||
# Read reference text
|
# Read reference text
|
||||||
with open(ref_text_path, "r", encoding="utf-8") as file:
|
with open(ref_text_path, encoding="utf-8") as file:
|
||||||
ref_text = file.read()
|
ref_text = file.read()
|
||||||
|
|
||||||
# Read target text
|
# Read target text
|
||||||
with open(target_text_path, "r", encoding="utf-8") as file:
|
with open(target_text_path, encoding="utf-8") as file:
|
||||||
target_text = file.read()
|
target_text = file.read()
|
||||||
|
|
||||||
# Change model weights
|
# Change model weights
|
||||||
|
|||||||
@ -1,15 +1,29 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from PyQt5.QtCore import QEvent
|
|
||||||
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
|
|
||||||
from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
|
|
||||||
import soundfile as sf
|
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
import soundfile as sf
|
||||||
|
from PyQt5.QtCore import QEvent
|
||||||
|
from PyQt5.QtWidgets import (
|
||||||
|
QApplication,
|
||||||
|
QComboBox,
|
||||||
|
QFileDialog,
|
||||||
|
QGridLayout,
|
||||||
|
QLabel,
|
||||||
|
QLineEdit,
|
||||||
|
QMainWindow,
|
||||||
|
QPushButton,
|
||||||
|
QStatusBar,
|
||||||
|
QTextEdit,
|
||||||
|
QVBoxLayout,
|
||||||
|
QWidget,
|
||||||
|
)
|
||||||
|
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
|
from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav, gpt_path, sovits_path
|
||||||
|
|
||||||
|
|
||||||
class GPTSoVITSGUI(QMainWindow):
|
class GPTSoVITSGUI(QMainWindow):
|
||||||
@ -239,14 +253,14 @@ class GPTSoVITSGUI(QMainWindow):
|
|||||||
def upload_ref_text(self):
|
def upload_ref_text(self):
|
||||||
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
||||||
if file_path:
|
if file_path:
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, encoding="utf-8") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
self.ref_text_input.setText(content)
|
self.ref_text_input.setText(content)
|
||||||
|
|
||||||
def upload_target_text(self):
|
def upload_target_text(self):
|
||||||
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
|
||||||
if file_path:
|
if file_path:
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, encoding="utf-8") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
self.target_text_input.setText(content)
|
self.target_text_input.setText(content)
|
||||||
|
|
||||||
|
|||||||
@ -6,23 +6,27 @@
|
|||||||
全部按英文识别
|
全部按英文识别
|
||||||
全部按日文识别
|
全部按日文识别
|
||||||
"""
|
"""
|
||||||
import psutil
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
def set_high_priority():
|
def set_high_priority():
|
||||||
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
|
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
|
||||||
if os.name != "nt":
|
if os.name != "nt":
|
||||||
return # 仅 Windows 有效
|
return # 仅 Windows 有效
|
||||||
p = psutil.Process(os.getpid())
|
p = psutil.Process(os.getpid())
|
||||||
try:
|
try:
|
||||||
p.nice(psutil.HIGH_PRIORITY_CLASS)
|
p.nice(psutil.HIGH_PRIORITY_CLASS)
|
||||||
print("已将进程优先级设为 High")
|
print("已将进程优先级设为 High")
|
||||||
except psutil.AccessDenied:
|
except psutil.AccessDenied:
|
||||||
print("权限不足,无法修改优先级(请用管理员运行)")
|
print("权限不足,无法修改优先级(请用管理员运行)")
|
||||||
|
|
||||||
|
|
||||||
set_high_priority()
|
set_high_priority()
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
@ -32,6 +36,7 @@ import torch
|
|||||||
import torchaudio
|
import torchaudio
|
||||||
from text.LangSegmenter import LangSegmenter
|
from text.LangSegmenter import LangSegmenter
|
||||||
|
|
||||||
|
|
||||||
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
||||||
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
||||||
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
||||||
@ -46,9 +51,11 @@ version = model_version = os.environ.get("version", "v2")
|
|||||||
|
|
||||||
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
|
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
|
||||||
|
|
||||||
|
|
||||||
SoVITS_names, GPT_names = get_weights_names()
|
SoVITS_names, GPT_names = get_weights_names()
|
||||||
from config import pretrained_sovits_name
|
from config import pretrained_sovits_name
|
||||||
|
|
||||||
|
|
||||||
path_sovits_v3 = pretrained_sovits_name["v3"]
|
path_sovits_v3 = pretrained_sovits_name["v3"]
|
||||||
path_sovits_v4 = pretrained_sovits_name["v4"]
|
path_sovits_v4 = pretrained_sovits_name["v4"]
|
||||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||||
@ -60,7 +67,7 @@ else:
|
|||||||
with open("./weight.json", "w", encoding="utf-8") as file:
|
with open("./weight.json", "w", encoding="utf-8") as file:
|
||||||
json.dump({"GPT": {}, "SoVITS": {}}, file)
|
json.dump({"GPT": {}, "SoVITS": {}}, file)
|
||||||
|
|
||||||
with open("./weight.json", "r", encoding="utf-8") as file:
|
with open("./weight.json", encoding="utf-8") as file:
|
||||||
weight_data = file.read()
|
weight_data = file.read()
|
||||||
weight_data = json.loads(weight_data)
|
weight_data = json.loads(weight_data)
|
||||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
||||||
@ -96,6 +103,7 @@ import numpy as np
|
|||||||
from feature_extractor import cnhubert
|
from feature_extractor import cnhubert
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||||
|
|
||||||
import random
|
import random
|
||||||
@ -123,8 +131,9 @@ from peft import LoraConfig, get_peft_model
|
|||||||
from text import cleaned_text_to_sequence
|
from text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from text.cleaner import clean_text
|
||||||
|
|
||||||
from tools.assets import css, js, top_html
|
from gsv_tools.assets import css, js, top_html
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
|
|
||||||
language = os.environ.get("language", "Auto")
|
language = os.environ.get("language", "Auto")
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
@ -162,7 +171,7 @@ dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
|||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
bert_model = bert_model.half().to(device)
|
bert_model = bert_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
bert_model = bert_model.to(device)
|
bert_model = bert_model.to(device)
|
||||||
@ -202,7 +211,7 @@ class DictToAttrRecursive(dict):
|
|||||||
def __setattr__(self, key, value):
|
def __setattr__(self, key, value):
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
value = DictToAttrRecursive(value)
|
value = DictToAttrRecursive(value)
|
||||||
super(DictToAttrRecursive, self).__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
super().__setattr__(key, value)
|
super().__setattr__(key, value)
|
||||||
|
|
||||||
def __delattr__(self, item):
|
def __delattr__(self, item):
|
||||||
@ -213,7 +222,7 @@ class DictToAttrRecursive(dict):
|
|||||||
|
|
||||||
|
|
||||||
ssl_model = cnhubert.get_model()
|
ssl_model = cnhubert.get_model()
|
||||||
if is_half == True:
|
if is_half:
|
||||||
ssl_model = ssl_model.half().to(device)
|
ssl_model = ssl_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
ssl_model = ssl_model.to(device)
|
ssl_model = ssl_model.to(device)
|
||||||
@ -223,6 +232,7 @@ else:
|
|||||||
# symbol_version-model_version-if_lora_v3
|
# symbol_version-model_version-if_lora_v3
|
||||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||||
|
|
||||||
|
|
||||||
v3v4set = {"v3", "v4"}
|
v3v4set = {"v3", "v4"}
|
||||||
|
|
||||||
|
|
||||||
@ -234,8 +244,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|||||||
print(sovits_path, version, model_version, if_lora_v3)
|
print(sovits_path, version, model_version, if_lora_v3)
|
||||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||||
if if_lora_v3 == True and is_exist == False:
|
if if_lora_v3 and not is_exist:
|
||||||
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重")
|
info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失,无法加载相应 LoRA 权重")
|
||||||
gr.Warning(info)
|
gr.Warning(info)
|
||||||
raise FileExistsError(info)
|
raise FileExistsError(info)
|
||||||
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
||||||
@ -314,17 +324,17 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|||||||
del vq_model.enc_q
|
del vq_model.enc_q
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if is_half == True:
|
if is_half:
|
||||||
vq_model = vq_model.half().to(device)
|
vq_model = vq_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
vq_model = vq_model.to(device)
|
vq_model = vq_model.to(device)
|
||||||
vq_model.eval()
|
vq_model.eval()
|
||||||
if if_lora_v3 == False:
|
if not if_lora_v3:
|
||||||
print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
print(f"loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||||
else:
|
else:
|
||||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||||
print(
|
print(
|
||||||
"loading sovits_%spretrained_G" % model_version,
|
f"loading sovits_{model_version}pretrained_G",
|
||||||
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
|
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
|
||||||
)
|
)
|
||||||
lora_rank = dict_s2["lora_rank"]
|
lora_rank = dict_s2["lora_rank"]
|
||||||
@ -335,7 +345,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|||||||
init_lora_weights=True,
|
init_lora_weights=True,
|
||||||
)
|
)
|
||||||
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
|
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
|
||||||
print("loading sovits_%s_lora%s" % (model_version, lora_rank))
|
print(f"loading sovits_{model_version}_lora{lora_rank}")
|
||||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||||
vq_model.cfm = vq_model.cfm.merge_and_unload()
|
vq_model.cfm = vq_model.cfm.merge_and_unload()
|
||||||
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
||||||
@ -383,7 +393,7 @@ def change_gpt_weights(gpt_path):
|
|||||||
max_sec = config["data"]["max_sec"]
|
max_sec = config["data"]["max_sec"]
|
||||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||||
t2s_model.load_state_dict(dict_s1["weight"])
|
t2s_model.load_state_dict(dict_s1["weight"])
|
||||||
if is_half == True:
|
if is_half:
|
||||||
t2s_model = t2s_model.half()
|
t2s_model = t2s_model.half()
|
||||||
t2s_model = t2s_model.to(device)
|
t2s_model = t2s_model.to(device)
|
||||||
t2s_model.eval()
|
t2s_model.eval()
|
||||||
@ -401,6 +411,7 @@ change_gpt_weights(gpt_path)
|
|||||||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
|
|
||||||
|
|
||||||
@ -442,7 +453,7 @@ def init_bigvgan():
|
|||||||
from BigVGAN import bigvgan
|
from BigVGAN import bigvgan
|
||||||
|
|
||||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
||||||
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
|
||||||
use_cuda_kernel=False,
|
use_cuda_kernel=False,
|
||||||
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
||||||
# remove weight norm in the model and set to eval mode
|
# remove weight norm in the model and set to eval mode
|
||||||
@ -450,7 +461,7 @@ def init_bigvgan():
|
|||||||
bigvgan_model = bigvgan_model.eval()
|
bigvgan_model = bigvgan_model.eval()
|
||||||
clean_hifigan_model()
|
clean_hifigan_model()
|
||||||
clean_sv_cn_model()
|
clean_sv_cn_model()
|
||||||
if is_half == True:
|
if is_half:
|
||||||
bigvgan_model = bigvgan_model.half().to(device)
|
bigvgan_model = bigvgan_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
bigvgan_model = bigvgan_model.to(device)
|
bigvgan_model = bigvgan_model.to(device)
|
||||||
@ -472,14 +483,14 @@ def init_hifigan():
|
|||||||
hifigan_model.eval()
|
hifigan_model.eval()
|
||||||
hifigan_model.remove_weight_norm()
|
hifigan_model.remove_weight_norm()
|
||||||
state_dict_g = torch.load(
|
state_dict_g = torch.load(
|
||||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
|
||||||
map_location="cpu",
|
map_location="cpu",
|
||||||
weights_only=False,
|
weights_only=False,
|
||||||
)
|
)
|
||||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||||
clean_bigvgan_model()
|
clean_bigvgan_model()
|
||||||
clean_sv_cn_model()
|
clean_sv_cn_model()
|
||||||
if is_half == True:
|
if is_half:
|
||||||
hifigan_model = hifigan_model.half().to(device)
|
hifigan_model = hifigan_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
hifigan_model = hifigan_model.to(device)
|
hifigan_model = hifigan_model.to(device)
|
||||||
@ -508,7 +519,7 @@ resample_transform_dict = {}
|
|||||||
|
|
||||||
def resample(audio_tensor, sr0, sr1, device):
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
key = f"{sr0}-{sr1}-{str(device)}"
|
||||||
if key not in resample_transform_dict:
|
if key not in resample_transform_dict:
|
||||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
return resample_transform_dict[key](audio_tensor)
|
return resample_transform_dict[key](audio_tensor)
|
||||||
@ -544,7 +555,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
|||||||
center=False,
|
center=False,
|
||||||
)
|
)
|
||||||
spec = spec.to(dtype)
|
spec = spec.to(dtype)
|
||||||
if is_v2pro == True:
|
if is_v2pro:
|
||||||
audio = resample(audio, sr1, 16000, device).to(dtype)
|
audio = resample(audio, sr1, 16000, device).to(dtype)
|
||||||
return spec, audio
|
return spec, audio
|
||||||
|
|
||||||
@ -556,7 +567,7 @@ def clean_text_inf(text, language, version):
|
|||||||
return phones, word2ph, norm_text
|
return phones, word2ph, norm_text
|
||||||
|
|
||||||
|
|
||||||
dtype = torch.float16 if is_half == True else torch.float32
|
dtype = torch.float16 if is_half else torch.float32
|
||||||
|
|
||||||
|
|
||||||
def get_bert_inf(phones, word2ph, norm_text, language):
|
def get_bert_inf(phones, word2ph, norm_text, language):
|
||||||
@ -566,7 +577,7 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
|||||||
else:
|
else:
|
||||||
bert = torch.zeros(
|
bert = torch.zeros(
|
||||||
(1024, len(phones)),
|
(1024, len(phones)),
|
||||||
dtype=torch.float16 if is_half == True else torch.float32,
|
dtype=torch.float16 if is_half else torch.float32,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
return bert
|
return bert
|
||||||
@ -595,29 +606,26 @@ def get_first(text):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
from text import chinese
|
|
||||||
|
|
||||||
|
|
||||||
def get_phones_and_bert(text, language, version, final=False):
|
def get_phones_and_bert(text, language, version, final=False):
|
||||||
text = re.sub(r' {2,}', ' ', text)
|
text = re.sub(r" {2,}", " ", text)
|
||||||
textlist = []
|
textlist = []
|
||||||
langlist = []
|
langlist = []
|
||||||
if language == "all_zh":
|
if language == "all_zh":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_yue":
|
elif language == "all_yue":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
if tmp["lang"] == "zh":
|
if tmp["lang"] == "zh":
|
||||||
tmp["lang"] = "yue"
|
tmp["lang"] = "yue"
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ja":
|
elif language == "all_ja":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ja"):
|
for tmp in LangSegmenter.getTexts(text, "ja"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ko":
|
elif language == "all_ko":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ko"):
|
for tmp in LangSegmenter.getTexts(text, "ko"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "en":
|
elif language == "en":
|
||||||
@ -669,6 +677,7 @@ def get_phones_and_bert(text, language, version, final=False):
|
|||||||
|
|
||||||
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
|
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
|
||||||
|
|
||||||
|
|
||||||
spec_min = -12
|
spec_min = -12
|
||||||
spec_max = 2
|
spec_max = 2
|
||||||
|
|
||||||
@ -681,32 +690,36 @@ def denorm_spec(x):
|
|||||||
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
||||||
|
|
||||||
|
|
||||||
mel_fn = lambda x: mel_spectrogram_torch(
|
def mel_fn(x):
|
||||||
x,
|
return mel_spectrogram_torch(
|
||||||
**{
|
x,
|
||||||
"n_fft": 1024,
|
**{
|
||||||
"win_size": 1024,
|
"n_fft": 1024,
|
||||||
"hop_size": 256,
|
"win_size": 1024,
|
||||||
"num_mels": 100,
|
"hop_size": 256,
|
||||||
"sampling_rate": 24000,
|
"num_mels": 100,
|
||||||
"fmin": 0,
|
"sampling_rate": 24000,
|
||||||
"fmax": None,
|
"fmin": 0,
|
||||||
"center": False,
|
"fmax": None,
|
||||||
},
|
"center": False,
|
||||||
)
|
},
|
||||||
mel_fn_v4 = lambda x: mel_spectrogram_torch(
|
)
|
||||||
x,
|
|
||||||
**{
|
|
||||||
"n_fft": 1280,
|
def mel_fn_v4(x):
|
||||||
"win_size": 1280,
|
return mel_spectrogram_torch(
|
||||||
"hop_size": 320,
|
x,
|
||||||
"num_mels": 100,
|
**{
|
||||||
"sampling_rate": 32000,
|
"n_fft": 1280,
|
||||||
"fmin": 0,
|
"win_size": 1280,
|
||||||
"fmax": None,
|
"hop_size": 320,
|
||||||
"center": False,
|
"num_mels": 100,
|
||||||
},
|
"sampling_rate": 32000,
|
||||||
)
|
"fmin": 0,
|
||||||
|
"fmax": None,
|
||||||
|
"center": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def merge_short_text_in_array(texts, threshold):
|
def merge_short_text_in_array(texts, threshold):
|
||||||
@ -732,8 +745,8 @@ sr_model = None
|
|||||||
|
|
||||||
def audio_sr(audio, sr):
|
def audio_sr(audio, sr):
|
||||||
global sr_model
|
global sr_model
|
||||||
if sr_model == None:
|
if sr_model is None:
|
||||||
from tools.audio_sr import AP_BWE
|
from gsv_tools.audio_sr import AP_BWE
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sr_model = AP_BWE(device, DictToAttrRecursive)
|
sr_model = AP_BWE(device, DictToAttrRecursive)
|
||||||
@ -801,10 +814,10 @@ def get_tts_wav(
|
|||||||
print(i18n("实际输入的目标文本:"), text)
|
print(i18n("实际输入的目标文本:"), text)
|
||||||
zero_wav = np.zeros(
|
zero_wav = np.zeros(
|
||||||
int(hps.data.sampling_rate * pause_second),
|
int(hps.data.sampling_rate * pause_second),
|
||||||
dtype=np.float16 if is_half == True else np.float32,
|
dtype=np.float16 if is_half else np.float32,
|
||||||
)
|
)
|
||||||
zero_wav_torch = torch.from_numpy(zero_wav)
|
zero_wav_torch = torch.from_numpy(zero_wav)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
zero_wav_torch = zero_wav_torch.half().to(device)
|
zero_wav_torch = zero_wav_torch.half().to(device)
|
||||||
else:
|
else:
|
||||||
zero_wav_torch = zero_wav_torch.to(device)
|
zero_wav_torch = zero_wav_torch.to(device)
|
||||||
@ -815,7 +828,7 @@ def get_tts_wav(
|
|||||||
gr.Warning(i18n("参考音频在3~10秒范围外,请更换!"))
|
gr.Warning(i18n("参考音频在3~10秒范围外,请更换!"))
|
||||||
raise OSError(i18n("参考音频在3~10秒范围外,请更换!"))
|
raise OSError(i18n("参考音频在3~10秒范围外,请更换!"))
|
||||||
wav16k = torch.from_numpy(wav16k)
|
wav16k = torch.from_numpy(wav16k)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
wav16k = wav16k.half().to(device)
|
wav16k = wav16k.half().to(device)
|
||||||
else:
|
else:
|
||||||
wav16k = wav16k.to(device)
|
wav16k = wav16k.to(device)
|
||||||
@ -871,7 +884,7 @@ def get_tts_wav(
|
|||||||
t2 = ttime()
|
t2 = ttime()
|
||||||
# cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
|
# cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
|
||||||
# print(cache.keys(),if_freeze)
|
# print(cache.keys(),if_freeze)
|
||||||
if i_text in cache and if_freeze == True:
|
if i_text in cache and if_freeze:
|
||||||
pred_semantic = cache[i_text]
|
pred_semantic = cache[i_text]
|
||||||
else:
|
else:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
@ -896,7 +909,7 @@ def get_tts_wav(
|
|||||||
refers = []
|
refers = []
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
sv_emb = []
|
sv_emb = []
|
||||||
if sv_cn_model == None:
|
if sv_cn_model is None:
|
||||||
init_sv_cn()
|
init_sv_cn()
|
||||||
if inp_refs:
|
if inp_refs:
|
||||||
for path in inp_refs:
|
for path in inp_refs:
|
||||||
@ -965,10 +978,10 @@ def get_tts_wav(
|
|||||||
cfm_res = torch.cat(cfm_resss, 2)
|
cfm_res = torch.cat(cfm_resss, 2)
|
||||||
cfm_res = denorm_spec(cfm_res)
|
cfm_res = denorm_spec(cfm_res)
|
||||||
if model_version == "v3":
|
if model_version == "v3":
|
||||||
if bigvgan_model == None:
|
if bigvgan_model is None:
|
||||||
init_bigvgan()
|
init_bigvgan()
|
||||||
else: # v4
|
else: # v4
|
||||||
if hifigan_model == None:
|
if hifigan_model is None:
|
||||||
init_hifigan()
|
init_hifigan()
|
||||||
vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
|
vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
@ -982,7 +995,7 @@ def get_tts_wav(
|
|||||||
t4 = ttime()
|
t4 = ttime()
|
||||||
t.extend([t2 - t1, t3 - t2, t4 - t3])
|
t.extend([t2 - t1, t3 - t2, t4 - t3])
|
||||||
t1 = ttime()
|
t1 = ttime()
|
||||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
|
print(f"{t[0]:.3f}\t{sum(t[1::3]):.3f}\t{sum(t[2::3]):.3f}\t{sum(t[3::3]):.3f}")
|
||||||
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
||||||
if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
|
if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
|
||||||
opt_sr = 32000
|
opt_sr = 32000
|
||||||
@ -990,7 +1003,7 @@ def get_tts_wav(
|
|||||||
opt_sr = 24000
|
opt_sr = 24000
|
||||||
else:
|
else:
|
||||||
opt_sr = 48000 # v4
|
opt_sr = 48000 # v4
|
||||||
if if_sr == True and opt_sr == 24000:
|
if if_sr and opt_sr == 24000:
|
||||||
print(i18n("音频超分中"))
|
print(i18n("音频超分中"))
|
||||||
audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
|
audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
|
||||||
max_audio = np.abs(audio_opt).max()
|
max_audio = np.abs(audio_opt).max()
|
||||||
@ -1062,7 +1075,7 @@ def cut2(inp):
|
|||||||
|
|
||||||
def cut3(inp):
|
def cut3(inp):
|
||||||
inp = inp.strip("\n")
|
inp = inp.strip("\n")
|
||||||
opts = ["%s" % item for item in inp.strip("。").split("。")]
|
opts = [f"{item}" for item in inp.strip("。").split("。")]
|
||||||
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
||||||
return "\n".join(opts)
|
return "\n".join(opts)
|
||||||
|
|
||||||
@ -1077,7 +1090,7 @@ def cut4(inp):
|
|||||||
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
|
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
|
||||||
def cut5(inp):
|
def cut5(inp):
|
||||||
inp = inp.strip("\n")
|
inp = inp.strip("\n")
|
||||||
punds = {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}
|
punds = {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ":", "…"}
|
||||||
mergeitems = []
|
mergeitems = []
|
||||||
items = []
|
items = []
|
||||||
|
|
||||||
@ -1101,7 +1114,7 @@ def cut5(inp):
|
|||||||
|
|
||||||
def custom_sort_key(s):
|
def custom_sort_key(s):
|
||||||
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||||
parts = re.split("(\d+)", s)
|
parts = re.split(r"(\d+)", s)
|
||||||
# 将数字部分转换为整数,非数字部分保持不变
|
# 将数字部分转换为整数,非数字部分保持不变
|
||||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
return parts
|
return parts
|
||||||
|
|||||||
@ -6,32 +6,37 @@
|
|||||||
全部按英文识别
|
全部按英文识别
|
||||||
全部按日文识别
|
全部按日文识别
|
||||||
"""
|
"""
|
||||||
import psutil
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
def set_high_priority():
|
def set_high_priority():
|
||||||
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
|
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
|
||||||
if os.name != "nt":
|
if os.name != "nt":
|
||||||
return # 仅 Windows 有效
|
return # 仅 Windows 有效
|
||||||
p = psutil.Process(os.getpid())
|
p = psutil.Process(os.getpid())
|
||||||
try:
|
try:
|
||||||
p.nice(psutil.HIGH_PRIORITY_CLASS)
|
p.nice(psutil.HIGH_PRIORITY_CLASS)
|
||||||
print("已将进程优先级设为 High")
|
print("已将进程优先级设为 High")
|
||||||
except psutil.AccessDenied:
|
except psutil.AccessDenied:
|
||||||
print("权限不足,无法修改优先级(请用管理员运行)")
|
print("权限不足,无法修改优先级(请用管理员运行)")
|
||||||
|
|
||||||
|
|
||||||
set_high_priority()
|
set_high_priority()
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
sys.path.append(f"{now_dir}/GPT_SoVITS")
|
||||||
|
|
||||||
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
||||||
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
||||||
@ -60,8 +65,9 @@ import gradio as gr
|
|||||||
from TTS_infer_pack.text_segmentation_method import get_method
|
from TTS_infer_pack.text_segmentation_method import get_method
|
||||||
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
|
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
|
||||||
|
|
||||||
from tools.assets import css, js, top_html
|
from gsv_tools.assets import css, js, top_html
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
|
|
||||||
language = os.environ.get("language", "Auto")
|
language = os.environ.get("language", "Auto")
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
@ -114,9 +120,11 @@ cut_method = {
|
|||||||
|
|
||||||
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
|
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
|
||||||
|
|
||||||
|
|
||||||
SoVITS_names, GPT_names = get_weights_names()
|
SoVITS_names, GPT_names = get_weights_names()
|
||||||
from config import pretrained_sovits_name
|
from config import pretrained_sovits_name
|
||||||
|
|
||||||
|
|
||||||
path_sovits_v3 = pretrained_sovits_name["v3"]
|
path_sovits_v3 = pretrained_sovits_name["v3"]
|
||||||
path_sovits_v4 = pretrained_sovits_name["v4"]
|
path_sovits_v4 = pretrained_sovits_name["v4"]
|
||||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||||
@ -203,7 +211,7 @@ def inference(
|
|||||||
|
|
||||||
def custom_sort_key(s):
|
def custom_sort_key(s):
|
||||||
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||||
parts = re.split("(\d+)", s)
|
parts = re.split(r"(\d+)", s)
|
||||||
# 将数字部分转换为整数,非数字部分保持不变
|
# 将数字部分转换为整数,非数字部分保持不变
|
||||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
return parts
|
return parts
|
||||||
@ -215,7 +223,7 @@ else:
|
|||||||
with open("./weight.json", "w", encoding="utf-8") as file:
|
with open("./weight.json", "w", encoding="utf-8") as file:
|
||||||
json.dump({"GPT": {}, "SoVITS": {}}, file)
|
json.dump({"GPT": {}, "SoVITS": {}}, file)
|
||||||
|
|
||||||
with open("./weight.json", "r", encoding="utf-8") as file:
|
with open("./weight.json", encoding="utf-8") as file:
|
||||||
weight_data = file.read()
|
weight_data = file.read()
|
||||||
weight_data = json.loads(weight_data)
|
weight_data = json.loads(weight_data)
|
||||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
||||||
@ -227,6 +235,7 @@ with open("./weight.json", "r", encoding="utf-8") as file:
|
|||||||
|
|
||||||
from process_ckpt import get_sovits_version_from_path_fast
|
from process_ckpt import get_sovits_version_from_path_fast
|
||||||
|
|
||||||
|
|
||||||
v3v4set = {"v3", "v4"}
|
v3v4set = {"v3", "v4"}
|
||||||
|
|
||||||
|
|
||||||
@ -238,8 +247,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|||||||
# print(sovits_path,version, model_version, if_lora_v3)
|
# print(sovits_path,version, model_version, if_lora_v3)
|
||||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||||
if if_lora_v3 == True and is_exist == False:
|
if if_lora_v3 and not is_exist:
|
||||||
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重")
|
info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失,无法加载相应 LoRA 权重")
|
||||||
gr.Warning(info)
|
gr.Warning(info)
|
||||||
raise FileExistsError(info)
|
raise FileExistsError(info)
|
||||||
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
||||||
|
|||||||
@ -1,14 +1,16 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
from text import cleaned_text_to_sequence
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from module.mel_processing import spectrogram_torch, spec_to_mel_torch
|
from gsv_tools.my_utils import load_audio
|
||||||
from text import cleaned_text_to_sequence
|
from module.mel_processing import spec_to_mel_torch, spectrogram_torch
|
||||||
import torch.nn.functional as F
|
|
||||||
from tools.my_utils import load_audio
|
|
||||||
|
|
||||||
version = os.environ.get("version", None)
|
version = os.environ.get("version", None)
|
||||||
|
|
||||||
@ -23,22 +25,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
|
|
||||||
def __init__(self, hparams, version=None, val=False):
|
def __init__(self, hparams, version=None, val=False):
|
||||||
exp_dir = hparams.exp_dir
|
exp_dir = hparams.exp_dir
|
||||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
self.path2 = f"{exp_dir}/2-name2text.txt"
|
||||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
self.path4 = f"{exp_dir}/4-cnhubert"
|
||||||
self.path5 = "%s/5-wav32k" % exp_dir
|
self.path5 = f"{exp_dir}/5-wav32k"
|
||||||
assert os.path.exists(self.path2)
|
assert os.path.exists(self.path2)
|
||||||
assert os.path.exists(self.path4)
|
assert os.path.exists(self.path4)
|
||||||
assert os.path.exists(self.path5)
|
assert os.path.exists(self.path5)
|
||||||
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
|
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
self.path7 = "%s/7-sv_cn" % exp_dir
|
self.path7 = f"{exp_dir}/7-sv_cn"
|
||||||
assert os.path.exists(self.path7)
|
assert os.path.exists(self.path7)
|
||||||
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
||||||
names5 = set(os.listdir(self.path5))
|
names5 = set(os.listdir(self.path5))
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
names6 = set([name[:-3] for name in list(os.listdir(self.path7))]) # 去除.pt后缀
|
names6 = set([name[:-3] for name in list(os.listdir(self.path7))]) # 去除.pt后缀
|
||||||
self.phoneme_data = {}
|
self.phoneme_data = {}
|
||||||
with open(self.path2, "r", encoding="utf8") as f:
|
with open(self.path2, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -85,7 +87,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
skipped_phone += 1
|
skipped_phone += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
|
size = os.path.getsize(f"{self.path5}/{audiopath}")
|
||||||
duration = size / self.sampling_rate / 2
|
duration = size / self.sampling_rate / 2
|
||||||
|
|
||||||
if duration == 0:
|
if duration == 0:
|
||||||
@ -110,15 +112,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
audiopath, phoneme_ids = audiopath_sid_text
|
audiopath, phoneme_ids = audiopath_sid_text
|
||||||
text = torch.FloatTensor(phoneme_ids)
|
text = torch.FloatTensor(phoneme_ids)
|
||||||
try:
|
try:
|
||||||
spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
|
spec, wav = self.get_audio(f"{self.path5}/{audiopath}")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
|
||||||
if ssl.shape[-1] != spec.shape[-1]:
|
if ssl.shape[-1] != spec.shape[-1]:
|
||||||
typee = ssl.dtype
|
typee = ssl.dtype
|
||||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||||
ssl.requires_grad = False
|
ssl.requires_grad = False
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
|
sv_emb = torch.load(f"{self.path7}/{audiopath}.pt", map_location="cpu")
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
spec = torch.zeros(1025, 100)
|
spec = torch.zeros(1025, 100)
|
||||||
@ -285,16 +287,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
|||||||
|
|
||||||
def __init__(self, hparams, val=False):
|
def __init__(self, hparams, val=False):
|
||||||
exp_dir = hparams.exp_dir
|
exp_dir = hparams.exp_dir
|
||||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
self.path2 = f"{exp_dir}/2-name2text.txt"
|
||||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
self.path4 = f"{exp_dir}/4-cnhubert"
|
||||||
self.path5 = "%s/5-wav32k" % exp_dir
|
self.path5 = f"{exp_dir}/5-wav32k"
|
||||||
assert os.path.exists(self.path2)
|
assert os.path.exists(self.path2)
|
||||||
assert os.path.exists(self.path4)
|
assert os.path.exists(self.path4)
|
||||||
assert os.path.exists(self.path5)
|
assert os.path.exists(self.path5)
|
||||||
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
||||||
names5 = set(os.listdir(self.path5))
|
names5 = set(os.listdir(self.path5))
|
||||||
self.phoneme_data = {}
|
self.phoneme_data = {}
|
||||||
with open(self.path2, "r", encoding="utf8") as f:
|
with open(self.path2, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -339,7 +341,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
|||||||
skipped_phone += 1
|
skipped_phone += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
|
size = os.path.getsize(f"{self.path5}/{audiopath}")
|
||||||
duration = size / self.sampling_rate / 2
|
duration = size / self.sampling_rate / 2
|
||||||
|
|
||||||
if duration == 0:
|
if duration == 0:
|
||||||
@ -376,9 +378,9 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
|||||||
audiopath, phoneme_ids = audiopath_sid_text
|
audiopath, phoneme_ids = audiopath_sid_text
|
||||||
text = torch.FloatTensor(phoneme_ids)
|
text = torch.FloatTensor(phoneme_ids)
|
||||||
try:
|
try:
|
||||||
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
|
spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
|
||||||
if ssl.shape[-1] != spec.shape[-1]:
|
if ssl.shape[-1] != spec.shape[-1]:
|
||||||
typee = ssl.dtype
|
typee = ssl.dtype
|
||||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||||
@ -523,16 +525,16 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
|
|||||||
|
|
||||||
def __init__(self, hparams, val=False):
|
def __init__(self, hparams, val=False):
|
||||||
exp_dir = hparams.exp_dir
|
exp_dir = hparams.exp_dir
|
||||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
self.path2 = f"{exp_dir}/2-name2text.txt"
|
||||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
self.path4 = f"{exp_dir}/4-cnhubert"
|
||||||
self.path5 = "%s/5-wav32k" % exp_dir
|
self.path5 = f"{exp_dir}/5-wav32k"
|
||||||
assert os.path.exists(self.path2)
|
assert os.path.exists(self.path2)
|
||||||
assert os.path.exists(self.path4)
|
assert os.path.exists(self.path4)
|
||||||
assert os.path.exists(self.path5)
|
assert os.path.exists(self.path5)
|
||||||
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
||||||
names5 = set(os.listdir(self.path5))
|
names5 = set(os.listdir(self.path5))
|
||||||
self.phoneme_data = {}
|
self.phoneme_data = {}
|
||||||
with open(self.path2, "r", encoding="utf8") as f:
|
with open(self.path2, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -577,7 +579,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
|
|||||||
skipped_phone += 1
|
skipped_phone += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
|
size = os.path.getsize(f"{self.path5}/{audiopath}")
|
||||||
duration = size / self.sampling_rate / 2
|
duration = size / self.sampling_rate / 2
|
||||||
|
|
||||||
if duration == 0:
|
if duration == 0:
|
||||||
@ -614,9 +616,9 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
|
|||||||
audiopath, phoneme_ids = audiopath_sid_text
|
audiopath, phoneme_ids = audiopath_sid_text
|
||||||
text = torch.FloatTensor(phoneme_ids)
|
text = torch.FloatTensor(phoneme_ids)
|
||||||
try:
|
try:
|
||||||
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
|
spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
|
||||||
if ssl.shape[-1] != spec.shape[-1]:
|
if ssl.shape[-1] != spec.shape[-1]:
|
||||||
typee = ssl.dtype
|
typee = ssl.dtype
|
||||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||||
@ -734,16 +736,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
|||||||
|
|
||||||
def __init__(self, hparams, val=False):
|
def __init__(self, hparams, val=False):
|
||||||
exp_dir = hparams.exp_dir
|
exp_dir = hparams.exp_dir
|
||||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
self.path2 = f"{exp_dir}/2-name2text.txt"
|
||||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
self.path4 = f"{exp_dir}/4-cnhubert"
|
||||||
self.path5 = "%s/5-wav32k" % exp_dir
|
self.path5 = f"{exp_dir}/5-wav32k"
|
||||||
assert os.path.exists(self.path2)
|
assert os.path.exists(self.path2)
|
||||||
assert os.path.exists(self.path4)
|
assert os.path.exists(self.path4)
|
||||||
assert os.path.exists(self.path5)
|
assert os.path.exists(self.path5)
|
||||||
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
||||||
names5 = set(os.listdir(self.path5))
|
names5 = set(os.listdir(self.path5))
|
||||||
self.phoneme_data = {}
|
self.phoneme_data = {}
|
||||||
with open(self.path2, "r", encoding="utf8") as f:
|
with open(self.path2, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -788,7 +790,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
|||||||
skipped_phone += 1
|
skipped_phone += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
|
size = os.path.getsize(f"{self.path5}/{audiopath}")
|
||||||
duration = size / self.sampling_rate / 2
|
duration = size / self.sampling_rate / 2
|
||||||
|
|
||||||
if duration == 0:
|
if duration == 0:
|
||||||
@ -825,9 +827,9 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
|||||||
audiopath, phoneme_ids = audiopath_sid_text
|
audiopath, phoneme_ids = audiopath_sid_text
|
||||||
text = torch.FloatTensor(phoneme_ids)
|
text = torch.FloatTensor(phoneme_ids)
|
||||||
try:
|
try:
|
||||||
spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
|
spec, mel, wav = self.get_audio(f"{self.path5}/{audiopath}")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
|
||||||
if ssl.shape[-1] != spec.shape[-1]:
|
if ssl.shape[-1] != spec.shape[-1]:
|
||||||
typee = ssl.dtype
|
typee = ssl.dtype
|
||||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
inp_text = os.environ.get("inp_text")
|
inp_text = os.environ.get("inp_text")
|
||||||
inp_wav_dir = os.environ.get("inp_wav_dir")
|
inp_wav_dir = os.environ.get("inp_wav_dir")
|
||||||
exp_name = os.environ.get("exp_name")
|
exp_name = os.environ.get("exp_name")
|
||||||
@ -13,13 +12,12 @@ opt_dir = os.environ.get("opt_dir")
|
|||||||
bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
|
bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
version = os.environ.get("version", None)
|
version = os.environ.get("version", None)
|
||||||
import traceback
|
|
||||||
import os.path
|
import os.path
|
||||||
from text.cleaner import clean_text
|
import shutil
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
import traceback
|
||||||
from tools.my_utils import clean_path
|
|
||||||
|
|
||||||
# inp_text=sys.argv[1]
|
# inp_text=sys.argv[1]
|
||||||
# inp_wav_dir=sys.argv[2]
|
# inp_wav_dir=sys.argv[2]
|
||||||
@ -29,23 +27,26 @@ from tools.my_utils import clean_path
|
|||||||
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
|
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
|
||||||
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
||||||
# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
|
# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
|
||||||
|
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
import shutil
|
|
||||||
|
from text.cleaner import clean_text
|
||||||
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
|
|
||||||
|
from gsv_tools.my_utils import clean_path
|
||||||
|
|
||||||
|
|
||||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
name = os.path.basename(path)
|
name = os.path.basename(path)
|
||||||
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
||||||
tmp_path = "%s%s.pth" % (ttime(), i_part)
|
tmp_path = f"{ttime()}{i_part}.pth"
|
||||||
torch.save(fea, tmp_path)
|
torch.save(fea, tmp_path)
|
||||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
shutil.move(tmp_path, f"{dir}/{name}")
|
||||||
|
|
||||||
|
|
||||||
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
txt_path = f"{opt_dir}/2-name2text-{i_part}.txt"
|
||||||
if os.path.exists(txt_path) == False:
|
if not os.path.exists(txt_path):
|
||||||
bert_dir = "%s/3-bert" % (opt_dir)
|
bert_dir = f"{opt_dir}/3-bert"
|
||||||
os.makedirs(opt_dir, exist_ok=True)
|
os.makedirs(opt_dir, exist_ok=True)
|
||||||
os.makedirs(bert_dir, exist_ok=True)
|
os.makedirs(bert_dir, exist_ok=True)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
@ -60,7 +61,7 @@ if os.path.exists(txt_path) == False:
|
|||||||
raise FileNotFoundError(bert_pretrained_dir)
|
raise FileNotFoundError(bert_pretrained_dir)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
|
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
|
||||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
|
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
bert_model = bert_model.half().to(device)
|
bert_model = bert_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
bert_model = bert_model.to(device)
|
bert_model = bert_model.to(device)
|
||||||
@ -90,8 +91,8 @@ if os.path.exists(txt_path) == False:
|
|||||||
name = os.path.basename(name)
|
name = os.path.basename(name)
|
||||||
print(name)
|
print(name)
|
||||||
phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("¥", ","), lan, version)
|
phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("¥", ","), lan, version)
|
||||||
path_bert = "%s/%s.pt" % (bert_dir, name)
|
path_bert = f"{bert_dir}/{name}.pt"
|
||||||
if os.path.exists(path_bert) == False and lan == "zh":
|
if not os.path.exists(path_bert) and lan == "zh":
|
||||||
bert_feature = get_bert_feature(norm_text, word2ph)
|
bert_feature = get_bert_feature(norm_text, word2ph)
|
||||||
assert bert_feature.shape[-1] == len(phones)
|
assert bert_feature.shape[-1] == len(phones)
|
||||||
# torch.save(bert_feature, path_bert)
|
# torch.save(bert_feature, path_bert)
|
||||||
@ -104,7 +105,7 @@ if os.path.exists(txt_path) == False:
|
|||||||
|
|
||||||
todo = []
|
todo = []
|
||||||
res = []
|
res = []
|
||||||
with open(inp_text, "r", encoding="utf8") as f:
|
with open(inp_text, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
language_v1_to_language_v2 = {
|
language_v1_to_language_v2 = {
|
||||||
@ -138,6 +139,6 @@ if os.path.exists(txt_path) == False:
|
|||||||
process(todo, res)
|
process(todo, res)
|
||||||
opt = []
|
opt = []
|
||||||
for name, phones, word2ph, norm_text in res:
|
for name, phones, word2ph, norm_text in res:
|
||||||
opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
|
opt.append(f"{name}\t{phones}\t{word2ph}\t{norm_text}")
|
||||||
with open(txt_path, "w", encoding="utf8") as f:
|
with open(txt_path, "w", encoding="utf8") as f:
|
||||||
f.write("\n".join(opt) + "\n")
|
f.write("\n".join(opt) + "\n")
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
inp_text = os.environ.get("inp_text")
|
inp_text = os.environ.get("inp_text")
|
||||||
inp_wav_dir = os.environ.get("inp_wav_dir")
|
inp_wav_dir = os.environ.get("inp_wav_dir")
|
||||||
@ -12,20 +11,24 @@ if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
|||||||
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
||||||
from feature_extractor import cnhubert
|
from feature_extractor import cnhubert
|
||||||
|
|
||||||
|
|
||||||
opt_dir = os.environ.get("opt_dir")
|
opt_dir = os.environ.get("opt_dir")
|
||||||
cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir")
|
cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir")
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
import librosa
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
from tools.my_utils import load_audio, clean_path
|
import shutil
|
||||||
|
|
||||||
# from config import cnhubert_base_path
|
# from config import cnhubert_base_path
|
||||||
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
||||||
@ -37,22 +40,22 @@ from tools.my_utils import load_audio, clean_path
|
|||||||
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
|
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
|
||||||
# cnhubert.cnhubert_base_path=sys.argv[7]
|
# cnhubert.cnhubert_base_path=sys.argv[7]
|
||||||
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
||||||
|
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
import shutil
|
|
||||||
|
from gsv_tools.my_utils import clean_path, load_audio
|
||||||
|
|
||||||
|
|
||||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
name = os.path.basename(path)
|
name = os.path.basename(path)
|
||||||
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
||||||
tmp_path = "%s%s.pth" % (ttime(), i_part)
|
tmp_path = f"{ttime()}{i_part}.pth"
|
||||||
torch.save(fea, tmp_path)
|
torch.save(fea, tmp_path)
|
||||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
shutil.move(tmp_path, f"{dir}/{name}")
|
||||||
|
|
||||||
|
|
||||||
hubert_dir = "%s/4-cnhubert" % (opt_dir)
|
hubert_dir = f"{opt_dir}/4-cnhubert"
|
||||||
wav32dir = "%s/5-wav32k" % (opt_dir)
|
wav32dir = f"{opt_dir}/5-wav32k"
|
||||||
os.makedirs(opt_dir, exist_ok=True)
|
os.makedirs(opt_dir, exist_ok=True)
|
||||||
os.makedirs(hubert_dir, exist_ok=True)
|
os.makedirs(hubert_dir, exist_ok=True)
|
||||||
os.makedirs(wav32dir, exist_ok=True)
|
os.makedirs(wav32dir, exist_ok=True)
|
||||||
@ -67,7 +70,7 @@ else:
|
|||||||
device = "cpu"
|
device = "cpu"
|
||||||
model = cnhubert.get_model()
|
model = cnhubert.get_model()
|
||||||
# is_half=False
|
# is_half=False
|
||||||
if is_half == True:
|
if is_half:
|
||||||
model = model.half().to(device)
|
model = model.half().to(device)
|
||||||
else:
|
else:
|
||||||
model = model.to(device)
|
model = model.to(device)
|
||||||
@ -76,36 +79,36 @@ nan_fails = []
|
|||||||
|
|
||||||
|
|
||||||
def name2go(wav_name, wav_path):
|
def name2go(wav_name, wav_path):
|
||||||
hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
|
hubert_path = f"{hubert_dir}/{wav_name}.pt"
|
||||||
if os.path.exists(hubert_path):
|
if os.path.exists(hubert_path):
|
||||||
return
|
return
|
||||||
tmp_audio = load_audio(wav_path, 32000)
|
tmp_audio = load_audio(wav_path, 32000)
|
||||||
tmp_max = np.abs(tmp_audio).max()
|
tmp_max = np.abs(tmp_audio).max()
|
||||||
if tmp_max > 2.2:
|
if tmp_max > 2.2:
|
||||||
print("%s-filtered,%s" % (wav_name, tmp_max))
|
print(f"{wav_name}-filtered,{tmp_max}")
|
||||||
return
|
return
|
||||||
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio
|
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio
|
||||||
tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio
|
tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio
|
||||||
tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题
|
tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题
|
||||||
tensor_wav16 = torch.from_numpy(tmp_audio)
|
tensor_wav16 = torch.from_numpy(tmp_audio)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
tensor_wav16 = tensor_wav16.half().to(device)
|
tensor_wav16 = tensor_wav16.half().to(device)
|
||||||
else:
|
else:
|
||||||
tensor_wav16 = tensor_wav16.to(device)
|
tensor_wav16 = tensor_wav16.to(device)
|
||||||
ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215])
|
ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215])
|
||||||
if np.isnan(ssl.detach().numpy()).sum() != 0:
|
if np.isnan(ssl.detach().numpy()).sum() != 0:
|
||||||
nan_fails.append((wav_name, wav_path))
|
nan_fails.append((wav_name, wav_path))
|
||||||
print("nan filtered:%s" % wav_name)
|
print(f"nan filtered:{wav_name}")
|
||||||
return
|
return
|
||||||
wavfile.write(
|
wavfile.write(
|
||||||
"%s/%s" % (wav32dir, wav_name),
|
f"{wav32dir}/{wav_name}",
|
||||||
32000,
|
32000,
|
||||||
tmp_audio32.astype("int16"),
|
tmp_audio32.astype("int16"),
|
||||||
)
|
)
|
||||||
my_save(ssl, hubert_path)
|
my_save(ssl, hubert_path)
|
||||||
|
|
||||||
|
|
||||||
with open(inp_text, "r", encoding="utf8") as f:
|
with open(inp_text, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines[int(i_part) :: int(all_parts)]:
|
for line in lines[int(i_part) :: int(all_parts)]:
|
||||||
@ -113,9 +116,9 @@ for line in lines[int(i_part) :: int(all_parts)]:
|
|||||||
# wav_name,text=line.split("\t")
|
# wav_name,text=line.split("\t")
|
||||||
wav_name, spk_name, language, text = line.split("|")
|
wav_name, spk_name, language, text = line.split("|")
|
||||||
wav_name = clean_path(wav_name)
|
wav_name = clean_path(wav_name)
|
||||||
if inp_wav_dir != "" and inp_wav_dir != None:
|
if inp_wav_dir != "" and inp_wav_dir is not None:
|
||||||
wav_name = os.path.basename(wav_name)
|
wav_name = os.path.basename(wav_name)
|
||||||
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
|
wav_path = f"{inp_wav_dir}/{wav_name}"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
wav_path = wav_name
|
wav_path = wav_name
|
||||||
@ -124,7 +127,7 @@ for line in lines[int(i_part) :: int(all_parts)]:
|
|||||||
except:
|
except:
|
||||||
print(line, traceback.format_exc())
|
print(line, traceback.format_exc())
|
||||||
|
|
||||||
if len(nan_fails) > 0 and is_half == True:
|
if len(nan_fails) > 0 and is_half:
|
||||||
is_half = False
|
is_half = False
|
||||||
model = model.float()
|
model = model.float()
|
||||||
for wav in nan_fails:
|
for wav in nan_fails:
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
inp_text = os.environ.get("inp_text")
|
inp_text = os.environ.get("inp_text")
|
||||||
inp_wav_dir = os.environ.get("inp_wav_dir")
|
inp_wav_dir = os.environ.get("inp_wav_dir")
|
||||||
@ -15,32 +14,37 @@ opt_dir = os.environ.get("opt_dir")
|
|||||||
sv_path = os.environ.get("sv_path")
|
sv_path = os.environ.get("sv_path")
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
|
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
|
||||||
from tools.my_utils import clean_path
|
|
||||||
from time import time as ttime
|
|
||||||
import shutil
|
import shutil
|
||||||
from ERes2NetV2 import ERes2NetV2
|
from time import time as ttime
|
||||||
|
|
||||||
import kaldi as Kaldi
|
import kaldi as Kaldi
|
||||||
|
from ERes2NetV2 import ERes2NetV2
|
||||||
|
|
||||||
|
from gsv_tools.my_utils import clean_path
|
||||||
|
|
||||||
|
|
||||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
name = os.path.basename(path)
|
name = os.path.basename(path)
|
||||||
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
||||||
tmp_path = "%s%s.pth" % (ttime(), i_part)
|
tmp_path = f"{ttime()}{i_part}.pth"
|
||||||
torch.save(fea, tmp_path)
|
torch.save(fea, tmp_path)
|
||||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
shutil.move(tmp_path, f"{dir}/{name}")
|
||||||
|
|
||||||
|
|
||||||
sv_cn_dir = "%s/7-sv_cn" % (opt_dir)
|
sv_cn_dir = f"{opt_dir}/7-sv_cn"
|
||||||
wav32dir = "%s/5-wav32k" % (opt_dir)
|
wav32dir = f"{opt_dir}/5-wav32k"
|
||||||
os.makedirs(opt_dir, exist_ok=True)
|
os.makedirs(opt_dir, exist_ok=True)
|
||||||
os.makedirs(sv_cn_dir, exist_ok=True)
|
os.makedirs(sv_cn_dir, exist_ok=True)
|
||||||
os.makedirs(wav32dir, exist_ok=True)
|
os.makedirs(wav32dir, exist_ok=True)
|
||||||
@ -63,7 +67,7 @@ class SV:
|
|||||||
embedding_model.eval()
|
embedding_model.eval()
|
||||||
self.embedding_model = embedding_model
|
self.embedding_model = embedding_model
|
||||||
self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
|
self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
|
||||||
if is_half == False:
|
if not is_half:
|
||||||
self.embedding_model = self.embedding_model.to(device)
|
self.embedding_model = self.embedding_model.to(device)
|
||||||
else:
|
else:
|
||||||
self.embedding_model = self.embedding_model.half().to(device)
|
self.embedding_model = self.embedding_model.half().to(device)
|
||||||
@ -72,7 +76,7 @@ class SV:
|
|||||||
def compute_embedding3(self, wav): # (1,x)#-1~1
|
def compute_embedding3(self, wav): # (1,x)#-1~1
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
wav = self.res(wav)
|
wav = self.res(wav)
|
||||||
if self.is_half == True:
|
if self.is_half:
|
||||||
wav = wav.half()
|
wav = wav.half()
|
||||||
feat = torch.stack(
|
feat = torch.stack(
|
||||||
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
|
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
|
||||||
@ -85,10 +89,10 @@ sv = SV(device, is_half)
|
|||||||
|
|
||||||
|
|
||||||
def name2go(wav_name, wav_path):
|
def name2go(wav_name, wav_path):
|
||||||
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
|
sv_cn_path = f"{sv_cn_dir}/{wav_name}.pt"
|
||||||
if os.path.exists(sv_cn_path):
|
if os.path.exists(sv_cn_path):
|
||||||
return
|
return
|
||||||
wav_path = "%s/%s" % (wav32dir, wav_name)
|
wav_path = f"{wav32dir}/{wav_name}"
|
||||||
wav32k, sr0 = torchaudio.load(wav_path)
|
wav32k, sr0 = torchaudio.load(wav_path)
|
||||||
assert sr0 == 32000
|
assert sr0 == 32000
|
||||||
wav32k = wav32k.to(device)
|
wav32k = wav32k.to(device)
|
||||||
@ -96,16 +100,16 @@ def name2go(wav_name, wav_path):
|
|||||||
my_save(emb, sv_cn_path)
|
my_save(emb, sv_cn_path)
|
||||||
|
|
||||||
|
|
||||||
with open(inp_text, "r", encoding="utf8") as f:
|
with open(inp_text, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
for line in lines[int(i_part) :: int(all_parts)]:
|
for line in lines[int(i_part) :: int(all_parts)]:
|
||||||
try:
|
try:
|
||||||
wav_name, spk_name, language, text = line.split("|")
|
wav_name, spk_name, language, text = line.split("|")
|
||||||
wav_name = clean_path(wav_name)
|
wav_name = clean_path(wav_name)
|
||||||
if inp_wav_dir != "" and inp_wav_dir != None:
|
if inp_wav_dir != "" and inp_wav_dir is not None:
|
||||||
wav_name = os.path.basename(wav_name)
|
wav_name = os.path.basename(wav_name)
|
||||||
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
|
wav_path = f"{inp_wav_dir}/{wav_name}"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
wav_path = wav_name
|
wav_path = wav_name
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
inp_text = os.environ.get("inp_text")
|
inp_text = os.environ.get("inp_text")
|
||||||
exp_name = os.environ.get("exp_name")
|
exp_name = os.environ.get("exp_name")
|
||||||
i_part = os.environ.get("i_part")
|
i_part = os.environ.get("i_part")
|
||||||
@ -28,20 +29,25 @@ else:
|
|||||||
version = "v3"
|
version = "v3"
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
import traceback
|
|
||||||
import sys
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
|
||||||
|
|
||||||
if version != "v3":
|
if version != "v3":
|
||||||
from module.models import SynthesizerTrn
|
from module.models import SynthesizerTrn
|
||||||
else:
|
else:
|
||||||
from module.models import SynthesizerTrnV3 as SynthesizerTrn
|
from module.models import SynthesizerTrnV3 as SynthesizerTrn
|
||||||
from tools.my_utils import clean_path
|
from gsv_tools.my_utils import clean_path
|
||||||
|
|
||||||
|
|
||||||
logging.getLogger("numba").setLevel(logging.WARNING)
|
logging.getLogger("numba").setLevel(logging.WARNING)
|
||||||
# from config import pretrained_s2G
|
# from config import pretrained_s2G
|
||||||
@ -54,9 +60,9 @@ logging.getLogger("numba").setLevel(logging.WARNING)
|
|||||||
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
||||||
|
|
||||||
|
|
||||||
hubert_dir = "%s/4-cnhubert" % (opt_dir)
|
hubert_dir = f"{opt_dir}/4-cnhubert"
|
||||||
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
semantic_path = f"{opt_dir}/6-name2semantic-{i_part}.tsv"
|
||||||
if os.path.exists(semantic_path) == False:
|
if not os.path.exists(semantic_path):
|
||||||
os.makedirs(opt_dir, exist_ok=True)
|
os.makedirs(opt_dir, exist_ok=True)
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
@ -73,7 +79,7 @@ if os.path.exists(semantic_path) == False:
|
|||||||
version=version,
|
version=version,
|
||||||
**hps.model,
|
**hps.model,
|
||||||
)
|
)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
vq_model = vq_model.half().to(device)
|
vq_model = vq_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
vq_model = vq_model.to(device)
|
vq_model = vq_model.to(device)
|
||||||
@ -87,19 +93,19 @@ if os.path.exists(semantic_path) == False:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def name2go(wav_name, lines):
|
def name2go(wav_name, lines):
|
||||||
hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
|
hubert_path = f"{hubert_dir}/{wav_name}.pt"
|
||||||
if os.path.exists(hubert_path) == False:
|
if not os.path.exists(hubert_path):
|
||||||
return
|
return
|
||||||
ssl_content = torch.load(hubert_path, map_location="cpu")
|
ssl_content = torch.load(hubert_path, map_location="cpu")
|
||||||
if is_half == True:
|
if is_half:
|
||||||
ssl_content = ssl_content.half().to(device)
|
ssl_content = ssl_content.half().to(device)
|
||||||
else:
|
else:
|
||||||
ssl_content = ssl_content.to(device)
|
ssl_content = ssl_content.to(device)
|
||||||
codes = vq_model.extract_latent(ssl_content)
|
codes = vq_model.extract_latent(ssl_content)
|
||||||
semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
|
semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
|
||||||
lines.append("%s\t%s" % (wav_name, semantic))
|
lines.append(f"{wav_name}\t{semantic}")
|
||||||
|
|
||||||
with open(inp_text, "r", encoding="utf8") as f:
|
with open(inp_text, encoding="utf8") as f:
|
||||||
lines = f.read().strip("\n").split("\n")
|
lines = f.read().strip("\n").split("\n")
|
||||||
|
|
||||||
lines1 = []
|
lines1 = []
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
import traceback
|
import traceback
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
import shutil
|
|
||||||
import os
|
|
||||||
import torch
|
import torch
|
||||||
from tools.i18n.i18n import I18nAuto
|
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
|
|
||||||
@ -12,13 +15,14 @@ i18n = I18nAuto()
|
|||||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
name = os.path.basename(path)
|
name = os.path.basename(path)
|
||||||
tmp_path = "%s.pth" % (ttime())
|
tmp_path = f"{ttime()}.pth"
|
||||||
torch.save(fea, tmp_path)
|
torch.save(fea, tmp_path)
|
||||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
shutil.move(tmp_path, f"{dir}/{name}")
|
||||||
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
model_version2byte = {
|
model_version2byte = {
|
||||||
"v3": b"03",
|
"v3": b"03",
|
||||||
"v4": b"04",
|
"v4": b"04",
|
||||||
@ -47,14 +51,14 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
|
|||||||
continue
|
continue
|
||||||
opt["weight"][key] = ckpt[key].half()
|
opt["weight"][key] = ckpt[key].half()
|
||||||
opt["config"] = hps
|
opt["config"] = hps
|
||||||
opt["info"] = "%sepoch_%siteration" % (epoch, steps)
|
opt["info"] = f"{epoch}epoch_{steps}iteration"
|
||||||
if lora_rank:
|
if lora_rank:
|
||||||
opt["lora_rank"] = lora_rank
|
opt["lora_rank"] = lora_rank
|
||||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
|
||||||
elif model_version != None and "Pro" in model_version:
|
elif model_version is not None and "Pro" in model_version:
|
||||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
|
||||||
else:
|
else:
|
||||||
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
my_save(opt, f"{hps.save_weight_dir}/{name}.pth")
|
||||||
return "Success."
|
return "Success."
|
||||||
except:
|
except:
|
||||||
return traceback.format_exc()
|
return traceback.format_exc()
|
||||||
|
|||||||
287
README.md
287
README.md
@ -1,3 +1,5 @@
|
|||||||
|
#
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
@ -7,8 +9,6 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
|||||||
|
|
||||||
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
|
||||||
|
|
||||||
[](https://www.python.org)
|
[](https://www.python.org)
|
||||||
[](https://github.com/RVC-Boss/gpt-sovits/releases)
|
[](https://github.com/RVC-Boss/gpt-sovits/releases)
|
||||||
|
|
||||||
@ -27,7 +27,11 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Features:
|
<div align="center">
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
|
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
|
||||||
|
|
||||||
@ -41,51 +45,56 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
|||||||
|
|
||||||
Unseen speakers few-shot fine-tuning demo:
|
Unseen speakers few-shot fine-tuning demo:
|
||||||
|
|
||||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
|
||||||
|
|
||||||
**RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
|
**RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
|
||||||
0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
|
0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
|
||||||
|
|
||||||
请不要尬黑GPT-SoVITS推理速度慢,谢谢!
|
|
||||||
|
|
||||||
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
|
For users in China, you can [Click Here to use](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) AutoDL Cloud Docker to experience the full functionality online.
|
||||||
|
|
||||||
### Tested Environments
|
### Tested Environments
|
||||||
|
|
||||||
| Python Version | PyTorch Version | Device |
|
| Python Version | PyTorch Version | Device |
|
||||||
| -------------- | ---------------- | ------------- |
|
| -------------- | --------------- | ------------- |
|
||||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
|
||||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
|
||||||
|
| Python 3.10 | PyTorch 2.9.0 | CPU |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
|
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) and double-click on `go-webui.bat` to start GPT-SoVITS-WebUI.
|
||||||
|
|
||||||
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
||||||
|
|
||||||
Install the program by running the following commands:
|
Install the program by running the following commands:
|
||||||
|
|
||||||
```pwsh
|
```pwsh
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
|
pwsh -F install.ps1 --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
|
Install the program by running the following commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
@ -95,54 +104,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
|
|||||||
Install the program by running the following commands:
|
Install the program by running the following commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install Manually
|
### Install Manually
|
||||||
|
|
||||||
#### Install Dependences
|
Install the program by running the following commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
|
conda install uv ffmpeg -c conda-forge
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps
|
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
|
||||||
pip install -r requirements.txt
|
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
|
||||||
|
uv pip install ".[flash-attn]"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Install FFmpeg
|
### Pretrained Models
|
||||||
|
|
||||||
##### Conda Users
|
**If `install.sh` runs successfully, you may skip No.1,2,3**
|
||||||
|
|
||||||
```bash
|
**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
|
||||||
conda activate GPTSoVits
|
|
||||||
conda install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Ubuntu/Debian Users
|
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
|
||||||
|
|
||||||
```bash
|
2. Download G2PW models from [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. (Chinese TTS Only)
|
||||||
sudo apt install ffmpeg
|
|
||||||
sudo apt install libsox-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Windows Users
|
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `gsv_tools/uvr5/uvr5_weights`.
|
||||||
|
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `gsv_tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
||||||
|
|
||||||
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root
|
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
||||||
|
|
||||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe)
|
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `gsv_tools/asr/models`.
|
||||||
|
|
||||||
##### MacOS Users
|
5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `gsv_tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
|
||||||
|
|
||||||
```bash
|
<div align="center">
|
||||||
brew install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running GPT-SoVITS with Docker
|
## Running GPT-SoVITS with Docker
|
||||||
|
|
||||||
#### Docker Image Selection
|
</div>
|
||||||
|
|
||||||
|
### Docker Image Selection
|
||||||
|
|
||||||
Due to rapid development in the codebase and a slower Docker image release cycle, please:
|
Due to rapid development in the codebase and a slower Docker image release cycle, please:
|
||||||
|
|
||||||
@ -153,15 +159,15 @@ Due to rapid development in the codebase and a slower Docker image release cycle
|
|||||||
- Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
|
- Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
|
||||||
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
|
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
|
||||||
|
|
||||||
#### Environment Variables
|
### Environment Variables
|
||||||
|
|
||||||
- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.
|
- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.
|
||||||
|
|
||||||
#### Shared Memory Configuration
|
### Shared Memory Configuration
|
||||||
|
|
||||||
On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.
|
On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.
|
||||||
|
|
||||||
#### Choosing a Service
|
### Choosing a Service
|
||||||
|
|
||||||
The `docker-compose.yaml` defines two services:
|
The `docker-compose.yaml` defines two services:
|
||||||
|
|
||||||
@ -174,7 +180,7 @@ To run a specific service with Docker Compose, use:
|
|||||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Building the Docker Image Locally
|
### Building the Docker Image Locally
|
||||||
|
|
||||||
If you want to build the image yourself, use:
|
If you want to build the image yourself, use:
|
||||||
|
|
||||||
@ -182,7 +188,7 @@ If you want to build the image yourself, use:
|
|||||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Accessing the Running Container (Bash Shell)
|
### Accessing the Running Container (Bash Shell)
|
||||||
|
|
||||||
Once the container is running in the background, you can access it using:
|
Once the container is running in the background, you can access it using:
|
||||||
|
|
||||||
@ -190,31 +196,15 @@ Once the container is running in the background, you can access it using:
|
|||||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||||
```
|
```
|
||||||
|
|
||||||
## Pretrained Models
|
<div align="center">
|
||||||
|
|
||||||
**If `install.sh` runs successfully, you may skip No.1,2,3**
|
|
||||||
|
|
||||||
**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
|
|
||||||
|
|
||||||
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
|
|
||||||
|
|
||||||
2. Download G2PW models from [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
|
|
||||||
|
|
||||||
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
|
|
||||||
|
|
||||||
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
|
||||||
|
|
||||||
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
|
||||||
|
|
||||||
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
|
|
||||||
|
|
||||||
5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
|
|
||||||
|
|
||||||
## Dataset Format
|
## Dataset Format
|
||||||
|
|
||||||
The TTS annotation .list file format:
|
</div>
|
||||||
|
|
||||||
```
|
The TTS annotation `.list` file format:
|
||||||
|
|
||||||
|
```text
|
||||||
|
|
||||||
vocal_path|speaker_name|language|text
|
vocal_path|speaker_name|language|text
|
||||||
|
|
||||||
@ -230,20 +220,23 @@ Language dictionary:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## Finetune and inference
|
## Finetune and inference
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Open WebUI
|
### Open WebUI
|
||||||
|
|
||||||
#### Integrated Package Users
|
#### Integrated Package Users
|
||||||
|
|
||||||
Double-click `go-webui.bat`or use `go-webui.ps1`
|
Double-click `go-webui.bat`or use `go-webui.ps1`
|
||||||
if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1`
|
|
||||||
|
|
||||||
#### Others
|
#### Others
|
||||||
|
|
||||||
@ -251,21 +244,13 @@ if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-
|
|||||||
python webui.py <language(optional)>
|
python webui.py <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
if you want to switch to V1,then
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python webui.py v1 <language(optional)>
|
|
||||||
```
|
|
||||||
|
|
||||||
Or maunally switch version in WebUI
|
|
||||||
|
|
||||||
### Finetune
|
### Finetune
|
||||||
|
|
||||||
#### Path Auto-filling is now supported
|
#### Path Auto-filling is now supported
|
||||||
|
|
||||||
1. Fill in the audio path
|
1. Fill in the audio path
|
||||||
2. Slice the audio into small chunks
|
2. Slice the audio into small chunks
|
||||||
3. Denoise(optinal)
|
3. Denoise (optinal)
|
||||||
4. ASR
|
4. ASR
|
||||||
5. Proofreading ASR transcriptions
|
5. Proofreading ASR transcriptions
|
||||||
6. Go to the next Tab, then finetune the model
|
6. Go to the next Tab, then finetune the model
|
||||||
@ -274,7 +259,7 @@ Or maunally switch version in WebUI
|
|||||||
|
|
||||||
#### Integrated Package Users
|
#### Integrated Package Users
|
||||||
|
|
||||||
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
Double-click `go-webui.bat` or use `go-webui.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||||
|
|
||||||
#### Others
|
#### Others
|
||||||
|
|
||||||
@ -290,8 +275,12 @@ python webui.py
|
|||||||
|
|
||||||
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## V2 Release Notes
|
## V2 Release Notes
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
New Features:
|
New Features:
|
||||||
|
|
||||||
1. Support Korean and Cantonese
|
1. Support Korean and Cantonese
|
||||||
@ -304,18 +293,12 @@ New Features:
|
|||||||
|
|
||||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
Use v2 from v1 environment:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` to update some packages
|
|
||||||
|
|
||||||
2. Clone the latest codes from github.
|
|
||||||
|
|
||||||
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`.
|
|
||||||
|
|
||||||
Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.)
|
|
||||||
|
|
||||||
## V3 Release Notes
|
## V3 Release Notes
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
New Features:
|
New Features:
|
||||||
|
|
||||||
1. The timbre similarity is higher, requiring less training data to approximate the target speaker (the timbre similarity is significantly improved using the base model directly without fine-tuning).
|
1. The timbre similarity is higher, requiring less training data to approximate the target speaker (the timbre similarity is significantly improved using the base model directly without fine-tuning).
|
||||||
@ -324,111 +307,59 @@ New Features:
|
|||||||
|
|
||||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
Use v3 from v2 environment:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` to update some packages
|
|
||||||
|
|
||||||
2. Clone the latest codes from github.
|
|
||||||
|
|
||||||
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
|
||||||
|
|
||||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
|
||||||
|
|
||||||
## V4 Release Notes
|
## V4 Release Notes
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
New Features:
|
New Features:
|
||||||
|
|
||||||
1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed.
|
1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed.
|
||||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
Use v4 from v1/v2/v3 environment:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` to update some packages
|
|
||||||
|
|
||||||
2. Clone the latest codes from github.
|
|
||||||
|
|
||||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
|
||||||
|
|
||||||
## V2Pro Release Notes
|
## V2Pro Release Notes
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
New Features:
|
New Features:
|
||||||
|
|
||||||
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
|
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
|
||||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
|
2. v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
|
||||||
|
|
||||||
Use v2Pro from v1/v2/v3/v4 environment:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` to update some packages
|
|
||||||
|
|
||||||
2. Clone the latest codes from github.
|
|
||||||
|
|
||||||
3. Download v2Pro pretrained models (v2Pro/s2Dv2Pro.pth, v2Pro/s2Gv2Pro.pth, v2Pro/s2Dv2ProPlus.pth, v2Pro/s2Gv2ProPlus.pth, and sv/pretrained_eres2netv2w24s4ep4.ckpt) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
|
||||||
|
|
||||||
## Todo List
|
## Todo List
|
||||||
|
|
||||||
- [x] **High Priority:**
|
</div>
|
||||||
|
|
||||||
- [x] Localization in Japanese and English.
|
- [x] **High Priority:**
|
||||||
- [x] User guide.
|
- [x] Localization in Japanese and English
|
||||||
- [x] Japanese and English dataset fine tune training.
|
- [x] User guide
|
||||||
|
- [x] Japanese and English dataset fine tune training
|
||||||
|
|
||||||
- [ ] **Features:**
|
- [ ] **Features:**
|
||||||
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min).
|
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min)
|
||||||
- [x] TTS speaking speed control.
|
- [x] TTS speaking speed control
|
||||||
- [ ] ~~Enhanced TTS emotion control.~~ Maybe use pretrained finetuned preset GPT models for better emotion.
|
- [ ] ~~Enhanced TTS emotion control~~ Maybe use pretrained finetuned preset GPT models for better emotion
|
||||||
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent).
|
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent)
|
||||||
- [x] Improve English and Japanese text frontend.
|
- [x] Improve English and Japanese text frontend
|
||||||
- [ ] Develop tiny and larger-sized TTS models.
|
- [ ] Develop tiny and larger-sized TTS models
|
||||||
- [x] Colab scripts.
|
- [x] Colab scripts
|
||||||
- [x] Try expand training dataset (2k hours -> 10k hours).
|
- [x] Try expand training dataset (2k hours -> 10k hours)
|
||||||
- [x] better sovits base model (enhanced audio quality)
|
- [x] better sovits base model (enhanced audio quality)
|
||||||
- [ ] model mix
|
- [ ] model mix
|
||||||
|
|
||||||
## (Additional) Method for running from the command line
|
<div align="center">
|
||||||
|
|
||||||
Use the command line to open the WebUI for UVR5
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
|
|
||||||
```
|
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
|
||||||
``` -->
|
|
||||||
|
|
||||||
This is how the audio segmentation of the dataset is done using the command line
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python audio_slicer.py \
|
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
|
||||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
|
||||||
--threshold <volume_threshold> \
|
|
||||||
--min_length <minimum_duration_of_each_subclip> \
|
|
||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
|
||||||
```
|
|
||||||
|
|
||||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
|
||||||
```
|
|
||||||
|
|
||||||
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
|
||||||
|
|
||||||
(No progress bars, GPU performance may cause time delays)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
|
||||||
```
|
|
||||||
|
|
||||||
A custom list save path is enabled
|
|
||||||
|
|
||||||
## Credits
|
## Credits
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Special thanks to the following projects and contributors:
|
Special thanks to the following projects and contributors:
|
||||||
|
|
||||||
### Theoretical Research
|
### Theoretical Research
|
||||||
@ -439,7 +370,7 @@ Special thanks to the following projects and contributors:
|
|||||||
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
||||||
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
||||||
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
@ -471,8 +402,12 @@ Special thanks to the following projects and contributors:
|
|||||||
|
|
||||||
Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge.
|
Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## Thanks to all contributors for their efforts
|
## Thanks to all contributors for their efforts
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
||||||
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
|
||||||
</a>
|
</a>
|
||||||
|
|||||||
163
api.py
163
api.py
@ -145,33 +145,36 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
sys.path.append(f"{now_dir}/GPT_SoVITS")
|
||||||
|
|
||||||
|
import logging
|
||||||
import signal
|
import signal
|
||||||
from text.LangSegmenter import LangSegmenter
|
import subprocess
|
||||||
|
from io import BytesIO
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import librosa
|
|
||||||
import soundfile as sf
|
|
||||||
from fastapi import FastAPI, Request, Query
|
|
||||||
from fastapi.responses import StreamingResponse, JSONResponse
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||||
import numpy as np
|
from fastapi import FastAPI, Query, Request
|
||||||
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from feature_extractor import cnhubert
|
from feature_extractor import cnhubert
|
||||||
from io import BytesIO
|
from module.mel_processing import spectrogram_torch
|
||||||
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
|
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
|
||||||
from peft import LoraConfig, get_peft_model
|
from peft import LoraConfig, get_peft_model
|
||||||
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|
||||||
from text import cleaned_text_to_sequence
|
from text import cleaned_text_to_sequence
|
||||||
from text.cleaner import clean_text
|
from text.cleaner import clean_text
|
||||||
from module.mel_processing import spectrogram_torch
|
from text.LangSegmenter import LangSegmenter
|
||||||
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||||
|
|
||||||
import config as global_config
|
import config as global_config
|
||||||
import logging
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
|
|
||||||
class DefaultRefer:
|
class DefaultRefer:
|
||||||
@ -239,14 +242,14 @@ def init_bigvgan():
|
|||||||
from BigVGAN import bigvgan
|
from BigVGAN import bigvgan
|
||||||
|
|
||||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
||||||
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
|
||||||
use_cuda_kernel=False,
|
use_cuda_kernel=False,
|
||||||
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
||||||
# remove weight norm in the model and set to eval mode
|
# remove weight norm in the model and set to eval mode
|
||||||
bigvgan_model.remove_weight_norm()
|
bigvgan_model.remove_weight_norm()
|
||||||
bigvgan_model = bigvgan_model.eval()
|
bigvgan_model = bigvgan_model.eval()
|
||||||
|
|
||||||
if is_half == True:
|
if is_half:
|
||||||
bigvgan_model = bigvgan_model.half().to(device)
|
bigvgan_model = bigvgan_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
bigvgan_model = bigvgan_model.to(device)
|
bigvgan_model = bigvgan_model.to(device)
|
||||||
@ -268,12 +271,12 @@ def init_hifigan():
|
|||||||
hifigan_model.eval()
|
hifigan_model.eval()
|
||||||
hifigan_model.remove_weight_norm()
|
hifigan_model.remove_weight_norm()
|
||||||
state_dict_g = torch.load(
|
state_dict_g = torch.load(
|
||||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
|
||||||
map_location="cpu",
|
map_location="cpu",
|
||||||
weights_only=False,
|
weights_only=False,
|
||||||
)
|
)
|
||||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||||
if is_half == True:
|
if is_half:
|
||||||
hifigan_model = hifigan_model.half().to(device)
|
hifigan_model = hifigan_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
hifigan_model = hifigan_model.to(device)
|
hifigan_model = hifigan_model.to(device)
|
||||||
@ -292,7 +295,7 @@ resample_transform_dict = {}
|
|||||||
|
|
||||||
def resample(audio_tensor, sr0, sr1, device):
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
key = f"{sr0}-{sr1}-{str(device)}"
|
||||||
if key not in resample_transform_dict:
|
if key not in resample_transform_dict:
|
||||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
return resample_transform_dict[key](audio_tensor)
|
return resample_transform_dict[key](audio_tensor)
|
||||||
@ -300,6 +303,7 @@ def resample(audio_tensor, sr0, sr1, device):
|
|||||||
|
|
||||||
from module.mel_processing import mel_spectrogram_torch
|
from module.mel_processing import mel_spectrogram_torch
|
||||||
|
|
||||||
|
|
||||||
spec_min = -12
|
spec_min = -12
|
||||||
spec_max = 2
|
spec_max = 2
|
||||||
|
|
||||||
@ -312,32 +316,36 @@ def denorm_spec(x):
|
|||||||
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
||||||
|
|
||||||
|
|
||||||
mel_fn = lambda x: mel_spectrogram_torch(
|
def mel_fn(x):
|
||||||
x,
|
return mel_spectrogram_torch(
|
||||||
**{
|
x,
|
||||||
"n_fft": 1024,
|
**{
|
||||||
"win_size": 1024,
|
"n_fft": 1024,
|
||||||
"hop_size": 256,
|
"win_size": 1024,
|
||||||
"num_mels": 100,
|
"hop_size": 256,
|
||||||
"sampling_rate": 24000,
|
"num_mels": 100,
|
||||||
"fmin": 0,
|
"sampling_rate": 24000,
|
||||||
"fmax": None,
|
"fmin": 0,
|
||||||
"center": False,
|
"fmax": None,
|
||||||
},
|
"center": False,
|
||||||
)
|
},
|
||||||
mel_fn_v4 = lambda x: mel_spectrogram_torch(
|
)
|
||||||
x,
|
|
||||||
**{
|
|
||||||
"n_fft": 1280,
|
def mel_fn_v4(x):
|
||||||
"win_size": 1280,
|
return mel_spectrogram_torch(
|
||||||
"hop_size": 320,
|
x,
|
||||||
"num_mels": 100,
|
**{
|
||||||
"sampling_rate": 32000,
|
"n_fft": 1280,
|
||||||
"fmin": 0,
|
"win_size": 1280,
|
||||||
"fmax": None,
|
"hop_size": 320,
|
||||||
"center": False,
|
"num_mels": 100,
|
||||||
},
|
"sampling_rate": 32000,
|
||||||
)
|
"fmin": 0,
|
||||||
|
"fmax": None,
|
||||||
|
"center": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
sr_model = None
|
sr_model = None
|
||||||
@ -345,8 +353,8 @@ sr_model = None
|
|||||||
|
|
||||||
def audio_sr(audio, sr):
|
def audio_sr(audio, sr):
|
||||||
global sr_model
|
global sr_model
|
||||||
if sr_model == None:
|
if sr_model is None:
|
||||||
from tools.audio_sr import AP_BWE
|
from gsv_tools.audio_sr import AP_BWE
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sr_model = AP_BWE(device, DictToAttrRecursive)
|
sr_model = AP_BWE(device, DictToAttrRecursive)
|
||||||
@ -390,8 +398,8 @@ def get_sovits_weights(sovits_path):
|
|||||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||||
|
|
||||||
if if_lora_v3 == True and is_exist == False:
|
if if_lora_v3 and not is_exist:
|
||||||
logger.info("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
logger.info(f"SoVITS {model_version} 底模缺失,无法加载相应 LoRA 权重")
|
||||||
|
|
||||||
dict_s2 = load_sovits_new(sovits_path)
|
dict_s2 = load_sovits_new(sovits_path)
|
||||||
hps = dict_s2["config"]
|
hps = dict_s2["config"]
|
||||||
@ -408,7 +416,7 @@ def get_sovits_weights(sovits_path):
|
|||||||
if model_version not in {"v3", "v4"}:
|
if model_version not in {"v3", "v4"}:
|
||||||
if "Pro" in model_version:
|
if "Pro" in model_version:
|
||||||
hps.model.version = model_version
|
hps.model.version = model_version
|
||||||
if sv_cn_model == None:
|
if sv_cn_model is None:
|
||||||
init_sv_cn()
|
init_sv_cn()
|
||||||
|
|
||||||
vq_model = SynthesizerTrn(
|
vq_model = SynthesizerTrn(
|
||||||
@ -437,12 +445,12 @@ def get_sovits_weights(sovits_path):
|
|||||||
del vq_model.enc_q
|
del vq_model.enc_q
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if is_half == True:
|
if is_half:
|
||||||
vq_model = vq_model.half().to(device)
|
vq_model = vq_model.half().to(device)
|
||||||
else:
|
else:
|
||||||
vq_model = vq_model.to(device)
|
vq_model = vq_model.to(device)
|
||||||
vq_model.eval()
|
vq_model.eval()
|
||||||
if if_lora_v3 == False:
|
if not if_lora_v3:
|
||||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||||
else:
|
else:
|
||||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||||
@ -480,7 +488,7 @@ def get_gpt_weights(gpt_path):
|
|||||||
max_sec = config["data"]["max_sec"]
|
max_sec = config["data"]["max_sec"]
|
||||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||||
t2s_model.load_state_dict(dict_s1["weight"])
|
t2s_model.load_state_dict(dict_s1["weight"])
|
||||||
if is_half == True:
|
if is_half:
|
||||||
t2s_model = t2s_model.half()
|
t2s_model = t2s_model.half()
|
||||||
t2s_model = t2s_model.to(device)
|
t2s_model = t2s_model.to(device)
|
||||||
t2s_model.eval()
|
t2s_model.eval()
|
||||||
@ -533,35 +541,32 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
|||||||
else:
|
else:
|
||||||
bert = torch.zeros(
|
bert = torch.zeros(
|
||||||
(1024, len(phones)),
|
(1024, len(phones)),
|
||||||
dtype=torch.float16 if is_half == True else torch.float32,
|
dtype=torch.float16 if is_half else torch.float32,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
return bert
|
return bert
|
||||||
|
|
||||||
|
|
||||||
from text import chinese
|
|
||||||
|
|
||||||
|
|
||||||
def get_phones_and_bert(text, language, version, final=False):
|
def get_phones_and_bert(text, language, version, final=False):
|
||||||
text = re.sub(r' {2,}', ' ', text)
|
text = re.sub(r" {2,}", " ", text)
|
||||||
textlist = []
|
textlist = []
|
||||||
langlist = []
|
langlist = []
|
||||||
if language == "all_zh":
|
if language == "all_zh":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_yue":
|
elif language == "all_yue":
|
||||||
for tmp in LangSegmenter.getTexts(text,"zh"):
|
for tmp in LangSegmenter.getTexts(text, "zh"):
|
||||||
if tmp["lang"] == "zh":
|
if tmp["lang"] == "zh":
|
||||||
tmp["lang"] = "yue"
|
tmp["lang"] = "yue"
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ja":
|
elif language == "all_ja":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ja"):
|
for tmp in LangSegmenter.getTexts(text, "ja"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "all_ko":
|
elif language == "all_ko":
|
||||||
for tmp in LangSegmenter.getTexts(text,"ko"):
|
for tmp in LangSegmenter.getTexts(text, "ko"):
|
||||||
langlist.append(tmp["lang"])
|
langlist.append(tmp["lang"])
|
||||||
textlist.append(tmp["text"])
|
textlist.append(tmp["text"])
|
||||||
elif language == "en":
|
elif language == "en":
|
||||||
@ -606,7 +611,7 @@ def get_phones_and_bert(text, language, version, final=False):
|
|||||||
if not final and len(phones) < 6:
|
if not final and len(phones) < 6:
|
||||||
return get_phones_and_bert("." + text, language, version, final=True)
|
return get_phones_and_bert("." + text, language, version, final=True)
|
||||||
|
|
||||||
return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text
|
return phones, bert.to(torch.float16 if is_half else torch.float32), norm_text
|
||||||
|
|
||||||
|
|
||||||
class DictToAttrRecursive(dict):
|
class DictToAttrRecursive(dict):
|
||||||
@ -627,7 +632,7 @@ class DictToAttrRecursive(dict):
|
|||||||
def __setattr__(self, key, value):
|
def __setattr__(self, key, value):
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
value = DictToAttrRecursive(value)
|
value = DictToAttrRecursive(value)
|
||||||
super(DictToAttrRecursive, self).__setitem__(key, value)
|
super().__setitem__(key, value)
|
||||||
super().__setattr__(key, value)
|
super().__setattr__(key, value)
|
||||||
|
|
||||||
def __delattr__(self, item):
|
def __delattr__(self, item):
|
||||||
@ -662,7 +667,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
|||||||
center=False,
|
center=False,
|
||||||
)
|
)
|
||||||
spec = spec.to(dtype)
|
spec = spec.to(dtype)
|
||||||
if is_v2pro == True:
|
if is_v2pro:
|
||||||
audio = resample(audio, sr1, 16000, device).to(dtype)
|
audio = resample(audio, sr1, 16000, device).to(dtype)
|
||||||
return spec, audio
|
return spec, audio
|
||||||
|
|
||||||
@ -715,11 +720,11 @@ def pack_ogg(audio_bytes, data, rate):
|
|||||||
pack_ogg_thread.join()
|
pack_ogg_thread.join()
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
# If changing the thread stack size is unsupported, a RuntimeError is raised.
|
# If changing the thread stack size is unsupported, a RuntimeError is raised.
|
||||||
print("RuntimeError: {}".format(e))
|
print(f"RuntimeError: {e}")
|
||||||
print("Changing the thread stack size is unsupported.")
|
print("Changing the thread stack size is unsupported.")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
|
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
|
||||||
print("ValueError: {}".format(e))
|
print(f"ValueError: {e}")
|
||||||
print("The specified stack size is invalid.")
|
print("The specified stack size is invalid.")
|
||||||
|
|
||||||
return audio_bytes
|
return audio_bytes
|
||||||
@ -794,7 +799,7 @@ def cut_text(text, punc):
|
|||||||
punds = r"[" + "".join(punc_list) + r"]"
|
punds = r"[" + "".join(punc_list) + r"]"
|
||||||
text = text.strip("\n")
|
text = text.strip("\n")
|
||||||
items = re.split(f"({punds})", text)
|
items = re.split(f"({punds})", text)
|
||||||
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
|
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2], strict=False)]
|
||||||
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
||||||
if len(items) % 2 == 1:
|
if len(items) % 2 == 1:
|
||||||
mergeitems.append(items[-1])
|
mergeitems.append(items[-1])
|
||||||
@ -861,18 +866,18 @@ def get_tts_wav(
|
|||||||
if if_sr and version != "v3":
|
if if_sr and version != "v3":
|
||||||
if_sr = False
|
if_sr = False
|
||||||
|
|
||||||
t0 = ttime()
|
ttime()
|
||||||
prompt_text = prompt_text.strip("\n")
|
prompt_text = prompt_text.strip("\n")
|
||||||
if prompt_text[-1] not in splits:
|
if prompt_text[-1] not in splits:
|
||||||
prompt_text += "。" if prompt_language != "en" else "."
|
prompt_text += "。" if prompt_language != "en" else "."
|
||||||
prompt_language, text = prompt_language, text.strip("\n")
|
prompt_language, text = prompt_language, text.strip("\n")
|
||||||
dtype = torch.float16 if is_half == True else torch.float32
|
dtype = torch.float16 if is_half else torch.float32
|
||||||
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
|
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half else np.float32)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
||||||
wav16k = torch.from_numpy(wav16k)
|
wav16k = torch.from_numpy(wav16k)
|
||||||
zero_wav_torch = torch.from_numpy(zero_wav)
|
zero_wav_torch = torch.from_numpy(zero_wav)
|
||||||
if is_half == True:
|
if is_half:
|
||||||
wav16k = wav16k.half().to(device)
|
wav16k = wav16k.half().to(device)
|
||||||
zero_wav_torch = zero_wav_torch.half().to(device)
|
zero_wav_torch = zero_wav_torch.half().to(device)
|
||||||
else:
|
else:
|
||||||
@ -889,7 +894,7 @@ def get_tts_wav(
|
|||||||
refers = []
|
refers = []
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
sv_emb = []
|
sv_emb = []
|
||||||
if sv_cn_model == None:
|
if sv_cn_model is None:
|
||||||
init_sv_cn()
|
init_sv_cn()
|
||||||
if inp_refs:
|
if inp_refs:
|
||||||
for path in inp_refs:
|
for path in inp_refs:
|
||||||
@ -908,7 +913,7 @@ def get_tts_wav(
|
|||||||
else:
|
else:
|
||||||
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
||||||
|
|
||||||
t1 = ttime()
|
ttime()
|
||||||
# os.environ['version'] = version
|
# os.environ['version'] = version
|
||||||
prompt_language = dict_language[prompt_language.lower()]
|
prompt_language = dict_language[prompt_language.lower()]
|
||||||
text_language = dict_language[text_language.lower()]
|
text_language = dict_language[text_language.lower()]
|
||||||
@ -930,7 +935,7 @@ def get_tts_wav(
|
|||||||
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
||||||
bert = bert.to(device).unsqueeze(0)
|
bert = bert.to(device).unsqueeze(0)
|
||||||
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
|
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
|
||||||
t2 = ttime()
|
ttime()
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
pred_semantic, idx = t2s_model.model.infer_panel(
|
pred_semantic, idx = t2s_model.model.infer_panel(
|
||||||
all_phoneme_ids,
|
all_phoneme_ids,
|
||||||
@ -944,7 +949,7 @@ def get_tts_wav(
|
|||||||
early_stop_num=hz * max_sec,
|
early_stop_num=hz * max_sec,
|
||||||
)
|
)
|
||||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
||||||
t3 = ttime()
|
ttime()
|
||||||
|
|
||||||
if version not in {"v3", "v4"}:
|
if version not in {"v3", "v4"}:
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
@ -1014,10 +1019,10 @@ def get_tts_wav(
|
|||||||
cfm_res = torch.cat(cfm_resss, 2)
|
cfm_res = torch.cat(cfm_resss, 2)
|
||||||
cfm_res = denorm_spec(cfm_res)
|
cfm_res = denorm_spec(cfm_res)
|
||||||
if version == "v3":
|
if version == "v3":
|
||||||
if bigvgan_model == None:
|
if bigvgan_model is None:
|
||||||
init_bigvgan()
|
init_bigvgan()
|
||||||
else: # v4
|
else: # v4
|
||||||
if hifigan_model == None:
|
if hifigan_model is None:
|
||||||
init_hifigan()
|
init_hifigan()
|
||||||
vocoder_model = bigvgan_model if version == "v3" else hifigan_model
|
vocoder_model = bigvgan_model if version == "v3" else hifigan_model
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
@ -1030,7 +1035,7 @@ def get_tts_wav(
|
|||||||
audio_opt.append(audio)
|
audio_opt.append(audio)
|
||||||
audio_opt.append(zero_wav)
|
audio_opt.append(zero_wav)
|
||||||
audio_opt = np.concatenate(audio_opt, 0)
|
audio_opt = np.concatenate(audio_opt, 0)
|
||||||
t4 = ttime()
|
ttime()
|
||||||
|
|
||||||
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
|
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
|
||||||
sr = 32000
|
sr = 32000
|
||||||
@ -1128,7 +1133,7 @@ def handle(
|
|||||||
if not default_refer.is_ready():
|
if not default_refer.is_ready():
|
||||||
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
||||||
|
|
||||||
if cut_punc == None:
|
if cut_punc is None:
|
||||||
text = cut_text(text, default_cut_punc)
|
text = cut_text(text, default_cut_punc)
|
||||||
else:
|
else:
|
||||||
text = cut_text(text, cut_punc)
|
text = cut_text(text, cut_punc)
|
||||||
|
|||||||
44
api_v2.py
44
api_v2.py
@ -104,27 +104,31 @@ RESP:
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Generator, Union
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
sys.path.append(f"{now_dir}/GPT_SoVITS")
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
|
||||||
import wave
|
|
||||||
import signal
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import threading
|
||||||
|
import wave
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from fastapi import FastAPI, Response
|
|
||||||
from fastapi.responses import StreamingResponse, JSONResponse
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from io import BytesIO
|
from fastapi import FastAPI, Response
|
||||||
from tools.i18n.i18n import I18nAuto
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
|
||||||
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
import threading
|
|
||||||
|
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
|
||||||
|
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
# print(sys.path)
|
# print(sys.path)
|
||||||
i18n = I18nAuto()
|
i18n = I18nAuto()
|
||||||
@ -169,7 +173,7 @@ class TTS_Request(BaseModel):
|
|||||||
fragment_interval: float = 0.3
|
fragment_interval: float = 0.3
|
||||||
seed: int = -1
|
seed: int = -1
|
||||||
media_type: str = "wav"
|
media_type: str = "wav"
|
||||||
streaming_mode: Union[bool, int] = False
|
streaming_mode: bool | int = False
|
||||||
parallel_infer: bool = True
|
parallel_infer: bool = True
|
||||||
repetition_penalty: float = 1.35
|
repetition_penalty: float = 1.35
|
||||||
sample_steps: int = 32
|
sample_steps: int = 32
|
||||||
@ -199,8 +203,6 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|||||||
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
|
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
|
||||||
audio_file.write(data)
|
audio_file.write(data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# See: https://docs.python.org/3/library/threading.html
|
# See: https://docs.python.org/3/library/threading.html
|
||||||
# The stack size of this thread is at least 32768
|
# The stack size of this thread is at least 32768
|
||||||
# If stack overflow error still occurs, just modify the `stack_size`.
|
# If stack overflow error still occurs, just modify the `stack_size`.
|
||||||
@ -214,11 +216,11 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
|||||||
pack_ogg_thread.join()
|
pack_ogg_thread.join()
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
# If changing the thread stack size is unsupported, a RuntimeError is raised.
|
# If changing the thread stack size is unsupported, a RuntimeError is raised.
|
||||||
print("RuntimeError: {}".format(e))
|
print(f"RuntimeError: {e}")
|
||||||
print("Changing the thread stack size is unsupported.")
|
print("Changing the thread stack size is unsupported.")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
|
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
|
||||||
print("ValueError: {}".format(e))
|
print(f"ValueError: {e}")
|
||||||
print("The specified stack size is invalid.")
|
print("The specified stack size is invalid.")
|
||||||
|
|
||||||
return io_buffer
|
return io_buffer
|
||||||
@ -306,7 +308,7 @@ def check_params(req: dict):
|
|||||||
text: str = req.get("text", "")
|
text: str = req.get("text", "")
|
||||||
text_lang: str = req.get("text_lang", "")
|
text_lang: str = req.get("text_lang", "")
|
||||||
ref_audio_path: str = req.get("ref_audio_path", "")
|
ref_audio_path: str = req.get("ref_audio_path", "")
|
||||||
streaming_mode: bool = req.get("streaming_mode", False)
|
req.get("streaming_mode", False)
|
||||||
media_type: str = req.get("media_type", "wav")
|
media_type: str = req.get("media_type", "wav")
|
||||||
prompt_lang: str = req.get("prompt_lang", "")
|
prompt_lang: str = req.get("prompt_lang", "")
|
||||||
text_split_method: str = req.get("text_split_method", "cut5")
|
text_split_method: str = req.get("text_split_method", "cut5")
|
||||||
@ -403,7 +405,10 @@ async def tts_handle(req: dict):
|
|||||||
fixed_length_chunk = True
|
fixed_length_chunk = True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return JSONResponse(status_code=400, content={"message": f"the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"})
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"message": "the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"},
|
||||||
|
)
|
||||||
|
|
||||||
req["streaming_mode"] = streaming_mode
|
req["streaming_mode"] = streaming_mode
|
||||||
req["return_fragment"] = return_fragment
|
req["return_fragment"] = return_fragment
|
||||||
@ -413,7 +418,6 @@ async def tts_handle(req: dict):
|
|||||||
|
|
||||||
streaming_mode = streaming_mode or return_fragment
|
streaming_mode = streaming_mode or return_fragment
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tts_generator = tts_pipeline.run(req)
|
tts_generator = tts_pipeline.run(req)
|
||||||
|
|
||||||
@ -475,7 +479,7 @@ async def tts_get_endpoint(
|
|||||||
repetition_penalty: float = 1.35,
|
repetition_penalty: float = 1.35,
|
||||||
sample_steps: int = 32,
|
sample_steps: int = 32,
|
||||||
super_sampling: bool = False,
|
super_sampling: bool = False,
|
||||||
streaming_mode: Union[bool, int] = False,
|
streaming_mode: bool | int = False,
|
||||||
overlap_length: int = 2,
|
overlap_length: int = 2,
|
||||||
min_chunk_length: int = 16,
|
min_chunk_length: int = 16,
|
||||||
):
|
):
|
||||||
|
|||||||
11
config.py
11
config.py
@ -4,7 +4,8 @@ import sys
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
||||||
|
|
||||||
@ -77,7 +78,7 @@ GPT_weight_version2root = {
|
|||||||
|
|
||||||
def custom_sort_key(s):
|
def custom_sort_key(s):
|
||||||
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||||
parts = re.split("(\d+)", s)
|
parts = re.split(r"(\d+)", s)
|
||||||
# 将数字部分转换为整数,非数字部分保持不变
|
# 将数字部分转换为整数,非数字部分保持不变
|
||||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
return parts
|
return parts
|
||||||
@ -93,7 +94,7 @@ def get_weights_names():
|
|||||||
continue
|
continue
|
||||||
for name in os.listdir(path):
|
for name in os.listdir(path):
|
||||||
if name.endswith(".pth"):
|
if name.endswith(".pth"):
|
||||||
SoVITS_names.append("%s/%s" % (path, name))
|
SoVITS_names.append(f"{path}/{name}")
|
||||||
if not SoVITS_names:
|
if not SoVITS_names:
|
||||||
SoVITS_names = [""]
|
SoVITS_names = [""]
|
||||||
GPT_names = []
|
GPT_names = []
|
||||||
@ -105,7 +106,7 @@ def get_weights_names():
|
|||||||
continue
|
continue
|
||||||
for name in os.listdir(path):
|
for name in os.listdir(path):
|
||||||
if name.endswith(".ckpt"):
|
if name.endswith(".ckpt"):
|
||||||
GPT_names.append("%s/%s" % (path, name))
|
GPT_names.append(f"{path}/{name}")
|
||||||
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
|
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
|
||||||
GPT_names = sorted(GPT_names, key=custom_sort_key)
|
GPT_names = sorted(GPT_names, key=custom_sort_key)
|
||||||
if not GPT_names:
|
if not GPT_names:
|
||||||
@ -161,7 +162,7 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
|
|||||||
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
|
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
|
||||||
if mem_gb < 4 or sm_version < 5.3:
|
if mem_gb < 4 or sm_version < 5.3:
|
||||||
return cpu, torch.float32, 0.0, 0.0
|
return cpu, torch.float32, 0.0, 0.0
|
||||||
if sm_version == 6.1 or is_16_series == True:
|
if sm_version == 6.1 or is_16_series:
|
||||||
return cuda, torch.float32, sm_version, mem_gb
|
return cuda, torch.float32, sm_version, mem_gb
|
||||||
if sm_version > 6.1:
|
if sm_version > 6.1:
|
||||||
return cuda, torch.float16, sm_version, mem_gb
|
return cuda, torch.float16, sm_version, mem_gb
|
||||||
|
|||||||
@ -30,8 +30,8 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- tools/asr/models:/workspace/models/asr_models
|
- gsv_tools/asr/models:/workspace/models/asr_models
|
||||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
- gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||||
environment:
|
environment:
|
||||||
- is_half=true
|
- is_half=true
|
||||||
tty: true
|
tty: true
|
||||||
@ -68,8 +68,8 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- tools/asr/models:/workspace/models/asr_models
|
- gsv_tools/asr/models:/workspace/models/asr_models
|
||||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
- gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||||
environment:
|
environment:
|
||||||
- is_half=true
|
- is_half=true
|
||||||
tty: true
|
tty: true
|
||||||
|
|||||||
@ -26,7 +26,7 @@ print_help() {
|
|||||||
echo " -h, --help Show this help message and exit"
|
echo " -h, --help Show this help message and exit"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
|
echo " bash docker_build.sh --cuda 12.6 --lite"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Show help if no arguments provided
|
# Show help if no arguments provided
|
||||||
|
|||||||
@ -409,7 +409,7 @@
|
|||||||
|
|
||||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
|
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
|
||||||
- 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
|
- 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
|
||||||
- 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
|
- 类型: 新功能 (特性参阅 [Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
|
||||||
- 提交: RVC-Boss
|
- 提交: RVC-Boss
|
||||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
|
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
|
||||||
- 内容: 更新项目多语言文档.
|
- 内容: 更新项目多语言文档.
|
||||||
@ -622,5 +622,3 @@
|
|||||||
- 内容: 提升推理进程优先级(修复win11下可能GPU利用率受限的问题)
|
- 内容: 提升推理进程优先级(修复win11下可能GPU利用率受限的问题)
|
||||||
- 类型: 修复
|
- 类型: 修复
|
||||||
- 提交: XianYue0125
|
- 提交: XianYue0125
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
#
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
@ -19,15 +21,18 @@
|
|||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
|
|
||||||
|
|
||||||
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 功能
|
## 功能
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
|
1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
|
||||||
|
|
||||||
2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
|
2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
|
||||||
@ -44,40 +49,49 @@
|
|||||||
|
|
||||||
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 安装
|
## 安装
|
||||||
|
|
||||||
中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.
|
中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.
|
||||||
|
|
||||||
### 测试通过的环境
|
### 测试通过的环境
|
||||||
|
|
||||||
| Python Version | PyTorch Version | Device |
|
| Python Version | PyTorch Version | Device |
|
||||||
| -------------- | ---------------- | ------------- |
|
| -------------- | --------------- | ------------- |
|
||||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
|
||||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
|
||||||
|
| Python 3.10 | PyTorch 2.9.0 | CPU |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
|
如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
|
||||||
|
|
||||||
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
||||||
|
|
||||||
|
运行以下的命令来安装本项目:
|
||||||
|
|
||||||
```pwsh
|
```pwsh
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
|
pwsh -F install.ps1 --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
|
运行以下的命令来安装本项目:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
@ -87,54 +101,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
|
|||||||
运行以下的命令来安装本项目:
|
运行以下的命令来安装本项目:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### 手动安装
|
### 手动安装
|
||||||
|
|
||||||
#### 安装依赖
|
运行以下的命令来安装本项目:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
|
conda install uv ffmpeg -c conda-forge
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps
|
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
|
||||||
pip install -r requirements.txt
|
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
|
||||||
|
uv pip install ".[flash-attn]"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 安装 FFmpeg
|
### 预训练模型
|
||||||
|
|
||||||
##### Conda 用户
|
**若成功运行`install.sh`可跳过 No.1,2,3**
|
||||||
|
|
||||||
```bash
|
**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
|
||||||
conda activate GPTSoVits
|
|
||||||
conda install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Ubuntu/Debian 用户
|
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
|
||||||
|
|
||||||
```bash
|
2. 从 [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
|
||||||
sudo apt install ffmpeg
|
|
||||||
sudo apt install libsox-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Windows 用户
|
3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `gsv_tools/uvr5/uvr5_weights` 目录中.
|
||||||
|
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `gsv_tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
|
||||||
|
|
||||||
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下
|
- 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.
|
||||||
|
|
||||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境
|
4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `gsv_tools/asr/models` 目录中.
|
||||||
|
|
||||||
##### MacOS 用户
|
5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `gsv_tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
|
||||||
|
|
||||||
```bash
|
<div align="center">
|
||||||
brew install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
### 运行 GPT-SoVITS (使用 Docker)
|
## 运行 GPT-SoVITS (使用 Docker)
|
||||||
|
|
||||||
#### Docker 镜像选择
|
</div>
|
||||||
|
|
||||||
|
### Docker 镜像选择
|
||||||
|
|
||||||
由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意:
|
由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意:
|
||||||
|
|
||||||
@ -145,15 +156,15 @@ brew install ffmpeg
|
|||||||
- Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
|
- Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
|
||||||
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
|
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
|
||||||
|
|
||||||
#### 环境变量
|
### 环境变量
|
||||||
|
|
||||||
- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用
|
- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用
|
||||||
|
|
||||||
#### 共享内存配置
|
### 共享内存配置
|
||||||
|
|
||||||
在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)
|
在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)
|
||||||
|
|
||||||
#### 选择服务
|
### 选择服务
|
||||||
|
|
||||||
`docker-compose.yaml` 文件定义了两个主要服务类型:
|
`docker-compose.yaml` 文件定义了两个主要服务类型:
|
||||||
|
|
||||||
@ -166,7 +177,7 @@ brew install ffmpeg
|
|||||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 本地构建 Docker 镜像
|
### 本地构建 Docker 镜像
|
||||||
|
|
||||||
如果你希望自行构建镜像, 请使用以下命令:
|
如果你希望自行构建镜像, 请使用以下命令:
|
||||||
|
|
||||||
@ -174,7 +185,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
|
|||||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 访问运行中的容器 (Bash Shell)
|
### 访问运行中的容器 (Bash Shell)
|
||||||
|
|
||||||
当容器在后台运行时, 你可以通过以下命令进入容器:
|
当容器在后台运行时, 你可以通过以下命令进入容器:
|
||||||
|
|
||||||
@ -182,32 +193,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
|||||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||||
```
|
```
|
||||||
|
|
||||||
## 预训练模型
|
<div align="center">
|
||||||
|
|
||||||
**若成功运行`install.sh`可跳过 No.1,2,3**
|
|
||||||
|
|
||||||
**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
|
|
||||||
|
|
||||||
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
|
|
||||||
|
|
||||||
2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
|
|
||||||
|
|
||||||
3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中.
|
|
||||||
|
|
||||||
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
|
|
||||||
|
|
||||||
- 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.
|
|
||||||
|
|
||||||
4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中.
|
|
||||||
|
|
||||||
5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
|
|
||||||
|
|
||||||
## 数据集格式
|
## 数据集格式
|
||||||
|
|
||||||
文本到语音 (TTS) 注释 .list 文件格式:
|
</div>
|
||||||
|
|
||||||
|
文本到语音 (TTS) 注释 `.list` 文件格式:
|
||||||
|
|
||||||
|
```text
|
||||||
|
|
||||||
```
|
|
||||||
vocal_path|speaker_name|language|text
|
vocal_path|speaker_name|language|text
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
语言字典:
|
语言字典:
|
||||||
@ -220,18 +217,23 @@ vocal_path|speaker_name|language|text
|
|||||||
|
|
||||||
示例:
|
示例:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 微调与推理
|
## 微调与推理
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### 打开 WebUI
|
### 打开 WebUI
|
||||||
|
|
||||||
#### 整合包用户
|
#### 整合包用户
|
||||||
|
|
||||||
双击`go-webui.bat`或者使用`go-webui.ps1`
|
双击`go-webui.bat`或者使用`go-webui.ps1`
|
||||||
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
|
||||||
|
|
||||||
#### 其他
|
#### 其他
|
||||||
|
|
||||||
@ -239,14 +241,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
|
|||||||
python webui.py <language(optional)>
|
python webui.py <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
若想使用 V1,则
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python webui.py v1 <language(optional)>
|
|
||||||
```
|
|
||||||
|
|
||||||
或者在 webUI 内动态切换
|
|
||||||
|
|
||||||
### 微调
|
### 微调
|
||||||
|
|
||||||
#### 现已支持自动填充路径
|
#### 现已支持自动填充路径
|
||||||
@ -278,8 +272,12 @@ python webui.py
|
|||||||
|
|
||||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## V2 发布说明
|
## V2 发布说明
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新特性:
|
新特性:
|
||||||
|
|
||||||
1. 支持韩语及粤语
|
1. 支持韩语及粤语
|
||||||
@ -292,18 +290,12 @@ python webui.py
|
|||||||
|
|
||||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
从 v1 环境迁移至 v2
|
<div align="center">
|
||||||
|
|
||||||
1. 需要 pip 安装 requirements.txt 更新环境
|
|
||||||
|
|
||||||
2. 需要克隆 github 上的最新代码
|
|
||||||
|
|
||||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下
|
|
||||||
|
|
||||||
中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
|
||||||
|
|
||||||
## V3 更新说明
|
## V3 更新说明
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新模型特点:
|
新模型特点:
|
||||||
|
|
||||||
1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
|
1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
|
||||||
@ -312,33 +304,23 @@ python webui.py
|
|||||||
|
|
||||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
从 v2 环境迁移至 v3
|
<div align="center">
|
||||||
|
|
||||||
1. 需要 pip 安装 requirements.txt 更新环境
|
|
||||||
|
|
||||||
2. 需要克隆 github 上的最新代码
|
|
||||||
|
|
||||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下
|
|
||||||
|
|
||||||
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
|
||||||
|
|
||||||
## V4 更新说明
|
## V4 更新说明
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新特性:
|
新特性:
|
||||||
|
|
||||||
1. **V4 版本修复了 V3 版本中由于非整数倍上采样导致的金属音问题, 并原生输出 48kHz 音频以避免声音闷糊 (而 V3 版本仅原生输出 24kHz 音频)**. 作者认为 V4 是对 V3 的直接替代, 但仍需进一步测试.
|
1. **V4 版本修复了 V3 版本中由于非整数倍上采样导致的金属音问题, 并原生输出 48kHz 音频以避免声音闷糊 (而 V3 版本仅原生输出 24kHz 音频)**. 作者认为 V4 是对 V3 的直接替代, 但仍需进一步测试.
|
||||||
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
从 V1/V2/V3 环境迁移至 V4:
|
<div align="center">
|
||||||
|
|
||||||
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
|
|
||||||
|
|
||||||
2. 从 GitHub 克隆最新代码.
|
|
||||||
|
|
||||||
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V4 预训练模型 (`gsv-v4-pretrained/s2v4.ckpt` 和 `gsv-v4-pretrained/vocoder.pth`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
|
|
||||||
|
|
||||||
## V2Pro 更新说明
|
## V2Pro 更新说明
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新特性:
|
新特性:
|
||||||
|
|
||||||
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
|
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
|
||||||
@ -346,77 +328,35 @@ python webui.py
|
|||||||
|
|
||||||
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
|
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
|
||||||
|
|
||||||
从 V1/V2/V3/V4 环境迁移至 V2Pro:
|
<div align="center">
|
||||||
|
|
||||||
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
|
|
||||||
|
|
||||||
2. 从 GitHub 克隆最新代码.
|
|
||||||
|
|
||||||
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V2Pro 预训练模型 (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 和 `sv/pretrained_eres2netv2w24s4ep4.ckpt`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
|
|
||||||
|
|
||||||
## 待办事项清单
|
## 待办事项清单
|
||||||
|
|
||||||
- [x] **高优先级:**
|
</div>
|
||||||
|
|
||||||
- [x] 日语和英语的本地化.
|
- [x] **高优先级:**
|
||||||
- [x] 用户指南.
|
- [x] 日语和英语的本地化
|
||||||
- [x] 日语和英语数据集微调训练.
|
- [x] 用户指南
|
||||||
|
- [x] 日语和英语数据集微调训练
|
||||||
|
|
||||||
- [ ] **功能:**
|
- [ ] **功能:**
|
||||||
- [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
|
- [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
|
||||||
- [x] TTS 语速控制.
|
- [x] TTS 语速控制
|
||||||
- [ ] ~~增强的 TTS 情感控制.~~
|
- [ ] ~~增强的 TTS 情感控制~~
|
||||||
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布.
|
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布
|
||||||
- [x] 改进英语和日语文本前端.
|
- [x] 改进英语和日语文本前端
|
||||||
- [ ] 开发体积小和更大的 TTS 模型.
|
- [ ] 开发体积小和更大的 TTS 模型
|
||||||
- [x] Colab 脚本.
|
- [x] Colab 脚本
|
||||||
- [x] 扩展训练数据集 (从 2k 小时到 10k 小时).
|
- [x] 扩展训练数据集 (从 2k 小时到 10k 小时)
|
||||||
- [x] 更好的 sovits 基础模型 (增强的音频质量).
|
- [x] 更好的 sovits 基础模型 (增强的音频质量)
|
||||||
- [ ] 模型混合.
|
- [ ] 模型混合
|
||||||
|
|
||||||
## (附加) 命令行运行方式
|
<div align="center">
|
||||||
|
|
||||||
使用命令行打开 UVR5 的 WebUI
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- 如果打不开浏览器, 请按照下面的格式进行UVR处理, 这是使用mdxnet进行音频处理的方式
|
|
||||||
````
|
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
|
||||||
```` -->
|
|
||||||
|
|
||||||
这是使用命令行完成数据集的音频切分的方式
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python audio_slicer.py \
|
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
|
||||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
|
||||||
--threshold <volume_threshold> \
|
|
||||||
--min_length <minimum_duration_of_each_subclip> \
|
|
||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
|
||||||
```
|
|
||||||
|
|
||||||
这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
|
||||||
```
|
|
||||||
|
|
||||||
通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
|
|
||||||
|
|
||||||
(没有进度条, GPU 性能可能会导致时间延迟)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
|
||||||
```
|
|
||||||
|
|
||||||
启用自定义列表保存路径
|
|
||||||
|
|
||||||
## 致谢
|
## 致谢
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
特别感谢以下项目和贡献者:
|
特别感谢以下项目和贡献者:
|
||||||
|
|
||||||
### 理论研究
|
### 理论研究
|
||||||
@ -427,7 +367,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
||||||
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
||||||
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
@ -459,8 +399,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
|
|
||||||
感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导.
|
感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 感谢所有贡献者的努力
|
## 感谢所有贡献者的努力
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
||||||
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
|
||||||
</a>
|
</a>
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
#
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
@ -19,14 +21,17 @@
|
|||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
|
|
||||||
|
|
||||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 機能:
|
<div align="center">
|
||||||
|
|
||||||
|
## 機能
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.
|
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.
|
||||||
|
|
||||||
@ -40,34 +45,49 @@
|
|||||||
|
|
||||||
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
|
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
|
||||||
|
|
||||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
|
||||||
|
|
||||||
**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## インストール
|
## インストール
|
||||||
|
|
||||||
### テスト済みの環境
|
### テスト済みの環境
|
||||||
|
|
||||||
| Python Version | PyTorch Version | Device |
|
| Python Version | PyTorch Version | Device |
|
||||||
| -------------- | ---------------- | ------------- |
|
| -------------- | --------------- | ------------- |
|
||||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
|
||||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
|
||||||
|
| Python 3.10 | PyTorch 2.9.0 | CPU |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
|
Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)し、解凍後に `go-webui.bat` をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
|
||||||
|
|
||||||
|
以下のコマンドを実行してこのプロジェクトをインストールします:
|
||||||
|
|
||||||
|
```pwsh
|
||||||
|
conda create -n GPTSoVITS python=3.11
|
||||||
|
conda activate GPTSoVITS
|
||||||
|
pwsh -F install.ps1 --help
|
||||||
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
|
以下のコマンドを実行してこのプロジェクトをインストールします:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
@ -77,54 +97,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
|
|||||||
以下のコマンドを実行してこのプロジェクトをインストールします:
|
以下のコマンドを実行してこのプロジェクトをインストールします:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### 手動インストール
|
### 手動インストール
|
||||||
|
|
||||||
#### 依存関係をインストールします
|
以下のコマンドを実行してこのプロジェクトをインストールします:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
|
conda install uv ffmpeg -c conda-forge
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps
|
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
|
||||||
pip install -r requirements.txt
|
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
|
||||||
|
uv pip install ".[flash-attn]"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### FFmpeg をインストールします
|
### 事前訓練済みモデル
|
||||||
|
|
||||||
##### Conda ユーザー
|
**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
|
||||||
|
|
||||||
```bash
|
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
|
||||||
conda activate GPTSoVits
|
|
||||||
conda install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Ubuntu/Debian ユーザー
|
2. [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
|
||||||
|
|
||||||
```bash
|
3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`gsv_tools/uvr5/uvr5_weights` ディレクトリに配置してください.
|
||||||
sudo apt install ffmpeg
|
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`gsv_tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
|
||||||
sudo apt install libsox-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Windows ユーザー
|
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです.
|
||||||
|
|
||||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます
|
4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.
|
||||||
|
|
||||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください
|
5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
|
||||||
|
|
||||||
##### MacOS ユーザー
|
<div align="center">
|
||||||
|
|
||||||
```bash
|
## GPT-SoVITS の実行 (Docker 使用)
|
||||||
brew install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
### GPT-SoVITS の実行 (Docker 使用)
|
</div>
|
||||||
|
|
||||||
#### Docker イメージの選択
|
### Docker イメージの選択
|
||||||
|
|
||||||
コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください:
|
コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください:
|
||||||
|
|
||||||
@ -135,15 +150,15 @@ brew install ffmpeg
|
|||||||
- Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
|
- Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
|
||||||
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
|
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
|
||||||
|
|
||||||
#### 環境変数
|
### 環境変数
|
||||||
|
|
||||||
- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます
|
- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます
|
||||||
|
|
||||||
#### 共有メモリの設定
|
### 共有メモリの設定
|
||||||
|
|
||||||
Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします
|
Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします
|
||||||
|
|
||||||
#### サービスの選択
|
### サービスの選択
|
||||||
|
|
||||||
`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています:
|
`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています:
|
||||||
|
|
||||||
@ -156,7 +171,7 @@ Windows (Docker Desktop) では、デフォルトの共有メモリサイズが
|
|||||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Docker イメージのローカルビルド
|
### Docker イメージのローカルビルド
|
||||||
|
|
||||||
自分でイメージをビルドするには、以下のコマンドを使ってください:
|
自分でイメージをビルドするには、以下のコマンドを使ってください:
|
||||||
|
|
||||||
@ -164,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
|
|||||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 実行中のコンテナへアクセス (Bash Shell)
|
### 実行中のコンテナへアクセス (Bash Shell)
|
||||||
|
|
||||||
コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます:
|
コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます:
|
||||||
|
|
||||||
@ -172,30 +187,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
|||||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||||
```
|
```
|
||||||
|
|
||||||
## 事前訓練済みモデル
|
<div align="center">
|
||||||
|
|
||||||
**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
|
|
||||||
|
|
||||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
|
|
||||||
|
|
||||||
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
|
|
||||||
|
|
||||||
3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください.
|
|
||||||
|
|
||||||
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
|
|
||||||
|
|
||||||
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです.
|
|
||||||
|
|
||||||
4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.
|
|
||||||
|
|
||||||
5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
|
|
||||||
|
|
||||||
## データセット形式
|
## データセット形式
|
||||||
|
|
||||||
TTS アノテーション .list ファイル形式:
|
</div>
|
||||||
|
|
||||||
|
TTS アノテーション `.list` ファイル形式:
|
||||||
|
|
||||||
|
```text
|
||||||
|
|
||||||
```
|
|
||||||
vocal_path|speaker_name|language|text
|
vocal_path|speaker_name|language|text
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
言語辞書:
|
言語辞書:
|
||||||
@ -206,18 +209,23 @@ vocal_path|speaker_name|language|text
|
|||||||
|
|
||||||
例:
|
例:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 微調整と推論
|
## 微調整と推論
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### WebUI を開く
|
### WebUI を開く
|
||||||
|
|
||||||
#### 統合パッケージ利用者
|
#### 統合パッケージ利用者
|
||||||
|
|
||||||
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
|
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
|
||||||
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください.
|
|
||||||
|
|
||||||
#### その他
|
#### その他
|
||||||
|
|
||||||
@ -225,14 +233,6 @@ V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
|
|||||||
python webui.py <言語(オプション)>
|
python webui.py <言語(オプション)>
|
||||||
```
|
```
|
||||||
|
|
||||||
V1 に切り替えたい場合は
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python webui.py v1 <言語(オプション)>
|
|
||||||
```
|
|
||||||
|
|
||||||
または WebUI で手動でバージョンを切り替えてください.
|
|
||||||
|
|
||||||
### 微調整
|
### 微調整
|
||||||
|
|
||||||
#### パス自動補完のサポート
|
#### パス自動補完のサポート
|
||||||
@ -248,7 +248,7 @@ python webui.py v1 <言語(オプション)>
|
|||||||
|
|
||||||
#### 統合パッケージ利用者
|
#### 統合パッケージ利用者
|
||||||
|
|
||||||
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
||||||
|
|
||||||
#### その他
|
#### その他
|
||||||
|
|
||||||
@ -264,8 +264,12 @@ python webui.py
|
|||||||
|
|
||||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## V2 リリースノート
|
## V2 リリースノート
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. 韓国語と広東語をサポート
|
1. 韓国語と広東語をサポート
|
||||||
@ -278,18 +282,12 @@ python webui.py
|
|||||||
|
|
||||||
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 環境から V2 を使用するには:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
|
|
||||||
|
|
||||||
2. 最新のコードを github からクローン
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置
|
|
||||||
|
|
||||||
中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
|
||||||
|
|
||||||
## V3 リリースノート
|
## V3 リリースノート
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます).
|
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます).
|
||||||
@ -298,33 +296,23 @@ V1 環境から V2 を使用するには:
|
|||||||
|
|
||||||
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
v2 環境から v3 を使用する方法:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します.
|
|
||||||
|
|
||||||
2. GitHub から最新のコードをクローンします.
|
|
||||||
|
|
||||||
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します.
|
|
||||||
|
|
||||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
|
|
||||||
|
|
||||||
## V4 リリースノート
|
## V4 リリースノート
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力します(V3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
|
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力します(V3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
|
||||||
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1/V2/V3 環境から V4 への移行方法:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
|
|
||||||
|
|
||||||
2. GitHub から最新のコードをクローンします.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V4 の事前学習済みモデル (`gsv-v4-pretrained/s2v4.ckpt` および `gsv-v4-pretrained/vocoder.pth`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
|
|
||||||
|
|
||||||
## V2Pro リリースノート
|
## V2Pro リリースノート
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
|
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
|
||||||
@ -332,18 +320,13 @@ V1/V2/V3 環境から V4 への移行方法:
|
|||||||
|
|
||||||
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
|
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
|
||||||
|
|
||||||
V1/V2/V3/V4 環境から V2Pro への移行方法:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
|
|
||||||
|
|
||||||
2. GitHub から最新のコードをクローンします.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V2Pro の事前学習済みモデル (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, および `sv/pretrained_eres2netv2w24s4ep4.ckpt`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
|
|
||||||
|
|
||||||
## Todo リスト
|
## Todo リスト
|
||||||
|
|
||||||
- [x] **優先度 高:**
|
</div>
|
||||||
|
|
||||||
|
- [x] **優先度 高:**
|
||||||
- [x] 日本語と英語でのローカライズ.
|
- [x] 日本語と英語でのローカライズ.
|
||||||
- [x] ユーザーガイド.
|
- [x] ユーザーガイド.
|
||||||
- [x] 日本語データセットと英語データセットのファインチューニングトレーニング.
|
- [x] 日本語データセットと英語データセットのファインチューニングトレーニング.
|
||||||
@ -356,53 +339,16 @@ V1/V2/V3/V4 環境から V2Pro への移行方法:
|
|||||||
- [x] 英語と日本語のテキストフロントエンドを改善.
|
- [x] 英語と日本語のテキストフロントエンドを改善.
|
||||||
- [ ] 小型と大型の TTS モデルを開発する.
|
- [ ] 小型と大型の TTS モデルを開発する.
|
||||||
- [x] Colab のスクリプト.
|
- [x] Colab のスクリプト.
|
||||||
- [ ] トレーニングデータセットを拡張する (2k→10k).
|
- [x] トレーニングデータセットを拡張する (2k→10k).
|
||||||
- [x] より良い sovits ベースモデル (音質向上)
|
- [x] より良い sovits ベースモデル (音質向上)
|
||||||
- [ ] モデルミックス
|
- [ ] モデルミックス
|
||||||
|
|
||||||
## (追加の) コマンドラインから実行する方法
|
<div align="center">
|
||||||
|
|
||||||
コマンド ラインを使用して UVR5 の WebUI を開きます
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください.これはオーディオ処理に mdxnet を使用しています.
|
|
||||||
```
|
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
|
||||||
``` -->
|
|
||||||
|
|
||||||
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python audio_slicer.py \
|
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
|
||||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
|
||||||
--threshold <volume_threshold> \
|
|
||||||
--min_length <minimum_duration_of_each_subclip> \
|
|
||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
|
||||||
```
|
|
||||||
|
|
||||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
|
||||||
```
|
|
||||||
|
|
||||||
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
|
||||||
|
|
||||||
(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
|
||||||
```
|
|
||||||
|
|
||||||
カスタムリストの保存パスが有効になっています
|
|
||||||
|
|
||||||
## クレジット
|
## クレジット
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
特に以下のプロジェクトと貢献者に感謝します:
|
特に以下のプロジェクトと貢献者に感謝します:
|
||||||
|
|
||||||
### 理論研究
|
### 理論研究
|
||||||
@ -413,7 +359,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
||||||
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
||||||
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
@ -445,8 +391,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
|
|
||||||
@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます.
|
@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## すべてのコントリビューターに感謝します
|
## すべてのコントリビューターに感謝します
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
||||||
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
|
||||||
</a>
|
</a>
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
#
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
@ -19,14 +21,17 @@
|
|||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
|
|
||||||
|
|
||||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 기능:
|
<div align="center">
|
||||||
|
|
||||||
|
## 기능
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.
|
1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.
|
||||||
|
|
||||||
@ -40,97 +45,101 @@
|
|||||||
|
|
||||||
보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:
|
보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:
|
||||||
|
|
||||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
|
||||||
|
|
||||||
**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 설치
|
## 설치
|
||||||
|
|
||||||
### 테스트 통과 환경
|
### 테스트 통과 환경
|
||||||
|
|
||||||
| Python Version | PyTorch Version | Device |
|
| Python Version | PyTorch Version | Device |
|
||||||
| -------------- | ---------------- | ------------- |
|
| -------------- | --------------- | ------------- |
|
||||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
|
||||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
|
||||||
|
| Python 3.10 | PyTorch 2.9.0 | CPU |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
|
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)한 후 압축을 풀고 `go-webui.bat` 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
|
||||||
|
|
||||||
|
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
|
||||||
|
|
||||||
```pwsh
|
```pwsh
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
|
pwsh -F install.ps1 --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
|
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
|
|
||||||
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
|
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
|
||||||
|
|
||||||
다음 명령어를 실행하여 이 프로젝트를 설치하세요
|
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### 수동 설치
|
### 수동 설치
|
||||||
|
|
||||||
#### 의존성 설치
|
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
|
conda install uv ffmpeg -c conda-forge
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps
|
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
|
||||||
pip install -r requirements.txt
|
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
|
||||||
|
uv pip install ".[flash-attn]"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### FFmpeg 설치
|
### 사전 학습된 모델
|
||||||
|
|
||||||
##### Conda 사용자
|
**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
|
||||||
|
|
||||||
```bash
|
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
|
||||||
conda activate GPTSoVits
|
|
||||||
conda install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Ubuntu/Debian 사용자
|
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
|
||||||
|
|
||||||
```bash
|
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `gsv_tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
|
||||||
sudo apt install ffmpeg
|
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `gsv_tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
||||||
sudo apt install libsox-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Windows 사용자
|
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
||||||
|
|
||||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다
|
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요.
|
||||||
|
|
||||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치
|
5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
|
||||||
|
|
||||||
##### MacOS 사용자
|
<div align="center">
|
||||||
|
|
||||||
```bash
|
## GPT-SoVITS 실행하기 (Docker 사용)
|
||||||
brew install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
### GPT-SoVITS 실행하기 (Docker 사용)
|
</div>
|
||||||
|
|
||||||
#### Docker 이미지 선택
|
### Docker 이미지 선택
|
||||||
|
|
||||||
코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:
|
코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:
|
||||||
|
|
||||||
@ -141,15 +150,15 @@ brew install ffmpeg
|
|||||||
- Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
|
- Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
|
||||||
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
|
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
|
||||||
|
|
||||||
#### 환경 변수
|
### 환경 변수
|
||||||
|
|
||||||
- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다
|
- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다
|
||||||
|
|
||||||
#### 공유 메모리 설정
|
### 공유 메모리 설정
|
||||||
|
|
||||||
Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다
|
Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다
|
||||||
|
|
||||||
#### 서비스 선택
|
### 서비스 선택
|
||||||
|
|
||||||
`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:
|
`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:
|
||||||
|
|
||||||
@ -162,7 +171,7 @@ Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기
|
|||||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Docker 이미지 직접 빌드하기
|
### Docker 이미지 직접 빌드하기
|
||||||
|
|
||||||
직접 이미지를 빌드하려면 다음 명령어를 사용하세요:
|
직접 이미지를 빌드하려면 다음 명령어를 사용하세요:
|
||||||
|
|
||||||
@ -170,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
|
|||||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 실행 중인 컨테이너 접속하기 (Bash Shell)
|
### 실행 중인 컨테이너 접속하기 (Bash Shell)
|
||||||
|
|
||||||
컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다:
|
컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다:
|
||||||
|
|
||||||
@ -178,30 +187,14 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
|||||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||||
```
|
```
|
||||||
|
|
||||||
## 사전 학습된 모델
|
|
||||||
|
|
||||||
**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
|
|
||||||
|
|
||||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
|
|
||||||
|
|
||||||
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
|
|
||||||
|
|
||||||
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
|
|
||||||
|
|
||||||
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
|
||||||
|
|
||||||
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
|
||||||
|
|
||||||
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
|
|
||||||
|
|
||||||
5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
|
|
||||||
|
|
||||||
## 데이터셋 형식
|
## 데이터셋 형식
|
||||||
|
|
||||||
텍스트 음성 합성(TTS) 주석 .list 파일 형식:
|
텍스트 음성 합성(TTS) 주석 .list 파일 형식:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
vocal_path|speaker_name|language|text
|
vocal_path|speaker_name|language|text
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
언어 사전:
|
언어 사전:
|
||||||
@ -209,21 +202,28 @@ vocal_path|speaker_name|language|text
|
|||||||
- 'zh': 중국어
|
- 'zh': 중국어
|
||||||
- 'ja': 일본어
|
- 'ja': 일본어
|
||||||
- 'en': 영어
|
- 'en': 영어
|
||||||
|
- 'ko': 한국인
|
||||||
|
- 'yue': 광둥어
|
||||||
|
|
||||||
예시:
|
예시:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## 미세 조정 및 추론
|
## 미세 조정 및 추론
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### WebUI 열기
|
### WebUI 열기
|
||||||
|
|
||||||
#### 통합 패키지 사용자
|
#### 통합 패키지 사용자
|
||||||
|
|
||||||
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
|
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
|
||||||
V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오.
|
|
||||||
|
|
||||||
#### 기타
|
#### 기타
|
||||||
|
|
||||||
@ -231,21 +231,13 @@ V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-
|
|||||||
python webui.py <언어(옵션)>
|
python webui.py <언어(옵션)>
|
||||||
```
|
```
|
||||||
|
|
||||||
V1으로 전환하려면,
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python webui.py v1 <언어(옵션)>
|
|
||||||
```
|
|
||||||
|
|
||||||
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
|
||||||
|
|
||||||
### 미세 조정
|
### 미세 조정
|
||||||
|
|
||||||
#### 경로 자동 채우기가 지원됩니다
|
#### 경로 자동 채우기가 지원됩니다
|
||||||
|
|
||||||
1. 오디오 경로를 입력하십시오.
|
1. 오디오 경로를 입력하십시오.
|
||||||
2. 오디오를 작은 청크로 분할하십시오.
|
2. 오디오를 작은 청크로 분할하십시오.
|
||||||
3. 노이즈 제거(옵션)
|
3. 노이즈 제거 (옵션)
|
||||||
4. ASR 수행
|
4. ASR 수행
|
||||||
5. ASR 전사를 교정하십시오.
|
5. ASR 전사를 교정하십시오.
|
||||||
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
|
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
|
||||||
@ -254,7 +246,7 @@ python webui.py v1 <언어(옵션)>
|
|||||||
|
|
||||||
#### 통합 패키지 사용자
|
#### 통합 패키지 사용자
|
||||||
|
|
||||||
`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
||||||
|
|
||||||
#### 기타
|
#### 기타
|
||||||
|
|
||||||
@ -270,8 +262,12 @@ python webui.py
|
|||||||
|
|
||||||
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## V2 릴리스 노트
|
## V2 릴리스 노트
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
새로운 기능:
|
새로운 기능:
|
||||||
|
|
||||||
1. 한국어 및 광둥어 지원
|
1. 한국어 및 광둥어 지원
|
||||||
@ -284,18 +280,12 @@ python webui.py
|
|||||||
|
|
||||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 환경에서 V2를 사용하려면:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트
|
|
||||||
|
|
||||||
2. github에서 최신 코드를 클론하십시오.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오.
|
|
||||||
|
|
||||||
중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
|
||||||
|
|
||||||
## V3 릴리스 노트
|
## V3 릴리스 노트
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
새로운 기능:
|
새로운 기능:
|
||||||
|
|
||||||
1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
|
1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
|
||||||
@ -304,33 +294,23 @@ V1 환경에서 V2를 사용하려면:
|
|||||||
|
|
||||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
v2 환경에서 v3 사용하기:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다.
|
|
||||||
|
|
||||||
2. 최신 코드를 github 에서 클론합니다.
|
|
||||||
|
|
||||||
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다.
|
|
||||||
|
|
||||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
|
||||||
|
|
||||||
## V4 릴리스 노트
|
## V4 릴리스 노트
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
신규 기능:
|
신규 기능:
|
||||||
|
|
||||||
1. **V4는 V3에서 발생하는 비정수 배율 업샘플링으로 인한 금속성 잡음 문제를 수정했으며, 소리가 먹먹해지는 것을 방지하기 위해 기본적으로 48kHz 오디오를 출력합니다 (V3는 기본적으로 24kHz만 지원)**. 개발자는 V4를 V3의 직접적인 대체 버전으로 보고 있지만 추가 테스트가 필요합니다.
|
1. **V4는 V3에서 발생하는 비정수 배율 업샘플링으로 인한 금속성 잡음 문제를 수정했으며, 소리가 먹먹해지는 것을 방지하기 위해 기본적으로 48kHz 오디오를 출력합니다 (V3는 기본적으로 24kHz만 지원)**. 개발자는 V4를 V3의 직접적인 대체 버전으로 보고 있지만 추가 테스트가 필요합니다.
|
||||||
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1/V2/V3 환경에서 V4로 전환 방법:
|
<div align="center">
|
||||||
|
|
||||||
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
|
|
||||||
|
|
||||||
2. GitHub에서 최신 코드를 클론하세요.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V4 사전 학습 모델(`gsv-v4-pretrained/s2v4.ckpt` 및 `gsv-v4-pretrained/vocoder.pth`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
|
|
||||||
|
|
||||||
## V2Pro 릴리스 노트
|
## V2Pro 릴리스 노트
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
신규 기능:
|
신규 기능:
|
||||||
|
|
||||||
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
|
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
|
||||||
@ -338,24 +318,18 @@ V1/V2/V3 환경에서 V4로 전환 방법:
|
|||||||
|
|
||||||
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
|
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
|
||||||
|
|
||||||
V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
|
<div align="center">
|
||||||
|
|
||||||
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
|
|
||||||
|
|
||||||
2. GitHub에서 최신 코드를 클론하세요.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V2Pro 사전 학습 모델(`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 및 `sv/pretrained_eres2netv2w24s4ep4.ckpt`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
|
|
||||||
|
|
||||||
## 할 일 목록
|
## 할 일 목록
|
||||||
|
|
||||||
- [x] **최우선순위:**
|
</div>
|
||||||
|
|
||||||
|
- [x] **최우선순위:**
|
||||||
- [x] 일본어 및 영어 지역화.
|
- [x] 일본어 및 영어 지역화.
|
||||||
- [x] 사용자 가이드.
|
- [x] 사용자 가이드.
|
||||||
- [x] 일본어 및 영어 데이터셋 미세 조정 훈련.
|
- [x] 일본어 및 영어 데이터셋 미세 조정 훈련.
|
||||||
|
|
||||||
- [ ] **기능:**
|
- [ ] **기능:**
|
||||||
|
|
||||||
- [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분).
|
- [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분).
|
||||||
- [x] TTS 속도 제어.
|
- [x] TTS 속도 제어.
|
||||||
- [ ] ~~향상된 TTS 감정 제어.~~
|
- [ ] ~~향상된 TTS 감정 제어.~~
|
||||||
@ -363,53 +337,16 @@ V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
|
|||||||
- [x] 영어 및 일본어 텍스트 프론트 엔드 개선.
|
- [x] 영어 및 일본어 텍스트 프론트 엔드 개선.
|
||||||
- [ ] 작은 크기와 큰 크기의 TTS 모델 개발.
|
- [ ] 작은 크기와 큰 크기의 TTS 모델 개발.
|
||||||
- [x] Colab 스크립트.
|
- [x] Colab 스크립트.
|
||||||
- [ ] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
|
- [x] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
|
||||||
- [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질).
|
- [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질).
|
||||||
- [ ] 모델 블렌딩.
|
- [ ] 모델 블렌딩.
|
||||||
|
|
||||||
## (추가적인) 명령줄에서 실행하는 방법
|
<div align="center">
|
||||||
|
|
||||||
명령줄을 사용하여 UVR5용 WebUI 열기
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
|
||||||
```
|
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
|
||||||
``` -->
|
|
||||||
|
|
||||||
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python audio_slicer.py \
|
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
|
||||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
|
||||||
--threshold <volume_threshold> \
|
|
||||||
--min_length <minimum_duration_of_each_subclip> \
|
|
||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
|
||||||
```
|
|
||||||
|
|
||||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
|
||||||
```
|
|
||||||
|
|
||||||
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
|
||||||
|
|
||||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
|
||||||
```
|
|
||||||
|
|
||||||
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
|
||||||
|
|
||||||
## 감사의 말
|
## 감사의 말
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
||||||
|
|
||||||
### 이론 연구
|
### 이론 연구
|
||||||
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
||||||
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
||||||
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
|
|
||||||
@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.
|
@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.
|
||||||
|
|
||||||
## 모든 기여자들에게 감사드립니다 ;)
|
<div align="center">
|
||||||
|
|
||||||
|
## 참여해주신 모든 분들께 감사드립니다
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
||||||
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
|
||||||
</a>
|
</a>
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
#
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
@ -25,7 +27,7 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Özellikler:
|
## Özellikler
|
||||||
|
|
||||||
1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.
|
1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.
|
||||||
|
|
||||||
@ -39,40 +41,49 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
|
|||||||
|
|
||||||
Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
|
Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
|
||||||
|
|
||||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
|
||||||
|
|
||||||
**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## Kurulum
|
## Kurulum
|
||||||
|
|
||||||
### Test Edilmiş Ortamlar
|
### Test Edilmiş Ortamlar
|
||||||
|
|
||||||
| Python Version | PyTorch Version | Device |
|
| Python Version | PyTorch Version | Device |
|
||||||
| -------------- | ---------------- | ------------- |
|
| -------------- | --------------- | ------------- |
|
||||||
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
|
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
|
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
|
||||||
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
|
||||||
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
|
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
|
||||||
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
|
||||||
|
| Python 3.10 | PyTorch 2.9.0 | CPU |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
|
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) ve `go-webui.bat` dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
|
||||||
|
|
||||||
|
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||||
|
|
||||||
```pwsh
|
```pwsh
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
|
pwsh -F install.ps1 --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
||||||
|
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
@ -82,54 +93,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
|
|||||||
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
bash install.sh --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### El ile Yükleme
|
### El ile Yükleme
|
||||||
|
|
||||||
#### Bağımlılıkları Yükleme
|
Aşağıdaki komutları çalıştırarak programı yükleyin:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.10
|
conda create -n GPTSoVITS python=3.11
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVITS
|
||||||
|
conda install uv ffmpeg -c conda-forge
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps
|
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
|
||||||
pip install -r requirements.txt
|
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
|
||||||
|
uv pip install ".[flash-attn]"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### FFmpeg'i Yükleme
|
### Önceden Eğitilmiş Modeller
|
||||||
|
|
||||||
##### Conda Kullanıcıları
|
**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
|
||||||
|
|
||||||
```bash
|
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
||||||
conda activate GPTSoVits
|
|
||||||
conda install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Ubuntu/Debian Kullanıcıları
|
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
|
||||||
|
|
||||||
```bash
|
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `gsv_tools/uvr5/uvr5_weights` dizinine yerleştirin.
|
||||||
sudo apt install ffmpeg
|
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `gsv_tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
||||||
sudo apt install libsox-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Windows Kullanıcıları
|
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
||||||
|
|
||||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin
|
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `gsv_tools/asr/models` dizinine yerleştirin.
|
||||||
|
|
||||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin
|
5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `gsv_tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
|
||||||
|
|
||||||
##### MacOS Kullanıcıları
|
<div align="center">
|
||||||
|
|
||||||
```bash
|
## GPT-SoVITS Çalıştırma (Docker Kullanarak)
|
||||||
brew install ffmpeg
|
|
||||||
```
|
|
||||||
|
|
||||||
### GPT-SoVITS Çalıştırma (Docker Kullanarak)
|
</div>
|
||||||
|
|
||||||
#### Docker İmajı Seçimi
|
### Docker İmajı Seçimi
|
||||||
|
|
||||||
Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:
|
Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:
|
||||||
|
|
||||||
@ -140,15 +146,15 @@ Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığ
|
|||||||
- Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
|
- Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
|
||||||
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
|
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
|
||||||
|
|
||||||
#### Ortam Değişkenleri
|
### Ortam Değişkenleri
|
||||||
|
|
||||||
- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.
|
- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.
|
||||||
|
|
||||||
#### Paylaşılan Bellek Yapılandırması
|
### Paylaşılan Bellek Yapılandırması
|
||||||
|
|
||||||
Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.
|
Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.
|
||||||
|
|
||||||
#### Servis Seçimi
|
### Servis Seçimi
|
||||||
|
|
||||||
`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:
|
`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:
|
||||||
|
|
||||||
@ -161,7 +167,7 @@ Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanı
|
|||||||
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Docker İmajını Yerel Olarak Oluşturma
|
### Docker İmajını Yerel Olarak Oluşturma
|
||||||
|
|
||||||
Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
|
Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
|
||||||
|
|
||||||
@ -169,7 +175,7 @@ Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
|
|||||||
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Çalışan Konteynere Erişim (Bash Shell)
|
### Çalışan Konteynere Erişim (Bash Shell)
|
||||||
|
|
||||||
Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz:
|
Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz:
|
||||||
|
|
||||||
@ -177,30 +183,18 @@ Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz
|
|||||||
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
|
||||||
```
|
```
|
||||||
|
|
||||||
## Önceden Eğitilmiş Modeller
|
<div align="center">
|
||||||
|
|
||||||
**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
|
|
||||||
|
|
||||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
|
||||||
|
|
||||||
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
|
|
||||||
|
|
||||||
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
|
|
||||||
|
|
||||||
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
|
||||||
|
|
||||||
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
|
||||||
|
|
||||||
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
|
|
||||||
|
|
||||||
5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
|
|
||||||
|
|
||||||
## Veri Seti Formatı
|
## Veri Seti Formatı
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
TTS açıklama .list dosya formatı:
|
TTS açıklama .list dosya formatı:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
vocal_path|speaker_name|language|text
|
vocal_path|speaker_name|language|text
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Dil sözlüğü:
|
Dil sözlüğü:
|
||||||
@ -213,18 +207,23 @@ Dil sözlüğü:
|
|||||||
|
|
||||||
Örnek:
|
Örnek:
|
||||||
|
|
||||||
```
|
```text
|
||||||
|
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## İnce Ayar ve Çıkarım
|
## İnce Ayar ve Çıkarım
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### WebUI'yi Açın
|
### WebUI'yi Açın
|
||||||
|
|
||||||
#### Entegre Paket Kullanıcıları
|
#### Entegre Paket Kullanıcıları
|
||||||
|
|
||||||
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
|
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
|
||||||
V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın.
|
|
||||||
|
|
||||||
#### Diğerleri
|
#### Diğerleri
|
||||||
|
|
||||||
@ -232,14 +231,6 @@ V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `
|
|||||||
python webui.py <dil(isteğe bağlı)>
|
python webui.py <dil(isteğe bağlı)>
|
||||||
```
|
```
|
||||||
|
|
||||||
V1'e geçmek istiyorsanız,
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python webui.py v1 <dil(isteğe bağlı)>
|
|
||||||
```
|
|
||||||
|
|
||||||
veya WebUI'de manuel olarak sürüm değiştirin.
|
|
||||||
|
|
||||||
### İnce Ayar
|
### İnce Ayar
|
||||||
|
|
||||||
#### Yol Otomatik Doldurma artık destekleniyor
|
#### Yol Otomatik Doldurma artık destekleniyor
|
||||||
@ -255,7 +246,7 @@ veya WebUI'de manuel olarak sürüm değiştirin.
|
|||||||
|
|
||||||
#### Entegre Paket Kullanıcıları
|
#### Entegre Paket Kullanıcıları
|
||||||
|
|
||||||
`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
||||||
|
|
||||||
#### Diğerleri
|
#### Diğerleri
|
||||||
|
|
||||||
@ -271,8 +262,12 @@ python webui.py
|
|||||||
|
|
||||||
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## V2 Sürüm Notları
|
## V2 Sürüm Notları
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Yeni Özellikler:
|
Yeni Özellikler:
|
||||||
|
|
||||||
1. Korece ve Kantonca destekler
|
1. Korece ve Kantonca destekler
|
||||||
@ -285,18 +280,12 @@ Yeni Özellikler:
|
|||||||
|
|
||||||
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 ortamından V2'yi kullanmak için:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin
|
|
||||||
|
|
||||||
2. github'dan en son kodları klonlayın.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin.
|
|
||||||
|
|
||||||
Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
|
||||||
|
|
||||||
## V3 Sürüm Notları
|
## V3 Sürüm Notları
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Yeni Özellikler:
|
Yeni Özellikler:
|
||||||
|
|
||||||
1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
|
1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
|
||||||
@ -305,33 +294,23 @@ Yeni Özellikler:
|
|||||||
|
|
||||||
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V2 ortamında V3 kullanımı:
|
<div align="center">
|
||||||
|
|
||||||
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
|
|
||||||
|
|
||||||
2. GitHub'dan en son kodları klonlayın.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
|
||||||
|
|
||||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
|
||||||
|
|
||||||
## V4 Sürüm Notları
|
## V4 Sürüm Notları
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Yeni Özellikler:
|
Yeni Özellikler:
|
||||||
|
|
||||||
1. **V4, V3'te görülen non-integer upsample işleminden kaynaklanan metalik ses sorununu düzeltti ve sesin boğuklaşmasını önlemek için doğrudan 48kHz ses çıktısı sunar (V3 sadece 24kHz destekler)**. Yazar, V4'ün V3'ün yerine geçebileceğini belirtmiştir ancak daha fazla test yapılması gerekmektedir.
|
1. **V4, V3'te görülen non-integer upsample işleminden kaynaklanan metalik ses sorununu düzeltti ve sesin boğuklaşmasını önlemek için doğrudan 48kHz ses çıktısı sunar (V3 sadece 24kHz destekler)**. Yazar, V4'ün V3'ün yerine geçebileceğini belirtmiştir ancak daha fazla test yapılması gerekmektedir.
|
||||||
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1/V2/V3 ortamından V4'e geçiş:
|
<div align="center">
|
||||||
|
|
||||||
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
|
|
||||||
|
|
||||||
2. GitHub'dan en son kodları klonlayın.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V4 ön eğitilmiş modelleri indirin (`gsv-v4-pretrained/s2v4.ckpt` ve `gsv-v4-pretrained/vocoder.pth`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
|
|
||||||
|
|
||||||
## V2Pro Sürüm Notları
|
## V2Pro Sürüm Notları
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Yeni Özellikler:
|
Yeni Özellikler:
|
||||||
|
|
||||||
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
|
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
|
||||||
@ -339,18 +318,13 @@ Yeni Özellikler:
|
|||||||
|
|
||||||
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
|
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
|
||||||
|
|
||||||
V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
|
<div align="center">
|
||||||
|
|
||||||
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
|
|
||||||
|
|
||||||
2. GitHub'dan en son kodları klonlayın.
|
|
||||||
|
|
||||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V2Pro ön eğitilmiş modelleri indirin (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, ve `sv/pretrained_eres2netv2w24s4ep4.ckpt`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
|
|
||||||
|
|
||||||
## Yapılacaklar Listesi
|
## Yapılacaklar Listesi
|
||||||
|
|
||||||
- [x] **Yüksek Öncelikli:**
|
</div>
|
||||||
|
|
||||||
|
- [x] **Yüksek Öncelikli:**
|
||||||
- [x] Japonca ve İngilizceye yerelleştirme.
|
- [x] Japonca ve İngilizceye yerelleştirme.
|
||||||
- [x] Kullanıcı kılavuzu.
|
- [x] Kullanıcı kılavuzu.
|
||||||
- [x] Japonca ve İngilizce veri seti ince ayar eğitimi.
|
- [x] Japonca ve İngilizce veri seti ince ayar eğitimi.
|
||||||
@ -363,53 +337,16 @@ V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
|
|||||||
- [x] İngilizce ve Japonca metin ön ucunu iyileştirme.
|
- [x] İngilizce ve Japonca metin ön ucunu iyileştirme.
|
||||||
- [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme.
|
- [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme.
|
||||||
- [x] Colab betikleri.
|
- [x] Colab betikleri.
|
||||||
- [ ] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
|
- [x] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
|
||||||
- [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi)
|
- [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi)
|
||||||
- [ ] model karışımı
|
- [ ] model karışımı
|
||||||
|
|
||||||
## (Ekstra) Komut satırından çalıştırma yöntemi
|
<div align="center">
|
||||||
|
|
||||||
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
|
||||||
```
|
|
||||||
|
|
||||||
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
|
||||||
```
|
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
|
||||||
``` -->
|
|
||||||
|
|
||||||
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python audio_slicer.py \
|
|
||||||
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
|
||||||
--output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
|
|
||||||
--threshold <ses_eşiği> \
|
|
||||||
--min_length <her_bir_alt_klibin_minimum_süresi> \
|
|
||||||
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
|
||||||
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
|
||||||
```
|
|
||||||
|
|
||||||
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
|
||||||
```
|
|
||||||
|
|
||||||
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
|
||||||
|
|
||||||
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
|
||||||
```
|
|
||||||
|
|
||||||
Özel bir liste kaydetme yolu etkinleştirildi
|
|
||||||
|
|
||||||
## Katkı Verenler
|
## Katkı Verenler
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
||||||
|
|
||||||
### Teorik Araştırma
|
### Teorik Araştırma
|
||||||
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
|||||||
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
||||||
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
||||||
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
|||||||
|
|
||||||
@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım.
|
@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
## Tüm katkıda bulunanlara çabaları için teşekkürler
|
## Tüm katkıda bulunanlara çabaları için teşekkürler
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
||||||
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
|
||||||
</a>
|
</a>
|
||||||
|
|||||||
@ -1 +0,0 @@
|
|||||||
faster-whisper
|
|
||||||
@ -1,8 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
import torchaudio
|
||||||
import torchaudio.functional as aF
|
import torchaudio.functional as aF
|
||||||
|
|
||||||
|
|
||||||
@ -37,10 +38,10 @@ def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
|
|||||||
|
|
||||||
|
|
||||||
def get_dataset_filelist(a):
|
def get_dataset_filelist(a):
|
||||||
with open(a.input_training_file, "r", encoding="utf-8") as fi:
|
with open(a.input_training_file, encoding="utf-8") as fi:
|
||||||
training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
||||||
|
|
||||||
with open(a.input_validation_file, "r", encoding="utf-8") as fi:
|
with open(a.input_validation_file, encoding="utf-8") as fi:
|
||||||
validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
||||||
|
|
||||||
return training_indexes, validation_indexes
|
return training_indexes, validation_indexes
|
||||||
@ -1,7 +1,7 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.nn.utils import weight_norm, spectral_norm
|
import torch.nn.functional as F
|
||||||
|
from torch.nn.utils import spectral_norm, weight_norm
|
||||||
|
|
||||||
|
|
||||||
# from utils import init_weights, get_padding
|
# from utils import init_weights, get_padding
|
||||||
@ -16,7 +16,7 @@ def init_weights(m, mean=0.0, std=0.01):
|
|||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import Tuple, List
|
|
||||||
|
|
||||||
LRELU_SLOPE = 0.1
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ class ConvNeXtBlock(nn.Module):
|
|||||||
|
|
||||||
class APNet_BWE_Model(torch.nn.Module):
|
class APNet_BWE_Model(torch.nn.Module):
|
||||||
def __init__(self, h):
|
def __init__(self, h):
|
||||||
super(APNet_BWE_Model, self).__init__()
|
super().__init__()
|
||||||
self.h = h
|
self.h = h
|
||||||
self.adanorm_num_embeddings = None
|
self.adanorm_num_embeddings = None
|
||||||
layer_scale_init_value = 1 / h.ConvNeXt_layers
|
layer_scale_init_value = 1 / h.ConvNeXt_layers
|
||||||
@ -125,7 +125,7 @@ class APNet_BWE_Model(torch.nn.Module):
|
|||||||
x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
|
x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
|
||||||
x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)
|
x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)
|
||||||
|
|
||||||
for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha):
|
for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha, strict=False):
|
||||||
x_mag = x_mag + x_pha
|
x_mag = x_mag + x_pha
|
||||||
x_pha = x_pha + x_mag
|
x_pha = x_pha + x_mag
|
||||||
x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
|
x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
|
||||||
@ -146,9 +146,9 @@ class APNet_BWE_Model(torch.nn.Module):
|
|||||||
|
|
||||||
class DiscriminatorP(torch.nn.Module):
|
class DiscriminatorP(torch.nn.Module):
|
||||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||||
super(DiscriminatorP, self).__init__()
|
super().__init__()
|
||||||
self.period = period
|
self.period = period
|
||||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
norm_f = weight_norm if not use_spectral_norm else spectral_norm
|
||||||
self.convs = nn.ModuleList(
|
self.convs = nn.ModuleList(
|
||||||
[
|
[
|
||||||
norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||||
@ -185,7 +185,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||||||
|
|
||||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(MultiPeriodDiscriminator, self).__init__()
|
super().__init__()
|
||||||
self.discriminators = nn.ModuleList(
|
self.discriminators = nn.ModuleList(
|
||||||
[
|
[
|
||||||
DiscriminatorP(2),
|
DiscriminatorP(2),
|
||||||
@ -201,7 +201,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||||||
y_d_gs = []
|
y_d_gs = []
|
||||||
fmap_rs = []
|
fmap_rs = []
|
||||||
fmap_gs = []
|
fmap_gs = []
|
||||||
for i, d in enumerate(self.discriminators):
|
for _i, d in enumerate(self.discriminators):
|
||||||
y_d_r, fmap_r = d(y)
|
y_d_r, fmap_r = d(y)
|
||||||
y_d_g, fmap_g = d(y_hat)
|
y_d_g, fmap_g = d(y_hat)
|
||||||
y_d_rs.append(y_d_r)
|
y_d_rs.append(y_d_r)
|
||||||
@ -215,7 +215,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||||||
class MultiResolutionAmplitudeDiscriminator(nn.Module):
|
class MultiResolutionAmplitudeDiscriminator(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
||||||
num_embeddings: int = None,
|
num_embeddings: int = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -225,7 +225,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
||||||
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
|
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
|
||||||
y_d_rs = []
|
y_d_rs = []
|
||||||
y_d_gs = []
|
y_d_gs = []
|
||||||
fmap_rs = []
|
fmap_rs = []
|
||||||
@ -245,7 +245,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):
|
|||||||
class DiscriminatorAR(nn.Module):
|
class DiscriminatorAR(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
resolution: Tuple[int, int, int],
|
resolution: tuple[int, int, int],
|
||||||
channels: int = 64,
|
channels: int = 64,
|
||||||
in_channels: int = 1,
|
in_channels: int = 1,
|
||||||
num_embeddings: int = None,
|
num_embeddings: int = None,
|
||||||
@ -269,7 +269,7 @@ class DiscriminatorAR(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
||||||
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
) -> tuple[torch.Tensor, list[torch.Tensor]]:
|
||||||
fmap = []
|
fmap = []
|
||||||
x = x.squeeze(1)
|
x = x.squeeze(1)
|
||||||
|
|
||||||
@ -309,7 +309,7 @@ class DiscriminatorAR(nn.Module):
|
|||||||
class MultiResolutionPhaseDiscriminator(nn.Module):
|
class MultiResolutionPhaseDiscriminator(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
||||||
num_embeddings: int = None,
|
num_embeddings: int = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -319,7 +319,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
||||||
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
|
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
|
||||||
y_d_rs = []
|
y_d_rs = []
|
||||||
y_d_gs = []
|
y_d_gs = []
|
||||||
fmap_rs = []
|
fmap_rs = []
|
||||||
@ -339,7 +339,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):
|
|||||||
class DiscriminatorPR(nn.Module):
|
class DiscriminatorPR(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
resolution: Tuple[int, int, int],
|
resolution: tuple[int, int, int],
|
||||||
channels: int = 64,
|
channels: int = 64,
|
||||||
in_channels: int = 1,
|
in_channels: int = 1,
|
||||||
num_embeddings: int = None,
|
num_embeddings: int = None,
|
||||||
@ -363,7 +363,7 @@ class DiscriminatorPR(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
||||||
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
) -> tuple[torch.Tensor, list[torch.Tensor]]:
|
||||||
fmap = []
|
fmap = []
|
||||||
x = x.squeeze(1)
|
x = x.squeeze(1)
|
||||||
|
|
||||||
@ -402,8 +402,8 @@ class DiscriminatorPR(nn.Module):
|
|||||||
|
|
||||||
def feature_loss(fmap_r, fmap_g):
|
def feature_loss(fmap_r, fmap_g):
|
||||||
loss = 0
|
loss = 0
|
||||||
for dr, dg in zip(fmap_r, fmap_g):
|
for dr, dg in zip(fmap_r, fmap_g, strict=False):
|
||||||
for rl, gl in zip(dr, dg):
|
for rl, gl in zip(dr, dg, strict=False):
|
||||||
loss += torch.mean(torch.abs(rl - gl))
|
loss += torch.mean(torch.abs(rl - gl))
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
@ -413,7 +413,7 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
|||||||
loss = 0
|
loss = 0
|
||||||
r_losses = []
|
r_losses = []
|
||||||
g_losses = []
|
g_losses = []
|
||||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
for dr, dg in zip(disc_real_outputs, disc_generated_outputs, strict=False):
|
||||||
r_loss = torch.mean(torch.clamp(1 - dr, min=0))
|
r_loss = torch.mean(torch.clamp(1 - dr, min=0))
|
||||||
g_loss = torch.mean(torch.clamp(1 + dg, min=0))
|
g_loss = torch.mean(torch.clamp(1 + dg, min=0))
|
||||||
loss += r_loss + g_loss
|
loss += r_loss + g_loss
|
||||||
@ -5,9 +5,9 @@ def get_models():
|
|||||||
"large-v2",
|
"large-v2",
|
||||||
"large-v3",
|
"large-v3",
|
||||||
"large-v3-turbo",
|
"large-v3-turbo",
|
||||||
#"distil-large-v2",
|
# "distil-large-v2",
|
||||||
#"distil-large-v3",
|
# "distil-large-v3",
|
||||||
#"distil-large-v3.5",
|
# "distil-large-v3.5",
|
||||||
]
|
]
|
||||||
return model_size_list
|
return model_size_list
|
||||||
|
|
||||||
@ -9,9 +9,10 @@ from huggingface_hub import snapshot_download as snapshot_download_hf
|
|||||||
from modelscope import snapshot_download as snapshot_download_ms
|
from modelscope import snapshot_download as snapshot_download_ms
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from tools.asr.config import get_models
|
from gsv_tools.asr.config import get_models
|
||||||
from tools.asr.funasr_asr import only_asr
|
from gsv_tools.asr.funasr_asr import only_asr
|
||||||
from tools.my_utils import load_cudnn
|
from gsv_tools.my_utils import load_cudnn
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
language_code_list = [
|
language_code_list = [
|
||||||
@ -52,20 +53,20 @@ def download_model(model_size: str):
|
|||||||
if "distil" in model_size:
|
if "distil" in model_size:
|
||||||
if "3.5" in model_size:
|
if "3.5" in model_size:
|
||||||
repo_id = "distil-whisper/distil-large-v3.5-ct2"
|
repo_id = "distil-whisper/distil-large-v3.5-ct2"
|
||||||
model_path = "tools/asr/models/faster-distil-whisper-large-v3.5"
|
model_path = "gsv_tools/asr/models/faster-distil-whisper-large-v3.5"
|
||||||
else:
|
else:
|
||||||
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
|
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
|
||||||
elif model_size == "large-v3-turbo":
|
elif model_size == "large-v3-turbo":
|
||||||
repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
|
repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
|
||||||
model_path = "tools/asr/models/faster-whisper-large-v3-turbo"
|
model_path = "gsv_tools/asr/models/faster-whisper-large-v3-turbo"
|
||||||
else:
|
else:
|
||||||
repo_id = f"Systran/faster-whisper-{model_size}"
|
repo_id = f"Systran/faster-whisper-{model_size}"
|
||||||
model_path = (
|
model_path = (
|
||||||
model_path or f"tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
|
model_path or f"gsv_tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
repo_id = "XXXXRT/faster-whisper"
|
repo_id = "XXXXRT/faster-whisper"
|
||||||
model_path = "tools/asr/models"
|
model_path = "gsv_tools/asr/models"
|
||||||
|
|
||||||
files: list[str] = [
|
files: list[str] = [
|
||||||
"config.json",
|
"config.json",
|
||||||
@ -1,5 +1,3 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
@ -8,6 +6,7 @@ from funasr import AutoModel
|
|||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
funasr_models = {} # 存储模型避免重复加载
|
funasr_models = {} # 存储模型避免重复加载
|
||||||
|
|
||||||
|
|
||||||
@ -23,27 +22,27 @@ def only_asr(input_file, language):
|
|||||||
|
|
||||||
def create_model(language="zh"):
|
def create_model(language="zh"):
|
||||||
if language == "zh":
|
if language == "zh":
|
||||||
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
path_vad = "gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||||
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
path_punc = "gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||||
path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
path_asr = "gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
local_dir="tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
local_dir="gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
)
|
)
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
||||||
local_dir="tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
local_dir="gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
||||||
)
|
)
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
local_dir="gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
)
|
)
|
||||||
model_revision = "v2.0.4"
|
model_revision = "v2.0.4"
|
||||||
elif language == "yue":
|
elif language == "yue":
|
||||||
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
path_asr = "gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
"iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
"iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
||||||
local_dir="tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
local_dir="gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
||||||
)
|
)
|
||||||
path_vad = path_punc = None
|
path_vad = path_punc = None
|
||||||
vad_model_revision = punc_model_revision = ""
|
vad_model_revision = punc_model_revision = ""
|
||||||
@ -1,23 +1,24 @@
|
|||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
|
AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
|
||||||
sys.path.append(AP_BWE_main_dir_path)
|
sys.path.append(AP_BWE_main_dir_path)
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio.functional as aF
|
import torchaudio.functional as aF
|
||||||
# from attrdict import AttrDict####will be bug in py3.10
|
|
||||||
|
|
||||||
from datasets1.dataset import amp_pha_stft, amp_pha_istft
|
# from attrdict import AttrDict####will be bug in py3.10
|
||||||
|
from datasets1.dataset import amp_pha_istft, amp_pha_stft
|
||||||
from models.model import APNet_BWE_Model
|
from models.model import APNet_BWE_Model
|
||||||
|
|
||||||
|
|
||||||
class AP_BWE:
|
class AP_BWE:
|
||||||
def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
|
def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
|
||||||
if checkpoint_file == None:
|
if checkpoint_file is None:
|
||||||
checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
|
checkpoint_file = f"{AP_BWE_main_dir_path}/24kto48k/g_24kto48k.zip"
|
||||||
if os.path.exists(checkpoint_file) == False:
|
if not os.path.exists(checkpoint_file):
|
||||||
raise FileNotFoundError
|
raise FileNotFoundError
|
||||||
config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
|
config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
|
||||||
with open(config_file) as f:
|
with open(config_file) as f:
|
||||||
@ -1,12 +1,13 @@
|
|||||||
import os
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
from modelscope.pipelines import pipeline
|
from modelscope.pipelines import pipeline
|
||||||
from modelscope.utils.constant import Tasks
|
from modelscope.utils.constant import Tasks
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
|
|
||||||
|
path_denoise = "gsv_tools/denoise-model/speech_frcrn_ans_cirm_16k"
|
||||||
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
||||||
ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
|
ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
|
||||||
|
|
||||||
@ -17,7 +18,7 @@ def execute_denoise(input_folder, output_folder):
|
|||||||
# print(list(os.listdir(input_folder).sort()))
|
# print(list(os.listdir(input_folder).sort()))
|
||||||
for name in tqdm(os.listdir(input_folder)):
|
for name in tqdm(os.listdir(input_folder)):
|
||||||
try:
|
try:
|
||||||
ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
|
ans(f"{input_folder}/{name}", output_path=f"{output_folder}/{name}")
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
@ -2,11 +2,12 @@ import json
|
|||||||
import locale
|
import locale
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
|
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
|
||||||
|
|
||||||
|
|
||||||
def load_language_list(language):
|
def load_language_list(language):
|
||||||
with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
|
with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), encoding="utf-8") as f:
|
||||||
language_list = json.load(f)
|
language_list = json.load(f)
|
||||||
return language_list
|
return language_list
|
||||||
|
|
||||||
@ -4,6 +4,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
|
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
|
||||||
DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言
|
DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言
|
||||||
TITLE_LEN: int = 60 # 标题显示长度
|
TITLE_LEN: int = 60 # 标题显示长度
|
||||||
@ -36,7 +37,7 @@ def scan_i18n_strings():
|
|||||||
print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
|
print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
|
||||||
for filename in glob.iglob("**/*.py", recursive=True):
|
for filename in glob.iglob("**/*.py", recursive=True):
|
||||||
try:
|
try:
|
||||||
with open(filename, "r", encoding="utf-8") as f:
|
with open(filename, encoding="utf-8") as f:
|
||||||
code = f.read()
|
code = f.read()
|
||||||
if "I18nAuto" in code:
|
if "I18nAuto" in code:
|
||||||
tree = ast.parse(code)
|
tree = ast.parse(code)
|
||||||
@ -57,7 +58,7 @@ def update_i18n_json(json_file, standard_keys):
|
|||||||
standard_keys = sorted(standard_keys)
|
standard_keys = sorted(standard_keys)
|
||||||
print(f" Process {json_file} ".center(TITLE_LEN, "="))
|
print(f" Process {json_file} ".center(TITLE_LEN, "="))
|
||||||
# 读取 JSON 文件
|
# 读取 JSON 文件
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
with open(json_file, encoding="utf-8") as f:
|
||||||
json_data = json.load(f, object_pairs_hook=OrderedDict)
|
json_data = json.load(f, object_pairs_hook=OrderedDict)
|
||||||
# 打印处理前的 JSON 条目数
|
# 打印处理前的 JSON 条目数
|
||||||
len_before = len(json_data)
|
len_before = len(json_data)
|
||||||
@ -8,7 +8,8 @@ import gradio as gr
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from tools.i18n.i18n import I18nAuto
|
from gsv_tools.i18n.i18n import I18nAuto
|
||||||
|
|
||||||
|
|
||||||
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
||||||
|
|
||||||
@ -48,7 +49,7 @@ def clean_path(path_str: str):
|
|||||||
|
|
||||||
def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
|
def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
|
||||||
files_status = []
|
files_status = []
|
||||||
if is_train == True and file_list:
|
if is_train and file_list:
|
||||||
file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
|
file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
|
||||||
file_list.append(os.path.join(file_list[0], "3-bert"))
|
file_list.append(os.path.join(file_list[0], "3-bert"))
|
||||||
file_list.append(os.path.join(file_list[0], "4-cnhubert"))
|
file_list.append(os.path.join(file_list[0], "4-cnhubert"))
|
||||||
@ -61,7 +62,7 @@ def check_for_existance(file_list: list = None, is_train=False, is_dataset_proce
|
|||||||
files_status.append(False)
|
files_status.append(False)
|
||||||
if sum(files_status) != len(files_status):
|
if sum(files_status) != len(files_status):
|
||||||
if is_train:
|
if is_train:
|
||||||
for file, status in zip(file_list, files_status):
|
for file, status in zip(file_list, files_status, strict=False):
|
||||||
if status:
|
if status:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@ -97,13 +98,13 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
|
|||||||
if not os.path.isdir(audio_path):
|
if not os.path.isdir(audio_path):
|
||||||
gr.Warning(i18n("请填入正确的音频文件夹路径"))
|
gr.Warning(i18n("请填入正确的音频文件夹路径"))
|
||||||
return
|
return
|
||||||
with open(list_path, "r", encoding="utf8") as f:
|
with open(list_path, encoding="utf8") as f:
|
||||||
line = f.readline().strip("\n").split("\n")
|
line = f.readline().strip("\n").split("\n")
|
||||||
wav_name, _, __, ___ = line[0].split("|")
|
wav_name, _, __, ___ = line[0].split("|")
|
||||||
wav_name = clean_path(wav_name)
|
wav_name = clean_path(wav_name)
|
||||||
if audio_path != "" and audio_path != None:
|
if audio_path != "" and audio_path is not None:
|
||||||
wav_name = os.path.basename(wav_name)
|
wav_name = os.path.basename(wav_name)
|
||||||
wav_path = "%s/%s" % (audio_path, wav_name)
|
wav_path = f"{audio_path}/{wav_name}"
|
||||||
else:
|
else:
|
||||||
wav_path = wav_name
|
wav_path = wav_name
|
||||||
if os.path.exists(wav_path):
|
if os.path.exists(wav_path):
|
||||||
@ -117,7 +118,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
|
|||||||
path_list.append(os.path.join(path_list[0], "5-wav32k"))
|
path_list.append(os.path.join(path_list[0], "5-wav32k"))
|
||||||
path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
|
path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
|
||||||
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
|
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
|
||||||
with open(phone_path, "r", encoding="utf-8") as f:
|
with open(phone_path, encoding="utf-8") as f:
|
||||||
if f.read(1):
|
if f.read(1):
|
||||||
...
|
...
|
||||||
else:
|
else:
|
||||||
@ -1,13 +1,14 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
|
from slicer2 import Slicer
|
||||||
|
|
||||||
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
# parent_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
# sys.path.append(parent_directory)
|
# sys.path.append(parent_directory)
|
||||||
from tools.my_utils import load_audio
|
from gsv_tools.my_utils import load_audio
|
||||||
from slicer2 import Slicer
|
|
||||||
|
|
||||||
|
|
||||||
def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
|
def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
|
||||||
@ -1,5 +1,7 @@
|
|||||||
import sys
|
import sys
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
|
||||||
|
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
|
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
|
||||||
i18n = I18nAuto(language=language)
|
i18n = I18nAuto(language=language)
|
||||||
@ -9,6 +11,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import gradio.analytics as analytics
|
import gradio.analytics as analytics
|
||||||
|
|
||||||
@ -21,6 +24,7 @@ import librosa
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile
|
import soundfile
|
||||||
|
|
||||||
|
|
||||||
g_json_key_text = ""
|
g_json_key_text = ""
|
||||||
g_json_key_path = ""
|
g_json_key_path = ""
|
||||||
g_load_file = ""
|
g_load_file = ""
|
||||||
@ -113,7 +117,7 @@ def b_delete_audio(*checkbox_list):
|
|||||||
change = False
|
change = False
|
||||||
for i, checkbox in reversed(list(enumerate(checkbox_list))):
|
for i, checkbox in reversed(list(enumerate(checkbox_list))):
|
||||||
if g_index + i < len(g_data_json):
|
if g_index + i < len(g_data_json):
|
||||||
if checkbox == True:
|
if checkbox:
|
||||||
g_data_json.pop(g_index + i)
|
g_data_json.pop(g_index + i)
|
||||||
change = True
|
change = True
|
||||||
|
|
||||||
@ -150,7 +154,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list):
|
|||||||
global g_data_json, g_max_json_index
|
global g_data_json, g_max_json_index
|
||||||
checked_index = []
|
checked_index = []
|
||||||
for i, checkbox in enumerate(checkbox_list):
|
for i, checkbox in enumerate(checkbox_list):
|
||||||
if checkbox == True and g_index + i < len(g_data_json):
|
if checkbox and g_index + i < len(g_data_json):
|
||||||
checked_index.append(g_index + i)
|
checked_index.append(g_index + i)
|
||||||
if len(checked_index) == 1:
|
if len(checked_index) == 1:
|
||||||
index = checked_index[0]
|
index = checked_index[0]
|
||||||
@ -182,7 +186,7 @@ def b_merge_audio(interval_r, *checkbox_list):
|
|||||||
audios_path = []
|
audios_path = []
|
||||||
audios_text = []
|
audios_text = []
|
||||||
for i, checkbox in enumerate(checkbox_list):
|
for i, checkbox in enumerate(checkbox_list):
|
||||||
if checkbox == True and g_index + i < len(g_data_json):
|
if checkbox and g_index + i < len(g_data_json):
|
||||||
checked_index.append(g_index + i)
|
checked_index.append(g_index + i)
|
||||||
|
|
||||||
if len(checked_index) > 1:
|
if len(checked_index) > 1:
|
||||||
@ -237,7 +241,7 @@ def b_save_list():
|
|||||||
|
|
||||||
def b_load_json():
|
def b_load_json():
|
||||||
global g_data_json, g_max_json_index
|
global g_data_json, g_max_json_index
|
||||||
with open(g_load_file, "r", encoding="utf-8") as file:
|
with open(g_load_file, encoding="utf-8") as file:
|
||||||
g_data_json = file.readlines()
|
g_data_json = file.readlines()
|
||||||
g_data_json = [json.loads(line) for line in g_data_json]
|
g_data_json = [json.loads(line) for line in g_data_json]
|
||||||
g_max_json_index = len(g_data_json) - 1
|
g_max_json_index = len(g_data_json) - 1
|
||||||
@ -245,7 +249,7 @@ def b_load_json():
|
|||||||
|
|
||||||
def b_load_list():
|
def b_load_list():
|
||||||
global g_data_json, g_max_json_index
|
global g_data_json, g_max_json_index
|
||||||
with open(g_load_file, "r", encoding="utf-8") as source:
|
with open(g_load_file, encoding="utf-8") as source:
|
||||||
data_list = source.readlines()
|
data_list = source.readlines()
|
||||||
for _ in data_list:
|
for _ in data_list:
|
||||||
data = _.split("|")
|
data = _.split("|")
|
||||||
@ -1,7 +1,7 @@
|
|||||||
from packaging import version
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn, einsum
|
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
from packaging import version
|
||||||
|
from torch import einsum, nn
|
||||||
|
|
||||||
|
|
||||||
def exists(val):
|
def exists(val):
|
||||||
@ -1,21 +1,20 @@
|
|||||||
|
from collections.abc import Callable
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
|
||||||
from torch.nn import Module, ModuleList
|
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
from einops import pack, rearrange, unpack
|
||||||
|
from einops.layers.torch import Rearrange
|
||||||
|
|
||||||
from bs_roformer.attend import Attend
|
|
||||||
from torch.utils.checkpoint import checkpoint
|
|
||||||
|
|
||||||
from typing import Tuple, Optional, Callable
|
|
||||||
# from beartype.typing import Tuple, Optional, List, Callable
|
# from beartype.typing import Tuple, Optional, List, Callable
|
||||||
# from beartype import beartype
|
# from beartype import beartype
|
||||||
|
|
||||||
from rotary_embedding_torch import RotaryEmbedding
|
from rotary_embedding_torch import RotaryEmbedding
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import Module, ModuleList
|
||||||
|
from torch.utils.checkpoint import checkpoint
|
||||||
|
|
||||||
|
from bs_roformer.attend import Attend
|
||||||
|
|
||||||
from einops import rearrange, pack, unpack
|
|
||||||
from einops.layers.torch import Rearrange
|
|
||||||
|
|
||||||
# helper functions
|
# helper functions
|
||||||
|
|
||||||
@ -192,7 +191,7 @@ class Transformer(Module):
|
|||||||
|
|
||||||
class BandSplit(Module):
|
class BandSplit(Module):
|
||||||
# @beartype
|
# @beartype
|
||||||
def __init__(self, dim, dim_inputs: Tuple[int, ...]):
|
def __init__(self, dim, dim_inputs: tuple[int, ...]):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim_inputs = dim_inputs
|
self.dim_inputs = dim_inputs
|
||||||
self.to_features = ModuleList([])
|
self.to_features = ModuleList([])
|
||||||
@ -206,7 +205,7 @@ class BandSplit(Module):
|
|||||||
x = x.split(self.dim_inputs, dim=-1)
|
x = x.split(self.dim_inputs, dim=-1)
|
||||||
|
|
||||||
outs = []
|
outs = []
|
||||||
for split_input, to_feature in zip(x, self.to_features):
|
for split_input, to_feature in zip(x, self.to_features, strict=False):
|
||||||
split_output = to_feature(split_input)
|
split_output = to_feature(split_input)
|
||||||
outs.append(split_output)
|
outs.append(split_output)
|
||||||
|
|
||||||
@ -219,7 +218,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
|
|||||||
net = []
|
net = []
|
||||||
dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
|
dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
|
||||||
|
|
||||||
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
|
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
|
||||||
is_last = ind == (len(dims) - 2)
|
is_last = ind == (len(dims) - 2)
|
||||||
|
|
||||||
net.append(nn.Linear(layer_dim_in, layer_dim_out))
|
net.append(nn.Linear(layer_dim_in, layer_dim_out))
|
||||||
@ -234,15 +233,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
|
|||||||
|
|
||||||
class MaskEstimator(Module):
|
class MaskEstimator(Module):
|
||||||
# @beartype
|
# @beartype
|
||||||
def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
|
def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim_inputs = dim_inputs
|
self.dim_inputs = dim_inputs
|
||||||
self.to_freqs = ModuleList([])
|
self.to_freqs = ModuleList([])
|
||||||
dim_hidden = dim * mlp_expansion_factor
|
dim_hidden = dim * mlp_expansion_factor
|
||||||
|
|
||||||
for dim_in in dim_inputs:
|
for dim_in in dim_inputs:
|
||||||
net = []
|
|
||||||
|
|
||||||
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
|
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
|
||||||
|
|
||||||
self.to_freqs.append(mlp)
|
self.to_freqs.append(mlp)
|
||||||
@ -252,7 +249,7 @@ class MaskEstimator(Module):
|
|||||||
|
|
||||||
outs = []
|
outs = []
|
||||||
|
|
||||||
for band_features, mlp in zip(x, self.to_freqs):
|
for band_features, mlp in zip(x, self.to_freqs, strict=False):
|
||||||
freq_out = mlp(band_features)
|
freq_out = mlp(band_features)
|
||||||
outs.append(freq_out)
|
outs.append(freq_out)
|
||||||
|
|
||||||
@ -339,7 +336,7 @@ class BSRoformer(Module):
|
|||||||
time_transformer_depth=2,
|
time_transformer_depth=2,
|
||||||
freq_transformer_depth=2,
|
freq_transformer_depth=2,
|
||||||
linear_transformer_depth=0,
|
linear_transformer_depth=0,
|
||||||
freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
|
freqs_per_bands: tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
|
||||||
# in the paper, they divide into ~60 bands, test with 1 for starters
|
# in the paper, they divide into ~60 bands, test with 1 for starters
|
||||||
dim_head=64,
|
dim_head=64,
|
||||||
heads=8,
|
heads=8,
|
||||||
@ -352,10 +349,10 @@ class BSRoformer(Module):
|
|||||||
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
|
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
|
||||||
stft_win_length=2048,
|
stft_win_length=2048,
|
||||||
stft_normalized=False,
|
stft_normalized=False,
|
||||||
stft_window_fn: Optional[Callable] = None,
|
stft_window_fn: Callable | None = None,
|
||||||
mask_estimator_depth=2,
|
mask_estimator_depth=2,
|
||||||
multi_stft_resolution_loss_weight=1.0,
|
multi_stft_resolution_loss_weight=1.0,
|
||||||
multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
|
multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
|
||||||
multi_stft_hop_size=147,
|
multi_stft_hop_size=147,
|
||||||
multi_stft_normalized=False,
|
multi_stft_normalized=False,
|
||||||
multi_stft_window_fn: Callable = torch.hann_window,
|
multi_stft_window_fn: Callable = torch.hann_window,
|
||||||
@ -1,23 +1,20 @@
|
|||||||
|
from collections.abc import Callable
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
|
||||||
from torch.nn import Module, ModuleList
|
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
from einops import pack, rearrange, reduce, repeat, unpack
|
||||||
|
from einops.layers.torch import Rearrange
|
||||||
|
from librosa import filters
|
||||||
|
|
||||||
from bs_roformer.attend import Attend
|
|
||||||
from torch.utils.checkpoint import checkpoint
|
|
||||||
|
|
||||||
from typing import Tuple, Optional, Callable
|
|
||||||
# from beartype.typing import Tuple, Optional, List, Callable
|
# from beartype.typing import Tuple, Optional, List, Callable
|
||||||
# from beartype import beartype
|
# from beartype import beartype
|
||||||
|
|
||||||
from rotary_embedding_torch import RotaryEmbedding
|
from rotary_embedding_torch import RotaryEmbedding
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import Module, ModuleList
|
||||||
|
from torch.utils.checkpoint import checkpoint
|
||||||
|
|
||||||
from einops import rearrange, pack, unpack, reduce, repeat
|
from bs_roformer.attend import Attend
|
||||||
from einops.layers.torch import Rearrange
|
|
||||||
|
|
||||||
from librosa import filters
|
|
||||||
|
|
||||||
|
|
||||||
# helper functions
|
# helper functions
|
||||||
@ -201,7 +198,7 @@ class Transformer(Module):
|
|||||||
|
|
||||||
class BandSplit(Module):
|
class BandSplit(Module):
|
||||||
# @beartype
|
# @beartype
|
||||||
def __init__(self, dim, dim_inputs: Tuple[int, ...]):
|
def __init__(self, dim, dim_inputs: tuple[int, ...]):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim_inputs = dim_inputs
|
self.dim_inputs = dim_inputs
|
||||||
self.to_features = ModuleList([])
|
self.to_features = ModuleList([])
|
||||||
@ -215,7 +212,7 @@ class BandSplit(Module):
|
|||||||
x = x.split(self.dim_inputs, dim=-1)
|
x = x.split(self.dim_inputs, dim=-1)
|
||||||
|
|
||||||
outs = []
|
outs = []
|
||||||
for split_input, to_feature in zip(x, self.to_features):
|
for split_input, to_feature in zip(x, self.to_features, strict=False):
|
||||||
split_output = to_feature(split_input)
|
split_output = to_feature(split_input)
|
||||||
outs.append(split_output)
|
outs.append(split_output)
|
||||||
|
|
||||||
@ -228,7 +225,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
|
|||||||
net = []
|
net = []
|
||||||
dims = (dim_in, *((dim_hidden,) * depth), dim_out)
|
dims = (dim_in, *((dim_hidden,) * depth), dim_out)
|
||||||
|
|
||||||
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
|
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
|
||||||
is_last = ind == (len(dims) - 2)
|
is_last = ind == (len(dims) - 2)
|
||||||
|
|
||||||
net.append(nn.Linear(layer_dim_in, layer_dim_out))
|
net.append(nn.Linear(layer_dim_in, layer_dim_out))
|
||||||
@ -243,15 +240,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
|
|||||||
|
|
||||||
class MaskEstimator(Module):
|
class MaskEstimator(Module):
|
||||||
# @beartype
|
# @beartype
|
||||||
def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
|
def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim_inputs = dim_inputs
|
self.dim_inputs = dim_inputs
|
||||||
self.to_freqs = ModuleList([])
|
self.to_freqs = ModuleList([])
|
||||||
dim_hidden = dim * mlp_expansion_factor
|
dim_hidden = dim * mlp_expansion_factor
|
||||||
|
|
||||||
for dim_in in dim_inputs:
|
for dim_in in dim_inputs:
|
||||||
net = []
|
|
||||||
|
|
||||||
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
|
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
|
||||||
|
|
||||||
self.to_freqs.append(mlp)
|
self.to_freqs.append(mlp)
|
||||||
@ -261,7 +256,7 @@ class MaskEstimator(Module):
|
|||||||
|
|
||||||
outs = []
|
outs = []
|
||||||
|
|
||||||
for band_features, mlp in zip(x, self.to_freqs):
|
for band_features, mlp in zip(x, self.to_freqs, strict=False):
|
||||||
freq_out = mlp(band_features)
|
freq_out = mlp(band_features)
|
||||||
outs.append(freq_out)
|
outs.append(freq_out)
|
||||||
|
|
||||||
@ -296,10 +291,10 @@ class MelBandRoformer(Module):
|
|||||||
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
|
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
|
||||||
stft_win_length=2048,
|
stft_win_length=2048,
|
||||||
stft_normalized=False,
|
stft_normalized=False,
|
||||||
stft_window_fn: Optional[Callable] = None,
|
stft_window_fn: Callable | None = None,
|
||||||
mask_estimator_depth=1,
|
mask_estimator_depth=1,
|
||||||
multi_stft_resolution_loss_weight=1.0,
|
multi_stft_resolution_loss_weight=1.0,
|
||||||
multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
|
multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
|
||||||
multi_stft_hop_size=147,
|
multi_stft_hop_size=147,
|
||||||
multi_stft_normalized=False,
|
multi_stft_normalized=False,
|
||||||
multi_stft_window_fn: Callable = torch.hann_window,
|
multi_stft_window_fn: Callable = torch.hann_window,
|
||||||
@ -10,12 +10,13 @@ import torch.nn as nn
|
|||||||
import yaml
|
import yaml
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
|
||||||
class Roformer_Loader:
|
class Roformer_Loader:
|
||||||
def get_config(self, config_path):
|
def get_config(self, config_path):
|
||||||
with open(config_path, "r", encoding="utf-8") as f:
|
with open(config_path, encoding="utf-8") as f:
|
||||||
# use fullloader to load tag !!python/tuple, code can be improved
|
# use fullloader to load tag !!python/tuple, code can be improved
|
||||||
config = yaml.load(f, Loader=yaml.FullLoader)
|
config = yaml.load(f, Loader=yaml.FullLoader)
|
||||||
return config
|
return config
|
||||||
@ -104,7 +105,7 @@ class Roformer_Loader:
|
|||||||
|
|
||||||
model = MelBandRoformer(**dict(self.config["model"]))
|
model = MelBandRoformer(**dict(self.config["model"]))
|
||||||
else:
|
else:
|
||||||
print("Error: Unknown model: {}".format(self.model_type))
|
print(f"Error: Unknown model: {self.model_type}")
|
||||||
model = None
|
model = None
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@ -192,9 +193,11 @@ class Roformer_Loader:
|
|||||||
progress_bar.close()
|
progress_bar.close()
|
||||||
|
|
||||||
if self.config["training"]["target_instrument"] is None:
|
if self.config["training"]["target_instrument"] is None:
|
||||||
return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources)}
|
return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources, strict=False)}
|
||||||
else:
|
else:
|
||||||
return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)}
|
return {
|
||||||
|
k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources, strict=False)
|
||||||
|
}
|
||||||
|
|
||||||
def run_folder(self, input, vocal_root, others_root, format):
|
def run_folder(self, input, vocal_root, others_root, format):
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
@ -210,8 +213,8 @@ class Roformer_Loader:
|
|||||||
try:
|
try:
|
||||||
mix, sr = librosa.load(path, sr=sample_rate, mono=False)
|
mix, sr = librosa.load(path, sr=sample_rate, mono=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Can read track: {}".format(path))
|
print(f"Can read track: {path}")
|
||||||
print("Error message: {}".format(str(e)))
|
print(f"Error message: {str(e)}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# in case if model only supports mono tracks
|
# in case if model only supports mono tracks
|
||||||
@ -232,17 +235,17 @@ class Roformer_Loader:
|
|||||||
other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument]
|
other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument]
|
||||||
other = mix_orig - res[target_instrument] # caculate other instruments
|
other = mix_orig - res[target_instrument] # caculate other instruments
|
||||||
|
|
||||||
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument)
|
path_vocal = f"{vocal_root}/{file_base_name}_{target_instrument}.wav"
|
||||||
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0])
|
path_other = f"{others_root}/{file_base_name}_{other_instruments[0]}.wav"
|
||||||
self.save_audio(path_vocal, res[target_instrument].T, sr, format)
|
self.save_audio(path_vocal, res[target_instrument].T, sr, format)
|
||||||
self.save_audio(path_other, other.T, sr, format)
|
self.save_audio(path_other, other.T, sr, format)
|
||||||
else:
|
else:
|
||||||
# if target instrument is not specified, save the first instrument as vocal and the rest as others
|
# if target instrument is not specified, save the first instrument as vocal and the rest as others
|
||||||
vocal_inst = self.config["training"]["instruments"][0]
|
vocal_inst = self.config["training"]["instruments"][0]
|
||||||
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst)
|
path_vocal = f"{vocal_root}/{file_base_name}_{vocal_inst}.wav"
|
||||||
self.save_audio(path_vocal, res[vocal_inst].T, sr, format)
|
self.save_audio(path_vocal, res[vocal_inst].T, sr, format)
|
||||||
for other in self.config["training"]["instruments"][1:]: # save other instruments
|
for other in self.config["training"]["instruments"][1:]: # save other instruments
|
||||||
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other)
|
path_other = f"{others_root}/{file_base_name}_{other}.wav"
|
||||||
self.save_audio(path_other, res[other].T, sr, format)
|
self.save_audio(path_other, res[other].T, sr, format)
|
||||||
|
|
||||||
def save_audio(self, path, data, sr, format):
|
def save_audio(self, path, data, sr, format):
|
||||||
@ -253,7 +256,7 @@ class Roformer_Loader:
|
|||||||
sf.write(path, data, sr)
|
sf.write(path, data, sr)
|
||||||
else:
|
else:
|
||||||
sf.write(path, data, sr)
|
sf.write(path, data, sr)
|
||||||
os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format))
|
os.system(f'ffmpeg -i "{path}" -vn "{path[:-3] + format}" -q:a 2 -y')
|
||||||
try:
|
try:
|
||||||
os.remove(path)
|
os.remove(path)
|
||||||
except:
|
except:
|
||||||
@ -275,7 +278,7 @@ class Roformer_Loader:
|
|||||||
if self.model_type is None:
|
if self.model_type is None:
|
||||||
# if model_type is still None, raise an error
|
# if model_type is still None, raise an error
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
|
"Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'gsv_tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
|
||||||
)
|
)
|
||||||
self.config = self.get_default_config()
|
self.config = self.get_default_config()
|
||||||
else:
|
else:
|
||||||
@ -290,12 +293,12 @@ class Roformer_Loader:
|
|||||||
# else it's a mel_band_roformer model
|
# else it's a mel_band_roformer model
|
||||||
self.model_type = "mel_band_roformer"
|
self.model_type = "mel_band_roformer"
|
||||||
|
|
||||||
print("Detected model type: {}".format(self.model_type))
|
print(f"Detected model type: {self.model_type}")
|
||||||
model = self.get_model_from_config()
|
model = self.get_model_from_config()
|
||||||
state_dict = torch.load(model_path, map_location="cpu")
|
state_dict = torch.load(model_path, map_location="cpu")
|
||||||
model.load_state_dict(state_dict)
|
model.load_state_dict(state_dict)
|
||||||
|
|
||||||
if is_half == False:
|
if not is_half:
|
||||||
self.model = model.to(device)
|
self.model = model.to(device)
|
||||||
else:
|
else:
|
||||||
self.model = model.half().to(device)
|
self.model = model.half().to(device)
|
||||||
@ -38,7 +38,7 @@ def make_pair(mix_dir, inst_dir):
|
|||||||
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
|
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
|
||||||
)
|
)
|
||||||
|
|
||||||
filelist = list(zip(X_list, y_list))
|
filelist = list(zip(X_list, y_list, strict=False))
|
||||||
|
|
||||||
return filelist
|
return filelist
|
||||||
|
|
||||||
@ -138,10 +138,10 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
|
|||||||
|
|
||||||
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
|
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
|
||||||
patch_list = []
|
patch_list = []
|
||||||
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset)
|
patch_dir = f"cs{cropsize}_sr{sr}_hl{hop_length}_nf{n_fft}_of{offset}"
|
||||||
os.makedirs(patch_dir, exist_ok=True)
|
os.makedirs(patch_dir, exist_ok=True)
|
||||||
|
|
||||||
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
|
for _i, (X_path, y_path) in enumerate(tqdm(filelist)):
|
||||||
basename = os.path.splitext(os.path.basename(X_path))[0]
|
basename = os.path.splitext(os.path.basename(X_path))[0]
|
||||||
|
|
||||||
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
|
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
|
||||||
@ -154,7 +154,7 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
|
|||||||
|
|
||||||
len_dataset = int(np.ceil(X.shape[2] / roi_size))
|
len_dataset = int(np.ceil(X.shape[2] / roi_size))
|
||||||
for j in range(len_dataset):
|
for j in range(len_dataset):
|
||||||
outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
|
outpath = os.path.join(patch_dir, f"{basename}_p{j}.npz")
|
||||||
start = j * roi_size
|
start = j * roi_size
|
||||||
if not os.path.exists(outpath):
|
if not os.path.exists(outpath):
|
||||||
np.savez(
|
np.savez(
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class SeperableConv2DBNActiv(nn.Module):
|
class SeperableConv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(SeperableConv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
||||||
@ -7,7 +7,7 @@ from . import spec_utils
|
|||||||
|
|
||||||
class Conv2DBNActiv(nn.Module):
|
class Conv2DBNActiv(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
||||||
super(Conv2DBNActiv, self).__init__()
|
super().__init__()
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
nin,
|
nin,
|
||||||
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
|
|||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
||||||
super(Encoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
|
||||||
self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
|
||||||
super(Decoder, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
||||||
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
|
||||||
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
||||||
@ -64,7 +64,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
class ASPPModule(nn.Module):
|
class ASPPModule(nn.Module):
|
||||||
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
|
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
|
||||||
super(ASPPModule, self).__init__()
|
super().__init__()
|
||||||
self.conv1 = nn.Sequential(
|
self.conv1 = nn.Sequential(
|
||||||
nn.AdaptiveAvgPool2d((1, None)),
|
nn.AdaptiveAvgPool2d((1, None)),
|
||||||
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
|
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
|
||||||
@ -94,7 +94,7 @@ class ASPPModule(nn.Module):
|
|||||||
|
|
||||||
class LSTMModule(nn.Module):
|
class LSTMModule(nn.Module):
|
||||||
def __init__(self, nin_conv, nin_lstm, nout_lstm):
|
def __init__(self, nin_conv, nin_lstm, nout_lstm):
|
||||||
super(LSTMModule, self).__init__()
|
super().__init__()
|
||||||
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
|
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
|
||||||
self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
|
self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
|
||||||
self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
|
self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
|
||||||
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
|
|
||||||
default_param = {}
|
default_param = {}
|
||||||
default_param["bins"] = 768
|
default_param["bins"] = 768
|
||||||
default_param["unstable_bins"] = 9 # training only
|
default_param["unstable_bins"] = 9 # training only
|
||||||
@ -41,7 +42,7 @@ def int_keys(d):
|
|||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
class ModelParameters(object):
|
class ModelParameters:
|
||||||
def __init__(self, config_path=""):
|
def __init__(self, config_path=""):
|
||||||
if ".pth" == pathlib.Path(config_path).suffix:
|
if ".pth" == pathlib.Path(config_path).suffix:
|
||||||
import zipfile
|
import zipfile
|
||||||
@ -49,7 +50,7 @@ class ModelParameters(object):
|
|||||||
with zipfile.ZipFile(config_path, "r") as zip:
|
with zipfile.ZipFile(config_path, "r") as zip:
|
||||||
self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
|
self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
|
||||||
elif ".json" == pathlib.Path(config_path).suffix:
|
elif ".json" == pathlib.Path(config_path).suffix:
|
||||||
with open(config_path, "r") as f:
|
with open(config_path) as f:
|
||||||
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
|
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
|
||||||
else:
|
else:
|
||||||
self.param = default_param
|
self.param = default_param
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user