Merge 319a09fa2d54335ab4893190c860c77beed76a83 into 2d9193b0d3c0eae0c3a14d8c68a839f1bae157dc

This commit is contained in:
XXXXRT666 2026-02-17 04:35:59 +00:00 committed by GitHub
commit 60a25737d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
131 changed files with 10190 additions and 1911 deletions

View File

@ -1,6 +1,6 @@
GPT_SoVITS/pretrained_models/*
tools/asr/models/*
tools/uvr5/uvr5_weights/*
gsv_tools/asr/models/*
gsv_tools/uvr5/uvr5_weights/*
.git
.DS_Store
@ -21,6 +21,8 @@ cfg.json
speakers.json
ref_audios
pylock.toml
# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/

View File

@ -92,13 +92,13 @@ Write-Host "[INFO] Download G2PWModel..."
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
Write-Host "[INFO] Download UVR5 model..."
DownloadAndUnzip $UVR5_URL "tools\uvr5"
DownloadAndUnzip $UVR5_URL "gsv_tools\uvr5"
Write-Host "[INFO] Downloading funasr..."
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
$funasrZip = "$tmpDir\funasr.zip"
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\gsv_tools\asr\models" -Force
Remove-Item $funasrZip
Write-Host "[INFO] Download ffmpeg..."

46
.gitignore vendored
View File

@ -1,24 +1,3 @@
.DS_Store
.vscode
__pycache__
*.pyc
env
runtime
.idea
output
logs
SoVITS_weights*/
GPT_weights*/
TEMP
weight.json
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
tools/AP_BWE_main/24kto48k/*
!tools/AP_BWE_main/24kto48k/readme.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -193,3 +172,28 @@ cython_debug/
# PyPI configuration file
.pypirc
# GPT-SoVITS
.DS_Store
.vscode
__pycache__
*.pyc
pylock.toml
env
runtime
.idea
output
logs
SoVITS_weights*/
GPT_weights*/
TEMP
weight.json
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
gsv_tools/AP_BWE_main/24kto48k/*
!gsv_tools/AP_BWE_main/24kto48k/readme.txt
!gsv_tools/uvr5/lib

View File

@ -3,13 +3,18 @@ ci:
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
rev: v0.15.1
hooks:
# Run the linter.
- id: ruff
types_or: [ python, pyi ]
args: [ --fix , "--exit-zero" ]
args: [ --fix , "--unsafe-fixes", "--exit-zero" ]
# Run the formatter.
- id: ruff-format
types_or: [ python, pyi ]
args: [ --line-length, "120", --target-version, "py311" ]
- repo: https://github.com/astral-sh/uv-pre-commit
# uv version.
rev: 0.10.3
hooks:
- id: uv-lock

View File

@ -53,7 +53,7 @@
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
" :\n",
"else\n",
" conda create -n GPTSoVITS python=3.10 -y\n",
" conda create -n GPTSoVITS python=3.11 -y\n",
"fi\n",
"\n",
"source activate GPTSoVITS\n",

View File

@ -52,7 +52,7 @@
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
" :\n",
"else\n",
" conda create -n GPTSoVITS python=3.10 -y\n",
" conda create -n GPTSoVITS python=3.11 -y\n",
"fi\n",
"\n",
"source activate GPTSoVITS\n",

View File

@ -57,21 +57,20 @@ source "$HOME/.bashrc"
"$HOME/conda/bin/conda" install python=3.12 -y
"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg uv cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
cd workspace
if [ "$CUDA_VERSION" = "12.8" ]; then
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
"$HOME/conda/bin/conda" install cuda-nvcc=12.8 -y
"$HOME/conda/bin/uv" pip install ".[cu128]" --no-cache-dir --python "$HOME/conda/bin/python"
elif [ "$CUDA_VERSION" = "12.6" ]; then
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
"$HOME/conda/bin/conda" install cuda-nvcc=12.6 -y
"$HOME/conda/bin/uv" pip install ".[cu126]" --no-cache-dir --python "$HOME/conda/bin/python"
fi
export PATH="$HOME/conda/bin:$PATH"
"$HOME/conda/bin/pip" install psutil ninja packaging wheel "setuptools>=42" einops
"$HOME/conda/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
"$HOME/conda/bin/pip" cache purge
"$HOME/conda/bin/uv" pip install ".[flash-attn]" --python "$HOME/conda/bin/python"
"$HOME/conda/bin/uv" cache clean
rm $LOG_PATH

View File

@ -15,7 +15,7 @@ SHELL ["/bin/bash", "-c"]
WORKDIR /workspace/GPT-SoVITS
COPY Docker /workspace/GPT-SoVITS/Docker/
COPY . /workspace/GPT-SoVITS
ARG LITE=false
ENV LITE=${LITE}
@ -26,12 +26,6 @@ ENV WORKFLOW=${WORKFLOW}
ARG TARGETPLATFORM
ENV TARGETPLATFORM=${TARGETPLATFORM}
COPY extra-req.txt /workspace/GPT-SoVITS/
COPY requirements.txt /workspace/GPT-SoVITS/
COPY install.sh /workspace/GPT-SoVITS/
RUN bash Docker/install_wrapper.sh
EXPOSE 9871 9872 9873 9874 9880
@ -40,21 +34,13 @@ ENV PYTHONPATH="/workspace/GPT-SoVITS"
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
WORKDIR /workspace
RUN rm -rf /workspace/GPT-SoVITS
WORKDIR /workspace/GPT-SoVITS
COPY . /workspace/GPT-SoVITS
CMD ["/bin/bash", "-c", "\
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
rm -rf /workspace/GPT-SoVITS/gsv_tools/asr/models && \
rm -rf /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/gsv_tools/asr/models && \
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
exec bash"]

View File

@ -9,6 +9,7 @@
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
## News
- **Sep 2024 (v2.4):**
- We have updated the pretrained checkpoints trained for 5M steps. This is final release of the BigVGAN-v2 checkpoints.
@ -31,7 +32,7 @@
The codebase has been tested on Python `3.10` and PyTorch `2.3.1` conda packages with either `pytorch-cuda=12.1` or `pytorch-cuda=11.8`. Below is an example command to create the conda environment:
```shell
conda create -n bigvgan python=3.10 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
conda create -n bigvgan python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
conda activate bigvgan
```
@ -186,17 +187,17 @@ If you see `[Fail] test CUDA fused vs. plain torch BigVGAN inference`, it means
We provide the [pretrained models on Hugging Face Collections](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a).
One can download the checkpoints of the generator weight (named `bigvgan_generator.pt`) and its discriminator/optimizer states (named `bigvgan_discriminator_optimizer.pt`) within the listed model repositories.
| Model Name | Sampling Rate | Mel band | fmax | Upsampling Ratio | Params | Dataset | Steps | Fine-Tuned |
|:--------------------------------------------------------------------------------------------------------:|:-------------:|:--------:|:-----:|:----------------:|:------:|:--------------------------:|:-----:|:----------:|
| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x) | 44 kHz | 128 | 22050 | 512 | 122M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x) | 44 kHz | 128 | 22050 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x) | 24 kHz | 100 | 12000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x) | 22 kHz | 80 | 11025 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz | 80 | 8000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 112M | LibriTTS | 5M | No |
| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 14M | LibriTTS | 5M | No |
| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 112M | LibriTTS + VCTK + LJSpeech | 5M | No |
| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 14M | LibriTTS + VCTK + LJSpeech | 5M | No |
| Model Name | Sampling Rate | Mel band | fmax | Upsampling Ratio | Params | Dataset | Steps | Fine-Tuned |
| :------------------------------------------------------------------------------------------------------: | :-----------: | :------: | :---: | :--------------: | :----: | :------------------------: | :---: | :--------: |
| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x) | 44 kHz | 128 | 22050 | 512 | 122M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x) | 44 kHz | 128 | 22050 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x) | 24 kHz | 100 | 12000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x) | 22 kHz | 80 | 11025 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz | 80 | 8000 | 256 | 112M | Large-scale Compilation | 5M | No |
| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 112M | LibriTTS | 5M | No |
| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band) | 24 kHz | 100 | 12000 | 256 | 14M | LibriTTS | 5M | No |
| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 112M | LibriTTS + VCTK + LJSpeech | 5M | No |
| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band) | 22 kHz | 80 | 8000 | 256 | 14M | LibriTTS + VCTK + LJSpeech | 5M | No |
The paper results are based on the original 24kHz BigVGAN models (`bigvgan_24khz_100band` and `bigvgan_base_24khz_100band`) trained on LibriTTS dataset.
We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
@ -219,37 +220,37 @@ When training BigVGAN-v2 from scratch with small batch size, it can potentially
Below are the objective results of the 24kHz model (`bigvgan_v2_24khz_100band_256x`) obtained from the LibriTTS `dev` sets. BigVGAN-v2 shows noticeable improvements of the metrics. The model also exhibits reduced perceptual artifacts, especially for non-speech audio.
| Model | Dataset | Steps | PESQ(↑) | M-STFT(↓) | MCD(↓) | Periodicity(↓) | V/UV F1(↑) |
|:----------:|:-----------------------:|:-----:|:---------:|:----------:|:----------:|:--------------:|:----------:|
| BigVGAN | LibriTTS | 1M | 4.027 | 0.7997 | 0.3745 | 0.1018 | 0.9598 |
| BigVGAN | LibriTTS | 5M | 4.256 | 0.7409 | 0.2988 | 0.0809 | 0.9698 |
| BigVGAN-v2 | Large-scale Compilation | 3M | 4.359 | 0.7134 | 0.3060 | 0.0621 | 0.9777 |
| BigVGAN-v2 | Large-scale Compilation | 5M | **4.362** | **0.7026** | **0.2903** | **0.0593** | **0.9793** |
| Model | Dataset | Steps | PESQ(↑) | M-STFT(↓) | MCD(↓) | Periodicity(↓) | V/UV F1(↑) |
| :--------: | :---------------------: | :---: | :-------: | :--------: | :--------: | :------------: | :--------: |
| BigVGAN | LibriTTS | 1M | 4.027 | 0.7997 | 0.3745 | 0.1018 | 0.9598 |
| BigVGAN | LibriTTS | 5M | 4.256 | 0.7409 | 0.2988 | 0.0809 | 0.9698 |
| BigVGAN-v2 | Large-scale Compilation | 3M | 4.359 | 0.7134 | 0.3060 | 0.0621 | 0.9777 |
| BigVGAN-v2 | Large-scale Compilation | 5M | **4.362** | **0.7026** | **0.2903** | **0.0593** | **0.9793** |
## Speed Benchmark
Below are the speed and VRAM usage benchmark results of BigVGAN from `tests/test_cuda_vs_torch_model.py`, using `bigvgan_v2_24khz_100band_256x` as a reference model.
| GPU | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
|:--------------------------:|:-------------:|:---------------:|:-----------:|:----------------:|:---------:|
| NVIDIA A100 | 256 | False | 1672.1 | 69.7x | 1.3 |
| | | True | 3916.5 | 163.2x | 1.3 |
| | 2048 | False | 1899.6 | 79.2x | 1.7 |
| | | True | 5330.1 | 222.1x | 1.7 |
| | 16384 | False | 1973.8 | 82.2x | 5.0 |
| | | True | 5761.7 | 240.1x | 4.4 |
| NVIDIA GeForce RTX 3080 | 256 | False | 841.1 | 35.0x | 1.3 |
| | | True | 1598.1 | 66.6x | 1.3 |
| | 2048 | False | 929.9 | 38.7x | 1.7 |
| | | True | 1971.3 | 82.1x | 1.6 |
| | 16384 | False | 943.4 | 39.3x | 5.0 |
| | | True | 2026.5 | 84.4x | 3.9 |
| NVIDIA GeForce RTX 2080 Ti | 256 | False | 515.6 | 21.5x | 1.3 |
| | | True | 811.3 | 33.8x | 1.3 |
| | 2048 | False | 576.5 | 24.0x | 1.7 |
| | | True | 1023.0 | 42.6x | 1.5 |
| | 16384 | False | 589.4 | 24.6x | 5.0 |
| | | True | 1068.1 | 44.5x | 3.2 |
| GPU | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
| :------------------------: | :-----------: | :-------------: | :---------: | :--------------: | :-------: |
| NVIDIA A100 | 256 | False | 1672.1 | 69.7x | 1.3 |
| | | True | 3916.5 | 163.2x | 1.3 |
| | 2048 | False | 1899.6 | 79.2x | 1.7 |
| | | True | 5330.1 | 222.1x | 1.7 |
| | 16384 | False | 1973.8 | 82.2x | 5.0 |
| | | True | 5761.7 | 240.1x | 4.4 |
| NVIDIA GeForce RTX 3080 | 256 | False | 841.1 | 35.0x | 1.3 |
| | | True | 1598.1 | 66.6x | 1.3 |
| | 2048 | False | 929.9 | 38.7x | 1.7 |
| | | True | 1971.3 | 82.1x | 1.6 |
| | 16384 | False | 943.4 | 39.3x | 5.0 |
| | | True | 2026.5 | 84.4x | 3.9 |
| NVIDIA GeForce RTX 2080 Ti | 256 | False | 515.6 | 21.5x | 1.3 |
| | | True | 811.3 | 33.8x | 1.3 |
| | 2048 | False | 576.5 | 24.0x | 1.7 |
| | | True | 1023.0 | 42.6x | 1.5 |
| | 16384 | False | 589.4 | 24.6x | 5.0 |
| | | True | 1068.1 | 44.5x | 3.2 |
## Acknowledgements

View File

@ -10,10 +10,10 @@ from copy import deepcopy
import torchaudio
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import os
from typing import List, Tuple, Union
import ffmpeg
import librosa
@ -25,23 +25,24 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from BigVGAN.bigvgan import BigVGAN
from feature_extractor.cnhubert import CNHubert
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
from module.models import SynthesizerTrn, SynthesizerTrnV3, Generator
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
from sv import SV
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list
from gsv_tools.audio_sr import AP_BWE
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from sv import SV
resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
key = f"{sr0}-{sr1}-{str(device)}"
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
@ -64,33 +65,36 @@ def denorm_spec(x):
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
mel_fn = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn_v4(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def speed_change(input_audio: np.ndarray, speed: float, sr: int):
@ -132,7 +136,7 @@ class DictToAttrRecursive(dict):
def __setattr__(self, key, value):
if isinstance(value, dict):
value = DictToAttrRecursive(value)
super(DictToAttrRecursive, self).__setitem__(key, value)
super().__setitem__(key, value)
super().__setattr__(key, value)
def __delattr__(self, item):
@ -276,12 +280,12 @@ class TTS_Config:
v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
languages: list = v2_languages
mute_tokens: dict = {
"v1" : 486,
"v2" : 486,
"v1": 486,
"v2": 486,
"v2Pro": 486,
"v2ProPlus": 486,
"v3" : 486,
"v4" : 486,
"v3": 486,
"v4": 486,
}
mute_emb_sim_matrix: torch.Tensor = None
# "all_zh",#全部按中文识别
@ -296,7 +300,7 @@ class TTS_Config:
# "auto",#多语种启动切分识别语种
# "auto_yue",#多语种启动切分识别语种
def __init__(self, configs: Union[dict, str] = None):
def __init__(self, configs: dict | str = None):
# 设置默认配置文件路径
configs_base_path: str = "GPT_SoVITS/configs/"
os.makedirs(configs_base_path, exist_ok=True)
@ -325,7 +329,7 @@ class TTS_Config:
self.is_half = self.configs.get("is_half", False)
if str(self.device) == "cpu" and self.is_half:
print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
print("Warning: Half precision is not supported on CPU, set is_half to False.")
self.is_half = False
version = self.configs.get("version", None)
@ -369,7 +373,7 @@ class TTS_Config:
else:
print(i18n("路径不存在,使用默认配置"))
self.save_configs(configs_path)
with open(configs_path, "r", encoding="utf-8") as f:
with open(configs_path, encoding="utf-8") as f:
configs = yaml.load(f, Loader=yaml.FullLoader)
return configs
@ -419,14 +423,14 @@ class TTS_Config:
class TTS:
def __init__(self, configs: Union[dict, str, TTS_Config]):
def __init__(self, configs: dict | str | TTS_Config):
if isinstance(configs, TTS_Config):
self.configs = configs
else:
self.configs: TTS_Config = TTS_Config(configs)
self.t2s_model: Text2SemanticLightningModule = None
self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None
self.vits_model: SynthesizerTrn | SynthesizerTrnV3 = None
self.bert_tokenizer: AutoTokenizer = None
self.bert_model: AutoModelForMaskedLM = None
self.cnhuhbert_model: CNHubert = None
@ -497,8 +501,8 @@ class TTS:
self.init_sv_model()
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
if if_lora_v3 and not os.path.exists(path_sovits):
info = path_sovits + i18n(f"SoVITS {model_version} 底模缺失,无法加载相应 LoRA 权重")
raise FileExistsError(info)
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
@ -558,7 +562,7 @@ class TTS:
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
if if_lora_v3 == False:
if not if_lora_v3:
print(
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
)
@ -589,8 +593,6 @@ class TTS:
self.configs.save_configs()
def init_t2s_weights(self, weights_path: str):
print(f"Loading Text2Semantic weights from {weights_path}")
self.configs.t2s_weights_path = weights_path
@ -622,7 +624,7 @@ class TTS:
self.empty_cache()
self.vocoder = BigVGAN.from_pretrained(
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
use_cuda_kernel=False,
) # if True, RuntimeError: Ninja is required to load C++ extensions
# remove weight norm in the model and set to eval mode
@ -655,7 +657,7 @@ class TTS:
)
self.vocoder.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
map_location="cpu",
weights_only=False,
)
@ -668,7 +670,7 @@ class TTS:
self.vocoder_configs["overlapped_len"] = 12
self.vocoder = self.vocoder.eval()
if self.configs.is_half == True:
if self.configs.is_half:
self.vocoder = self.vocoder.half().to(self.configs.device)
else:
self.vocoder = self.vocoder.to(self.configs.device)
@ -798,7 +800,7 @@ class TTS:
)
if self.configs.is_half:
spec = spec.half()
if self.is_v2pro == True:
if self.is_v2pro:
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
if self.configs.is_half:
audio = audio.half()
@ -832,7 +834,7 @@ class TTS:
prompt_semantic = codes[0, 0].to(self.configs.device)
self.prompt_cache["prompt_semantic"] = prompt_semantic
def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
def batch_sequences(self, sequences: list[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
seq = sequences[0]
ndim = seq.dim()
if axis < 0:
@ -846,7 +848,7 @@ class TTS:
max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length
padded_sequences = []
for seq, length in zip(sequences, seq_lengths):
for seq, length in zip(sequences, seq_lengths, strict=False):
padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
padded_sequences.append(padded_seq)
@ -898,7 +900,7 @@ class TTS:
batch_index_list.append([])
batch_index_list[-1].append(i)
for batch_idx, index_list in enumerate(batch_index_list):
for _batch_idx, index_list in enumerate(batch_index_list):
item_list = [data[idx] for idx in index_list]
phones_list = []
phones_len_list = []
@ -1051,7 +1053,7 @@ class TTS:
fragment_interval = inputs.get("fragment_interval", 0.3)
seed = inputs.get("seed", -1)
seed = -1 if seed in ["", None] else seed
actual_seed = set_seed(seed)
set_seed(seed)
parallel_infer = inputs.get("parallel_infer", True)
repetition_penalty = inputs.get("repetition_penalty", 1.35)
sample_steps = inputs.get("sample_steps", 32)
@ -1060,7 +1062,7 @@ class TTS:
overlap_length = inputs.get("overlap_length", 2)
min_chunk_length = inputs.get("min_chunk_length", 16)
fixed_length_chunk = inputs.get("fixed_length_chunk", False)
chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值若大于该阈值则视为可切分点。
chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值若大于该阈值则视为可切分点。
if parallel_infer and not streaming_mode:
print(i18n("并行推理模式已开启"))
@ -1093,7 +1095,6 @@ class TTS:
print(i18n("分段返回模式/流式推理模式不支持分桶处理,已自动关闭分桶处理"))
split_bucket = False
if split_bucket and speed_factor == 1.0 and not (self.configs.use_vocoder and parallel_infer):
print(i18n("分桶处理模式已开启"))
elif speed_factor != 1.0:
@ -1171,7 +1172,7 @@ class TTS:
if not (return_fragment or streaming_mode):
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
if len(data) == 0:
yield 16000, np.zeros(int(16000), dtype=np.int16)
yield 16000, np.zeros(16000, dtype=np.int16)
return
batch_index_list: list = None
@ -1237,9 +1238,9 @@ class TTS:
if item is None:
continue
batch_phones: List[torch.LongTensor] = item["phones"]
batch_phones: list[torch.LongTensor] = item["phones"]
# batch_phones:torch.LongTensor = item["phones"]
batch_phones_len: torch.LongTensor = item["phones_len"]
item["phones_len"]
all_phoneme_ids: torch.LongTensor = item["all_phones"]
all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
all_bert_features: torch.LongTensor = item["all_bert_features"]
@ -1255,7 +1256,7 @@ class TTS:
)
refer_audio_spec = []
sv_emb = [] if self.is_v2pro else None
for spec, audio_tensor in self.prompt_cache["refer_spec"]:
spec = spec.to(dtype=self.precision, device=self.configs.device)
@ -1281,7 +1282,6 @@ class TTS:
t4 = time.perf_counter()
t_34 += t4 - t3
batch_audio_fragment = []
# ## vits并行推理 method 1
@ -1301,7 +1301,9 @@ class TTS:
if speed_factor == 1.0:
print(f"{i18n('并行合成中')}...")
# ## vits并行推理 method 2
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
pred_semantic_list = [
item[-idx:] for item, idx in zip(pred_semantic_list, idx_list, strict=False)
]
upsample_rate = math.prod(self.vits_model.upsample_rates)
audio_frag_idx = [
pred_semantic_list[i].shape[0] * 2 * upsample_rate
@ -1314,8 +1316,8 @@ class TTS:
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
_batch_audio_fragment = self.vits_model.decode(
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
audio_frag_end_idx.insert(0, 0)
batch_audio_fragment = [
@ -1330,14 +1332,18 @@ class TTS:
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
) # .unsqueeze(0)#mq要多unsqueeze一次
audio_fragment = self.vits_model.decode(
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
).detach()[0, 0, :]
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
else:
if parallel_infer:
print(f"{i18n('并行合成中')}...")
audio_fragments = self.using_vocoder_synthesis_batched_infer(
idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps
idx_list,
pred_semantic_list,
batch_phones,
speed=speed_factor,
sample_steps=sample_steps,
)
batch_audio_fragment.extend(audio_fragments)
else:
@ -1356,7 +1362,7 @@ class TTS:
# item.to(dtype=self.precision, device=self.configs.device)
# for item in self.prompt_cache["refer_spec"]
# ]
semantic_token_generator =self.t2s_model.model.infer_panel(
semantic_token_generator = self.t2s_model.model.infer_panel(
all_phoneme_ids[0].unsqueeze(0),
all_phoneme_lens,
prompt,
@ -1381,30 +1387,34 @@ class TTS:
# if speed_factor == 1.0:
# upsample_rate = math.prod(self.vits_model.upsample_rates)*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1)
# else:
upsample_rate = math.prod(self.vits_model.upsample_rates)*((2 if self.vits_model.semantic_frame_rate == "25hz" else 1)/speed_factor)
upsample_rate = math.prod(self.vits_model.upsample_rates) * (
(2 if self.vits_model.semantic_frame_rate == "25hz" else 1) / speed_factor
)
else:
# if speed_factor == 1.0:
# upsample_rate = self.vocoder_configs["upsample_rate"]*(3.875 if self.configs.version == "v3" else 4)
# else:
upsample_rate = self.vocoder_configs["upsample_rate"]*((3.875 if self.configs.version == "v3" else 4)/speed_factor)
upsample_rate = self.vocoder_configs["upsample_rate"] * (
(3.875 if self.configs.version == "v3" else 4) / speed_factor
)
last_audio_chunk = None
# last_tokens = None
last_latent = None
previous_tokens = []
overlap_len = overlap_length
overlap_size = math.ceil(overlap_length*upsample_rate)
overlap_size = math.ceil(overlap_length * upsample_rate)
for semantic_tokens, is_final in semantic_token_generator:
if semantic_tokens is None and last_audio_chunk is not None:
yield self.audio_postprocess(
[[last_audio_chunk[-overlap_size:]]],
output_sr,
None,
speed_factor,
False,
0.0,
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
)
[[last_audio_chunk[-overlap_size:]]],
output_sr,
None,
speed_factor,
False,
0.0,
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
)
break
_semantic_tokens = semantic_tokens
@ -1415,11 +1425,10 @@ class TTS:
_semantic_tokens = torch.cat(previous_tokens, dim=-1)
if not is_first_chunk and semantic_tokens.shape[-1] < 10:
overlap_len = overlap_length+(10-semantic_tokens.shape[-1])
overlap_len = overlap_length + (10 - semantic_tokens.shape[-1])
else:
overlap_len = overlap_length
if not self.configs.use_vocoder:
token_padding_length = 0
# token_padding_length = int(phones.shape[-1]*2)-_semantic_tokens.shape[-1]
@ -1429,58 +1438,64 @@ class TTS:
# token_padding_length = 0
audio_chunk, latent, latent_mask = self.vits_model.decode_streaming(
_semantic_tokens.unsqueeze(0),
phones, refer_audio_spec,
speed=speed_factor,
sv_emb=sv_emb,
result_length=semantic_tokens.shape[-1]+overlap_len if not is_first_chunk else None,
overlap_frames=last_latent[:,:,-overlap_len*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1):] \
if last_latent is not None else None,
padding_length=token_padding_length
)
audio_chunk=audio_chunk.detach()[0, 0, :]
_semantic_tokens.unsqueeze(0),
phones,
refer_audio_spec,
speed=speed_factor,
sv_emb=sv_emb,
result_length=semantic_tokens.shape[-1] + overlap_len if not is_first_chunk else None,
overlap_frames=last_latent[
:, :, -overlap_len * (2 if self.vits_model.semantic_frame_rate == "25hz" else 1) :
]
if last_latent is not None
else None,
padding_length=token_padding_length,
)
audio_chunk = audio_chunk.detach()[0, 0, :]
else:
raise RuntimeError(i18n("SoVits V3/4模型不支持流式推理模式"))
if overlap_len>overlap_length:
audio_chunk=audio_chunk[-int((overlap_length+semantic_tokens.shape[-1])*upsample_rate):]
if overlap_len > overlap_length:
audio_chunk = audio_chunk[
-int((overlap_length + semantic_tokens.shape[-1]) * upsample_rate) :
]
audio_chunk_ = audio_chunk
if is_first_chunk and not is_final:
is_first_chunk = False
audio_chunk_ = audio_chunk_[:-overlap_size]
elif is_first_chunk and is_final:
elif is_first_chunk and is_final:
is_first_chunk = False
elif not is_first_chunk and not is_final:
audio_chunk_ = self.sola_algorithm([last_audio_chunk, audio_chunk_], overlap_size)
audio_chunk_ = (
audio_chunk_[last_audio_chunk.shape[0]-overlap_size:-overlap_size] if not is_final \
else audio_chunk_[last_audio_chunk.shape[0]-overlap_size:]
)
audio_chunk_[last_audio_chunk.shape[0] - overlap_size : -overlap_size]
if not is_final
else audio_chunk_[last_audio_chunk.shape[0] - overlap_size :]
)
last_latent = latent
last_audio_chunk = audio_chunk
yield self.audio_postprocess(
[[audio_chunk_]],
output_sr,
None,
speed_factor,
False,
0.0,
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
)
if is_first_package:
print(f"first_package_delay: {time.perf_counter()-t0:.3f}")
[[audio_chunk_]],
output_sr,
None,
speed_factor,
False,
0.0,
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
)
if is_first_package:
print(f"first_package_delay: {time.perf_counter() - t0:.3f}")
is_first_package = False
yield output_sr, np.zeros(int(output_sr*fragment_interval), dtype=np.int16)
yield output_sr, np.zeros(int(output_sr * fragment_interval), dtype=np.int16)
t5 = time.perf_counter()
t_45 += t5 - t4
if return_fragment:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t4 - t3:.3f}\t{t5 - t4:.3f}")
yield self.audio_postprocess(
[batch_audio_fragment],
output_sr,
@ -1490,7 +1505,8 @@ class TTS:
fragment_interval,
super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
)
elif streaming_mode:...
elif streaming_mode:
...
else:
audio.append(batch_audio_fragment)
@ -1499,7 +1515,7 @@ class TTS:
return
if not (return_fragment or streaming_mode):
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t_34:.3f}\t{t_45:.3f}")
if len(audio) == 0:
yield output_sr, np.zeros(int(output_sr), dtype=np.int16)
return
@ -1516,7 +1532,7 @@ class TTS:
except Exception as e:
traceback.print_exc()
# 必须返回一个空音频, 否则会导致显存不释放。
yield 16000, np.zeros(int(16000), dtype=np.int16)
yield 16000, np.zeros(16000, dtype=np.int16)
# 重置模型, 否则会导致显存释放不完全。
del self.t2s_model
del self.vits_model
@ -1540,15 +1556,15 @@ class TTS:
def audio_postprocess(
self,
audio: List[torch.Tensor],
audio: list[torch.Tensor],
sr: int,
batch_index_list: list = None,
speed_factor: float = 1.0,
split_bucket: bool = True,
fragment_interval: float = 0.3,
super_sampling: bool = False,
) -> Tuple[int, np.ndarray]:
if fragment_interval>0:
) -> tuple[int, np.ndarray]:
if fragment_interval > 0:
zero_wav = torch.zeros(
int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
)
@ -1558,7 +1574,9 @@ class TTS:
max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音
if max_audio > 1:
audio_fragment /= max_audio
audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval>0 else audio_fragment
audio_fragment: torch.Tensor = (
torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval > 0 else audio_fragment
)
audio[i][j] = audio_fragment
if split_bucket:
@ -1589,7 +1607,6 @@ class TTS:
audio = (audio * 32768).astype(np.int16)
# try:
# if speed_factor != 1.0:
# audio = speed_change(audio, speed=speed_factor, sr=int(sr))
@ -1665,12 +1682,12 @@ class TTS:
def using_vocoder_synthesis_batched_infer(
self,
idx_list: List[int],
semantic_tokens_list: List[torch.Tensor],
batch_phones: List[torch.Tensor],
idx_list: list[int],
semantic_tokens_list: list[torch.Tensor],
batch_phones: list[torch.Tensor],
speed: float = 1.0,
sample_steps: int = 32,
) -> List[torch.Tensor]:
) -> list[torch.Tensor]:
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
raw_entry = self.prompt_cache["refer_spec"][0]
@ -1778,40 +1795,32 @@ class TTS:
return audio_fragments
def sola_algorithm(
self,
audio_fragments: List[torch.Tensor],
overlap_len: int,
search_len:int= 320
):
def sola_algorithm(self, audio_fragments: list[torch.Tensor], overlap_len: int, search_len: int = 320):
# overlap_len-=search_len
dtype = audio_fragments[0].dtype
for i in range(len(audio_fragments) - 1):
f1 = audio_fragments[i].float()
f2 = audio_fragments[i + 1].float()
w1 = f1[-overlap_len:]
w2 = f2[:overlap_len+search_len]
w2 = f2[: overlap_len + search_len]
# w2 = w2[-w2.shape[-1]//2:]
# assert w1.shape == w2.shape
corr_norm = F.conv1d(w2.view(1, 1, -1), w1.view(1, 1, -1)).view(-1)
corr_den = F.conv1d(w2.view(1, 1, -1)**2, torch.ones_like(w1).view(1, 1, -1)).view(-1)+ 1e-8
idx = (corr_norm/corr_den.sqrt()).argmax()
corr_den = F.conv1d(w2.view(1, 1, -1) ** 2, torch.ones_like(w1).view(1, 1, -1)).view(-1) + 1e-8
idx = (corr_norm / corr_den.sqrt()).argmax()
print(f"seg_idx: {idx}")
# idx = corr.argmax()
f1_ = f1[: -overlap_len]
f1_ = f1[:-overlap_len]
audio_fragments[i] = f1_
f2_ = f2[idx:]
window = torch.hann_window((overlap_len) * 2, device=f1.device, dtype=f1.dtype)
f2_[: overlap_len] = (
window[: overlap_len] * f2_[: overlap_len]
+ window[overlap_len :] * f1[-overlap_len :]
)
f2_[:overlap_len] = window[:overlap_len] * f2_[:overlap_len] + window[overlap_len:] * f1[-overlap_len:]
# window = torch.sin(torch.arange((overlap_len - idx), device=f1.device) * np.pi / (overlap_len - idx))
# f2_[: (overlap_len - idx)] = (

View File

@ -4,20 +4,21 @@ import threading
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import re
import torch
from text.LangSegmenter import LangSegmenter
from text import chinese
from typing import Dict, List, Tuple
from text.cleaner import clean_text
from text import cleaned_text_to_sequence
from transformers import AutoModelForMaskedLM, AutoTokenizer
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
from tools.i18n.i18n import I18nAuto, scan_language_list
import torch
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
from text.LangSegmenter import LangSegmenter
from transformers import AutoModelForMaskedLM, AutoTokenizer
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
from TTS_infer_pack.text_segmentation_method import get_method as get_seg_method, split_big_text, splits
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -56,7 +57,7 @@ class TextPreprocessor:
self.device = device
self.bert_lock = threading.RLock()
def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]:
def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> list[dict]:
print(f"############ {i18n('切分文本')} ############")
text = self.replace_consecutive_punctuation(text)
texts = self.pre_seg_text(text, lang, text_split_method)
@ -98,7 +99,7 @@ class TextPreprocessor:
# 解决输入目标文本的空行导致报错的问题
if len(text.strip()) == 0:
continue
if not re.sub("\W+", "", text):
if not re.sub(r"\W+", "", text):
# 检测一下,如果是纯符号,就跳过。
continue
if text[-1] not in splits:
@ -116,30 +117,30 @@ class TextPreprocessor:
def segment_and_extract_feature_for_text(
self, text: str, language: str, version: str = "v1"
) -> Tuple[list, torch.Tensor, str]:
) -> tuple[list, torch.Tensor, str]:
return self.get_phones_and_bert(text, language, version)
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
with self.bert_lock:
text = re.sub(r' {2,}', ' ', text)
text = re.sub(r" {2,}", " ", text)
textlist = []
langlist = []
if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
for tmp in LangSegmenter.getTexts(text, "ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
for tmp in LangSegmenter.getTexts(text, "ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
@ -158,7 +159,9 @@ class TextPreprocessor:
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
if (tmp["lang"] == "en" and langlist[-1] == "en") or (
tmp["lang"] != "en" and langlist[-1] != "en"
):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
@ -236,4 +239,4 @@ class TextPreprocessor:
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
return result
return result

View File

@ -1,9 +1,11 @@
import argparse
import os
import soundfile as sf
from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
from gsv_tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
@ -19,11 +21,11 @@ def synthesize(
output_path,
):
# Read reference text
with open(ref_text_path, "r", encoding="utf-8") as file:
with open(ref_text_path, encoding="utf-8") as file:
ref_text = file.read()
# Read target text
with open(target_text_path, "r", encoding="utf-8") as file:
with open(target_text_path, encoding="utf-8") as file:
target_text = file.read()
# Change model weights

View File

@ -1,15 +1,29 @@
import os
import sys
from PyQt5.QtCore import QEvent
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
import soundfile as sf
from tools.i18n.i18n import I18nAuto
import soundfile as sf
from PyQt5.QtCore import QEvent
from PyQt5.QtWidgets import (
QApplication,
QComboBox,
QFileDialog,
QGridLayout,
QLabel,
QLineEdit,
QMainWindow,
QPushButton,
QStatusBar,
QTextEdit,
QVBoxLayout,
QWidget,
)
from gsv_tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav, gpt_path, sovits_path
class GPTSoVITSGUI(QMainWindow):
@ -24,11 +38,11 @@ class GPTSoVITSGUI(QMainWindow):
self.setStyleSheet("""
QWidget {
background-color: #a3d3b1;
background-color: #a3d3b1;
}
QTabWidget::pane {
background-color: #a3d3b1;
background-color: #a3d3b1;
}
QTabWidget::tab-bar {
@ -36,29 +50,29 @@ class GPTSoVITSGUI(QMainWindow):
}
QTabBar::tab {
background: #8da4bf;
color: #ffffff;
background: #8da4bf;
color: #ffffff;
padding: 8px;
}
QTabBar::tab:selected {
background: #2a3f54;
background: #2a3f54;
}
QLabel {
color: #000000;
color: #000000;
}
QPushButton {
background-color: #4CAF50;
color: white;
background-color: #4CAF50;
color: white;
padding: 8px;
border: 1px solid #4CAF50;
border-radius: 4px;
}
QPushButton:hover {
background-color: #45a049;
background-color: #45a049;
border: 1px solid #45a049;
box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1);
}
@ -239,14 +253,14 @@ class GPTSoVITSGUI(QMainWindow):
def upload_ref_text(self):
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
if file_path:
with open(file_path, "r", encoding="utf-8") as file:
with open(file_path, encoding="utf-8") as file:
content = file.read()
self.ref_text_input.setText(content)
def upload_target_text(self):
file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
if file_path:
with open(file_path, "r", encoding="utf-8") as file:
with open(file_path, encoding="utf-8") as file:
content = file.read()
self.target_text_input.setText(content)

View File

@ -6,23 +6,27 @@
全部按英文识别
全部按日文识别
"""
import psutil
import os
import psutil
def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
if os.name != "nt":
return # 仅 Windows 有效
return # 仅 Windows 有效
p = psutil.Process(os.getpid())
try:
p.nice(psutil.HIGH_PRIORITY_CLASS)
print("已将进程优先级设为 High")
except psutil.AccessDenied:
print("权限不足,无法修改优先级(请用管理员运行)")
set_high_priority()
import json
import logging
import os
import re
import sys
import traceback
@ -32,6 +36,7 @@ import torch
import torchaudio
from text.LangSegmenter import LangSegmenter
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -46,9 +51,11 @@ version = model_version = os.environ.get("version", "v2")
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
SoVITS_names, GPT_names = get_weights_names()
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -60,7 +67,7 @@ else:
with open("./weight.json", "w", encoding="utf-8") as file:
json.dump({"GPT": {}, "SoVITS": {}}, file)
with open("./weight.json", "r", encoding="utf-8") as file:
with open("./weight.json", encoding="utf-8") as file:
weight_data = file.read()
weight_data = json.loads(weight_data)
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
@ -96,6 +103,7 @@ import numpy as np
from feature_extractor import cnhubert
from transformers import AutoModelForMaskedLM, AutoTokenizer
cnhubert.cnhubert_base_path = cnhubert_base_path
import random
@ -123,8 +131,9 @@ from peft import LoraConfig, get_peft_model
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
from tools.assets import css, js, top_html
from tools.i18n.i18n import I18nAuto, scan_language_list
from gsv_tools.assets import css, js, top_html
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -162,7 +171,7 @@ dict_language = dict_language_v1 if version == "v1" else dict_language_v2
tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
if is_half == True:
if is_half:
bert_model = bert_model.half().to(device)
else:
bert_model = bert_model.to(device)
@ -202,7 +211,7 @@ class DictToAttrRecursive(dict):
def __setattr__(self, key, value):
if isinstance(value, dict):
value = DictToAttrRecursive(value)
super(DictToAttrRecursive, self).__setitem__(key, value)
super().__setitem__(key, value)
super().__setattr__(key, value)
def __delattr__(self, item):
@ -213,7 +222,7 @@ class DictToAttrRecursive(dict):
ssl_model = cnhubert.get_model()
if is_half == True:
if is_half:
ssl_model = ssl_model.half().to(device)
else:
ssl_model = ssl_model.to(device)
@ -223,6 +232,7 @@ else:
# symbol_version-model_version-if_lora_v3
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
v3v4set = {"v3", "v4"}
@ -234,8 +244,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
print(sovits_path, version, model_version, if_lora_v3)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False:
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重")
if if_lora_v3 and not is_exist:
info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info)
raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
@ -314,17 +324,17 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
del vq_model.enc_q
except:
pass
if is_half == True:
if is_half:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
if if_lora_v3 == False:
print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False))
if not if_lora_v3:
print(f"loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
print(
"loading sovits_%spretrained_G" % model_version,
f"loading sovits_{model_version}pretrained_G",
vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
)
lora_rank = dict_s2["lora_rank"]
@ -335,7 +345,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
init_lora_weights=True,
)
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
print("loading sovits_%s_lora%s" % (model_version, lora_rank))
print(f"loading sovits_{model_version}_lora{lora_rank}")
vq_model.load_state_dict(dict_s2["weight"], strict=False)
vq_model.cfm = vq_model.cfm.merge_and_unload()
# torch.save(vq_model.state_dict(),"merge_win.pth")
@ -383,7 +393,7 @@ def change_gpt_weights(gpt_path):
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
if is_half:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
@ -401,6 +411,7 @@ change_gpt_weights(gpt_path)
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import torch
now_dir = os.getcwd()
@ -442,7 +453,7 @@ def init_bigvgan():
from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
use_cuda_kernel=False,
) # if True, RuntimeError: Ninja is required to load C++ extensions
# remove weight norm in the model and set to eval mode
@ -450,7 +461,7 @@ def init_bigvgan():
bigvgan_model = bigvgan_model.eval()
clean_hifigan_model()
clean_sv_cn_model()
if is_half == True:
if is_half:
bigvgan_model = bigvgan_model.half().to(device)
else:
bigvgan_model = bigvgan_model.to(device)
@ -472,14 +483,14 @@ def init_hifigan():
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
map_location="cpu",
weights_only=False,
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
clean_bigvgan_model()
clean_sv_cn_model()
if is_half == True:
if is_half:
hifigan_model = hifigan_model.half().to(device)
else:
hifigan_model = hifigan_model.to(device)
@ -508,7 +519,7 @@ resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
key = f"{sr0}-{sr1}-{str(device)}"
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
@ -544,7 +555,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
center=False,
)
spec = spec.to(dtype)
if is_v2pro == True:
if is_v2pro:
audio = resample(audio, sr1, 16000, device).to(dtype)
return spec, audio
@ -556,7 +567,7 @@ def clean_text_inf(text, language, version):
return phones, word2ph, norm_text
dtype = torch.float16 if is_half == True else torch.float32
dtype = torch.float16 if is_half else torch.float32
def get_bert_inf(phones, word2ph, norm_text, language):
@ -566,7 +577,7 @@ def get_bert_inf(phones, word2ph, norm_text, language):
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
dtype=torch.float16 if is_half else torch.float32,
).to(device)
return bert
@ -595,29 +606,26 @@ def get_first(text):
return text
from text import chinese
def get_phones_and_bert(text, language, version, final=False):
text = re.sub(r' {2,}', ' ', text)
text = re.sub(r" {2,}", " ", text)
textlist = []
langlist = []
if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
for tmp in LangSegmenter.getTexts(text, "ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
for tmp in LangSegmenter.getTexts(text, "ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
@ -669,6 +677,7 @@ def get_phones_and_bert(text, language, version, final=False):
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
spec_min = -12
spec_max = 2
@ -681,32 +690,36 @@ def denorm_spec(x):
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
mel_fn = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn_v4(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def merge_short_text_in_array(texts, threshold):
@ -732,8 +745,8 @@ sr_model = None
def audio_sr(audio, sr):
global sr_model
if sr_model == None:
from tools.audio_sr import AP_BWE
if sr_model is None:
from gsv_tools.audio_sr import AP_BWE
try:
sr_model = AP_BWE(device, DictToAttrRecursive)
@ -801,10 +814,10 @@ def get_tts_wav(
print(i18n("实际输入的目标文本:"), text)
zero_wav = np.zeros(
int(hps.data.sampling_rate * pause_second),
dtype=np.float16 if is_half == True else np.float32,
dtype=np.float16 if is_half else np.float32,
)
zero_wav_torch = torch.from_numpy(zero_wav)
if is_half == True:
if is_half:
zero_wav_torch = zero_wav_torch.half().to(device)
else:
zero_wav_torch = zero_wav_torch.to(device)
@ -815,7 +828,7 @@ def get_tts_wav(
gr.Warning(i18n("参考音频在3~10秒范围外请更换"))
raise OSError(i18n("参考音频在3~10秒范围外请更换"))
wav16k = torch.from_numpy(wav16k)
if is_half == True:
if is_half:
wav16k = wav16k.half().to(device)
else:
wav16k = wav16k.to(device)
@ -871,7 +884,7 @@ def get_tts_wav(
t2 = ttime()
# cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
# print(cache.keys(),if_freeze)
if i_text in cache and if_freeze == True:
if i_text in cache and if_freeze:
pred_semantic = cache[i_text]
else:
with torch.no_grad():
@ -896,7 +909,7 @@ def get_tts_wav(
refers = []
if is_v2pro:
sv_emb = []
if sv_cn_model == None:
if sv_cn_model is None:
init_sv_cn()
if inp_refs:
for path in inp_refs:
@ -965,10 +978,10 @@ def get_tts_wav(
cfm_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res)
if model_version == "v3":
if bigvgan_model == None:
if bigvgan_model is None:
init_bigvgan()
else: # v4
if hifigan_model == None:
if hifigan_model is None:
init_hifigan()
vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
with torch.inference_mode():
@ -982,7 +995,7 @@ def get_tts_wav(
t4 = ttime()
t.extend([t2 - t1, t3 - t2, t4 - t3])
t1 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
print(f"{t[0]:.3f}\t{sum(t[1::3]):.3f}\t{sum(t[2::3]):.3f}\t{sum(t[3::3]):.3f}")
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
opt_sr = 32000
@ -990,7 +1003,7 @@ def get_tts_wav(
opt_sr = 24000
else:
opt_sr = 48000 # v4
if if_sr == True and opt_sr == 24000:
if if_sr and opt_sr == 24000:
print(i18n("音频超分中"))
audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
max_audio = np.abs(audio_opt).max()
@ -1062,7 +1075,7 @@ def cut2(inp):
def cut3(inp):
inp = inp.strip("\n")
opts = ["%s" % item for item in inp.strip("").split("")]
opts = [f"{item}" for item in inp.strip("").split("")]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
@ -1077,7 +1090,7 @@ def cut4(inp):
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
def cut5(inp):
inp = inp.strip("\n")
punds = {",", ".", ";", "?", "!", "", "", "", "", "", ";", "", ""}
punds = {",", ".", ";", "?", "!", "", "", "", "", "", "", ""}
mergeitems = []
items = []
@ -1101,7 +1114,7 @@ def cut5(inp):
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
parts = re.split(r"(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts

View File

@ -6,32 +6,37 @@
全部按英文识别
全部按日文识别
"""
import psutil
import os
import psutil
def set_high_priority():
"""把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
if os.name != "nt":
return # 仅 Windows 有效
return # 仅 Windows 有效
p = psutil.Process(os.getpid())
try:
p.nice(psutil.HIGH_PRIORITY_CLASS)
print("已将进程优先级设为 High")
except psutil.AccessDenied:
print("权限不足,无法修改优先级(请用管理员运行)")
set_high_priority()
import json
import logging
import os
import random
import re
import sys
import torch
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir))
sys.path.append(f"{now_dir}/GPT_SoVITS")
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
@ -60,8 +65,9 @@ import gradio as gr
from TTS_infer_pack.text_segmentation_method import get_method
from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config
from tools.assets import css, js, top_html
from tools.i18n.i18n import I18nAuto, scan_language_list
from gsv_tools.assets import css, js, top_html
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -114,9 +120,11 @@ cut_method = {
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
SoVITS_names, GPT_names = get_weights_names()
from config import pretrained_sovits_name
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -203,7 +211,7 @@ def inference(
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
parts = re.split(r"(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
@ -215,7 +223,7 @@ else:
with open("./weight.json", "w", encoding="utf-8") as file:
json.dump({"GPT": {}, "SoVITS": {}}, file)
with open("./weight.json", "r", encoding="utf-8") as file:
with open("./weight.json", encoding="utf-8") as file:
weight_data = file.read()
weight_data = json.loads(weight_data)
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
@ -227,6 +235,7 @@ with open("./weight.json", "r", encoding="utf-8") as file:
from process_ckpt import get_sovits_version_from_path_fast
v3v4set = {"v3", "v4"}
@ -238,8 +247,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
# print(sovits_path,version, model_version, if_lora_v3)
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False:
info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重")
if if_lora_v3 and not is_exist:
info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info)
raise FileExistsError(info)
dict_language = dict_language_v1 if version == "v1" else dict_language_v2

View File

@ -1,14 +1,16 @@
import os
import random
import traceback
import torch
import torch.nn.functional as F
import torch.utils.data
from text import cleaned_text_to_sequence
from tqdm import tqdm
from module.mel_processing import spectrogram_torch, spec_to_mel_torch
from text import cleaned_text_to_sequence
import torch.nn.functional as F
from tools.my_utils import load_audio
from gsv_tools.my_utils import load_audio
from module.mel_processing import spec_to_mel_torch, spectrogram_torch
version = os.environ.get("version", None)
@ -23,22 +25,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
def __init__(self, hparams, version=None, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
self.path5 = "%s/5-wav32k" % exp_dir
self.path2 = f"{exp_dir}/2-name2text.txt"
self.path4 = f"{exp_dir}/4-cnhubert"
self.path5 = f"{exp_dir}/5-wav32k"
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
if self.is_v2Pro:
self.path7 = "%s/7-sv_cn" % exp_dir
self.path7 = f"{exp_dir}/7-sv_cn"
assert os.path.exists(self.path7)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5))
if self.is_v2Pro:
names6 = set([name[:-3] for name in list(os.listdir(self.path7))]) # 去除.pt后缀
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
with open(self.path2, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
@ -85,7 +87,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
size = os.path.getsize(f"{self.path5}/{audiopath}")
duration = size / self.sampling_rate / 2
if duration == 0:
@ -110,15 +112,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
spec, wav = self.get_audio(f"{self.path5}/{audiopath}")
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
if self.is_v2Pro:
sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
sv_emb = torch.load(f"{self.path7}/{audiopath}.pt", map_location="cpu")
except:
traceback.print_exc()
spec = torch.zeros(1025, 100)
@ -285,16 +287,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
def __init__(self, hparams, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
self.path5 = "%s/5-wav32k" % exp_dir
self.path2 = f"{exp_dir}/2-name2text.txt"
self.path4 = f"{exp_dir}/4-cnhubert"
self.path5 = f"{exp_dir}/5-wav32k"
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5))
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
with open(self.path2, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
@ -339,7 +341,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
size = os.path.getsize(f"{self.path5}/{audiopath}")
duration = size / self.sampling_rate / 2
if duration == 0:
@ -376,9 +378,9 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
@ -523,16 +525,16 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
def __init__(self, hparams, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
self.path5 = "%s/5-wav32k" % exp_dir
self.path2 = f"{exp_dir}/2-name2text.txt"
self.path4 = f"{exp_dir}/4-cnhubert"
self.path5 = f"{exp_dir}/5-wav32k"
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5))
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
with open(self.path2, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
@ -577,7 +579,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
size = os.path.getsize(f"{self.path5}/{audiopath}")
duration = size / self.sampling_rate / 2
if duration == 0:
@ -614,9 +616,9 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
@ -734,16 +736,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
def __init__(self, hparams, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
self.path5 = "%s/5-wav32k" % exp_dir
self.path2 = f"{exp_dir}/2-name2text.txt"
self.path4 = f"{exp_dir}/4-cnhubert"
self.path5 = f"{exp_dir}/5-wav32k"
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
names5 = set(os.listdir(self.path5))
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
with open(self.path2, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
@ -788,7 +790,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
size = os.path.getsize(f"{self.path5}/{audiopath}")
duration = size / self.sampling_rate / 2
if duration == 0:
@ -825,9 +827,9 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
spec, mel, wav = self.get_audio(f"{self.path5}/{audiopath}")
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
import os
inp_text = os.environ.get("inp_text")
inp_wav_dir = os.environ.get("inp_wav_dir")
exp_name = os.environ.get("exp_name")
@ -13,13 +12,12 @@ opt_dir = os.environ.get("opt_dir")
bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
version = os.environ.get("version", None)
import traceback
import os.path
from text.cleaner import clean_text
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.my_utils import clean_path
import shutil
import traceback
# inp_text=sys.argv[1]
# inp_wav_dir=sys.argv[2]
@ -29,23 +27,26 @@ from tools.my_utils import clean_path
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
from time import time as ttime
import shutil
from text.cleaner import clean_text
from transformers import AutoModelForMaskedLM, AutoTokenizer
from gsv_tools.my_utils import clean_path
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
tmp_path = "%s%s.pth" % (ttime(), i_part)
tmp_path = f"{ttime()}{i_part}.pth"
torch.save(fea, tmp_path)
shutil.move(tmp_path, "%s/%s" % (dir, name))
shutil.move(tmp_path, f"{dir}/{name}")
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
if os.path.exists(txt_path) == False:
bert_dir = "%s/3-bert" % (opt_dir)
txt_path = f"{opt_dir}/2-name2text-{i_part}.txt"
if not os.path.exists(txt_path):
bert_dir = f"{opt_dir}/3-bert"
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(bert_dir, exist_ok=True)
if torch.cuda.is_available():
@ -60,7 +61,7 @@ if os.path.exists(txt_path) == False:
raise FileNotFoundError(bert_pretrained_dir)
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
if is_half == True:
if is_half:
bert_model = bert_model.half().to(device)
else:
bert_model = bert_model.to(device)
@ -90,8 +91,8 @@ if os.path.exists(txt_path) == False:
name = os.path.basename(name)
print(name)
phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("", ","), lan, version)
path_bert = "%s/%s.pt" % (bert_dir, name)
if os.path.exists(path_bert) == False and lan == "zh":
path_bert = f"{bert_dir}/{name}.pt"
if not os.path.exists(path_bert) and lan == "zh":
bert_feature = get_bert_feature(norm_text, word2ph)
assert bert_feature.shape[-1] == len(phones)
# torch.save(bert_feature, path_bert)
@ -104,7 +105,7 @@ if os.path.exists(txt_path) == False:
todo = []
res = []
with open(inp_text, "r", encoding="utf8") as f:
with open(inp_text, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
language_v1_to_language_v2 = {
@ -138,6 +139,6 @@ if os.path.exists(txt_path) == False:
process(todo, res)
opt = []
for name, phones, word2ph, norm_text in res:
opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
opt.append(f"{name}\t{phones}\t{word2ph}\t{norm_text}")
with open(txt_path, "w", encoding="utf8") as f:
f.write("\n".join(opt) + "\n")

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
import sys
import os
import sys
inp_text = os.environ.get("inp_text")
inp_wav_dir = os.environ.get("inp_wav_dir")
@ -12,20 +11,24 @@ if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
from feature_extractor import cnhubert
opt_dir = os.environ.get("opt_dir")
cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir")
import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
import traceback
import librosa
import numpy as np
from scipy.io import wavfile
import librosa
now_dir = os.getcwd()
sys.path.append(now_dir)
from tools.my_utils import load_audio, clean_path
import shutil
# from config import cnhubert_base_path
# cnhubert.cnhubert_base_path=cnhubert_base_path
@ -37,22 +40,22 @@ from tools.my_utils import load_audio, clean_path
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
# cnhubert.cnhubert_base_path=sys.argv[7]
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
from time import time as ttime
import shutil
from gsv_tools.my_utils import clean_path, load_audio
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
tmp_path = "%s%s.pth" % (ttime(), i_part)
tmp_path = f"{ttime()}{i_part}.pth"
torch.save(fea, tmp_path)
shutil.move(tmp_path, "%s/%s" % (dir, name))
shutil.move(tmp_path, f"{dir}/{name}")
hubert_dir = "%s/4-cnhubert" % (opt_dir)
wav32dir = "%s/5-wav32k" % (opt_dir)
hubert_dir = f"{opt_dir}/4-cnhubert"
wav32dir = f"{opt_dir}/5-wav32k"
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(hubert_dir, exist_ok=True)
os.makedirs(wav32dir, exist_ok=True)
@ -67,7 +70,7 @@ else:
device = "cpu"
model = cnhubert.get_model()
# is_half=False
if is_half == True:
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
@ -76,36 +79,36 @@ nan_fails = []
def name2go(wav_name, wav_path):
hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
hubert_path = f"{hubert_dir}/{wav_name}.pt"
if os.path.exists(hubert_path):
return
tmp_audio = load_audio(wav_path, 32000)
tmp_max = np.abs(tmp_audio).max()
if tmp_max > 2.2:
print("%s-filtered,%s" % (wav_name, tmp_max))
print(f"{wav_name}-filtered,{tmp_max}")
return
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio
tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio
tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题
tensor_wav16 = torch.from_numpy(tmp_audio)
if is_half == True:
if is_half:
tensor_wav16 = tensor_wav16.half().to(device)
else:
tensor_wav16 = tensor_wav16.to(device)
ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215])
if np.isnan(ssl.detach().numpy()).sum() != 0:
nan_fails.append((wav_name, wav_path))
print("nan filtered:%s" % wav_name)
print(f"nan filtered:{wav_name}")
return
wavfile.write(
"%s/%s" % (wav32dir, wav_name),
f"{wav32dir}/{wav_name}",
32000,
tmp_audio32.astype("int16"),
)
my_save(ssl, hubert_path)
with open(inp_text, "r", encoding="utf8") as f:
with open(inp_text, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines[int(i_part) :: int(all_parts)]:
@ -113,9 +116,9 @@ for line in lines[int(i_part) :: int(all_parts)]:
# wav_name,text=line.split("\t")
wav_name, spk_name, language, text = line.split("|")
wav_name = clean_path(wav_name)
if inp_wav_dir != "" and inp_wav_dir != None:
if inp_wav_dir != "" and inp_wav_dir is not None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
wav_path = f"{inp_wav_dir}/{wav_name}"
else:
wav_path = wav_name
@ -124,7 +127,7 @@ for line in lines[int(i_part) :: int(all_parts)]:
except:
print(line, traceback.format_exc())
if len(nan_fails) > 0 and is_half == True:
if len(nan_fails) > 0 and is_half:
is_half = False
model = model.float()
for wav in nan_fails:

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
import sys
import os
import sys
inp_text = os.environ.get("inp_text")
inp_wav_dir = os.environ.get("inp_wav_dir")
@ -15,32 +14,37 @@ opt_dir = os.environ.get("opt_dir")
sv_path = os.environ.get("sv_path")
import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
import traceback
import torchaudio
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
from tools.my_utils import clean_path
from time import time as ttime
import shutil
from ERes2NetV2 import ERes2NetV2
from time import time as ttime
import kaldi as Kaldi
from ERes2NetV2 import ERes2NetV2
from gsv_tools.my_utils import clean_path
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
tmp_path = "%s%s.pth" % (ttime(), i_part)
tmp_path = f"{ttime()}{i_part}.pth"
torch.save(fea, tmp_path)
shutil.move(tmp_path, "%s/%s" % (dir, name))
shutil.move(tmp_path, f"{dir}/{name}")
sv_cn_dir = "%s/7-sv_cn" % (opt_dir)
wav32dir = "%s/5-wav32k" % (opt_dir)
sv_cn_dir = f"{opt_dir}/7-sv_cn"
wav32dir = f"{opt_dir}/5-wav32k"
os.makedirs(opt_dir, exist_ok=True)
os.makedirs(sv_cn_dir, exist_ok=True)
os.makedirs(wav32dir, exist_ok=True)
@ -63,7 +67,7 @@ class SV:
embedding_model.eval()
self.embedding_model = embedding_model
self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
if is_half == False:
if not is_half:
self.embedding_model = self.embedding_model.to(device)
else:
self.embedding_model = self.embedding_model.half().to(device)
@ -72,7 +76,7 @@ class SV:
def compute_embedding3(self, wav): # (1,x)#-1~1
with torch.no_grad():
wav = self.res(wav)
if self.is_half == True:
if self.is_half:
wav = wav.half()
feat = torch.stack(
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
@ -85,10 +89,10 @@ sv = SV(device, is_half)
def name2go(wav_name, wav_path):
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
sv_cn_path = f"{sv_cn_dir}/{wav_name}.pt"
if os.path.exists(sv_cn_path):
return
wav_path = "%s/%s" % (wav32dir, wav_name)
wav_path = f"{wav32dir}/{wav_name}"
wav32k, sr0 = torchaudio.load(wav_path)
assert sr0 == 32000
wav32k = wav32k.to(device)
@ -96,16 +100,16 @@ def name2go(wav_name, wav_path):
my_save(emb, sv_cn_path)
with open(inp_text, "r", encoding="utf8") as f:
with open(inp_text, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines[int(i_part) :: int(all_parts)]:
try:
wav_name, spk_name, language, text = line.split("|")
wav_name = clean_path(wav_name)
if inp_wav_dir != "" and inp_wav_dir != None:
if inp_wav_dir != "" and inp_wav_dir is not None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
wav_path = f"{inp_wav_dir}/{wav_name}"
else:
wav_path = wav_name

View File

@ -1,5 +1,6 @@
import os
inp_text = os.environ.get("inp_text")
exp_name = os.environ.get("exp_name")
i_part = os.environ.get("i_part")
@ -28,20 +29,25 @@ else:
version = "v3"
import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
import traceback
import sys
import traceback
now_dir = os.getcwd()
sys.path.append(now_dir)
import logging
import utils
if version != "v3":
from module.models import SynthesizerTrn
else:
from module.models import SynthesizerTrnV3 as SynthesizerTrn
from tools.my_utils import clean_path
from gsv_tools.my_utils import clean_path
logging.getLogger("numba").setLevel(logging.WARNING)
# from config import pretrained_s2G
@ -54,9 +60,9 @@ logging.getLogger("numba").setLevel(logging.WARNING)
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
hubert_dir = "%s/4-cnhubert" % (opt_dir)
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
if os.path.exists(semantic_path) == False:
hubert_dir = f"{opt_dir}/4-cnhubert"
semantic_path = f"{opt_dir}/6-name2semantic-{i_part}.tsv"
if not os.path.exists(semantic_path):
os.makedirs(opt_dir, exist_ok=True)
if torch.cuda.is_available():
@ -73,7 +79,7 @@ if os.path.exists(semantic_path) == False:
version=version,
**hps.model,
)
if is_half == True:
if is_half:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
@ -87,19 +93,19 @@ if os.path.exists(semantic_path) == False:
)
def name2go(wav_name, lines):
hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
if os.path.exists(hubert_path) == False:
hubert_path = f"{hubert_dir}/{wav_name}.pt"
if not os.path.exists(hubert_path):
return
ssl_content = torch.load(hubert_path, map_location="cpu")
if is_half == True:
if is_half:
ssl_content = ssl_content.half().to(device)
else:
ssl_content = ssl_content.to(device)
codes = vq_model.extract_latent(ssl_content)
semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
lines.append("%s\t%s" % (wav_name, semantic))
lines.append(f"{wav_name}\t{semantic}")
with open(inp_text, "r", encoding="utf8") as f:
with open(inp_text, encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
lines1 = []

View File

@ -1,10 +1,13 @@
import os
import shutil
import traceback
from collections import OrderedDict
from time import time as ttime
import shutil
import os
import torch
from tools.i18n.i18n import I18nAuto
from gsv_tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
@ -12,13 +15,14 @@ i18n = I18nAuto()
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
tmp_path = "%s.pth" % (ttime())
tmp_path = f"{ttime()}.pth"
torch.save(fea, tmp_path)
shutil.move(tmp_path, "%s/%s" % (dir, name))
shutil.move(tmp_path, f"{dir}/{name}")
from io import BytesIO
model_version2byte = {
"v3": b"03",
"v4": b"04",
@ -47,14 +51,14 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
continue
opt["weight"][key] = ckpt[key].half()
opt["config"] = hps
opt["info"] = "%sepoch_%siteration" % (epoch, steps)
opt["info"] = f"{epoch}epoch_{steps}iteration"
if lora_rank:
opt["lora_rank"] = lora_rank
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
elif model_version != None and "Pro" in model_version:
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
elif model_version is not None and "Pro" in model_version:
my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
else:
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
my_save(opt, f"{hps.save_weight_dir}/{name}.pth")
return "Success."
except:
return traceback.format_exc()

287
README.md
View File

@ -1,3 +1,5 @@
#
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
@ -7,8 +9,6 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
[![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org)
[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
@ -27,7 +27,11 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
---
## Features:
<div align="center">
## Features
</div>
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
@ -41,51 +45,56 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
**RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .
请不要尬黑GPT-SoVITS推理速度慢谢谢
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
<div align="center">
## Installation
For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
For users in China, you can [Click Here to use](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) AutoDL Cloud Docker to experience the full functionality online.
### Tested Environments
| Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- |
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU |
| Python Version | PyTorch Version | Device |
| -------------- | --------------- | ------------- |
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
| Python 3.10 | PyTorch 2.9.0 | CPU |
</div>
### Windows
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) and double-click on `go-webui.bat` to start GPT-SoVITS-WebUI.
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
Install the program by running the following commands:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
pwsh -F install.ps1 --help
```
### Linux
Install the program by running the following commands:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### macOS
@ -95,54 +104,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
Install the program by running the following commands:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### Install Manually
#### Install Dependences
Install the program by running the following commands:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
conda install uv ffmpeg -c conda-forge
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
uv pip install ".[flash-attn]"
```
#### Install FFmpeg
### Pretrained Models
##### Conda Users
**If `install.sh` runs successfully, you may skip No.1,2,3**
```bash
conda activate GPTSoVits
conda install ffmpeg
```
**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
##### Ubuntu/Debian Users
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
```
2. Download G2PW models from [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. (Chinese TTS Only)
##### Windows Users
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `gsv_tools/uvr5/uvr5_weights`.
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `gsv_tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe)
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `gsv_tools/asr/models`.
##### MacOS Users
5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `gsv_tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
```bash
brew install ffmpeg
```
<div align="center">
### Running GPT-SoVITS with Docker
## Running GPT-SoVITS with Docker
#### Docker Image Selection
</div>
### Docker Image Selection
Due to rapid development in the codebase and a slower Docker image release cycle, please:
@ -153,15 +159,15 @@ Due to rapid development in the codebase and a slower Docker image release cycle
- Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
#### Environment Variables
### Environment Variables
- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.
#### Shared Memory Configuration
### Shared Memory Configuration
On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.
#### Choosing a Service
### Choosing a Service
The `docker-compose.yaml` defines two services:
@ -174,7 +180,7 @@ To run a specific service with Docker Compose, use:
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
```
#### Building the Docker Image Locally
### Building the Docker Image Locally
If you want to build the image yourself, use:
@ -182,7 +188,7 @@ If you want to build the image yourself, use:
bash docker_build.sh --cuda <12.6|12.8> [--lite]
```
#### Accessing the Running Container (Bash Shell)
### Accessing the Running Container (Bash Shell)
Once the container is running in the background, you can access it using:
@ -190,31 +196,15 @@ Once the container is running in the background, you can access it using:
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
```
## Pretrained Models
**If `install.sh` runs successfully, you may skip No.1,2,3**
**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
2. Download G2PW models from [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
<div align="center">
## Dataset Format
The TTS annotation .list file format:
</div>
```
The TTS annotation `.list` file format:
```text
vocal_path|speaker_name|language|text
@ -230,20 +220,23 @@ Language dictionary:
Example:
```
```text
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
<div align="center">
## Finetune and inference
</div>
### Open WebUI
#### Integrated Package Users
Double-click `go-webui.bat`or use `go-webui.ps1`
if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1`
#### Others
@ -251,21 +244,13 @@ if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-
python webui.py <language(optional)>
```
if you want to switch to V1,then
```bash
python webui.py v1 <language(optional)>
```
Or maunally switch version in WebUI
### Finetune
#### Path Auto-filling is now supported
1. Fill in the audio path
2. Slice the audio into small chunks
3. Denoise(optinal)
3. Denoise (optinal)
4. ASR
5. Proofreading ASR transcriptions
6. Go to the next Tab, then finetune the model
@ -274,7 +259,7 @@ Or maunally switch version in WebUI
#### Integrated Package Users
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
Double-click `go-webui.bat` or use `go-webui.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
#### Others
@ -290,8 +275,12 @@ python webui.py
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
<div align="center">
## V2 Release Notes
</div>
New Features:
1. Support Korean and Cantonese
@ -304,18 +293,12 @@ New Features:
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v2 from v1 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`.
Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.)
<div align="center">
## V3 Release Notes
</div>
New Features:
1. The timbre similarity is higher, requiring less training data to approximate the target speaker (the timbre similarity is significantly improved using the base model directly without fine-tuning).
@ -324,111 +307,59 @@ New Features:
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v3 from v2 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
<div align="center">
## V4 Release Notes
</div>
New Features:
1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed.
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v4 from v1/v2/v3 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
<div align="center">
## V2Pro Release Notes
</div>
New Features:
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
2. v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
Use v2Pro from v1/v2/v3/v4 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v2Pro pretrained models (v2Pro/s2Dv2Pro.pth, v2Pro/s2Gv2Pro.pth, v2Pro/s2Dv2ProPlus.pth, v2Pro/s2Gv2ProPlus.pth, and sv/pretrained_eres2netv2w24s4ep4.ckpt) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
<div align="center">
## Todo List
- [x] **High Priority:**
</div>
- [x] Localization in Japanese and English.
- [x] User guide.
- [x] Japanese and English dataset fine tune training.
- [x] **High Priority:**
- [x] Localization in Japanese and English
- [x] User guide
- [x] Japanese and English dataset fine tune training
- [ ] **Features:**
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min).
- [x] TTS speaking speed control.
- [ ] ~~Enhanced TTS emotion control.~~ Maybe use pretrained finetuned preset GPT models for better emotion.
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent).
- [x] Improve English and Japanese text frontend.
- [ ] Develop tiny and larger-sized TTS models.
- [x] Colab scripts.
- [x] Try expand training dataset (2k hours -> 10k hours).
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min)
- [x] TTS speaking speed control
- [ ] ~~Enhanced TTS emotion control~~ Maybe use pretrained finetuned preset GPT models for better emotion
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent)
- [x] Improve English and Japanese text frontend
- [ ] Develop tiny and larger-sized TTS models
- [x] Colab scripts
- [x] Try expand training dataset (2k hours -> 10k hours)
- [x] better sovits base model (enhanced audio quality)
- [ ] model mix
## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5
```bash
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
This is how the audio segmentation of the dataset is done using the command line
```bash
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
--threshold <volume_threshold> \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
This is how dataset ASR processing is done using the command line(Only Chinese)
```bash
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays)
```bash
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
A custom list save path is enabled
<div align="center">
## Credits
</div>
Special thanks to the following projects and contributors:
### Theoretical Research
@ -439,7 +370,7 @@ Special thanks to the following projects and contributors:
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
@ -471,8 +402,12 @@ Special thanks to the following projects and contributors:
Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge.
<div align="center">
## Thanks to all contributors for their efforts
</div>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
</a>

163
api.py
View File

@ -145,33 +145,36 @@ import os
import re
import sys
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir))
sys.path.append(f"{now_dir}/GPT_SoVITS")
import logging
import signal
from text.LangSegmenter import LangSegmenter
import subprocess
from io import BytesIO
from time import time as ttime
import librosa
import numpy as np
import soundfile as sf
import torch
import torchaudio
import librosa
import soundfile as sf
from fastapi import FastAPI, Request, Query
from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from fastapi import FastAPI, Query, Request
from fastapi.responses import JSONResponse, StreamingResponse
from feature_extractor import cnhubert
from io import BytesIO
from module.mel_processing import spectrogram_torch
from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
from peft import LoraConfig, get_peft_model
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
from module.mel_processing import spectrogram_torch
from text.LangSegmenter import LangSegmenter
from transformers import AutoModelForMaskedLM, AutoTokenizer
import config as global_config
import logging
import subprocess
class DefaultRefer:
@ -239,14 +242,14 @@ def init_bigvgan():
from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
use_cuda_kernel=False,
) # if True, RuntimeError: Ninja is required to load C++ extensions
# remove weight norm in the model and set to eval mode
bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval()
if is_half == True:
if is_half:
bigvgan_model = bigvgan_model.half().to(device)
else:
bigvgan_model = bigvgan_model.to(device)
@ -268,12 +271,12 @@ def init_hifigan():
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
map_location="cpu",
weights_only=False,
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
if is_half == True:
if is_half:
hifigan_model = hifigan_model.half().to(device)
else:
hifigan_model = hifigan_model.to(device)
@ -292,7 +295,7 @@ resample_transform_dict = {}
def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
key = "%s-%s-%s" % (sr0, sr1, str(device))
key = f"{sr0}-{sr1}-{str(device)}"
if key not in resample_transform_dict:
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
@ -300,6 +303,7 @@ def resample(audio_tensor, sr0, sr1, device):
from module.mel_processing import mel_spectrogram_torch
spec_min = -12
spec_max = 2
@ -312,32 +316,36 @@ def denorm_spec(x):
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
mel_fn = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
mel_fn_v4 = lambda x: mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1024,
"win_size": 1024,
"hop_size": 256,
"num_mels": 100,
"sampling_rate": 24000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
def mel_fn_v4(x):
return mel_spectrogram_torch(
x,
**{
"n_fft": 1280,
"win_size": 1280,
"hop_size": 320,
"num_mels": 100,
"sampling_rate": 32000,
"fmin": 0,
"fmax": None,
"center": False,
},
)
sr_model = None
@ -345,8 +353,8 @@ sr_model = None
def audio_sr(audio, sr):
global sr_model
if sr_model == None:
from tools.audio_sr import AP_BWE
if sr_model is None:
from gsv_tools.audio_sr import AP_BWE
try:
sr_model = AP_BWE(device, DictToAttrRecursive)
@ -390,8 +398,8 @@ def get_sovits_weights(sovits_path):
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
if if_lora_v3 == True and is_exist == False:
logger.info("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
if if_lora_v3 and not is_exist:
logger.info(f"SoVITS {model_version} 底模缺失,无法加载相应 LoRA 权重")
dict_s2 = load_sovits_new(sovits_path)
hps = dict_s2["config"]
@ -408,7 +416,7 @@ def get_sovits_weights(sovits_path):
if model_version not in {"v3", "v4"}:
if "Pro" in model_version:
hps.model.version = model_version
if sv_cn_model == None:
if sv_cn_model is None:
init_sv_cn()
vq_model = SynthesizerTrn(
@ -437,12 +445,12 @@ def get_sovits_weights(sovits_path):
del vq_model.enc_q
except:
pass
if is_half == True:
if is_half:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
if if_lora_v3 == False:
if not if_lora_v3:
vq_model.load_state_dict(dict_s2["weight"], strict=False)
else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
@ -480,7 +488,7 @@ def get_gpt_weights(gpt_path):
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
if is_half:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
@ -533,35 +541,32 @@ def get_bert_inf(phones, word2ph, norm_text, language):
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
dtype=torch.float16 if is_half else torch.float32,
).to(device)
return bert
from text import chinese
def get_phones_and_bert(text, language, version, final=False):
text = re.sub(r' {2,}', ' ', text)
text = re.sub(r" {2,}", " ", text)
textlist = []
langlist = []
if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
for tmp in LangSegmenter.getTexts(text, "zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
for tmp in LangSegmenter.getTexts(text, "ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
for tmp in LangSegmenter.getTexts(text, "ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
@ -606,7 +611,7 @@ def get_phones_and_bert(text, language, version, final=False):
if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True)
return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text
return phones, bert.to(torch.float16 if is_half else torch.float32), norm_text
class DictToAttrRecursive(dict):
@ -627,7 +632,7 @@ class DictToAttrRecursive(dict):
def __setattr__(self, key, value):
if isinstance(value, dict):
value = DictToAttrRecursive(value)
super(DictToAttrRecursive, self).__setitem__(key, value)
super().__setitem__(key, value)
super().__setattr__(key, value)
def __delattr__(self, item):
@ -662,7 +667,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
center=False,
)
spec = spec.to(dtype)
if is_v2pro == True:
if is_v2pro:
audio = resample(audio, sr1, 16000, device).to(dtype)
return spec, audio
@ -715,11 +720,11 @@ def pack_ogg(audio_bytes, data, rate):
pack_ogg_thread.join()
except RuntimeError as e:
# If changing the thread stack size is unsupported, a RuntimeError is raised.
print("RuntimeError: {}".format(e))
print(f"RuntimeError: {e}")
print("Changing the thread stack size is unsupported.")
except ValueError as e:
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
print("ValueError: {}".format(e))
print(f"ValueError: {e}")
print("The specified stack size is invalid.")
return audio_bytes
@ -794,7 +799,7 @@ def cut_text(text, punc):
punds = r"[" + "".join(punc_list) + r"]"
text = text.strip("\n")
items = re.split(f"({punds})", text)
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2], strict=False)]
# 在句子不存在符号或句尾无符号的时候保证文本完整
if len(items) % 2 == 1:
mergeitems.append(items[-1])
@ -861,18 +866,18 @@ def get_tts_wav(
if if_sr and version != "v3":
if_sr = False
t0 = ttime()
ttime()
prompt_text = prompt_text.strip("\n")
if prompt_text[-1] not in splits:
prompt_text += "" if prompt_language != "en" else "."
prompt_language, text = prompt_language, text.strip("\n")
dtype = torch.float16 if is_half == True else torch.float32
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
dtype = torch.float16 if is_half else torch.float32
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half else np.float32)
with torch.no_grad():
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
wav16k = torch.from_numpy(wav16k)
zero_wav_torch = torch.from_numpy(zero_wav)
if is_half == True:
if is_half:
wav16k = wav16k.half().to(device)
zero_wav_torch = zero_wav_torch.half().to(device)
else:
@ -889,7 +894,7 @@ def get_tts_wav(
refers = []
if is_v2pro:
sv_emb = []
if sv_cn_model == None:
if sv_cn_model is None:
init_sv_cn()
if inp_refs:
for path in inp_refs:
@ -908,7 +913,7 @@ def get_tts_wav(
else:
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
t1 = ttime()
ttime()
# os.environ['version'] = version
prompt_language = dict_language[prompt_language.lower()]
text_language = dict_language[text_language.lower()]
@ -930,7 +935,7 @@ def get_tts_wav(
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
t2 = ttime()
ttime()
with torch.no_grad():
pred_semantic, idx = t2s_model.model.infer_panel(
all_phoneme_ids,
@ -944,7 +949,7 @@ def get_tts_wav(
early_stop_num=hz * max_sec,
)
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
t3 = ttime()
ttime()
if version not in {"v3", "v4"}:
if is_v2pro:
@ -1014,10 +1019,10 @@ def get_tts_wav(
cfm_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res)
if version == "v3":
if bigvgan_model == None:
if bigvgan_model is None:
init_bigvgan()
else: # v4
if hifigan_model == None:
if hifigan_model is None:
init_hifigan()
vocoder_model = bigvgan_model if version == "v3" else hifigan_model
with torch.inference_mode():
@ -1030,7 +1035,7 @@ def get_tts_wav(
audio_opt.append(audio)
audio_opt.append(zero_wav)
audio_opt = np.concatenate(audio_opt, 0)
t4 = ttime()
ttime()
if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
sr = 32000
@ -1128,7 +1133,7 @@ def handle(
if not default_refer.is_ready():
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
if cut_punc == None:
if cut_punc is None:
text = cut_text(text, default_cut_punc)
else:
text = cut_text(text, cut_punc)

View File

@ -104,27 +104,31 @@ RESP:
import os
import sys
import traceback
from typing import Generator, Union
from collections.abc import Generator
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir))
sys.path.append(f"{now_dir}/GPT_SoVITS")
import argparse
import subprocess
import wave
import signal
import subprocess
import threading
import wave
from io import BytesIO
import numpy as np
import soundfile as sf
from fastapi import FastAPI, Response
from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn
from io import BytesIO
from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
from fastapi import FastAPI, Response
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel
import threading
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
from gsv_tools.i18n.i18n import I18nAuto
# print(sys.path)
i18n = I18nAuto()
@ -169,7 +173,7 @@ class TTS_Request(BaseModel):
fragment_interval: float = 0.3
seed: int = -1
media_type: str = "wav"
streaming_mode: Union[bool, int] = False
streaming_mode: bool | int = False
parallel_infer: bool = True
repetition_penalty: float = 1.35
sample_steps: int = 32
@ -199,8 +203,6 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
audio_file.write(data)
# See: https://docs.python.org/3/library/threading.html
# The stack size of this thread is at least 32768
# If stack overflow error still occurs, just modify the `stack_size`.
@ -214,11 +216,11 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
pack_ogg_thread.join()
except RuntimeError as e:
# If changing the thread stack size is unsupported, a RuntimeError is raised.
print("RuntimeError: {}".format(e))
print(f"RuntimeError: {e}")
print("Changing the thread stack size is unsupported.")
except ValueError as e:
# If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
print("ValueError: {}".format(e))
print(f"ValueError: {e}")
print("The specified stack size is invalid.")
return io_buffer
@ -306,7 +308,7 @@ def check_params(req: dict):
text: str = req.get("text", "")
text_lang: str = req.get("text_lang", "")
ref_audio_path: str = req.get("ref_audio_path", "")
streaming_mode: bool = req.get("streaming_mode", False)
req.get("streaming_mode", False)
media_type: str = req.get("media_type", "wav")
prompt_lang: str = req.get("prompt_lang", "")
text_split_method: str = req.get("text_split_method", "cut5")
@ -384,7 +386,7 @@ async def tts_handle(req: dict):
check_res = check_params(req)
if check_res is not None:
return check_res
if streaming_mode == 0:
streaming_mode = False
return_fragment = False
@ -403,7 +405,10 @@ async def tts_handle(req: dict):
fixed_length_chunk = True
else:
return JSONResponse(status_code=400, content={"message": f"the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"})
return JSONResponse(
status_code=400,
content={"message": "the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"},
)
req["streaming_mode"] = streaming_mode
req["return_fragment"] = return_fragment
@ -413,7 +418,6 @@ async def tts_handle(req: dict):
streaming_mode = streaming_mode or return_fragment
try:
tts_generator = tts_pipeline.run(req)
@ -475,7 +479,7 @@ async def tts_get_endpoint(
repetition_penalty: float = 1.35,
sample_steps: int = 32,
super_sampling: bool = False,
streaming_mode: Union[bool, int] = False,
streaming_mode: bool | int = False,
overlap_length: int = 2,
min_chunk_length: int = 16,
):

View File

@ -4,7 +4,8 @@ import sys
import torch
from tools.i18n.i18n import I18nAuto
from gsv_tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
@ -77,7 +78,7 @@ GPT_weight_version2root = {
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split("(\d+)", s)
parts = re.split(r"(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
@ -93,7 +94,7 @@ def get_weights_names():
continue
for name in os.listdir(path):
if name.endswith(".pth"):
SoVITS_names.append("%s/%s" % (path, name))
SoVITS_names.append(f"{path}/{name}")
if not SoVITS_names:
SoVITS_names = [""]
GPT_names = []
@ -105,7 +106,7 @@ def get_weights_names():
continue
for name in os.listdir(path):
if name.endswith(".ckpt"):
GPT_names.append("%s/%s" % (path, name))
GPT_names.append(f"{path}/{name}")
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
GPT_names = sorted(GPT_names, key=custom_sort_key)
if not GPT_names:
@ -161,7 +162,7 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
if mem_gb < 4 or sm_version < 5.3:
return cpu, torch.float32, 0.0, 0.0
if sm_version == 6.1 or is_16_series == True:
if sm_version == 6.1 or is_16_series:
return cuda, torch.float32, sm_version, mem_gb
if sm_version > 6.1:
return cuda, torch.float16, sm_version, mem_gb

View File

@ -30,8 +30,8 @@ services:
- "9880:9880"
volumes:
- .:/workspace/GPT-SoVITS
- tools/asr/models:/workspace/models/asr_models
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
- gsv_tools/asr/models:/workspace/models/asr_models
- gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
environment:
- is_half=true
tty: true
@ -68,8 +68,8 @@ services:
- "9880:9880"
volumes:
- .:/workspace/GPT-SoVITS
- tools/asr/models:/workspace/models/asr_models
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
- gsv_tools/asr/models:/workspace/models/asr_models
- gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
environment:
- is_half=true
tty: true

View File

@ -26,7 +26,7 @@ print_help() {
echo " -h, --help Show this help message and exit"
echo ""
echo "Examples:"
echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
echo " bash docker_build.sh --cuda 12.6 --lite"
}
# Show help if no arguments provided

View File

@ -409,7 +409,7 @@
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
- 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
- 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
- 类型: 新功能 (特性参阅 [Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
- 提交: RVC-Boss
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
- 内容: 更新项目多语言文档.
@ -622,5 +622,3 @@
- 内容: 提升推理进程优先级修复win11下可能GPU利用率受限的问题
- 类型: 修复
- 提交: XianYue0125

View File

@ -1,3 +1,5 @@
#
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
@ -19,15 +21,18 @@
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
</div>
---
<div align="center">
## 功能
</div>
1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
@ -44,40 +49,49 @@
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
<div align="center">
## 安装
中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.
### 测试通过的环境
| Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- |
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU |
| Python Version | PyTorch Version | Device |
| -------------- | --------------- | ------------- |
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
| Python 3.10 | PyTorch 2.9.0 | CPU |
</div>
### Windows
如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
运行以下的命令来安装本项目:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
pwsh -F install.ps1 --help
```
### Linux
运行以下的命令来安装本项目:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### macOS
@ -87,54 +101,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
运行以下的命令来安装本项目:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### 手动安装
#### 安装依赖
运行以下的命令来安装本项目:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
conda install uv ffmpeg -c conda-forge
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
uv pip install ".[flash-attn]"
```
#### 安装 FFmpeg
### 预训练模型
##### Conda 用户
**若成功运行`install.sh`可跳过 No.1,2,3**
```bash
conda activate GPTSoVits
conda install ffmpeg
```
**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
##### Ubuntu/Debian 用户
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
```
2. 从 [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
##### Windows 用户
3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `gsv_tools/uvr5/uvr5_weights` 目录中.
- 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `gsv_tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下
- 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer``bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对.
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境
4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `gsv_tools/asr/models` 目录中.
##### MacOS 用户
5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `gsv_tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
```bash
brew install ffmpeg
```
<div align="center">
### 运行 GPT-SoVITS (使用 Docker)
## 运行 GPT-SoVITS (使用 Docker)
#### Docker 镜像选择
</div>
### Docker 镜像选择
由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意:
@ -145,15 +156,15 @@ brew install ffmpeg
- Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
#### 环境变量
### 环境变量
- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用
#### 共享内存配置
### 共享内存配置
在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)
#### 选择服务
### 选择服务
`docker-compose.yaml` 文件定义了两个主要服务类型:
@ -166,7 +177,7 @@ brew install ffmpeg
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
```
#### 本地构建 Docker 镜像
### 本地构建 Docker 镜像
如果你希望自行构建镜像, 请使用以下命令:
@ -174,7 +185,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
```
#### 访问运行中的容器 (Bash Shell)
### 访问运行中的容器 (Bash Shell)
当容器在后台运行时, 你可以通过以下命令进入容器:
@ -182,32 +193,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
```
## 预训练模型
**若成功运行`install.sh`可跳过 No.1,2,3**
**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中.
- 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
- 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer``bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对.
4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中.
5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
<div align="center">
## 数据集格式
文本到语音 (TTS) 注释 .list 文件格式:
</div>
文本到语音 (TTS) 注释 `.list` 文件格式:
```text
```
vocal_path|speaker_name|language|text
```
语言字典:
@ -220,18 +217,23 @@ vocal_path|speaker_name|language|text
示例:
```
```text
D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
```
<div align="center">
## 微调与推理
</div>
### 打开 WebUI
#### 整合包用户
双击`go-webui.bat`或者使用`go-webui.ps1`
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
#### 其他
@ -239,14 +241,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
python webui.py <language(optional)>
```
若想使用 V1,则
```bash
python webui.py v1 <language(optional)>
```
或者在 webUI 内动态切换
### 微调
#### 现已支持自动填充路径
@ -278,8 +272,12 @@ python webui.py
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
<div align="center">
## V2 发布说明
</div>
新特性:
1. 支持韩语及粤语
@ -292,18 +290,12 @@ python webui.py
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 v1 环境迁移至 v2
1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆 github 上的最新代码
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下
中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
<div align="center">
## V3 更新说明
</div>
新模型特点:
1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
@ -312,33 +304,23 @@ python webui.py
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 v2 环境迁移至 v3
1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆 github 上的最新代码
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
<div align="center">
## V4 更新说明
</div>
新特性:
1. **V4 版本修复了 V3 版本中由于非整数倍上采样导致的金属音问题, 并原生输出 48kHz 音频以避免声音闷糊 (而 V3 版本仅原生输出 24kHz 音频)**. 作者认为 V4 是对 V3 的直接替代, 但仍需进一步测试.
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 V1/V2/V3 环境迁移至 V4
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
2. 从 GitHub 克隆最新代码.
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V4 预训练模型 (`gsv-v4-pretrained/s2v4.ckpt``gsv-v4-pretrained/vocoder.pth`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
<div align="center">
## V2Pro 更新说明
</div>
新特性:
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
@ -346,77 +328,35 @@ python webui.py
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
从 V1/V2/V3/V4 环境迁移至 V2Pro
1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
2. 从 GitHub 克隆最新代码.
3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V2Pro 预训练模型 (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 和 `sv/pretrained_eres2netv2w24s4ep4.ckpt`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
<div align="center">
## 待办事项清单
- [x] **高优先级:**
</div>
- [x] 日语和英语的本地化.
- [x] 用户指南.
- [x] 日语和英语数据集微调训练.
- [x] **高优先级:**
- [x] 日语和英语的本地化
- [x] 用户指南
- [x] 日语和英语数据集微调训练
- [ ] **功能:**
- [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
- [x] TTS 语速控制.
- [ ] ~~增强的 TTS 情感控制.~~
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布.
- [x] 改进英语和日语文本前端.
- [ ] 开发体积小和更大的 TTS 模型.
- [x] Colab 脚本.
- [x] 扩展训练数据集 (从 2k 小时到 10k 小时).
- [x] 更好的 sovits 基础模型 (增强的音频质量).
- [ ] 模型混合.
- [x] TTS 语速控制
- [ ] ~~增强的 TTS 情感控制~~
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布
- [x] 改进英语和日语文本前端
- [ ] 开发体积小和更大的 TTS 模型
- [x] Colab 脚本
- [x] 扩展训练数据集 (从 2k 小时到 10k 小时)
- [x] 更好的 sovits 基础模型 (增强的音频质量)
- [ ] 模型混合
## (附加) 命令行运行方式
使用命令行打开 UVR5 的 WebUI
```bash
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- 如果打不开浏览器, 请按照下面的格式进行UVR处理, 这是使用mdxnet进行音频处理的方式
````
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
```` -->
这是使用命令行完成数据集的音频切分的方式
```bash
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
--threshold <volume_threshold> \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
```bash
python tools/asr/funasr_asr.py -i <input> -o <output>
```
通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
(没有进度条, GPU 性能可能会导致时间延迟)
```bash
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
启用自定义列表保存路径
<div align="center">
## 致谢
</div>
特别感谢以下项目和贡献者:
### 理论研究
@ -427,7 +367,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
@ -459,8 +399,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导.
<div align="center">
## 感谢所有贡献者的努力
</div>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
</a>

View File

@ -1,3 +1,5 @@
#
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
@ -19,14 +21,17 @@
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
</div>
---
## 機能:
<div align="center">
## 機能
</div>
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.
@ -40,34 +45,49 @@
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
<div align="center">
## インストール
### テスト済みの環境
| Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- |
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU |
| Python Version | PyTorch Version | Device |
| -------------- | --------------- | ------------- |
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
| Python 3.10 | PyTorch 2.9.0 | CPU |
</div>
### Windows
Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)し、解凍後に `go-webui.bat` をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
以下のコマンドを実行してこのプロジェクトをインストールします:
```pwsh
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
pwsh -F install.ps1 --help
```
### Linux
以下のコマンドを実行してこのプロジェクトをインストールします:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### macOS
@ -77,54 +97,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
以下のコマンドを実行してこのプロジェクトをインストールします:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### 手動インストール
#### 依存関係をインストールします
以下のコマンドを実行してこのプロジェクトをインストールします:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
conda install uv ffmpeg -c conda-forge
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
uv pip install ".[flash-attn]"
```
#### FFmpeg をインストールします
### 事前訓練済みモデル
##### Conda ユーザー
**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
```bash
conda activate GPTSoVits
conda install ffmpeg
```
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
##### Ubuntu/Debian ユーザー
2. [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
```
3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`gsv_tools/uvr5/uvr5_weights` ディレクトリに配置してください.
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`gsv_tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
##### Windows ユーザー
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです.
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます
4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください
5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
##### MacOS ユーザー
<div align="center">
```bash
brew install ffmpeg
```
## GPT-SoVITS の実行 (Docker 使用)
### GPT-SoVITS の実行 (Docker 使用)
</div>
#### Docker イメージの選択
### Docker イメージの選択
コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください:
@ -135,15 +150,15 @@ brew install ffmpeg
- Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
#### 環境変数
### 環境変数
- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます
#### 共有メモリの設定
### 共有メモリの設定
Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします
#### サービスの選択
### サービスの選択
`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています:
@ -156,7 +171,7 @@ Windows (Docker Desktop) では、デフォルトの共有メモリサイズが
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
```
#### Docker イメージのローカルビルド
### Docker イメージのローカルビルド
自分でイメージをビルドするには、以下のコマンドを使ってください:
@ -164,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
```
#### 実行中のコンテナへアクセス (Bash Shell)
### 実行中のコンテナへアクセス (Bash Shell)
コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます:
@ -172,30 +187,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
```
## 事前訓練済みモデル
**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください.
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです.
4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.
5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
<div align="center">
## データセット形式
TTS アノテーション .list ファイル形式:
</div>
TTS アノテーション `.list` ファイル形式:
```text
```
vocal_path|speaker_name|language|text
```
言語辞書:
@ -206,18 +209,23 @@ vocal_path|speaker_name|language|text
例:
```
```text
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
<div align="center">
## 微調整と推論
</div>
### WebUI を開く
#### 統合パッケージ利用者
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください.
#### その他
@ -225,14 +233,6 @@ V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
python webui.py <言語(オプション)>
```
V1 に切り替えたい場合は
```bash
python webui.py v1 <言語(オプション)>
```
または WebUI で手動でバージョンを切り替えてください.
### 微調整
#### パス自動補完のサポート
@ -248,7 +248,7 @@ python webui.py v1 <言語(オプション)>
#### 統合パッケージ利用者
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
#### その他
@ -264,8 +264,12 @@ python webui.py
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
<div align="center">
## V2 リリースノート
</div>
新機能:
1. 韓国語と広東語をサポート
@ -278,18 +282,12 @@ python webui.py
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 環境から V2 を使用するには:
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
2. 最新のコードを github からクローン
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置
中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
<div align="center">
## V3 リリースノート
</div>
新機能:
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます).
@ -298,33 +296,23 @@ V1 環境から V2 を使用するには:
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 環境から v3 を使用する方法:
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します.
2. GitHub から最新のコードをクローンします.
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します.
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
<div align="center">
## V4 リリースノート
</div>
新機能:
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力しますV3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 環境から V4 への移行方法:
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
2. GitHub から最新のコードをクローンします.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V4 の事前学習済みモデル (`gsv-v4-pretrained/s2v4.ckpt` および `gsv-v4-pretrained/vocoder.pth`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
<div align="center">
## V2Pro リリースノート
</div>
新機能:
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
@ -332,18 +320,13 @@ V1/V2/V3 環境から V4 への移行方法:
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
V1/V2/V3/V4 環境から V2Pro への移行方法:
1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
2. GitHub から最新のコードをクローンします.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V2Pro の事前学習済みモデル (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, および `sv/pretrained_eres2netv2w24s4ep4.ckpt`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
<div align="center">
## Todo リスト
- [x] **優先度 高:**
</div>
- [x] **優先度 高:**
- [x] 日本語と英語でのローカライズ.
- [x] ユーザーガイド.
- [x] 日本語データセットと英語データセットのファインチューニングトレーニング.
@ -356,53 +339,16 @@ V1/V2/V3/V4 環境から V2Pro への移行方法:
- [x] 英語と日本語のテキストフロントエンドを改善.
- [ ] 小型と大型の TTS モデルを開発する.
- [x] Colab のスクリプト.
- [ ] トレーニングデータセットを拡張する (2k→10k).
- [x] トレーニングデータセットを拡張する (2k→10k).
- [x] より良い sovits ベースモデル (音質向上)
- [ ] モデルミックス
## (追加の) コマンドラインから実行する方法
コマンド ラインを使用して UVR5 の WebUI を開きます
```bash
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください.これはオーディオ処理に mdxnet を使用しています.
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
```bash
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
--threshold <volume_threshold> \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
```bash
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
```bash
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
カスタムリストの保存パスが有効になっています
<div align="center">
## クレジット
</div>
特に以下のプロジェクトと貢献者に感謝します:
### 理論研究
@ -413,7 +359,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
@ -445,8 +391,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます.
<div align="center">
## すべてのコントリビューターに感謝します
</div>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
</a>

View File

@ -1,3 +1,5 @@
#
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
@ -19,14 +21,17 @@
[![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
</div>
---
## 기능:
<div align="center">
## 기능
</div>
1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.
@ -40,97 +45,101 @@
보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
<div align="center">
## 설치
### 테스트 통과 환경
| Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- |
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU |
| Python Version | PyTorch Version | Device |
| -------------- | --------------- | ------------- |
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
| Python 3.10 | PyTorch 2.9.0 | CPU |
</div>
### Windows
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)한 후 압축을 풀고 `go-webui.bat` 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
pwsh -F install.ps1 --help
```
### Linux
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### macOS
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
다음 명령어를 실행하여 이 프로젝트를 설치하세요
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### 수동 설치
#### 의존성 설치
다음 명령어를 실행하여 이 프로젝트를 설치하세요:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
conda install uv ffmpeg -c conda-forge
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
uv pip install ".[flash-attn]"
```
#### FFmpeg 설치
### 사전 학습된 모델
##### Conda 사용자
**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
```bash
conda activate GPTSoVits
conda install ffmpeg
```
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
##### Ubuntu/Debian 사용자
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
```
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `gsv_tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `gsv_tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
##### Windows 사용자
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다.
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치
5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
##### MacOS 사용자
<div align="center">
```bash
brew install ffmpeg
```
## GPT-SoVITS 실행하기 (Docker 사용)
### GPT-SoVITS 실행하기 (Docker 사용)
</div>
#### Docker 이미지 선택
### Docker 이미지 선택
코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:
@ -141,15 +150,15 @@ brew install ffmpeg
- Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
#### 환경 변수
### 환경 변수
- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다
#### 공유 메모리 설정
### 공유 메모리 설정
Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다
#### 서비스 선택
### 서비스 선택
`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:
@ -162,7 +171,7 @@ Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
```
#### Docker 이미지 직접 빌드하기
### Docker 이미지 직접 빌드하기
직접 이미지를 빌드하려면 다음 명령어를 사용하세요:
@ -170,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
bash docker_build.sh --cuda <12.6|12.8> [--lite]
```
#### 실행 중인 컨테이너 접속하기 (Bash Shell)
### 실행 중인 컨테이너 접속하기 (Bash Shell)
컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다:
@ -178,30 +187,14 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
```
## 사전 학습된 모델
**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다.
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
## 데이터셋 형식
텍스트 음성 합성(TTS) 주석 .list 파일 형식:
```
```text
vocal_path|speaker_name|language|text
```
언어 사전:
@ -209,21 +202,28 @@ vocal_path|speaker_name|language|text
- 'zh': 중국어
- 'ja': 일본어
- 'en': 영어
- 'ko': 한국인
- 'yue': 광둥어
예시:
```
```text
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
<div align="center">
## 미세 조정 및 추론
</div>
### WebUI 열기
#### 통합 패키지 사용자
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오.
#### 기타
@ -231,21 +231,13 @@ V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-
python webui.py <언어(옵션)>
```
V1으로 전환하려면,
```bash
python webui.py v1 <언어(옵션)>
```
또는 WebUI에서 수동으로 버전을 전환하십시오.
### 미세 조정
#### 경로 자동 채우기가 지원됩니다
1. 오디오 경로를 입력하십시오.
2. 오디오를 작은 청크로 분할하십시오.
3. 노이즈 제거(옵션)
3. 노이즈 제거 (옵션)
4. ASR 수행
5. ASR 전사를 교정하십시오.
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
@ -254,7 +246,7 @@ python webui.py v1 <언어(옵션)>
#### 통합 패키지 사용자
`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
#### 기타
@ -270,8 +262,12 @@ python webui.py
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
<div align="center">
## V2 릴리스 노트
</div>
새로운 기능:
1. 한국어 및 광둥어 지원
@ -284,18 +280,12 @@ python webui.py
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 환경에서 V2를 사용하려면:
1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트
2. github에서 최신 코드를 클론하십시오.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오.
중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
<div align="center">
## V3 릴리스 노트
</div>
새로운 기능:
1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
@ -304,33 +294,23 @@ V1 환경에서 V2를 사용하려면:
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 환경에서 v3 사용하기:
1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다.
2. 최신 코드를 github 에서 클론합니다.
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다.
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
<div align="center">
## V4 릴리스 노트
</div>
신규 기능:
1. **V4는 V3에서 발생하는 비정수 배율 업샘플링으로 인한 금속성 잡음 문제를 수정했으며, 소리가 먹먹해지는 것을 방지하기 위해 기본적으로 48kHz 오디오를 출력합니다 (V3는 기본적으로 24kHz만 지원)**. 개발자는 V4를 V3의 직접적인 대체 버전으로 보고 있지만 추가 테스트가 필요합니다.
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 환경에서 V4로 전환 방법:
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
2. GitHub에서 최신 코드를 클론하세요.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V4 사전 학습 모델(`gsv-v4-pretrained/s2v4.ckpt``gsv-v4-pretrained/vocoder.pth`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
<div align="center">
## V2Pro 릴리스 노트
</div>
신규 기능:
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
@ -338,24 +318,18 @@ V1/V2/V3 환경에서 V4로 전환 방법:
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
2. GitHub에서 최신 코드를 클론하세요.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V2Pro 사전 학습 모델(`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 및 `sv/pretrained_eres2netv2w24s4ep4.ckpt`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
<div align="center">
## 할 일 목록
- [x] **최우선순위:**
</div>
- [x] **최우선순위:**
- [x] 일본어 및 영어 지역화.
- [x] 사용자 가이드.
- [x] 일본어 및 영어 데이터셋 미세 조정 훈련.
- [ ] **기능:**
- [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분).
- [x] TTS 속도 제어.
- [ ] ~~향상된 TTS 감정 제어.~~
@ -363,53 +337,16 @@ V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
- [x] 영어 및 일본어 텍스트 프론트 엔드 개선.
- [ ] 작은 크기와 큰 크기의 TTS 모델 개발.
- [x] Colab 스크립트.
- [ ] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
- [x] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
- [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질).
- [ ] 모델 블렌딩.
## (추가적인) 명령줄에서 실행하는 방법
명령줄을 사용하여 UVR5용 WebUI 열기
```bash
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
```bash
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
--threshold <volume_threshold> \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
```bash
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
```bash
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
사용자 정의 목록 저장 경로가 활성화되었습니다.
<div align="center">
## 감사의 말
</div>
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
### 이론 연구
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.
## 모든 기여자들에게 감사드립니다 ;)
<div align="center">
## 참여해주신 모든 분들께 감사드립니다
</div>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
</a>

View File

@ -1,3 +1,5 @@
#
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
@ -25,7 +27,7 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
---
## Özellikler:
## Özellikler
1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.
@ -39,40 +41,49 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
<div align="center">
## Kurulum
### Test Edilmiş Ortamlar
| Python Version | PyTorch Version | Device |
| -------------- | ---------------- | ------------- |
| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 |
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
| Python 3.11 | PyTorch 2.7.0 | Apple silicon |
| Python 3.9 | PyTorch 2.2.2 | CPU |
| Python Version | PyTorch Version | Device |
| -------------- | --------------- | ------------- |
| Python 3.10 | PyTorch 2.8.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.9.0 | CUDA 12.6 |
| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 |
| Python 3.11 | PyTorch 2.10.0 | CUDA 12.8 |
| Python 3.10 | PyTorch 2.8.0 | Apple silicon |
| Python 3.11 | PyTorch 2.9.0 | Apple silicon |
| Python 3.12 | PyTorch 2.10.0 | Apple silicon |
| Python 3.10 | PyTorch 2.9.0 | CPU |
</div>
### Windows
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) ve `go-webui.bat` dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
Aşağıdaki komutları çalıştırarak programı yükleyin:
```pwsh
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
pwsh -F install.ps1 --help
```
### Linux
Aşağıdaki komutları çalıştırarak programı yükleyin:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### macOS
@ -82,54 +93,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
Aşağıdaki komutları çalıştırarak programı yükleyin:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
bash install.sh --help
```
### El ile Yükleme
#### Bağımlılıkları Yükleme
Aşağıdaki komutları çalıştırarak programı yükleyin:
```bash
conda create -n GPTSoVits python=3.10
conda activate GPTSoVits
conda create -n GPTSoVITS python=3.11
conda activate GPTSoVITS
conda install uv ffmpeg -c conda-forge
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
uv pip install ".[flash-attn]"
```
#### FFmpeg'i Yükleme
### Önceden Eğitilmiş Modeller
##### Conda Kullanıcıları
**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
```bash
conda activate GPTSoVits
conda install ffmpeg
```
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
##### Ubuntu/Debian Kullanıcıları
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayıın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
```
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `gsv_tools/uvr5/uvr5_weights` dizinine yerleştirin.
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `gsv_tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
##### Windows Kullanıcıları
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `gsv_tools/asr/models` dizinine yerleştirin.
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin
5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `gsv_tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
##### MacOS Kullanıcıları
<div align="center">
```bash
brew install ffmpeg
```
## GPT-SoVITS Çalıştırma (Docker Kullanarak)
### GPT-SoVITS Çalıştırma (Docker Kullanarak)
</div>
#### Docker İmajı Seçimi
### Docker İmajı Seçimi
Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:
@ -140,15 +146,15 @@ Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığ
- Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
#### Ortam Değişkenleri
### Ortam Değişkenleri
- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPUnuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.
#### Paylaşılan Bellek Yapılandırması
### Paylaşılan Bellek Yapılandırması
Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.
#### Servis Seçimi
### Servis Seçimi
`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:
@ -161,7 +167,7 @@ Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanı
docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
```
#### Docker İmajını Yerel Olarak Oluşturma
### Docker İmajını Yerel Olarak Oluşturma
Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
@ -169,7 +175,7 @@ Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
bash docker_build.sh --cuda <12.6|12.8> [--lite]
```
#### Çalışan Konteynere Erişim (Bash Shell)
### Çalışan Konteynere Erişim (Bash Shell)
Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz:
@ -177,30 +183,18 @@ Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz
docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
```
## Önceden Eğitilmiş Modeller
**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayıın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
<div align="center">
## Veri Seti Formatı
</div>
TTS açıklama .list dosya formatı:
```
```text
vocal_path|speaker_name|language|text
```
Dil sözlüğü:
@ -213,18 +207,23 @@ Dil sözlüğü:
Örnek:
```
```text
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
<div align="center">
## İnce Ayar ve Çıkarım
</div>
### WebUI'yi Açın
#### Entegre Paket Kullanıcıları
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın.
#### Diğerleri
@ -232,14 +231,6 @@ V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `
python webui.py <dil(isteğe bağlı)>
```
V1'e geçmek istiyorsanız,
```bash
python webui.py v1 <dil(isteğe bağlı)>
```
veya WebUI'de manuel olarak sürüm değiştirin.
### İnce Ayar
#### Yol Otomatik Doldurma artık destekleniyor
@ -255,7 +246,7 @@ veya WebUI'de manuel olarak sürüm değiştirin.
#### Entegre Paket Kullanıcıları
`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
#### Diğerleri
@ -271,8 +262,12 @@ python webui.py
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
<div align="center">
## V2 Sürüm Notları
</div>
Yeni Özellikler:
1. Korece ve Kantonca destekler
@ -285,18 +280,12 @@ Yeni Özellikler:
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 ortamından V2'yi kullanmak için:
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin
2. github'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin.
Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
<div align="center">
## V3 Sürüm Notları
</div>
Yeni Özellikler:
1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
@ -305,33 +294,23 @@ Yeni Özellikler:
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V2 ortamında V3 kullanımı:
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
<div align="center">
## V4 Sürüm Notları
</div>
Yeni Özellikler:
1. **V4, V3'te görülen non-integer upsample işleminden kaynaklanan metalik ses sorununu düzeltti ve sesin boğuklaşmasını önlemek için doğrudan 48kHz ses çıktısı sunar (V3 sadece 24kHz destekler)**. Yazar, V4'ün V3'ün yerine geçebileceğini belirtmiştir ancak daha fazla test yapılması gerekmektedir.
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1/V2/V3 ortamından V4'e geçiş:
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V4 ön eğitilmiş modelleri indirin (`gsv-v4-pretrained/s2v4.ckpt` ve `gsv-v4-pretrained/vocoder.pth`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
<div align="center">
## V2Pro Sürüm Notları
</div>
Yeni Özellikler:
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
@ -339,18 +318,13 @@ Yeni Özellikler:
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
2. GitHub'dan en son kodları klonlayın.
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V2Pro ön eğitilmiş modelleri indirin (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, ve `sv/pretrained_eres2netv2w24s4ep4.ckpt`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
<div align="center">
## Yapılacaklar Listesi
- [x] **Yüksek Öncelikli:**
</div>
- [x] **Yüksek Öncelikli:**
- [x] Japonca ve İngilizceye yerelleştirme.
- [x] Kullanıcı kılavuzu.
- [x] Japonca ve İngilizce veri seti ince ayar eğitimi.
@ -363,53 +337,16 @@ V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
- [x] İngilizce ve Japonca metin ön ucunu iyileştirme.
- [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme.
- [x] Colab betikleri.
- [ ] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
- [x] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
- [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi)
- [ ] model karışımı
## (Ekstra) Komut satırından çalıştırma yöntemi
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
```bash
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
```bash
python audio_slicer.py \
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
--output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
--threshold <ses_eşiği> \
--min_length <her_bir_alt_klibin_minimum_süresi> \
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
```
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
```bash
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
```
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
```bash
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
```
Özel bir liste kaydetme yolu etkinleştirildi
<div align="center">
## Katkı Verenler
</div>
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
### Teorik Araştırma
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım.
<div align="center">
## Tüm katkıda bulunanlara çabaları için teşekkürler
</div>
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
</a>

View File

@ -1 +0,0 @@
faster-whisper

View File

@ -1,8 +1,9 @@
import os
import random
import torch
import torchaudio
import torch.utils.data
import torchaudio
import torchaudio.functional as aF
@ -37,10 +38,10 @@ def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
def get_dataset_filelist(a):
with open(a.input_training_file, "r", encoding="utf-8") as fi:
with open(a.input_training_file, encoding="utf-8") as fi:
training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
with open(a.input_validation_file, "r", encoding="utf-8") as fi:
with open(a.input_validation_file, encoding="utf-8") as fi:
validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
return training_indexes, validation_indexes

View File

@ -1,7 +1,7 @@
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.utils import weight_norm, spectral_norm
import torch.nn.functional as F
from torch.nn.utils import spectral_norm, weight_norm
# from utils import init_weights, get_padding
@ -16,7 +16,7 @@ def init_weights(m, mean=0.0, std=0.01):
import numpy as np
from typing import Tuple, List
LRELU_SLOPE = 0.1
@ -75,7 +75,7 @@ class ConvNeXtBlock(nn.Module):
class APNet_BWE_Model(torch.nn.Module):
def __init__(self, h):
super(APNet_BWE_Model, self).__init__()
super().__init__()
self.h = h
self.adanorm_num_embeddings = None
layer_scale_init_value = 1 / h.ConvNeXt_layers
@ -125,7 +125,7 @@ class APNet_BWE_Model(torch.nn.Module):
x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)
for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha):
for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha, strict=False):
x_mag = x_mag + x_pha
x_pha = x_pha + x_mag
x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
@ -146,9 +146,9 @@ class APNet_BWE_Model(torch.nn.Module):
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
super().__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if not use_spectral_norm else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
@ -185,7 +185,7 @@ class DiscriminatorP(torch.nn.Module):
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self):
super(MultiPeriodDiscriminator, self).__init__()
super().__init__()
self.discriminators = nn.ModuleList(
[
DiscriminatorP(2),
@ -201,7 +201,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
for _i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
@ -215,7 +215,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
class MultiResolutionAmplitudeDiscriminator(nn.Module):
def __init__(
self,
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
num_embeddings: int = None,
):
super().__init__()
@ -225,7 +225,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):
def forward(
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
@ -245,7 +245,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):
class DiscriminatorAR(nn.Module):
def __init__(
self,
resolution: Tuple[int, int, int],
resolution: tuple[int, int, int],
channels: int = 64,
in_channels: int = 1,
num_embeddings: int = None,
@ -269,7 +269,7 @@ class DiscriminatorAR(nn.Module):
def forward(
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
) -> tuple[torch.Tensor, list[torch.Tensor]]:
fmap = []
x = x.squeeze(1)
@ -309,7 +309,7 @@ class DiscriminatorAR(nn.Module):
class MultiResolutionPhaseDiscriminator(nn.Module):
def __init__(
self,
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
num_embeddings: int = None,
):
super().__init__()
@ -319,7 +319,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):
def forward(
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
@ -339,7 +339,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):
class DiscriminatorPR(nn.Module):
def __init__(
self,
resolution: Tuple[int, int, int],
resolution: tuple[int, int, int],
channels: int = 64,
in_channels: int = 1,
num_embeddings: int = None,
@ -363,7 +363,7 @@ class DiscriminatorPR(nn.Module):
def forward(
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
) -> tuple[torch.Tensor, list[torch.Tensor]]:
fmap = []
x = x.squeeze(1)
@ -402,8 +402,8 @@ class DiscriminatorPR(nn.Module):
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
for dr, dg in zip(fmap_r, fmap_g, strict=False):
for rl, gl in zip(dr, dg, strict=False):
loss += torch.mean(torch.abs(rl - gl))
return loss
@ -413,7 +413,7 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
for dr, dg in zip(disc_real_outputs, disc_generated_outputs, strict=False):
r_loss = torch.mean(torch.clamp(1 - dr, min=0))
g_loss = torch.mean(torch.clamp(1 + dg, min=0))
loss += r_loss + g_loss

View File

@ -5,9 +5,9 @@ def get_models():
"large-v2",
"large-v3",
"large-v3-turbo",
#"distil-large-v2",
#"distil-large-v3",
#"distil-large-v3.5",
# "distil-large-v2",
# "distil-large-v3",
# "distil-large-v3.5",
]
return model_size_list

View File

@ -9,33 +9,34 @@ from huggingface_hub import snapshot_download as snapshot_download_hf
from modelscope import snapshot_download as snapshot_download_ms
from tqdm import tqdm
from tools.asr.config import get_models
from tools.asr.funasr_asr import only_asr
from tools.my_utils import load_cudnn
from gsv_tools.asr.config import get_models
from gsv_tools.asr.funasr_asr import only_asr
from gsv_tools.my_utils import load_cudnn
# fmt: off
language_code_list = [
"af", "am", "ar", "as", "az",
"ba", "be", "bg", "bn", "bo",
"br", "bs", "ca", "cs", "cy",
"da", "de", "el", "en", "es",
"et", "eu", "fa", "fi", "fo",
"fr", "gl", "gu", "ha", "haw",
"he", "hi", "hr", "ht", "hu",
"hy", "id", "is", "it", "ja",
"jw", "ka", "kk", "km", "kn",
"ko", "la", "lb", "ln", "lo",
"lt", "lv", "mg", "mi", "mk",
"ml", "mn", "mr", "ms", "mt",
"my", "ne", "nl", "nn", "no",
"oc", "pa", "pl", "ps", "pt",
"ro", "ru", "sa", "sd", "si",
"sk", "sl", "sn", "so", "sq",
"sr", "su", "sv", "sw", "ta",
"te", "tg", "th", "tk", "tl",
"tr", "tt", "uk", "ur", "uz",
"af", "am", "ar", "as", "az",
"ba", "be", "bg", "bn", "bo",
"br", "bs", "ca", "cs", "cy",
"da", "de", "el", "en", "es",
"et", "eu", "fa", "fi", "fo",
"fr", "gl", "gu", "ha", "haw",
"he", "hi", "hr", "ht", "hu",
"hy", "id", "is", "it", "ja",
"jw", "ka", "kk", "km", "kn",
"ko", "la", "lb", "ln", "lo",
"lt", "lv", "mg", "mi", "mk",
"ml", "mn", "mr", "ms", "mt",
"my", "ne", "nl", "nn", "no",
"oc", "pa", "pl", "ps", "pt",
"ro", "ru", "sa", "sd", "si",
"sk", "sl", "sn", "so", "sq",
"sr", "su", "sv", "sw", "ta",
"te", "tg", "th", "tk", "tl",
"tr", "tt", "uk", "ur", "uz",
"vi", "yi", "yo", "zh", "yue",
"auto"]
"auto"]
# fmt: on
@ -52,20 +53,20 @@ def download_model(model_size: str):
if "distil" in model_size:
if "3.5" in model_size:
repo_id = "distil-whisper/distil-large-v3.5-ct2"
model_path = "tools/asr/models/faster-distil-whisper-large-v3.5"
model_path = "gsv_tools/asr/models/faster-distil-whisper-large-v3.5"
else:
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
elif model_size == "large-v3-turbo":
repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
model_path = "tools/asr/models/faster-whisper-large-v3-turbo"
model_path = "gsv_tools/asr/models/faster-whisper-large-v3-turbo"
else:
repo_id = f"Systran/faster-whisper-{model_size}"
model_path = (
model_path or f"tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
model_path or f"gsv_tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
)
else:
repo_id = "XXXXRT/faster-whisper"
model_path = "tools/asr/models"
model_path = "gsv_tools/asr/models"
files: list[str] = [
"config.json",

View File

@ -1,5 +1,3 @@
# -*- coding:utf-8 -*-
import argparse
import os
import traceback
@ -8,6 +6,7 @@ from funasr import AutoModel
from modelscope import snapshot_download
from tqdm import tqdm
funasr_models = {} # 存储模型避免重复加载
@ -23,27 +22,27 @@ def only_asr(input_file, language):
def create_model(language="zh"):
if language == "zh":
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
path_vad = "gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
path_punc = "gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
path_asr = "gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
snapshot_download(
"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
local_dir="tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
local_dir="gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
)
snapshot_download(
"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
local_dir="tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
local_dir="gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
)
snapshot_download(
"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
local_dir="gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
)
model_revision = "v2.0.4"
elif language == "yue":
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
path_asr = "gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
snapshot_download(
"iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
local_dir="tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
local_dir="gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
)
path_vad = path_punc = None
vad_model_revision = punc_model_revision = ""

View File

@ -1,23 +1,24 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import os
import sys
AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
sys.path.append(AP_BWE_main_dir_path)
import json
import torch
import torchaudio.functional as aF
# from attrdict import AttrDict####will be bug in py3.10
from datasets1.dataset import amp_pha_stft, amp_pha_istft
# from attrdict import AttrDict####will be bug in py3.10
from datasets1.dataset import amp_pha_istft, amp_pha_stft
from models.model import APNet_BWE_Model
class AP_BWE:
def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
if checkpoint_file == None:
checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
if os.path.exists(checkpoint_file) == False:
if checkpoint_file is None:
checkpoint_file = f"{AP_BWE_main_dir_path}/24kto48k/g_24kto48k.zip"
if not os.path.exists(checkpoint_file):
raise FileNotFoundError
config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
with open(config_file) as f:

View File

@ -1,12 +1,13 @@
import os
import argparse
import os
import traceback
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from tqdm import tqdm
path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
path_denoise = "gsv_tools/denoise-model/speech_frcrn_ans_cirm_16k"
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
@ -17,7 +18,7 @@ def execute_denoise(input_folder, output_folder):
# print(list(os.listdir(input_folder).sort()))
for name in tqdm(os.listdir(input_folder)):
try:
ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
ans(f"{input_folder}/{name}", output_path=f"{output_folder}/{name}")
except:
traceback.print_exc()

View File

@ -2,11 +2,12 @@ import json
import locale
import os
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
def load_language_list(language):
with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), encoding="utf-8") as f:
language_list = json.load(f)
return language_list

View File

@ -4,6 +4,7 @@ import json
import os
from collections import OrderedDict
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言
TITLE_LEN: int = 60 # 标题显示长度
@ -36,7 +37,7 @@ def scan_i18n_strings():
print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
for filename in glob.iglob("**/*.py", recursive=True):
try:
with open(filename, "r", encoding="utf-8") as f:
with open(filename, encoding="utf-8") as f:
code = f.read()
if "I18nAuto" in code:
tree = ast.parse(code)
@ -57,7 +58,7 @@ def update_i18n_json(json_file, standard_keys):
standard_keys = sorted(standard_keys)
print(f" Process {json_file} ".center(TITLE_LEN, "="))
# 读取 JSON 文件
with open(json_file, "r", encoding="utf-8") as f:
with open(json_file, encoding="utf-8") as f:
json_data = json.load(f, object_pairs_hook=OrderedDict)
# 打印处理前的 JSON 条目数
len_before = len(json_data)

View File

@ -8,7 +8,8 @@ import gradio as gr
import numpy as np
import pandas as pd
from tools.i18n.i18n import I18nAuto
from gsv_tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
@ -48,7 +49,7 @@ def clean_path(path_str: str):
def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
files_status = []
if is_train == True and file_list:
if is_train and file_list:
file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
file_list.append(os.path.join(file_list[0], "3-bert"))
file_list.append(os.path.join(file_list[0], "4-cnhubert"))
@ -61,7 +62,7 @@ def check_for_existance(file_list: list = None, is_train=False, is_dataset_proce
files_status.append(False)
if sum(files_status) != len(files_status):
if is_train:
for file, status in zip(file_list, files_status):
for file, status in zip(file_list, files_status, strict=False):
if status:
pass
else:
@ -97,13 +98,13 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
if not os.path.isdir(audio_path):
gr.Warning(i18n("请填入正确的音频文件夹路径"))
return
with open(list_path, "r", encoding="utf8") as f:
with open(list_path, encoding="utf8") as f:
line = f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|")
wav_name = clean_path(wav_name)
if audio_path != "" and audio_path != None:
if audio_path != "" and audio_path is not None:
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s" % (audio_path, wav_name)
wav_path = f"{audio_path}/{wav_name}"
else:
wav_path = wav_name
if os.path.exists(wav_path):
@ -117,7 +118,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
path_list.append(os.path.join(path_list[0], "5-wav32k"))
path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path, "r", encoding="utf-8") as f:
with open(phone_path, encoding="utf-8") as f:
if f.read(1):
...
else:

View File

@ -1,13 +1,14 @@
import os
import sys
import numpy as np
import traceback
import numpy as np
from scipy.io import wavfile
from slicer2 import Slicer
# parent_directory = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(parent_directory)
from tools.my_utils import load_audio
from slicer2 import Slicer
from gsv_tools.my_utils import load_audio
def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):

View File

@ -1,5 +1,7 @@
import sys
from tools.i18n.i18n import I18nAuto, scan_language_list
from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
i18n = I18nAuto(language=language)
@ -9,6 +11,7 @@ import json
import os
import uuid
try:
import gradio.analytics as analytics
@ -21,6 +24,7 @@ import librosa
import numpy as np
import soundfile
g_json_key_text = ""
g_json_key_path = ""
g_load_file = ""
@ -113,7 +117,7 @@ def b_delete_audio(*checkbox_list):
change = False
for i, checkbox in reversed(list(enumerate(checkbox_list))):
if g_index + i < len(g_data_json):
if checkbox == True:
if checkbox:
g_data_json.pop(g_index + i)
change = True
@ -150,7 +154,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list):
global g_data_json, g_max_json_index
checked_index = []
for i, checkbox in enumerate(checkbox_list):
if checkbox == True and g_index + i < len(g_data_json):
if checkbox and g_index + i < len(g_data_json):
checked_index.append(g_index + i)
if len(checked_index) == 1:
index = checked_index[0]
@ -182,7 +186,7 @@ def b_merge_audio(interval_r, *checkbox_list):
audios_path = []
audios_text = []
for i, checkbox in enumerate(checkbox_list):
if checkbox == True and g_index + i < len(g_data_json):
if checkbox and g_index + i < len(g_data_json):
checked_index.append(g_index + i)
if len(checked_index) > 1:
@ -237,7 +241,7 @@ def b_save_list():
def b_load_json():
global g_data_json, g_max_json_index
with open(g_load_file, "r", encoding="utf-8") as file:
with open(g_load_file, encoding="utf-8") as file:
g_data_json = file.readlines()
g_data_json = [json.loads(line) for line in g_data_json]
g_max_json_index = len(g_data_json) - 1
@ -245,7 +249,7 @@ def b_load_json():
def b_load_list():
global g_data_json, g_max_json_index
with open(g_load_file, "r", encoding="utf-8") as source:
with open(g_load_file, encoding="utf-8") as source:
data_list = source.readlines()
for _ in data_list:
data = _.split("|")

View File

@ -1,7 +1,7 @@
from packaging import version
import torch
from torch import nn, einsum
import torch.nn.functional as F
from packaging import version
from torch import einsum, nn
def exists(val):

View File

@ -1,21 +1,20 @@
from collections.abc import Callable
from functools import partial
import torch
from torch import nn
from torch.nn import Module, ModuleList
import torch.nn.functional as F
from einops import pack, rearrange, unpack
from einops.layers.torch import Rearrange
from bs_roformer.attend import Attend
from torch.utils.checkpoint import checkpoint
from typing import Tuple, Optional, Callable
# from beartype.typing import Tuple, Optional, List, Callable
# from beartype import beartype
from rotary_embedding_torch import RotaryEmbedding
from torch import nn
from torch.nn import Module, ModuleList
from torch.utils.checkpoint import checkpoint
from bs_roformer.attend import Attend
from einops import rearrange, pack, unpack
from einops.layers.torch import Rearrange
# helper functions
@ -192,7 +191,7 @@ class Transformer(Module):
class BandSplit(Module):
# @beartype
def __init__(self, dim, dim_inputs: Tuple[int, ...]):
def __init__(self, dim, dim_inputs: tuple[int, ...]):
super().__init__()
self.dim_inputs = dim_inputs
self.to_features = ModuleList([])
@ -206,7 +205,7 @@ class BandSplit(Module):
x = x.split(self.dim_inputs, dim=-1)
outs = []
for split_input, to_feature in zip(x, self.to_features):
for split_input, to_feature in zip(x, self.to_features, strict=False):
split_output = to_feature(split_input)
outs.append(split_output)
@ -219,7 +218,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
net = []
dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
is_last = ind == (len(dims) - 2)
net.append(nn.Linear(layer_dim_in, layer_dim_out))
@ -234,15 +233,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
class MaskEstimator(Module):
# @beartype
def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
super().__init__()
self.dim_inputs = dim_inputs
self.to_freqs = ModuleList([])
dim_hidden = dim * mlp_expansion_factor
for dim_in in dim_inputs:
net = []
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
self.to_freqs.append(mlp)
@ -252,7 +249,7 @@ class MaskEstimator(Module):
outs = []
for band_features, mlp in zip(x, self.to_freqs):
for band_features, mlp in zip(x, self.to_freqs, strict=False):
freq_out = mlp(band_features)
outs.append(freq_out)
@ -339,7 +336,7 @@ class BSRoformer(Module):
time_transformer_depth=2,
freq_transformer_depth=2,
linear_transformer_depth=0,
freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
freqs_per_bands: tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
# in the paper, they divide into ~60 bands, test with 1 for starters
dim_head=64,
heads=8,
@ -352,10 +349,10 @@ class BSRoformer(Module):
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
stft_win_length=2048,
stft_normalized=False,
stft_window_fn: Optional[Callable] = None,
stft_window_fn: Callable | None = None,
mask_estimator_depth=2,
multi_stft_resolution_loss_weight=1.0,
multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
multi_stft_hop_size=147,
multi_stft_normalized=False,
multi_stft_window_fn: Callable = torch.hann_window,

View File

@ -1,23 +1,20 @@
from collections.abc import Callable
from functools import partial
import torch
from torch import nn
from torch.nn import Module, ModuleList
import torch.nn.functional as F
from einops import pack, rearrange, reduce, repeat, unpack
from einops.layers.torch import Rearrange
from librosa import filters
from bs_roformer.attend import Attend
from torch.utils.checkpoint import checkpoint
from typing import Tuple, Optional, Callable
# from beartype.typing import Tuple, Optional, List, Callable
# from beartype import beartype
from rotary_embedding_torch import RotaryEmbedding
from torch import nn
from torch.nn import Module, ModuleList
from torch.utils.checkpoint import checkpoint
from einops import rearrange, pack, unpack, reduce, repeat
from einops.layers.torch import Rearrange
from librosa import filters
from bs_roformer.attend import Attend
# helper functions
@ -201,7 +198,7 @@ class Transformer(Module):
class BandSplit(Module):
# @beartype
def __init__(self, dim, dim_inputs: Tuple[int, ...]):
def __init__(self, dim, dim_inputs: tuple[int, ...]):
super().__init__()
self.dim_inputs = dim_inputs
self.to_features = ModuleList([])
@ -215,7 +212,7 @@ class BandSplit(Module):
x = x.split(self.dim_inputs, dim=-1)
outs = []
for split_input, to_feature in zip(x, self.to_features):
for split_input, to_feature in zip(x, self.to_features, strict=False):
split_output = to_feature(split_input)
outs.append(split_output)
@ -228,7 +225,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
net = []
dims = (dim_in, *((dim_hidden,) * depth), dim_out)
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
is_last = ind == (len(dims) - 2)
net.append(nn.Linear(layer_dim_in, layer_dim_out))
@ -243,15 +240,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
class MaskEstimator(Module):
# @beartype
def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
super().__init__()
self.dim_inputs = dim_inputs
self.to_freqs = ModuleList([])
dim_hidden = dim * mlp_expansion_factor
for dim_in in dim_inputs:
net = []
mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
self.to_freqs.append(mlp)
@ -261,7 +256,7 @@ class MaskEstimator(Module):
outs = []
for band_features, mlp in zip(x, self.to_freqs):
for band_features, mlp in zip(x, self.to_freqs, strict=False):
freq_out = mlp(band_features)
outs.append(freq_out)
@ -296,10 +291,10 @@ class MelBandRoformer(Module):
# 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
stft_win_length=2048,
stft_normalized=False,
stft_window_fn: Optional[Callable] = None,
stft_window_fn: Callable | None = None,
mask_estimator_depth=1,
multi_stft_resolution_loss_weight=1.0,
multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
multi_stft_hop_size=147,
multi_stft_normalized=False,
multi_stft_window_fn: Callable = torch.hann_window,

View File

@ -10,12 +10,13 @@ import torch.nn as nn
import yaml
from tqdm import tqdm
warnings.filterwarnings("ignore")
class Roformer_Loader:
def get_config(self, config_path):
with open(config_path, "r", encoding="utf-8") as f:
with open(config_path, encoding="utf-8") as f:
# use fullloader to load tag !!python/tuple, code can be improved
config = yaml.load(f, Loader=yaml.FullLoader)
return config
@ -104,7 +105,7 @@ class Roformer_Loader:
model = MelBandRoformer(**dict(self.config["model"]))
else:
print("Error: Unknown model: {}".format(self.model_type))
print(f"Error: Unknown model: {self.model_type}")
model = None
return model
@ -192,9 +193,11 @@ class Roformer_Loader:
progress_bar.close()
if self.config["training"]["target_instrument"] is None:
return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources)}
return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources, strict=False)}
else:
return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)}
return {
k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources, strict=False)
}
def run_folder(self, input, vocal_root, others_root, format):
self.model.eval()
@ -210,8 +213,8 @@ class Roformer_Loader:
try:
mix, sr = librosa.load(path, sr=sample_rate, mono=False)
except Exception as e:
print("Can read track: {}".format(path))
print("Error message: {}".format(str(e)))
print(f"Can read track: {path}")
print(f"Error message: {str(e)}")
return
# in case if model only supports mono tracks
@ -232,17 +235,17 @@ class Roformer_Loader:
other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument]
other = mix_orig - res[target_instrument] # caculate other instruments
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument)
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0])
path_vocal = f"{vocal_root}/{file_base_name}_{target_instrument}.wav"
path_other = f"{others_root}/{file_base_name}_{other_instruments[0]}.wav"
self.save_audio(path_vocal, res[target_instrument].T, sr, format)
self.save_audio(path_other, other.T, sr, format)
else:
# if target instrument is not specified, save the first instrument as vocal and the rest as others
vocal_inst = self.config["training"]["instruments"][0]
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst)
path_vocal = f"{vocal_root}/{file_base_name}_{vocal_inst}.wav"
self.save_audio(path_vocal, res[vocal_inst].T, sr, format)
for other in self.config["training"]["instruments"][1:]: # save other instruments
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other)
path_other = f"{others_root}/{file_base_name}_{other}.wav"
self.save_audio(path_other, res[other].T, sr, format)
def save_audio(self, path, data, sr, format):
@ -253,7 +256,7 @@ class Roformer_Loader:
sf.write(path, data, sr)
else:
sf.write(path, data, sr)
os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format))
os.system(f'ffmpeg -i "{path}" -vn "{path[:-3] + format}" -q:a 2 -y')
try:
os.remove(path)
except:
@ -275,7 +278,7 @@ class Roformer_Loader:
if self.model_type is None:
# if model_type is still None, raise an error
raise ValueError(
"Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
"Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'gsv_tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
)
self.config = self.get_default_config()
else:
@ -290,12 +293,12 @@ class Roformer_Loader:
# else it's a mel_band_roformer model
self.model_type = "mel_band_roformer"
print("Detected model type: {}".format(self.model_type))
print(f"Detected model type: {self.model_type}")
model = self.get_model_from_config()
state_dict = torch.load(model_path, map_location="cpu")
model.load_state_dict(state_dict)
if is_half == False:
if not is_half:
self.model = model.to(device)
else:
self.model = model.half().to(device)

View File

@ -38,7 +38,7 @@ def make_pair(mix_dir, inst_dir):
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
)
filelist = list(zip(X_list, y_list))
filelist = list(zip(X_list, y_list, strict=False))
return filelist
@ -138,10 +138,10 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset)
patch_dir = f"cs{cropsize}_sr{sr}_hl{hop_length}_nf{n_fft}_of{offset}"
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
for _i, (X_path, y_path) in enumerate(tqdm(filelist)):
basename = os.path.splitext(os.path.basename(X_path))[0]
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@ -154,7 +154,7 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
outpath = os.path.join(patch_dir, f"{basename}_p{j}.npz")
start = j * roi_size
if not os.path.exists(outpath):
np.savez(

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
@ -64,7 +64,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -83,7 +83,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),

View File

@ -7,7 +7,7 @@ from . import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
@ -41,7 +41,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
super().__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -64,7 +64,7 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
super(ASPPModule, self).__init__()
super().__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
@ -94,7 +94,7 @@ class ASPPModule(nn.Module):
class LSTMModule(nn.Module):
def __init__(self, nin_conv, nin_lstm, nout_lstm):
super(LSTMModule, self).__init__()
super().__init__()
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())

View File

@ -1,6 +1,7 @@
import json
import pathlib
default_param = {}
default_param["bins"] = 768
default_param["unstable_bins"] = 9 # training only
@ -41,7 +42,7 @@ def int_keys(d):
return r
class ModelParameters(object):
class ModelParameters:
def __init__(self, config_path=""):
if ".pth" == pathlib.Path(config_path).suffix:
import zipfile
@ -49,7 +50,7 @@ class ModelParameters(object):
with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
with open(config_path) as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param

Some files were not shown because too many files have changed in this diff Show More