update docs

This commit is contained in:
XXXXRT666 2025-03-29 09:47:23 +00:00
parent 3d6ecf3d79
commit f400301e89
8 changed files with 314 additions and 203 deletions

View File

@ -1,42 +1,37 @@
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
],
"metadata": { "metadata": {
"id": "himHYZmra7ix" "id": "himHYZmra7ix"
} },
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "e9b7iFV3dm1f" "id": "e9b7iFV3dm1f"
}, },
"outputs": [],
"source": [ "source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n", "%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r extra-req.txt --no-deps\n",
"!pip install -r requirements.txt" "!pip install -r requirements.txt"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [ "source": [
"# @title Download pretrained models 下载预训练模型\n", "# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n", "!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n", "!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
], ]
"metadata": {
"id": "0NgxXg5sjv7z",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"outputs": [],
"source": [ "source": [
"#@title Create folder models 创建文件夹模型\n", "#@title Create folder models 创建文件夹模型\n",
"import os\n", "import os\n",
@ -77,16 +72,16 @@
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n", " print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n", "\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")" "print(\"All folders have been created. (所有文件夹均已创建。)\")"
], ]
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [ "source": [
"import requests\n", "import requests\n",
"import zipfile\n", "import zipfile\n",
@ -124,29 +119,35 @@
" shutil.move(source_path, destination_path)\n", " shutil.move(source_path, destination_path)\n",
"\n", "\n",
"print(f'Model downloaded. (模型已下载。)')" "print(f'Model downloaded. (模型已下载。)')"
], ]
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [ "source": [
"# @title launch WebUI 启动WebUI\n", "# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n", "!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n", "%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py" "!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}
] ]
} }
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -1,6 +1,5 @@
<div align="center"> <div align="center">
<h1>GPT-SoVITS-WebUI</h1> <h1>GPT-SoVITS-WebUI</h1>
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br> A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
@ -77,7 +76,8 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### Install Manually ### Install Manually
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
##### MacOS Users ##### MacOS Users
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -112,7 +113,8 @@ brew install ffmpeg
#### Install Dependences #### Install Dependences
```bash ```bash
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### Using Docker ### Using Docker
@ -200,6 +202,7 @@ if you want to switch to V1,then
```bash ```bash
python webui.py v1 <language(optional)> python webui.py v1 <language(optional)>
``` ```
Or maunally switch version in WebUI Or maunally switch version in WebUI
### Finetune ### Finetune
@ -224,11 +227,13 @@ Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference
```bash ```bash
python GPT_SoVITS/inference_webui.py <language(optional)> python GPT_SoVITS/inference_webui.py <language(optional)>
``` ```
OR OR
```bash ```bash
python webui.py python webui.py
``` ```
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
## V2 Release Notes ## V2 Release Notes
@ -243,11 +248,11 @@ New Features:
4. Improved synthesis quality for low-quality reference audio 4. Improved synthesis quality for low-quality reference audio
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v2 from v1 environment: Use v2 from v1 environment:
1. `pip install -r requirements.txt -c constraints.txt` to update some packages 1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github. 2. Clone the latest codes from github.
@ -263,11 +268,11 @@ New Features:
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression. 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v3 from v2 environment: Use v3 from v2 environment:
1. `pip install -r requirements.txt -c constraints.txt` to update some packages 1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github. 2. Clone the latest codes from github.
@ -275,7 +280,6 @@ Use v3 from v2 environment:
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
## Todo List ## Todo List
- [x] **High Priority:** - [x] **High Priority:**
@ -297,15 +301,20 @@ Use v3 from v2 environment:
- [ ] model mix - [ ] model mix
## (Additional) Method for running from the command line ## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5 Use the command line to open the WebUI for UVR5
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
This is how the audio segmentation of the dataset is done using the command line This is how the audio segmentation of the dataset is done using the command line
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
This is how dataset ASR processing is done using the command line(Only Chinese) This is how dataset ASR processing is done using the command line(Only Chinese)
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR processing is performed through Faster_Whisper(ASR marking except Chinese) ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays) (No progress bars, GPU performance may cause time delays)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
A custom list save path is enabled A custom list save path is enabled
## Credits ## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
Special thanks to the following projects and contributors: Special thanks to the following projects and contributors:
### Theoretical Research ### Theoretical Research
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Pretrained Models ### Pretrained Models
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Text Frontend for Inference ### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Tools ### WebUI Tools
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -1,23 +1,10 @@
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {
"id": "view-in-github", "colab_type": "text",
"colab_type": "text" "id": "view-in-github"
}, },
"source": [ "source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [
"环境配置 environment"
],
"metadata": { "metadata": {
"id": "_o6a8GS2lWQM" "id": "_o6a8GS2lWQM"
} },
"source": [
"环境配置 environment"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "e9b7iFV3dm1f" "id": "e9b7iFV3dm1f"
}, },
"outputs": [],
"source": [ "source": [
"!pip install -q condacolab\n", "!pip install -q condacolab\n",
"# Setting up condacolab and installing packages\n", "# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
"%cd -q /content/GPT-SoVITS\n", "%cd -q /content/GPT-SoVITS\n",
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
"!/usr/local/bin/pip install -r requirements.txt" "!/usr/local/bin/pip install -r requirements.txt"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [ "source": [
"# @title Download pretrained models 下载预训练模型\n", "# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n", "!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n", "!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
], ]
"metadata": {
"id": "0NgxXg5sjv7z"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [ "source": [
"# @title launch WebUI 启动WebUI\n", "# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n", "!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n", "%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py" "!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7"
},
"execution_count": null,
"outputs": []
}
] ]
} }
],
"metadata": {
"accelerator": "GPU",
"colab": {
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -76,7 +76,8 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### 手动安装 ### 手动安装
@ -104,6 +105,7 @@ conda install -c conda-forge 'ffmpeg<7'
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS) 安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
##### MacOS 用户 ##### MacOS 用户
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -111,7 +113,8 @@ brew install ffmpeg
#### 安装依赖 #### 安装依赖
```bash ```bash
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### 在 Docker 中使用 ### 在 Docker 中使用
@ -155,7 +158,6 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。 - 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。
4. 对于中文 ASR额外功能从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。 4. 对于中文 ASR额外功能从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
5. 对于英语或日语 ASR额外功能从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。 5. 对于英语或日语 ASR额外功能从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
@ -202,6 +204,7 @@ python webui.py <language(optional)>
```bash ```bash
python webui.py v1 <language(optional)> python webui.py v1 <language(optional)>
``` ```
或者在 webUI 内动态切换 或者在 webUI 内动态切换
### 微调 ### 微调
@ -226,11 +229,13 @@ python webui.py v1 <language(optional)>
```bash ```bash
python GPT_SoVITS/inference_webui.py <language(optional)> python GPT_SoVITS/inference_webui.py <language(optional)>
``` ```
或者 或者
```bash ```bash
python webui.py python webui.py
``` ```
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
## V2 发布说明 ## V2 发布说明
@ -245,7 +250,7 @@ python webui.py
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好 4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) 详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 v1 环境迁移至 v2 从 v1 环境迁移至 v2
@ -265,7 +270,7 @@ python webui.py
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感 2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) 详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从 v2 环境迁移至 v3 从 v2 环境迁移至 v3
@ -277,7 +282,6 @@ python webui.py
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
## 待办事项清单 ## 待办事项清单
- [x] **高优先级:** - [x] **高优先级:**
@ -299,16 +303,21 @@ python webui.py
- [ ] 模型混合。 - [ ] 模型混合。
## (附加)命令行运行方式 ## (附加)命令行运行方式
使用命令行打开 UVR5 的 WebUI 使用命令行打开 UVR5 的 WebUI
````
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```` ```
<!-- 如果打不开浏览器请按照下面的格式进行UVR处理这是使用mdxnet进行音频处理的方式 <!-- 如果打不开浏览器请按照下面的格式进行UVR处理这是使用mdxnet进行音频处理的方式
```` ````
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
```` --> ```` -->
这是使用命令行完成数据集的音频切分的方式 这是使用命令行完成数据集的音频切分的方式
````
```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
--min_length <minimum_duration_of_each_subclip> \ --min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
```` ```
这是使用命令行完成数据集 ASR 处理的方式(仅限中文) 这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
````
```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
```` ```
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记) 通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
没有进度条GPU 性能可能会导致时间延迟) 没有进度条GPU 性能可能会导致时间延迟)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
启用自定义列表保存路径 启用自定义列表保存路径
## 致谢 ## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特别感谢以下项目和贡献者: 特别感谢以下项目和贡献者:
### 理论研究 ### 理论研究
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 预训练模型 ### 预训练模型
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推理用文本前端 ### 推理用文本前端
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 工具 ### WebUI 工具
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -70,8 +70,8 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt -c constraints.txt pip install -r requirements.txt
``` ```
### 手動インストール ### 手動インストール
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
##### MacOS ユーザー ##### MacOS ユーザー
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -104,7 +105,8 @@ brew install ffmpeg
#### 依存関係をインストールします #### 依存関係をインストールします
```bash ```bash
pip install -r requirementx.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirementx.txt
``` ```
### Docker の使用 ### Docker の使用
@ -169,6 +171,7 @@ vocal_path|speaker_name|language|text
``` ```
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
``` ```
## 微調整と推論 ## 微調整と推論
### WebUI を開く ### WebUI を開く
@ -189,6 +192,7 @@ V1に切り替えたい場合は
```bash ```bash
python webui.py v1 <言語(オプション)> python webui.py v1 <言語(オプション)>
``` ```
または WebUI で手動でバージョンを切り替えてください。 または WebUI で手動でバージョンを切り替えてください。
### 微調整 ### 微調整
@ -213,11 +217,13 @@ python webui.py v1 <言語(オプション)>
```bash ```bash
python GPT_SoVITS/inference_webui.py <言語(オプション)> python GPT_SoVITS/inference_webui.py <言語(オプション)>
``` ```
または または
```bash ```bash
python webui.py python webui.py
``` ```
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
## V2 リリースノート ## V2 リリースノート
@ -232,11 +238,11 @@ python webui.py
4. 低品質の参照音声に対する合成品質の向上 4. 低品質の参照音声に対する合成品質の向上
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 環境から V2 を使用するには: V1 環境から V2 を使用するには:
1. `pip install -r requirements.txt -c constraints.txt`を使用していくつかのパッケージを更新 1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
2. 最新のコードを github からクローン 2. 最新のコードを github からクローン
@ -252,11 +258,11 @@ V1環境からV2を使用するには:
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。 2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 環境から v3 を使用する方法: v2 環境から v3 を使用する方法:
1. `pip install -r requirements.txt -c constraints.txt` を実行して、いくつかのパッケージを更新します。 1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
2. GitHub から最新のコードをクローンします。 2. GitHub から最新のコードをクローンします。
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
- [ ] モデルミックス - [ ] モデルミックス
## (追加の) コマンドラインから実行する方法 ## (追加の) コマンドラインから実行する方法
コマンド ラインを使用して UVR5 の WebUI を開きます コマンド ラインを使用して UVR5 の WebUI を開きます
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング) ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります) (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
カスタムリストの保存パスが有効になっています カスタムリストの保存パスが有効になっています
## クレジット ## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特に以下のプロジェクトと貢献者に感謝します: 特に以下のプロジェクトと貢献者に感謝します:
### 理論研究 ### 理論研究
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 事前学習モデル ### 事前学習モデル
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推論用テキストフロントエンド ### 推論用テキストフロントエンド
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI ツール ### WebUI ツール
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -70,8 +70,8 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt -c constraints.txt pip install -r requirements.txt
``` ```
### 수동 설치 ### 수동 설치
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용) [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
##### MacOS 사용자 ##### MacOS 사용자
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -106,7 +107,8 @@ brew install ffmpeg
#### 의존성 설치 #### 의존성 설치
```bash ```bash
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### Docker에서 사용 ### Docker에서 사용
@ -195,6 +197,7 @@ V1으로 전환하려면,
```bash ```bash
python webui.py v1 <언어(옵션)> python webui.py v1 <언어(옵션)>
``` ```
또는 WebUI에서 수동으로 버전을 전환하십시오. 또는 WebUI에서 수동으로 버전을 전환하십시오.
### 미세 조정 ### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
```bash ```bash
python GPT_SoVITS/inference_webui.py <언어(옵션)> python GPT_SoVITS/inference_webui.py <언어(옵션)>
``` ```
또는 또는
```bash ```bash
python webui.py python webui.py
``` ```
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
## V2 릴리스 노트 ## V2 릴리스 노트
@ -238,11 +243,11 @@ python webui.py
4. 저품질 참조 오디오에 대한 합성 품질 향상 4. 저품질 참조 오디오에 대한 합성 품질 향상
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 환경에서 V2를 사용하려면: V1 환경에서 V2를 사용하려면:
1. `pip install -r requirements.txt -c constraints.txt`를 사용하여 일부 패키지 업데이트 1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트
2. github에서 최신 코드를 클론하십시오. 2. github에서 최신 코드를 클론하십시오.
@ -258,11 +263,11 @@ V1 환경에서 V2를 사용하려면:
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다. 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 환경에서 v3 사용하기: v2 환경에서 v3 사용하기:
1. `pip install -r requirements.txt -c constraints.txt`로 일부 패키지를 업데이트합니다. 1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다.
2. 최신 코드를 github 에서 클론합니다. 2. 최신 코드를 github 에서 클론합니다.
@ -270,7 +275,6 @@ v2 환경에서 v3 사용하기:
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
## 할 일 목록 ## 할 일 목록
- [x] **최우선순위:** - [x] **최우선순위:**
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
- [ ] 모델 블렌딩. - [ ] 모델 블렌딩.
## (추가적인) 명령줄에서 실행하는 방법 ## (추가적인) 명령줄에서 실행하는 방법
명령줄을 사용하여 UVR5용 WebUI 열기 명령줄을 사용하여 UVR5용 WebUI 열기
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다. <!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다. ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
사용자 정의 목록 저장 경로가 활성화되었습니다. 사용자 정의 목록 저장 경로가 활성화되었습니다.
## 감사의 말 ## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
다음 프로젝트와 기여자들에게 특별히 감사드립니다: 다음 프로젝트와 기여자들에게 특별히 감사드립니다:
### 이론 연구 ### 이론 연구
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 사전 학습 모델 ### 사전 학습 모델
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 추론용 텍스트 프론트엔드 ### 추론용 텍스트 프론트엔드
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 도구 ### WebUI 도구
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -72,8 +72,8 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt -c constraints.txt pip install -r requirements.txt
``` ```
### El ile Yükleme ### El ile Yükleme
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin. [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
##### MacOS Kullanıcıları ##### MacOS Kullanıcıları
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -106,7 +107,8 @@ brew install ffmpeg
#### Bağımlılıkları Yükleme #### Bağımlılıkları Yükleme
```bash ```bash
pip install -r requirements.txt -c constraints.txt pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
``` ```
### Docker Kullanarak ### Docker Kullanarak
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
```bash ```bash
python webui.py v1 <dil(isteğe bağlı)> python webui.py v1 <dil(isteğe bağlı)>
``` ```
veya WebUI'de manuel olarak sürüm değiştirin. veya WebUI'de manuel olarak sürüm değiştirin.
### İnce Ayar ### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
```bash ```bash
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)> python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
``` ```
VEYA VEYA
```bash ```bash
python webui.py python webui.py
``` ```
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
## V2 Sürüm Notları ## V2 Sürüm Notları
@ -235,11 +240,11 @@ Yeni Özellikler:
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi 4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 ortamından V2'yi kullanmak için: V1 ortamından V2'yi kullanmak için:
1. `pip install -r requirements.txt -c constraints.txt` ile bazı paketleri güncelleyin 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin
2. github'dan en son kodları klonlayın. 2. github'dan en son kodları klonlayın.
@ -255,11 +260,11 @@ V1 ortamından V2'yi kullanmak için:
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi. 2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
### v2 ortamında v3 kullanımı: ### v2 ortamında v3 kullanımı:
1. `pip install -r requirements.txt -c constraints.txt` ile bazı paketleri güncelleyin. 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
2. GitHubdan en son kodları klonlayın. 2. GitHubdan en son kodları klonlayın.
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
- [ ] model karışımı - [ ] model karışımı
## (Ekstra) Komut satırından çalıştırma yöntemi ## (Ekstra) Komut satırından çalıştırma yöntemi
UVR5 için Web Arayüzünü açmak için komut satırını kullanın UVR5 için Web Arayüzünü açmak için komut satırını kullanın
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \ --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı> --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu> --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
``` ```
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
``` ```
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı> python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
``` ```
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme) ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil> python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
``` ```
Özel bir liste kaydetme yolu etkinleştirildi Özel bir liste kaydetme yolu etkinleştirildi
## Katkı Verenler ## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz: Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
### Teorik Araştırma ### Teorik Araştırma
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Önceden Eğitilmiş Modeller ### Önceden Eğitilmiş Modeller
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Tahmin İçin Metin Ön Ucu ### Tahmin İçin Metin Ön Ucu
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Araçları ### WebUI Araçları
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -27,7 +27,8 @@
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n", "%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt" "!pip install -r requirements.txt\n",
"!pip install -r extra-req.txt --no-deps"
] ]
}, },
{ {