cleanup for package

This commit is contained in:
Jarod Mica 2024-11-16 02:56:29 -08:00
parent 71b2fe69ea
commit 9db1d9fa43
15 changed files with 144 additions and 1968 deletions

View File

@ -1,8 +0,0 @@
docs
logs
output
reference
SoVITS_weights
GPT_weights
TEMP
.git

View File

@ -1,42 +0,0 @@
# Base CUDA image
FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
LABEL maintainer="breakstring@hotmail.com"
LABEL version="dev-20240209"
LABEL description="Docker image for GPT-SoVITS"
# Install 3rd party apps
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get update && \
apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
git lfs install && \
rm -rf /var/lib/apt/lists/*
# Copy only requirements.txt initially to leverage Docker cache
WORKDIR /workspace
COPY requirements.txt /workspace/
RUN pip install --no-cache-dir -r requirements.txt
# Define a build-time argument for image type
ARG IMAGE_TYPE=full
# Conditional logic based on the IMAGE_TYPE argument
# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
COPY ./Docker /workspace/Docker
# elite 类型的镜像里面不包含额外的模型
RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
chmod +x /workspace/Docker/download.sh && \
/workspace/Docker/download.sh && \
python /workspace/Docker/download.py && \
python -m nltk.downloader averaged_perceptron_tagger cmudict; \
fi
# Copy the rest of the application
COPY . /workspace
EXPOSE 9871 9872 9873 9874 9880
CMD ["python", "webui.py"]

View File

@ -1,152 +0,0 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
],
"metadata": {
"id": "himHYZmra7ix"
}
},
{
"cell_type": "code",
"metadata": {
"id": "e9b7iFV3dm1f"
},
"source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
"!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
"%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
"%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
"# @title UVR5 pretrains 安装uvr5模型\n",
"%cd /content/GPT-SoVITS/tools/uvr5\n",
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
],
"metadata": {
"id": "0NgxXg5sjv7z",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Create folder models 创建文件夹模型\n",
"import os\n",
"base_directory = \"/content/GPT-SoVITS\"\n",
"folder_names = [\"SoVITS_weights\", \"GPT_weights\"]\n",
"\n",
"for folder_name in folder_names:\n",
" if os.path.exists(os.path.join(base_directory, folder_name)):\n",
" print(f\"The folder '{folder_name}' already exists. (文件夹'{folder_name}'已经存在。)\")\n",
" else:\n",
" os.makedirs(os.path.join(base_directory, folder_name))\n",
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
],
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import requests\n",
"import zipfile\n",
"import shutil\n",
"import os\n",
"\n",
"#@title Import model 导入模型 (HuggingFace)\n",
"hf_link = 'https://huggingface.co/modelloosrvcc/Nagisa_Shingetsu_GPT-SoVITS/resolve/main/Nagisa.zip' #@param {type: \"string\"}\n",
"\n",
"output_path = '/content/'\n",
"\n",
"response = requests.get(hf_link)\n",
"with open(output_path + 'file.zip', 'wb') as file:\n",
" file.write(response.content)\n",
"\n",
"with zipfile.ZipFile(output_path + 'file.zip', 'r') as zip_ref:\n",
" zip_ref.extractall(output_path)\n",
"\n",
"os.remove(output_path + \"file.zip\")\n",
"\n",
"source_directory = output_path\n",
"SoVITS_destination_directory = '/content/GPT-SoVITS/SoVITS_weights'\n",
"GPT_destination_directory = '/content/GPT-SoVITS/GPT_weights'\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".pth\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(SoVITS_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"for filename in os.listdir(source_directory):\n",
" if filename.endswith(\".ckpt\"):\n",
" source_path = os.path.join(source_directory, filename)\n",
" destination_path = os.path.join(GPT_destination_directory, filename)\n",
" shutil.move(source_path, destination_path)\n",
"\n",
"print(f'Model downloaded. (模型已下载。)')"
],
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}
]
}

347
README.md
View File

@ -1,153 +1,25 @@
# Jarod's NOTE
Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. Will get instrucitons out sometime in the future.
Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed.
<div align="center">
## Quick Install and Usage
Ideally, do this all inside of a venv for package isolation
1. Install by doing:
```
pip install git+https://github.com/JarodMica/GPT-SoVITS.git
```
2. Make sure torch is installed with CUDA enabled. Reccomend to run `pip uninstall torch` to uninstall torch, then reinstall with the following. I chose 2.4.0+cu121:
```
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
```
<h1>GPT-SoVITS-WebUI</h1>
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
</div>
---
## Features:
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data for improved voice similarity and realism.
3. **Cross-lingual Support:** Inference in languages different from the training dataset, currently supporting English, Japanese, Korean, Cantonese and Chinese.
4. **WebUI Tools:** Integrated tools include voice accompaniment separation, automatic training set segmentation, Chinese ASR, and text labeling, assisting beginners in creating training datasets and GPT/SoVITS models.
**Check out our [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw) here!**
Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation
For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
### Tested Environments
- Python 3.9, PyTorch 2.0.1, CUDA 11
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
- Python 3.9, PyTorch 2.2.2, CPU devices
_Note: numba==0.56.4 requires py<3.11_
### Windows
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
### Linux
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
bash install.sh
Now to use it, so far I've only tested it with the api_v2.py. Given that the install above went fine, you should now be able to run:
```
### macOS
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
1. Install Xcode command-line tools by running `xcode-select --install`.
2. Install FFmpeg by running `brew install ffmpeg`.
3. Install the program by running the following commands:
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r requirements.txt
```
### Install Manually
#### Install FFmpeg
##### Conda Users
```bash
conda install ffmpeg
```
##### Ubuntu/Debian Users
```bash
sudo apt install ffmpeg
sudo apt install libsox-dev
conda install -c conda-forge 'ffmpeg<7'
```
##### Windows Users
Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root.
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
##### MacOS Users
```bash
brew install ffmpeg
```
#### Install Dependences
```bash
pip install -r requirements.txt
```
### Using Docker
#### docker-compose.yaml configuration
0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs.
1. Environment Variables
- is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation.
2. Volumes ConfigurationThe application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content.
3. shm_size The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation.
4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances.
#### Running with docker compose
```
docker compose -f "docker-compose.yaml" up -d
```
#### Running with docker command
As above, modify the corresponding parameters based on your actual situation, then run the following command:
```
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
gpt_sovits_api
```
Which will bootup local server that you can make requests to. Checkout `test.py` and `test_streaming.py` to get an idea for how you might be able to use the API.
## Pretrained Models
**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
Probably don't need to follow the instructions for the below, these are just kept here for reference for now.
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
@ -159,193 +31,8 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
## Dataset Format
The TTS annotation .list file format:
```
vocal_path|speaker_name|language|text
```
Language dictionary:
- 'zh': Chinese
- 'ja': Japanese
- 'en': English
- 'ko': Korean
- 'yue': Cantonese
Example:
```
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
## Finetune and inference
### Open WebUI
#### Integrated Package Users
Double-click `go-webui.bat`or use `go-webui.ps1`
if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1`
#### Others
```bash
python webui.py <language(optional)>
```
if you want to switch to V1,then
```bash
python webui.py v1 <language(optional)>
```
Or maunally switch version in WebUI
### Finetune
#### Path Auto-filling is now supported
1.Fill in the audio path
2.Slice the audio into small chunks
3.Denoise(optinal)
4.ASR
5.Proofreading ASR transcriptions
6.Go to the next Tab, then finetune the model
### Open Inference WebUI
#### Integrated Package Users
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
#### Others
```bash
python GPT_SoVITS/inference_webui.py <language(optional)>
```
OR
```bash
python webui.py
```
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
## V2 Release Notes
New Features:
1. Support Korean and Cantonese
2. An optimized text frontend
3. Pre-trained model extended from 2k hours to 5k hours
4. Improved synthesis quality for low-quality reference audio
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7) )
Use v2 from v1 environment:
1. `pip install -r requirements.txt` to update some packages
2. Clone the latest codes from github.
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
## Todo List
- [x] **High Priority:**
- [x] Localization in Japanese and English.
- [x] User guide.
- [x] Japanese and English dataset fine tune training.
- [ ] **Features:**
- [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min).
- [x] TTS speaking speed control.
- [ ] ~~Enhanced TTS emotion control.~~
- [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent).
- [x] Improve English and Japanese text frontend.
- [ ] Develop tiny and larger-sized TTS models.
- [x] Colab scripts.
- [ ] Try expand training dataset (2k hours -> 10k hours).
- [x] better sovits base model (enhanced audio quality)
- [ ] model mix
## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
This is how the audio segmentation of the dataset is done using the command line
```
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
--threshold <volume_threshold> \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
This is how dataset ASR processing is done using the command line(Only Chinese)
```
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays)
```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
A custom list save path is enabled
## Credits
Special thanks to the following projects and contributors:
Special thanks to the RVC-Boss for getting this wonderful tool up and going, as well as all of the other attributions used to build it:
### Theoretical Research
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
- [contentvec](https://github.com/auspicious3000/contentvec/)
- [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
### Pretrained Models
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [LangSegment](https://github.com/juntaosun/LangSegment)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Tools
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
- [gradio](https://github.com/gradio-app/gradio)
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge.
## Thanks to all contributors for their efforts
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
</a>
**Original Repo:** https://github.com/RVC-Boss/GPT-SoVITS

View File

@ -1,97 +0,0 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"环境配置 environment"
],
"metadata": {
"id": "_o6a8GS2lWQM"
}
},
{
"cell_type": "code",
"metadata": {
"id": "e9b7iFV3dm1f"
},
"source": [
"!pip install -q condacolab\n",
"# Setting up condacolab and installing packages\n",
"import condacolab\n",
"condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n",
"%cd -q /content\n",
"!git clone https://github.com/RVC-Boss/GPT-SoVITS\n",
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
"%cd -q /content/GPT-SoVITS\n",
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
"!/usr/local/bin/pip install -r requirements.txt"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
"!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
"%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
"%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
"# @title UVR5 pretrains 安装uvr5模型\n",
"%cd /content/GPT-SoVITS/tools/uvr5\n",
"%rm -r uvr5_weights\n",
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
],
"metadata": {
"id": "0NgxXg5sjv7z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7"
},
"execution_count": null,
"outputs": []
}
]
}

View File

@ -1,32 +0,0 @@
version: '3.8'
services:
gpt-sovits:
image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container.
container_name: gpt-sovits-container
environment:
- is_half=False
- is_share=False
volumes:
- ./output:/workspace/output
- ./logs:/workspace/logs
- ./SoVITS_weights:/workspace/SoVITS_weights
- ./reference:/workspace/reference
working_dir: /workspace
ports:
- "9880:9880"
- "9871:9871"
- "9872:9872"
- "9873:9873"
- "9874:9874"
shm_size: 16G
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: "all"
capabilities: [gpu]
stdin_open: true
tty: true
restart: unless-stopped

View File

@ -1,21 +0,0 @@
#!/bin/bash
# 获取当前日期,格式为 YYYYMMDD
DATE=$(date +%Y%m%d)
# 获取最新的 Git commit 哈希值的前 7 位
COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
# 构建 full 版本的镜像
docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
# 为同一个镜像添加带日期的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
# 为同一个镜像添加带当前代码库Commit哈希值的标签
docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite

View File

@ -1,2 +0,0 @@
runtime\python.exe webui.py zh_CN
pause

View File

@ -1,4 +0,0 @@
$ErrorActionPreference = "SilentlyContinue"
chcp 65001
& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py zh_CN"
pause

View File

@ -1,218 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "45857cb2",
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
"execution": {
"iopub.execute_input": "2024-02-18T14:43:46.735480Z",
"iopub.status.busy": "2024-02-18T14:43:46.735183Z",
"iopub.status.idle": "2024-02-18T14:48:10.724175Z",
"shell.execute_reply": "2024-02-18T14:48:10.723059Z"
},
"papermill": {
"duration": 263.994935,
"end_time": "2024-02-18T14:48:10.726613",
"exception": false,
"start_time": "2024-02-18T14:43:46.731678",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9d346b4",
"metadata": {
"execution": {
"iopub.execute_input": "2024-02-18T14:48:10.815802Z",
"iopub.status.busy": "2024-02-18T14:48:10.814899Z",
"iopub.status.idle": "2024-02-18T14:50:31.253276Z",
"shell.execute_reply": "2024-02-18T14:50:31.252024Z"
},
"papermill": {
"duration": 140.484893,
"end_time": "2024-02-18T14:50:31.255720",
"exception": false,
"start_time": "2024-02-18T14:48:10.770827",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!mkdir -p /kaggle/working/GPT-SoVITS/tools/asr/models\n",
"!mkdir -p /kaggle/working/GPT-SoVITS/tools/uvr5\n",
"%cd /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
"!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
"%cd /kaggle/working/GPT-SoVITS/tools/asr/models\n",
"!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
"# # @title UVR5 pretrains 安装uvr5模型\n",
"%cd /kaggle/working/GPT-SoVITS/tools/uvr5\n",
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /kaggle/working/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea94d245",
"metadata": {
"execution": {
"iopub.execute_input": "2024-02-18T14:29:01.071549Z",
"iopub.status.busy": "2024-02-18T14:29:01.070592Z",
"iopub.status.idle": "2024-02-18T14:40:45.318368Z",
"shell.execute_reply": "2024-02-18T14:40:45.317130Z",
"shell.execute_reply.started": "2024-02-18T14:29:01.071512Z"
},
"papermill": {
"duration": null,
"end_time": null,
"exception": false,
"start_time": "2024-02-18T14:50:31.309013",
"status": "running"
},
"tags": []
},
"outputs": [],
"source": [
"# @title launch WebUI 启动WebUI\n",
"%cd /kaggle/working/GPT-SoVITS/\n",
"!npm install -g localtunnel\n",
"import subprocess\n",
"import threading\n",
"import time\n",
"import socket\n",
"import urllib.request\n",
"def iframe_thread(port):\n",
" while True:\n",
" time.sleep(0.5)\n",
" sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
" result = sock.connect_ex(('127.0.0.1', port))\n",
" if result == 0:\n",
" break\n",
" sock.close()\n",
"\n",
" from colorama import Fore, Style\n",
" print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n",
" p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n",
" for line in p.stdout:\n",
" print(line.decode(), end='')\n",
"threading.Thread (target=iframe_thread, daemon=True, args=(9874,)).start()\n",
"\n",
"!python webui.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dda88a6d",
"metadata": {
"execution": {
"iopub.execute_input": "2024-02-18T14:40:56.880608Z",
"iopub.status.busy": "2024-02-18T14:40:56.879879Z"
},
"papermill": {
"duration": null,
"end_time": null,
"exception": null,
"start_time": null,
"status": "pending"
},
"tags": []
},
"outputs": [],
"source": [
"# 开启推理页面\n",
"%cd /kaggle/working/GPT-SoVITS/\n",
"!npm install -g localtunnel\n",
"import subprocess\n",
"import threading\n",
"import time\n",
"import socket\n",
"import urllib.request\n",
"def iframe_thread(port):\n",
" while True:\n",
" time.sleep(0.5)\n",
" sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
" result = sock.connect_ex(('127.0.0.1', port))\n",
" if result == 0:\n",
" break\n",
" sock.close()\n",
"\n",
" from colorama import Fore, Style\n",
" print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n",
" p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n",
" for line in p.stdout:\n",
" print(line.decode(), end='')\n",
"threading.Thread (target=iframe_thread, daemon=True, args=(9872,)).start()\n",
"\n",
"!python ./GPT_SoVITS/inference_webui.py"
]
}
],
"metadata": {
"kaggle": {
"accelerator": "nvidiaTeslaT4",
"dataSources": [
{
"datasetId": 4459328,
"sourceId": 7649639,
"sourceType": "datasetVersion"
}
],
"dockerImageVersionId": 30646,
"isGpuEnabled": true,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
},
"papermill": {
"default_parameters": {},
"duration": null,
"end_time": null,
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2024-02-18T14:43:44.011910",
"version": "2.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,6 +0,0 @@
#!/bin/bash
conda install -c conda-forge gcc
conda install -c conda-forge gxx
conda install ffmpeg cmake
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
pip install -r requirements.txt

View File

@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "GPT_SoVITS"
dynamic = ["version"]
description = ""
description = "A packaged version of GPT_SoVITS designed to be installable"
readme = "README.md"
license = {text = "MIT License"}
classifiers = [
@ -50,11 +50,12 @@ dependencies = [
"opencc==1.1.1; sys_platform == 'linux'",
"python_mecab_ko; sys_platform != 'win32'",
"fastapi<0.112.2",
"sounddevice"
]
[project.urls]
Homepage = "https://github.com/RVC-Boss/GPT-SoVITS"
Homepage = "https://github.com/JarodMica/GPT-SoVITS/tree/main"
[project.scripts]
"gpt_sovits_api" = "GPT_SoVITS.api_v2:main"

35
test.py Normal file
View File

@ -0,0 +1,35 @@
import requests
# API configuration
host = '127.0.0.1'
port = 9880
url = f'http://{host}:{port}/tts'
# Parameters for the request
params = {
'text': 'ははは、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか',
'text_lang': 'ja',
'ref_audio_path': 'test.wav',
'prompt_lang': 'ja',
'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか',
'text_split_method': 'cut0',
'batch_size': 1,
'media_type': 'wav',
'streaming_mode': False,
}
try:
# Send the GET request
response = requests.get(url, params=params)
# Check if the request was successful
if response.status_code == 200:
# Save the audio content to a file
with open('output.wav', 'wb') as f:
f.write(response.content)
print('Audio saved to output.wav')
else:
print(f'Error: {response.status_code}')
print(response.json())
except requests.exceptions.RequestException as e:
print(f'An error occurred: {e}')

89
test_streaming.py Normal file
View File

@ -0,0 +1,89 @@
import requests
import wave
import io
import sounddevice as sd
# API configuration
host = '127.0.0.1'
port = 9880
url = f'http://{host}:{port}/tts'
# Parameters for the request
params = {
'text': 'ふふふ、それはいいよね!でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか',
'text_lang': 'ja',
'ref_audio_path': 'test.wav',
'prompt_lang': 'ja',
'prompt_text': 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか',
'text_split_method': 'cut0',
'batch_size': 1,
'media_type': 'wav',
'streaming_mode': True,
}
while True:
input("Waiting for enter")
try:
# Send the GET request with streaming enabled
response = requests.get(url, params=params, stream=True)
# Check if the request was successful
if response.status_code == 200:
buffer = b'' # Buffer to hold data until header is processed
header_size = 44 # Standard WAV header size
header_parsed = False
stream = None
for chunk in response.iter_content(chunk_size=4096):
if chunk:
if not header_parsed:
buffer += chunk
if len(buffer) >= header_size:
# Parse WAV header
wav_header = buffer[:header_size]
wav_file = wave.open(io.BytesIO(wav_header), 'rb')
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
sample_rate = wav_file.getframerate()
wav_file.close()
# Map sample_width to dtype
dtype_map = {1: 'int8', 2: 'int16', 3: 'int24', 4: 'int32'}
dtype = dtype_map.get(sample_width, 'int16')
# Open audio stream
stream = sd.RawOutputStream(
samplerate=sample_rate,
channels=channels,
dtype=dtype,
blocksize=0, # Use default block size
)
stream.start()
# Write any remaining data after the header
data = buffer[header_size:]
if data:
stream.write(data)
header_parsed = True
buffer = b'' # Clear buffer
else:
# Play audio data
if stream:
stream.write(chunk)
# Clean up
if stream:
stream.stop()
stream.close()
print('Audio playback completed.')
else:
print(f'Error: {response.status_code}')
# Print the error message from the API
try:
print(response.json())
except ValueError:
print(response.text)
except requests.exceptions.RequestException as e:
print(f'An error occurred: {e}')
except Exception as e:
print(f'An error occurred: {e}')

1054
webui.py

File diff suppressed because it is too large Load Diff