diff --git a/Docker/damo.sha256 b/Docker/damo.sha256 new file mode 100644 index 0000000..6e9804d --- /dev/null +++ b/Docker/damo.sha256 @@ -0,0 +1,3 @@ +5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb +b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb +a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb \ No newline at end of file diff --git a/Docker/download.sh b/Docker/download.sh new file mode 100644 index 0000000..447e018 --- /dev/null +++ b/Docker/download.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +echo "Downloading models..." + +aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue + +echo "Checking SHA256..." + +parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c" diff --git a/Docker/links.sha256 b/Docker/links.sha256 new file mode 100644 index 0000000..cda6dc1 --- /dev/null +++ b/Docker/links.sha256 @@ -0,0 +1,12 @@ +b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt +fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth +020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth +24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin +e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin +39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth +45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth +5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth +8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth +01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth +56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth +233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Docker/links.txt b/Docker/links.txt new file mode 100644 index 0000000..e6603db --- /dev/null +++ b/Docker/links.txt @@ -0,0 +1,34 @@ +# GPT-SoVITS models +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt + out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth + out=GPT_SoVITS/pretrained_models/s2D488k.pth +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth + out=GPT_SoVITS/pretrained_models/s2G488k.pth +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json + out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json + out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin + out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json + out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin + out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin +https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json + out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json +# UVR5 +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth + out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth + out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth + out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth + out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth + out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth + out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth +https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx + out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cbf92cb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# Base CUDA image +FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 + +LABEL maintainer="breakstring@hotmail.com" +LABEL version="dev-20240123.03" +LABEL description="Docker image for GPT-SoVITS" + + +# Install 3rd party apps +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +RUN apt-get update && \ + apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ + rm -rf /var/lib/apt/lists/* && \ + git lfs install + +# Copy application +WORKDIR /workspace +COPY . /workspace + +# Download models +RUN chmod +x /workspace/Docker/download.sh && /workspace/Docker/download.sh + +# 本应该从 requirements.txt 里面安装package,但是由于funasr和modelscope的问题,暂时先在后面手工安装依赖包吧 +RUN pip install --no-cache-dir torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet transformers jieba psutil PyYAML +# 这里强制指定了modelscope和funasr的版本,后面damo_asr的模型让它们自己下载 +RUN pip install --no-cache-dir modelscope~=1.10.0 torchaudio sentencepiece funasr~=0.8.7 + +# 先屏蔽掉,让容器里自己下载 +# Clone damo_asr +#WORKDIR /workspace/tools/damo_asr/models +#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && \ +# (cd speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && git lfs pull) +#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch.git speech_fsmn_vad_zh-cn-16k-common-pytorch && \ +# (cd speech_fsmn_vad_zh-cn-16k-common-pytorch && git lfs pull) +#RUN git clone --depth 1 https://www.modelscope.cn/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git punc_ct-transformer_zh-cn-common-vocab272727-pytorch && \ +# (cd punc_ct-transformer_zh-cn-common-vocab272727-pytorch && git lfs pull) + +#RUN parallel --will-cite -a /workspace/Docker/damo.sha256 "echo -n {} | sha256sum -c" + +#WORKDIR /workspace + +EXPOSE 9870 +EXPOSE 9871 +EXPOSE 9872 +EXPOSE 9873 +EXPOSE 9874 + +VOLUME /workspace/output +VOLUME /workspace/logs +VOLUME /workspace/SoVITS_weights + +CMD ["python", "webui.py"] \ No newline at end of file diff --git a/README.md b/README.md index 7649d7b..59089ea 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,31 @@ For Chinese ASR (additionally), download models from [Damo ASR Model](https://mo For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. +### Using Docker + +#### docker-compose.yaml configuration + +1. Environment Variables: + - is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation. + +2. Volumes Configuration,The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content. +3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation. +4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances. + + +#### Running with docker compose +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### Running with docker command + +As above, modify the corresponding parameters based on your actual situation, then run the following command: +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03 +``` + + ## Dataset Format The TTS annotation .list file format: diff --git a/config.py b/config.py index ec846b3..75db9bc 100644 --- a/config.py +++ b/config.py @@ -1,10 +1,10 @@ -import sys +import sys,os # 推理用的指定模型 sovits_path = "" gpt_path = "" -is_half = True +is_half = eval(os.environ.get("is_half",True)) is_share=False cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..a772c82 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,31 @@ +version: '3.8' + +services: + gpt-sovits: + image: breakstring/gpt-sovits:dev-20240123.03 + container_name: gpt-sovits-container + environment: + - is_half=False + volumes: + - G:/GPT-SoVITS-DockerTest/output:/workspace/output + - G:/GPT-SoVITS-DockerTest/logs:/workspace/logs + - G:/GPT-SoVITS-DockerTest/SoVITS_weights:/workspace/SoVITS_weights + - G:/GPT-SoVITS-DockerTest/reference:/workspace/reference + working_dir: /workspace + ports: + - "9870:9870" + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + shm_size: 16G + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: "all" + capabilities: [gpu] + stdin_open: true + tty: true + restart: unless-stopped diff --git a/docs/cn/README.md b/docs/cn/README.md index 27c5668..072dc0d 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -87,6 +87,32 @@ brew install ffmpeg 下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。 +### 在 Docker 中使用 + +#### docker-compose.yaml 设置 + +1. 环境变量: + - is_half: 半精度/双精度控制。在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时,一般都是它引起的,可以根据实际情况来调整为True或者False。 + +2. Volume设置,容器内的应用根目录设置为 /workspace。 默认的 docker-compose.yaml 中列出了一些实际的例子,便于上传/下载内容。 +3. shm_size:Windows下的Docker Desktop默认可用内存过小,会导致运行异常,根据自己情况酌情设置。 +4. deploy小节下的gpu相关内容,请根据您的系统和实际情况酌情设置。 + + + +#### 通过 docker compose运行 +``` +docker compose -f "docker-compose.yaml" up -d +``` + +#### 通过 docker 命令运行 + +同上,根据您自己的实际情况修改对应的参数,然后运行如下命令: +``` +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03 +``` + + ### 预训练模型 diff --git a/docs/ja/README.md b/docs/ja/README.md index d0987a8..cadb68b 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -93,6 +93,30 @@ brew install ffmpeg [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートディレクトリに置きます。 +### Dockerの使用 + +#### docker-compose.yamlの設定 + +1. 環境変数: + - `is_half`:半精度/倍精度の制御。"SSL抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です。実際の状況に応じてTrueまたはFalseに調整してください。 + +2. ボリューム設定:コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます。デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています。 +3. `shm_size`:WindowsのDocker Desktopのデフォルトの利用可能メモリが小さすぎるため、異常な動作を引き起こす可能性があります。状況に応じて適宜設定してください。 +4. `deploy`セクションのGPUに関連する内容は、システムと実際の状況に応じて慎重に設定してください。 + +#### docker composeで実行する +```markdown +docker compose -f "docker-compose.yaml" up -d +``` + +#### dockerコマンドで実行する + +上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します: +```markdown +docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03 +``` + + ### 事前訓練済みモデル