Merge branch 'RVC-Boss:main' into main

This commit is contained in:
zZxztxZz 2024-01-23 21:51:22 +08:00 committed by GitHub
commit 5eddbdd33a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 343 additions and 61 deletions

3
Docker/damo.sha256 Normal file
View File

@ -0,0 +1,3 @@
5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb

11
Docker/download.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -Eeuo pipefail
echo "Downloading models..."
aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
echo "Checking SHA256..."
parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"

12
Docker/links.sha256 Normal file
View File

@ -0,0 +1,12 @@
b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx

34
Docker/links.txt Normal file
View File

@ -0,0 +1,34 @@
# GPT-SoVITS models
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
out=GPT_SoVITS/pretrained_models/s2D488k.pth
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
out=GPT_SoVITS/pretrained_models/s2G488k.pth
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
# UVR5
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx

53
Dockerfile Normal file
View File

@ -0,0 +1,53 @@
# Base CUDA image
FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
LABEL maintainer="breakstring@hotmail.com"
LABEL version="dev-20240123.03"
LABEL description="Docker image for GPT-SoVITS"
# Install 3rd party apps
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get update && \
apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
rm -rf /var/lib/apt/lists/* && \
git lfs install
# Copy application
WORKDIR /workspace
COPY . /workspace
# Download models
RUN chmod +x /workspace/Docker/download.sh && /workspace/Docker/download.sh
# 本应该从 requirements.txt 里面安装package但是由于funasr和modelscope的问题暂时先在后面手工安装依赖包吧
RUN pip install --no-cache-dir torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet transformers jieba psutil PyYAML
# 这里强制指定了modelscope和funasr的版本后面damo_asr的模型让它们自己下载
RUN pip install --no-cache-dir modelscope~=1.10.0 torchaudio sentencepiece funasr~=0.8.7
# 先屏蔽掉,让容器里自己下载
# Clone damo_asr
#WORKDIR /workspace/tools/damo_asr/models
#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && \
# (cd speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch && git lfs pull)
#RUN git clone --depth 1 https://www.modelscope.cn/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch.git speech_fsmn_vad_zh-cn-16k-common-pytorch && \
# (cd speech_fsmn_vad_zh-cn-16k-common-pytorch && git lfs pull)
#RUN git clone --depth 1 https://www.modelscope.cn/iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git punc_ct-transformer_zh-cn-common-vocab272727-pytorch && \
# (cd punc_ct-transformer_zh-cn-common-vocab272727-pytorch && git lfs pull)
#RUN parallel --will-cite -a /workspace/Docker/damo.sha256 "echo -n {} | sha256sum -c"
#WORKDIR /workspace
EXPOSE 9870
EXPOSE 9871
EXPOSE 9872
EXPOSE 9873
EXPOSE 9874
VOLUME /workspace/output
VOLUME /workspace/logs
VOLUME /workspace/SoVITS_weights
CMD ["python", "webui.py"]

View File

@ -1,4 +1,5 @@
import os
import os,re
import pdb
gpt_path = os.environ.get(
"gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
@ -42,8 +43,6 @@ if is_half == True:
else:
bert_model = bert_model.to(device)
# bert_model=bert_model.to(device)
def get_bert_feature(text, word2ph):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")
@ -57,15 +56,8 @@ def get_bert_feature(text, word2ph):
repeat_feature = res[i].repeat(word2ph[i], 1)
phone_level_feature.append(repeat_feature)
phone_level_feature = torch.cat(phone_level_feature, dim=0)
# if(is_half==True):phone_level_feature=phone_level_feature.half()
return phone_level_feature.T
n_semantic = 1024
dict_s2=torch.load(sovits_path,map_location="cpu")
hps=dict_s2["config"]
class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
@ -94,40 +86,48 @@ class DictToAttrRecursive(dict):
raise AttributeError(f"Attribute {item} not found")
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"]
ssl_model = cnhubert.get_model()
if is_half == True:
ssl_model = ssl_model.half().to(device)
else:
ssl_model = ssl_model.to(device)
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
hz = 50
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
def change_sovits_weights(sovits_path):
global vq_model,hps
dict_s2=torch.load(sovits_path,map_location="cpu")
hps=dict_s2["config"]
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
del vq_model.enc_q
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
change_sovits_weights(sovits_path)
def change_gpt_weights(gpt_path):
global hz,max_sec,t2s_model,config
hz = 50
dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"]
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
change_gpt_weights(gpt_path)
def get_spepc(hps, filename):
audio = load_audio(filename, int(hps.data.sampling_rate))
@ -325,14 +325,46 @@ def cut3(inp):
inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")])
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split('(\d+)', s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
SoVITS_weight_root="SoVITS_weights"
GPT_weight_root="GPT_weights"
os.makedirs(SoVITS_weight_root,exist_ok=True)
os.makedirs(GPT_weight_root,exist_ok=True)
def get_weights_names():
SoVITS_names = [pretrained_sovits_name]
for name in os.listdir(SoVITS_weight_root):
if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
GPT_names = [pretrained_gpt_name]
for name in os.listdir(GPT_weight_root):
if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
return SoVITS_names,GPT_names
SoVITS_names,GPT_names = get_weights_names()
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
)
# with gr.Tabs():
# with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
with gr.Group():
gr.Markdown(value=i18n("模型切换"))
with gr.Row():
GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath")

View File

@ -49,10 +49,13 @@ maxx=0.95
alpha=0.5
device="cuda:0"
model=cnhubert.get_model()
# is_half=False
if(is_half==True):
model=model.half().to(device)
else:
model = model.to(device)
nan_fails=[]
def name2go(wav_name):
hubert_path="%s/%s.pt"%(hubert_dir,wav_name)
if(os.path.exists(hubert_path)):return
@ -60,25 +63,28 @@ def name2go(wav_name):
tmp_audio = load_audio(wav_path, 32000)
tmp_max = np.abs(tmp_audio).max()
if tmp_max > 2.2:
print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
print("%s-filtered" % (wav_name, tmp_max))
return
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio
tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio
tmp_audio = librosa.resample(
tmp_audio32, orig_sr=32000, target_sr=16000
)
tmp_audio32b, orig_sr=32000, target_sr=16000
)#不是重采样问题
tensor_wav16 = torch.from_numpy(tmp_audio)
if (is_half == True):
tensor_wav16=tensor_wav16.half().to(device)
else:
tensor_wav16 = tensor_wav16.to(device)
ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
if np.isnan(ssl.detach().numpy()).sum()!= 0:return
if np.isnan(ssl.detach().numpy()).sum()!= 0:
nan_fails.append(wav_name)
print("nan filtered:%s"%wav_name)
return
wavfile.write(
"%s/%s"%(wav32dir,wav_name),
32000,
tmp_audio32.astype("int16"),
)
# torch.save(ssl,hubert_path )
my_save(ssl,hubert_path )
with open(inp_text,"r",encoding="utf8")as f:
@ -92,3 +98,12 @@ for line in lines[int(i_part)::int(all_parts)]:
name2go(wav_name)
except:
print(line,traceback.format_exc())
if(len(nan_fails)>0 and is_half==True):
is_half=False
model=model.float()
for wav_name in nan_fails:
try:
name2go(wav_name)
except:
print(wav_name,traceback.format_exc())

View File

@ -18,7 +18,7 @@ pinyin_to_symbol_map = {
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba.posseg as psg
import jieba_fast.posseg as psg
rep_map = {

View File

@ -14,7 +14,7 @@
from typing import List
from typing import Tuple
import jieba
import jieba_fast as jieba
from pypinyin import lazy_pinyin
from pypinyin import Style

View File

@ -18,7 +18,7 @@ logging.getLogger("matplotlib").setLevel(logging.ERROR)
MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
@ -310,13 +310,13 @@ def check_git_hash(model_dir):
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.WARNING)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.WARNING)
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger

View File

@ -61,7 +61,7 @@ sudo apt-get install python3.9-distutils
#### Pip Packages
```bash
pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet transformers jieba
pip install torch numpy scipy tensorboard librosa==0.9.2 numba==0.56.4 pytorch-lightning gradio==3.14.0 ffmpeg-python onnxruntime tqdm cn2an pypinyin pyopenjtalk g2p_en chardet transformers jieba_fast
```
#### Additional Requirements
@ -107,6 +107,31 @@ For Chinese ASR (additionally), download models from [Damo ASR Model](https://mo
For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
### Using Docker
#### docker-compose.yaml configuration
1. Environment Variables
- is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation.
2. Volumes ConfigurationThe application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content.
3. shm_size The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation.
4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances.
#### Running with docker compose
```
docker compose -f "docker-compose.yaml" up -d
```
#### Running with docker command
As above, modify the corresponding parameters based on your actual situation, then run the following command:
```
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03
```
## Dataset Format
The TTS annotation .list file format:

View File

@ -1,10 +1,10 @@
import sys
import sys,os
# 推理用的指定模型
sovits_path = ""
gpt_path = ""
is_half = True
is_half = eval(os.environ.get("is_half","True"))
is_share=False
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"

31
docker-compose.yaml Normal file
View File

@ -0,0 +1,31 @@
version: '3.8'
services:
gpt-sovits:
image: breakstring/gpt-sovits:dev-20240123.03
container_name: gpt-sovits-container
environment:
- is_half=False
volumes:
- G:/GPT-SoVITS-DockerTest/output:/workspace/output
- G:/GPT-SoVITS-DockerTest/logs:/workspace/logs
- G:/GPT-SoVITS-DockerTest/SoVITS_weights:/workspace/SoVITS_weights
- G:/GPT-SoVITS-DockerTest/reference:/workspace/reference
working_dir: /workspace
ports:
- "9870:9870"
- "9871:9871"
- "9872:9872"
- "9873:9873"
- "9874:9874"
shm_size: 16G
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: "all"
capabilities: [gpu]
stdin_open: true
tty: true
restart: unless-stopped

View File

@ -10,7 +10,7 @@
5-清理TEMP文件夹缓存音频等文件
6-在参考音频结尾留空0.3s削弱合成音频包含参考音频结尾的问题
6-大幅削弱合成音频包含参考音频结尾的问题
### 20240122更新
@ -20,5 +20,12 @@
3-音频路径检查。如果尝试读取输入错的路径报错路径不存在而非ffmpeg错误。
待修复:-hubert提取在half下出现nan概率更高的问题
### 20240123更新
1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
2-支持推理界面快速切换模型
3-优化模型文件排序逻辑
4-中文分词使用jieba_fast代替jieba

View File

@ -87,6 +87,32 @@ brew install ffmpeg
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
### 在 Docker 中使用
#### docker-compose.yaml 设置
1. 环境变量:
- is_half: 半精度/双精度控制。在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时一般都是它引起的可以根据实际情况来调整为True或者False。
2. Volume设置容器内的应用根目录设置为 /workspace。 默认的 docker-compose.yaml 中列出了一些实际的例子,便于上传/下载内容。
3. shm_sizeWindows下的Docker Desktop默认可用内存过小会导致运行异常根据自己情况酌情设置。
4. deploy小节下的gpu相关内容请根据您的系统和实际情况酌情设置。
#### 通过 docker compose运行
```
docker compose -f "docker-compose.yaml" up -d
```
#### 通过 docker 命令运行
同上,根据您自己的实际情况修改对应的参数,然后运行如下命令:
```
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03
```
### 预训练模型

View File

@ -93,6 +93,30 @@ brew install ffmpeg
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートディレクトリに置きます。
### Dockerの使用
#### docker-compose.yamlの設定
1. 環境変数:
- `is_half`:半精度/倍精度の制御。"SSL抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です。実際の状況に応じてTrueまたはFalseに調整してください。
2. ボリューム設定:コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます。デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています。
3. `shm_size`WindowsのDocker Desktopのデフォルトの利用可能メモリが小さすぎるため、異常な動作を引き起こす可能性があります。状況に応じて適宜設定してください。
4. `deploy`セクションのGPUに関連する内容は、システムと実際の状況に応じて慎重に設定してください。
#### docker composeで実行する
```markdown
docker compose -f "docker-compose.yaml" up -d
```
#### dockerコマンドで実行する
上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します:
```markdown
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9870:9870 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:dev-20240123.03
```
### 事前訓練済みモデル

View File

@ -20,4 +20,4 @@ transformers
chardet
PyYAML
psutil
jieba
jieba_fast

View File

@ -110,6 +110,7 @@ def b_submit_change(*text_list):
def b_delete_audio(*checkbox_list):
global g_data_json, g_index, g_max_json_index
b_save_file()
change = False
for i, checkbox in reversed(list(enumerate(checkbox_list))):
if g_index + i < len(g_data_json):
@ -121,8 +122,8 @@ def b_delete_audio(*checkbox_list):
if g_index > g_max_json_index:
g_index = g_max_json_index
g_index = g_index if g_index >= 0 else 0
# if change:
# b_save_file()
if change:
b_save_file()
# return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch)
return {"value":g_index,"__type__":"update","maximum":(g_max_json_index if g_max_json_index>=0 else 0)},*b_change_index(g_index, g_batch)
@ -172,6 +173,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list):
def b_merge_audio(interval_r, *checkbox_list):
global g_data_json , g_max_json_index
b_save_file()
checked_index = []
audios_path = []
audios_text = []

View File

@ -1,4 +1,4 @@
import os,shutil,sys,pdb
import os,shutil,sys,pdb,re
now_dir = os.getcwd()
sys.path.append(now_dir)
import json,yaml,warnings,torch
@ -85,9 +85,16 @@ os.makedirs(SoVITS_weight_root,exist_ok=True)
os.makedirs(GPT_weight_root,exist_ok=True)
SoVITS_names,GPT_names = get_weights_names()
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
parts = re.split('(\d+)', s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
return {"choices": sorted(SoVITS_names), "__type__": "update"}, {"choices": sorted(GPT_names), "__type__": "update"}
return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
p_label=None
p_uvr5=None
@ -733,8 +740,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.TabItem(i18n("1C-推理")):
gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模体验5秒Zero Shot TTS用。"))
with gr.Row():
GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names),value=pretrained_gpt_name)
SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names),value=pretrained_sovits_name)
GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])