Merge branch 'RVC-Boss:main' into main

This commit is contained in:
梨梨梨 2024-01-22 02:03:46 +08:00 committed by GitHub
commit 032425857c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 115 additions and 110 deletions

View File

@ -12,6 +12,8 @@ bert_path = os.environ.get(
)
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
infer_ttswebui = int(infer_ttswebui)
is_share = os.environ.get("is_share", "False")
is_share=eval(is_share)
if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
is_half = eval(os.environ.get("is_half", "True"))
@ -115,7 +117,6 @@ vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
hz = 50
max_sec = config["data"]["max_sec"]
# t2s_model = Text2SemanticLightningModule.load_from_checkpoint(checkpoint_path=gpt_path, config=config, map_location="cpu")#########todo
t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
@ -149,13 +150,21 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
t0 = ttime()
prompt_text = prompt_text.strip("\n")
prompt_language, text = prompt_language, text.strip("\n")
zero_wav = np.zeros(
int(hps.data.sampling_rate * 0.3),
dtype=np.float16 if is_half == True else np.float32,
)
with torch.no_grad():
wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
wav16k = torch.from_numpy(wav16k)
zero_wav_torch = torch.from_numpy(zero_wav)
if is_half == True:
wav16k = wav16k.half().to(device)
zero_wav_torch = zero_wav_torch.half().to(device)
else:
wav16k = wav16k.to(device)
zero_wav_torch = zero_wav_torch.to(device)
wav16k=torch.cat([wav16k,zero_wav_torch])
ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
"last_hidden_state"
].transpose(
@ -170,10 +179,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
phones1 = cleaned_text_to_sequence(phones1)
texts = text.split("\n")
audio_opt = []
zero_wav = np.zeros(
int(hps.data.sampling_rate * 0.3),
dtype=np.float16 if is_half == True else np.float32,
)
for text in texts:
# 解决输入目标文本的空行导致报错的问题
if (len(text.strip()) == 0):

View File

@ -1,6 +1,8 @@
import time, logging
import time
import logging
import os
import random, traceback
import random
import traceback
import numpy as np
import torch
import torch.utils.data
@ -12,15 +14,12 @@ from text import cleaned_text_to_sequence
from utils import load_wav_to_torch, load_filepaths_and_text
import torch.nn.functional as F
from functools import lru_cache
import torch
import requests
from scipy.io import wavfile
from io import BytesIO
# from config import exp_dir
from my_utils import load_audio
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
@ -44,7 +43,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
for line in lines:
tmp = line.split("\t")
if len(tmp) != 4:
if (len(tmp) != 4):
continue
self.phoneme_data[tmp[0]] = [tmp[1]]
@ -52,7 +51,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
tmp = self.audiopaths_sid_text
leng = len(tmp)
min_num = 100
if leng < min_num:
if (leng < min_num):
self.audiopaths_sid_text = []
for _ in range(max(2, int(min_num / leng))):
self.audiopaths_sid_text += tmp
@ -77,20 +76,28 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
for audiopath in tqdm(self.audiopaths_sid_text):
try:
phoneme = self.phoneme_data[audiopath][0]
phoneme = phoneme.split(" ")
phoneme = phoneme.split(' ')
phoneme_ids = cleaned_text_to_sequence(phoneme)
except Exception:
print(f"{audiopath} not in self.phoneme_data !")
skipped_phone += 1
continue
size = os.path.getsize("%s/%s" % (self.path5, audiopath))
duration = size / self.sampling_rate / 2
if duration == 0:
print(f"Zero duration for {audiopath}, skipping...")
skipped_dur += 1
continue
if 54 > duration > 0.6 or self.val:
audiopaths_sid_text_new.append([audiopath, phoneme_ids])
lengths.append(size // (2 * self.hop_length))
else:
skipped_dur += 1
continue
print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur)
print("total left: ", len(audiopaths_sid_text_new))
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size这里todo
@ -103,10 +110,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
try:
spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
with torch.no_grad():
ssl = torch.load(
"%s/%s.pt" % (self.path4, audiopath), map_location="cpu"
)
if ssl.shape[-1] != spec.shape[-1]:
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
if (ssl.shape[-1] != spec.shape[-1]):
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
@ -117,25 +122,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
ssl = torch.zeros(1, 768, 100)
text = text[-1:]
print("load audio or ssl error!!!!!!", audiopath)
# print(ssl.requires_grad,spec.requires_grad,wav.requires_grad,text.requires_grad)
return (ssl, spec, wav, text)
def get_audio(self, filename):
audio_array = load_audio(
filename, self.sampling_rate
) # load_audio的方法是已经归一化到-1~1之间的不用再/32768
# print(filename,audio_array.max(),audio_array.min(),audio_array.mean())
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio = torch.FloatTensor(audio_array) # /32768
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
return spec, audio_norm
@ -152,14 +147,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
def random_slice(self, ssl, wav, mel):
assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, (
"first",
ssl.shape,
wav.shape,
)
"first", ssl.shape, wav.shape)
len_mel = mel.shape[1]
if self.val:
reference_mel = mel[:, : len_mel // 3]
reference_mel = mel[:, :len_mel // 3]
return reference_mel, ssl, wav, mel
dir = random.randint(0, 1)
sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2))
@ -167,29 +159,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
if dir == 0:
reference_mel = mel[:, :sep_point]
ssl = ssl[:, :, sep_point:]
wav2 = wav[:, sep_point * self.hop_length :]
wav2 = wav[:, sep_point * self.hop_length:]
mel = mel[:, sep_point:]
else:
reference_mel = mel[:, sep_point:]
ssl = ssl[:, :, :sep_point]
wav2 = wav[:, : sep_point * self.hop_length]
wav2 = wav[:, :sep_point * self.hop_length]
mel = mel[:, :sep_point]
assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
ssl.shape,
wav.shape,
wav2.shape,
mel.shape,
sep_point,
self.hop_length,
sep_point * self.hop_length,
dir,
)
ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir)
return reference_mel, ssl, wav2, mel
class TextAudioSpeakerCollate:
"""Zero-pads model inputs and targets"""
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
@ -202,8 +187,8 @@ class TextAudioSpeakerCollate:
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
)
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_ssl_len = max([x[0].size(2) for x in batch])
max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
@ -231,31 +216,22 @@ class TextAudioSpeakerCollate:
row = batch[ids_sorted_decreasing[i]]
ssl = row[0]
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
ssl_lengths[i] = ssl.size(2)
spec = row[1]
spec_padded[i, :, : spec.size(1)] = spec
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, : wav.size(1)] = wav
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
text = row[3]
text_padded[i, : text.size(0)] = text
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
return (
ssl_padded,
ssl_lengths,
spec_padded,
spec_lengths,
wav_padded,
wav_lengths,
text_padded,
text_lengths,
)
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
@ -268,18 +244,9 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(
self,
dataset,
batch_size,
boundaries,
num_replicas=None,
rank=None,
shuffle=True,
):
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
# print(233333333333333,self.lengths,dir(dataset))
self.batch_size = batch_size
self.boundaries = boundaries
@ -295,24 +262,22 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, 0, -1):
# for i in range(len(buckets) - 1, -1, -1):
i = len(buckets) - 1
while i >= 0:
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i + 1)
i -= 1
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (
total_batch_size - (len_bucket % total_batch_size)
) % total_batch_size
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
@ -331,25 +296,13 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = (
ids_bucket
+ ids_bucket * (rem // len_bucket)
+ ids_bucket[: (rem % len_bucket)]
)
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
# subsample
ids_bucket = ids_bucket[self.rank :: self.num_replicas]
ids_bucket = ids_bucket[self.rank::self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [
bucket[idx]
for idx in ids_bucket[
j * self.batch_size : (j + 1) * self.batch_size
]
]
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
batches.append(batch)
if self.shuffle:
@ -376,4 +329,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
return -1
def __len__(self):
return self.num_samples // self.batch_size
return self.num_samples // self.batch_size

View File

@ -5,6 +5,7 @@ import sys
sovits_path = ""
gpt_path = ""
is_half = True
is_share=False
cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"

25
docs/cn/Changelog_CN.md Normal file
View File

@ -0,0 +1,25 @@
### 20240121更新
1-config添加is_share诸如colab等场景可以将此改为True来使得webui映射到公网
2-WebUI添加英文系统英文翻译适配
3-cmd-asr自动判断是否已自带damo模型如不在默认目录上将从modelscope自带下载
4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复过滤长度0的样本等
5-清理TEMP文件夹缓存音频等文件
6-在参考音频结尾留空0.3s,削弱合成音频包含参考音频结尾的问题
待修复:
1-过短输出文件返回重复参考音频的问题
2-batch size超过条数导致微调有问题
3-hubert提取在half下出现nan概率更高的问题
高优:
支持英文日文训练

View File

@ -1,7 +1,7 @@
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
少样本强大的声音转换与文本到语音网络界面。<br><br>
强大的少样本语音转换与语音合成Web用户界面。<br><br>
[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange
)](https://github.com/RVC-Boss/GPT-SoVITS)

View File

@ -20,3 +20,4 @@ transformers
chardet
PyYAML
psutil
jieba

View File

@ -6,11 +6,18 @@ import sys,os,traceback
dir=sys.argv[1]
# opt_name=dir.split("\\")[-1].split("/")[-1]
opt_name=os.path.basename(dir)
path_asr='tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
path_vad='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
path_punc='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
path_asr=path_asr if os.path.exists(path_asr)else "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
path_vad=path_vad if os.path.exists(path_vad)else "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
path_punc=path_punc if os.path.exists(path_punc)else "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
vad_model='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch',
punc_model='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
model=path_asr,
vad_model=path_vad,
punc_model=path_punc,
)
opt=[]

View File

@ -79,6 +79,7 @@ def b_change_index(index, batch):
def b_next_index(index, batch):
b_save_file()
if (index + batch) <= g_max_json_index:
return index + batch , *b_change_index(index + batch, batch)
else:
@ -86,6 +87,7 @@ def b_next_index(index, batch):
def b_previous_index(index, batch):
b_save_file()
if (index - batch) >= 0:
return index - batch , *b_change_index(index - batch, batch)
else:
@ -294,6 +296,7 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--load_json', default="None", help='source file, like demo.json')
parser.add_argument('--is_share', default="False", help='whether webui is_share=True')
parser.add_argument('--load_list', default="None", help='source file, like demo.list')
parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list')
parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text')
@ -488,5 +491,6 @@ if __name__ == "__main__":
server_name="0.0.0.0",
inbrowser=True,
quiet=True,
share=eval(args.is_share),
server_port=int(args.webui_port_subfix)
)

View File

@ -19,7 +19,8 @@ for name in os.listdir(weight_uvr5_root):
device=sys.argv[1]
is_half=sys.argv[2]
webui_port_uvr5=int(sys.argv[3])
is_share=eval(sys.argv[4])
def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
infos = []
@ -179,6 +180,7 @@ with gr.Blocks(title="RVC WebUI") as app:
app.queue(concurrency_count=511, max_size=1022).launch(
server_name="0.0.0.0",
inbrowser=True,
server_port=9873,
share=is_share,
server_port=webui_port_uvr5,
quiet=True,
)

View File

@ -1,7 +1,7 @@
import json,yaml,warnings,torch
import platform
import psutil
import os
import os,shutil
import signal
from tools import my_utils
@ -12,6 +12,12 @@ now_dir = os.getcwd()
tmp = os.path.join(now_dir, "TEMP")
os.makedirs(tmp, exist_ok=True)
os.environ["TEMP"] = tmp
if(os.path.exists(tmp)):
for name in os.listdir(tmp):
if(name=="jieba.cache"):continue
path="%s/%s"%(tmp,name)
delete=os.remove if os.path.isfile(path) else shutil.rmtree
delete(path)
import site
site_packages_roots = []
for path in site.getsitepackages():
@ -34,7 +40,7 @@ import pdb
import gradio as gr
from subprocess import Popen
import signal
from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix
from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
from scipy.io import wavfile
@ -120,7 +126,7 @@ def kill_process(pid):
def change_label(if_label,path_list):
global p_label
if(if_label==True and p_label==None):
cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s'%(python_exec,path_list,webui_port_subfix)
cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
yield i18n("打标工具WebUI已开启")
print(cmd)
p_label = Popen(cmd, shell=True)
@ -132,7 +138,7 @@ def change_label(if_label,path_list):
def change_uvr5(if_uvr5):
global p_uvr5
if(if_uvr5==True and p_uvr5==None):
cmd = '"%s" tools/uvr5/webui.py "%s" %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5)
cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
yield i18n("UVR5已开启")
print(cmd)
p_uvr5 = Popen(cmd, shell=True)
@ -151,6 +157,7 @@ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path
os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
os.environ["is_half"]=str(is_half)
os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
os.environ["is_share"]=str(is_share)
cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
yield i18n("TTS推理进程已开启")
print(cmd)
@ -659,7 +666,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
label=i18n("*训练集音频文件目录"),
# value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",
interactive=True,
placeholder=i18n("训练集音频文件目录 拼接 list文件里波形对应的文件名")
placeholder=i18n("训练集音频文件目录-拼接-list文件里波形对应的文件名不是全路径")
)
gr.Markdown(value=i18n("1Aa-文本内容"))
with gr.Row():
@ -740,7 +747,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
app.queue(concurrency_count=511, max_size=1022).launch(
server_name="0.0.0.0",
inbrowser=True,
share=True,
share=is_share,
server_port=webui_port_main,
quiet=True,
)