diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index e5e604f..dbc7eb3 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -12,6 +12,8 @@ bert_path = os.environ.get( ) infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share=eval(is_share) if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) @@ -115,7 +117,6 @@ vq_model.eval() print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) hz = 50 max_sec = config["data"]["max_sec"] -# t2s_model = Text2SemanticLightningModule.load_from_checkpoint(checkpoint_path=gpt_path, config=config, map_location="cpu")#########todo t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False) t2s_model.load_state_dict(dict_s1["weight"]) if is_half == True: @@ -149,13 +150,21 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) with torch.no_grad(): - wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙 + wav16k, sr = librosa.load(ref_wav_path, sr=16000) wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) if is_half == True: wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) else: wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k=torch.cat([wav16k,zero_wav_torch]) ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ "last_hidden_state" ].transpose( @@ -170,10 +179,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1 = cleaned_text_to_sequence(phones1) texts = text.split("\n") audio_opt = [] - zero_wav = np.zeros( - int(hps.data.sampling_rate * 0.3), - dtype=np.float16 if is_half == True else np.float32, - ) for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 15f401d..ff4c4f4 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,6 +1,8 @@ -import time, logging +import time +import logging import os -import random, traceback +import random +import traceback import numpy as np import torch import torch.utils.data @@ -12,15 +14,12 @@ from text import cleaned_text_to_sequence from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F from functools import lru_cache -import torch import requests from scipy.io import wavfile from io import BytesIO - -# from config import exp_dir from my_utils import load_audio - +# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -44,7 +43,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if len(tmp) != 4: + if (len(tmp) != 4): continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -52,7 +51,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if leng < min_num: + if (leng < min_num): self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -77,20 +76,28 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(" ") + phoneme = phoneme.split(' ') phoneme_ids = cleaned_text_to_sequence(phoneme) except Exception: print(f"{audiopath} not in self.phoneme_data !") skipped_phone += 1 continue + size = os.path.getsize("%s/%s" % (self.path5, audiopath)) duration = size / self.sampling_rate / 2 + + if duration == 0: + print(f"Zero duration for {audiopath}, skipping...") + skipped_dur += 1 + continue + if 54 > duration > 0.6 or self.val: audiopaths_sid_text_new.append([audiopath, phoneme_ids]) lengths.append(size // (2 * self.hop_length)) else: skipped_dur += 1 continue + print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) print("total left: ", len(audiopaths_sid_text_new)) assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo @@ -103,10 +110,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): try: spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): - ssl = torch.load( - "%s/%s.pt" % (self.path4, audiopath), map_location="cpu" - ) - if ssl.shape[-1] != spec.shape[-1]: + ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") + if (ssl.shape[-1] != spec.shape[-1]): typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -117,25 +122,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): ssl = torch.zeros(1, 768, 100) text = text[-1:] print("load audio or ssl error!!!!!!", audiopath) - # print(ssl.requires_grad,spec.requires_grad,wav.requires_grad,text.requires_grad) return (ssl, spec, wav, text) def get_audio(self, filename): - audio_array = load_audio( - filename, self.sampling_rate - ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - # print(filename,audio_array.max(),audio_array.min(),audio_array.mean()) + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) + spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, + center=False) spec = torch.squeeze(spec, 0) return spec, audio_norm @@ -152,14 +147,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def random_slice(self, ssl, wav, mel): assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( - "first", - ssl.shape, - wav.shape, - ) + "first", ssl.shape, wav.shape) len_mel = mel.shape[1] if self.val: - reference_mel = mel[:, : len_mel // 3] + reference_mel = mel[:, :len_mel // 3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) @@ -167,29 +159,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] - wav2 = wav[:, sep_point * self.hop_length :] + wav2 = wav[:, sep_point * self.hop_length:] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] - wav2 = wav[:, : sep_point * self.hop_length] + wav2 = wav[:, :sep_point * self.hop_length] mel = mel[:, :sep_point] assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( - ssl.shape, - wav.shape, - wav2.shape, - mel.shape, - sep_point, - self.hop_length, - sep_point * self.hop_length, - dir, - ) + ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir) return reference_mel, ssl, wav2, mel -class TextAudioSpeakerCollate: - """Zero-pads model inputs and targets""" +class TextAudioSpeakerCollate(): + """ Zero-pads model inputs and targets + """ def __init__(self, return_ids=False): self.return_ids = return_ids @@ -202,8 +187,8 @@ class TextAudioSpeakerCollate: """ # Right zero-pad all one-hot text sequences to max input length _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True - ) + torch.LongTensor([x[1].size(1) for x in batch]), + dim=0, descending=True) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) @@ -231,31 +216,22 @@ class TextAudioSpeakerCollate: row = batch[ids_sorted_decreasing[i]] ssl = row[0] - ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, : spec.size(1)] = spec + spec_padded[i, :, :spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, : wav.size(1)] = wav + wav_padded[i, :, :wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] - text_padded[i, : text.size(0)] = text + text_padded[i, :text.size(0)] = text text_lengths[i] = text.size(0) - return ( - ssl_padded, - ssl_lengths, - spec_padded, - spec_lengths, - wav_padded, - wav_lengths, - text_padded, - text_lengths, - ) + return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): @@ -268,18 +244,9 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. """ - def __init__( - self, - dataset, - batch_size, - boundaries, - num_replicas=None, - rank=None, - shuffle=True, - ): + def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) self.lengths = dataset.lengths - # print(233333333333333,self.lengths,dir(dataset)) self.batch_size = batch_size self.boundaries = boundaries @@ -295,24 +262,22 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): if idx_bucket != -1: buckets[idx_bucket].append(i) - for i in range(len(buckets) - 1, 0, -1): - # for i in range(len(buckets) - 1, -1, -1): + i = len(buckets) - 1 + while i >= 0: if len(buckets[i]) == 0: buckets.pop(i) self.boundaries.pop(i + 1) + i -= 1 num_samples_per_bucket = [] for i in range(len(buckets)): len_bucket = len(buckets[i]) total_batch_size = self.num_replicas * self.batch_size - rem = ( - total_batch_size - (len_bucket % total_batch_size) - ) % total_batch_size + rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size num_samples_per_bucket.append(len_bucket + rem) return buckets, num_samples_per_bucket def __iter__(self): - # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) @@ -331,25 +296,13 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): ids_bucket = indices[i] num_samples_bucket = self.num_samples_per_bucket[i] - # add extra samples to make it evenly divisible rem = num_samples_bucket - len_bucket - ids_bucket = ( - ids_bucket - + ids_bucket * (rem // len_bucket) - + ids_bucket[: (rem % len_bucket)] - ) + ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] - # subsample - ids_bucket = ids_bucket[self.rank :: self.num_replicas] + ids_bucket = ids_bucket[self.rank::self.num_replicas] - # batching for j in range(len(ids_bucket) // self.batch_size): - batch = [ - bucket[idx] - for idx in ids_bucket[ - j * self.batch_size : (j + 1) * self.batch_size - ] - ] + batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] batches.append(batch) if self.shuffle: @@ -376,4 +329,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): return -1 def __len__(self): - return self.num_samples // self.batch_size + return self.num_samples // self.batch_size \ No newline at end of file diff --git a/config.py b/config.py index 504ca62..ec846b3 100644 --- a/config.py +++ b/config.py @@ -5,6 +5,7 @@ import sys sovits_path = "" gpt_path = "" is_half = True +is_share=False cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md new file mode 100644 index 0000000..71d3c72 --- /dev/null +++ b/docs/cn/Changelog_CN.md @@ -0,0 +1,25 @@ +### 20240121更新 + +1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网 + +2-WebUI添加英文系统英文翻译适配 + +3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载 + +4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) + +5-清理TEMP文件夹缓存音频等文件 + +6-在参考音频结尾留空0.3s,削弱合成音频包含参考音频结尾的问题 + +待修复: + +1-过短输出文件返回重复参考音频的问题 + +2-batch size超过条数导致微调有问题 + +3-hubert提取在half下出现nan概率更高的问题 + +高优: + +支持英文日文训练 diff --git a/docs/cn/README.md b/docs/cn/README.md index 5993b08..4c7f530 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -1,7 +1,7 @@

GPT-SoVITS-WebUI

-少样本强大的声音转换与文本到语音网络界面。

+强大的少样本语音转换与语音合成Web用户界面。

[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange )](https://github.com/RVC-Boss/GPT-SoVITS) diff --git a/requirements.txt b/requirements.txt index 2e64033..fedce8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ transformers chardet PyYAML psutil +jieba diff --git a/tools/damo_asr/cmd-asr.py b/tools/damo_asr/cmd-asr.py index 9d1c4b9..2775562 100644 --- a/tools/damo_asr/cmd-asr.py +++ b/tools/damo_asr/cmd-asr.py @@ -6,11 +6,18 @@ import sys,os,traceback dir=sys.argv[1] # opt_name=dir.split("\\")[-1].split("/")[-1] opt_name=os.path.basename(dir) + +path_asr='tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' +path_vad='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' +path_punc='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' +path_asr=path_asr if os.path.exists(path_asr)else "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +path_vad=path_vad if os.path.exists(path_vad)else "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" +path_punc=path_punc if os.path.exists(path_punc)else "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, - model='tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', - vad_model='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch', - punc_model='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', + model=path_asr, + vad_model=path_vad, + punc_model=path_punc, ) opt=[] diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py index 0e6585e..ad4907b 100644 --- a/tools/subfix_webui.py +++ b/tools/subfix_webui.py @@ -79,6 +79,7 @@ def b_change_index(index, batch): def b_next_index(index, batch): + b_save_file() if (index + batch) <= g_max_json_index: return index + batch , *b_change_index(index + batch, batch) else: @@ -86,6 +87,7 @@ def b_next_index(index, batch): def b_previous_index(index, batch): + b_save_file() if (index - batch) >= 0: return index - batch , *b_change_index(index - batch, batch) else: @@ -294,6 +296,7 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch): if __name__ == "__main__": parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--load_json', default="None", help='source file, like demo.json') + parser.add_argument('--is_share', default="False", help='whether webui is_share=True') parser.add_argument('--load_list', default="None", help='source file, like demo.list') parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list') parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text') @@ -488,5 +491,6 @@ if __name__ == "__main__": server_name="0.0.0.0", inbrowser=True, quiet=True, + share=eval(args.is_share), server_port=int(args.webui_port_subfix) ) \ No newline at end of file diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index 11b39f5..59d2d2e 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -19,7 +19,8 @@ for name in os.listdir(weight_uvr5_root): device=sys.argv[1] is_half=sys.argv[2] - +webui_port_uvr5=int(sys.argv[3]) +is_share=eval(sys.argv[4]) def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] @@ -179,6 +180,7 @@ with gr.Blocks(title="RVC WebUI") as app: app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, - server_port=9873, + share=is_share, + server_port=webui_port_uvr5, quiet=True, ) \ No newline at end of file diff --git a/webui.py b/webui.py index 1d82e25..e783094 100644 --- a/webui.py +++ b/webui.py @@ -1,7 +1,7 @@ import json,yaml,warnings,torch import platform import psutil -import os +import os,shutil import signal from tools import my_utils @@ -12,6 +12,12 @@ now_dir = os.getcwd() tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp +if(os.path.exists(tmp)): + for name in os.listdir(tmp): + if(name=="jieba.cache"):continue + path="%s/%s"%(tmp,name) + delete=os.remove if os.path.isfile(path) else shutil.rmtree + delete(path) import site site_packages_roots = [] for path in site.getsitepackages(): @@ -34,7 +40,7 @@ import pdb import gradio as gr from subprocess import Popen import signal -from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix +from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share from tools.i18n.i18n import I18nAuto i18n = I18nAuto() from scipy.io import wavfile @@ -120,7 +126,7 @@ def kill_process(pid): def change_label(if_label,path_list): global p_label if(if_label==True and p_label==None): - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s'%(python_exec,path_list,webui_port_subfix) + cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share) yield i18n("打标工具WebUI已开启") print(cmd) p_label = Popen(cmd, shell=True) @@ -132,7 +138,7 @@ def change_label(if_label,path_list): def change_uvr5(if_uvr5): global p_uvr5 if(if_uvr5==True and p_uvr5==None): - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5) + cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share) yield i18n("UVR5已开启") print(cmd) p_uvr5 = Popen(cmd, shell=True) @@ -151,6 +157,7 @@ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number os.environ["is_half"]=str(is_half) os.environ["infer_ttswebui"]=str(webui_port_infer_tts) + os.environ["is_share"]=str(is_share) cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec) yield i18n("TTS推理进程已开启") print(cmd) @@ -659,7 +666,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label=i18n("*训练集音频文件目录"), # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", interactive=True, - placeholder=i18n("训练集音频文件目录 拼接 list文件里波形对应的文件名。") + placeholder=i18n("训练集音频文件目录-拼接-list文件里波形对应的文件名(不是全路径)。") ) gr.Markdown(value=i18n("1Aa-文本内容")) with gr.Row(): @@ -740,7 +747,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, - share=True, + share=is_share, server_port=webui_port_main, quiet=True, )