diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index e5e604f..dbc7eb3 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -12,6 +12,8 @@ bert_path = os.environ.get( ) infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share=eval(is_share) if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) @@ -115,7 +117,6 @@ vq_model.eval() print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) hz = 50 max_sec = config["data"]["max_sec"] -# t2s_model = Text2SemanticLightningModule.load_from_checkpoint(checkpoint_path=gpt_path, config=config, map_location="cpu")#########todo t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False) t2s_model.load_state_dict(dict_s1["weight"]) if is_half == True: @@ -149,13 +150,21 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) t0 = ttime() prompt_text = prompt_text.strip("\n") prompt_language, text = prompt_language, text.strip("\n") + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) with torch.no_grad(): - wav16k, sr = librosa.load(ref_wav_path, sr=16000) # 派蒙 + wav16k, sr = librosa.load(ref_wav_path, sr=16000) wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) if is_half == True: wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) else: wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k=torch.cat([wav16k,zero_wav_torch]) ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ "last_hidden_state" ].transpose( @@ -170,10 +179,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language) phones1 = cleaned_text_to_sequence(phones1) texts = text.split("\n") audio_opt = [] - zero_wav = np.zeros( - int(hps.data.sampling_rate * 0.3), - dtype=np.float16 if is_half == True else np.float32, - ) for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 15f401d..ff4c4f4 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,6 +1,8 @@ -import time, logging +import time +import logging import os -import random, traceback +import random +import traceback import numpy as np import torch import torch.utils.data @@ -12,15 +14,12 @@ from text import cleaned_text_to_sequence from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F from functools import lru_cache -import torch import requests from scipy.io import wavfile from io import BytesIO - -# from config import exp_dir from my_utils import load_audio - +# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -44,7 +43,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if len(tmp) != 4: + if (len(tmp) != 4): continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -52,7 +51,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if leng < min_num: + if (leng < min_num): self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -77,20 +76,28 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(" ") + phoneme = phoneme.split(' ') phoneme_ids = cleaned_text_to_sequence(phoneme) except Exception: print(f"{audiopath} not in self.phoneme_data !") skipped_phone += 1 continue + size = os.path.getsize("%s/%s" % (self.path5, audiopath)) duration = size / self.sampling_rate / 2 + + if duration == 0: + print(f"Zero duration for {audiopath}, skipping...") + skipped_dur += 1 + continue + if 54 > duration > 0.6 or self.val: audiopaths_sid_text_new.append([audiopath, phoneme_ids]) lengths.append(size // (2 * self.hop_length)) else: skipped_dur += 1 continue + print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) print("total left: ", len(audiopaths_sid_text_new)) assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo @@ -103,10 +110,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): try: spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): - ssl = torch.load( - "%s/%s.pt" % (self.path4, audiopath), map_location="cpu" - ) - if ssl.shape[-1] != spec.shape[-1]: + ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") + if (ssl.shape[-1] != spec.shape[-1]): typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -117,25 +122,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): ssl = torch.zeros(1, 768, 100) text = text[-1:] print("load audio or ssl error!!!!!!", audiopath) - # print(ssl.requires_grad,spec.requires_grad,wav.requires_grad,text.requires_grad) return (ssl, spec, wav, text) def get_audio(self, filename): - audio_array = load_audio( - filename, self.sampling_rate - ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - # print(filename,audio_array.max(),audio_array.min(),audio_array.mean()) + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) + spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, + center=False) spec = torch.squeeze(spec, 0) return spec, audio_norm @@ -152,14 +147,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def random_slice(self, ssl, wav, mel): assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( - "first", - ssl.shape, - wav.shape, - ) + "first", ssl.shape, wav.shape) len_mel = mel.shape[1] if self.val: - reference_mel = mel[:, : len_mel // 3] + reference_mel = mel[:, :len_mel // 3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) @@ -167,29 +159,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] - wav2 = wav[:, sep_point * self.hop_length :] + wav2 = wav[:, sep_point * self.hop_length:] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] - wav2 = wav[:, : sep_point * self.hop_length] + wav2 = wav[:, :sep_point * self.hop_length] mel = mel[:, :sep_point] assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( - ssl.shape, - wav.shape, - wav2.shape, - mel.shape, - sep_point, - self.hop_length, - sep_point * self.hop_length, - dir, - ) + ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir) return reference_mel, ssl, wav2, mel -class TextAudioSpeakerCollate: - """Zero-pads model inputs and targets""" +class TextAudioSpeakerCollate(): + """ Zero-pads model inputs and targets + """ def __init__(self, return_ids=False): self.return_ids = return_ids @@ -202,8 +187,8 @@ class TextAudioSpeakerCollate: """ # Right zero-pad all one-hot text sequences to max input length _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True - ) + torch.LongTensor([x[1].size(1) for x in batch]), + dim=0, descending=True) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) @@ -231,31 +216,22 @@ class TextAudioSpeakerCollate: row = batch[ids_sorted_decreasing[i]] ssl = row[0] - ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, : spec.size(1)] = spec + spec_padded[i, :, :spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, : wav.size(1)] = wav + wav_padded[i, :, :wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] - text_padded[i, : text.size(0)] = text + text_padded[i, :text.size(0)] = text text_lengths[i] = text.size(0) - return ( - ssl_padded, - ssl_lengths, - spec_padded, - spec_lengths, - wav_padded, - wav_lengths, - text_padded, - text_lengths, - ) + return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): @@ -268,18 +244,9 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. """ - def __init__( - self, - dataset, - batch_size, - boundaries, - num_replicas=None, - rank=None, - shuffle=True, - ): + def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) self.lengths = dataset.lengths - # print(233333333333333,self.lengths,dir(dataset)) self.batch_size = batch_size self.boundaries = boundaries @@ -295,24 +262,22 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): if idx_bucket != -1: buckets[idx_bucket].append(i) - for i in range(len(buckets) - 1, 0, -1): - # for i in range(len(buckets) - 1, -1, -1): + i = len(buckets) - 1 + while i >= 0: if len(buckets[i]) == 0: buckets.pop(i) self.boundaries.pop(i + 1) + i -= 1 num_samples_per_bucket = [] for i in range(len(buckets)): len_bucket = len(buckets[i]) total_batch_size = self.num_replicas * self.batch_size - rem = ( - total_batch_size - (len_bucket % total_batch_size) - ) % total_batch_size + rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size num_samples_per_bucket.append(len_bucket + rem) return buckets, num_samples_per_bucket def __iter__(self): - # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) @@ -331,25 +296,13 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): ids_bucket = indices[i] num_samples_bucket = self.num_samples_per_bucket[i] - # add extra samples to make it evenly divisible rem = num_samples_bucket - len_bucket - ids_bucket = ( - ids_bucket - + ids_bucket * (rem // len_bucket) - + ids_bucket[: (rem % len_bucket)] - ) + ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] - # subsample - ids_bucket = ids_bucket[self.rank :: self.num_replicas] + ids_bucket = ids_bucket[self.rank::self.num_replicas] - # batching for j in range(len(ids_bucket) // self.batch_size): - batch = [ - bucket[idx] - for idx in ids_bucket[ - j * self.batch_size : (j + 1) * self.batch_size - ] - ] + batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] batches.append(batch) if self.shuffle: @@ -376,4 +329,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): return -1 def __len__(self): - return self.num_samples // self.batch_size + return self.num_samples // self.batch_size \ No newline at end of file diff --git a/config.py b/config.py index 504ca62..ec846b3 100644 --- a/config.py +++ b/config.py @@ -5,6 +5,7 @@ import sys sovits_path = "" gpt_path = "" is_half = True +is_share=False cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md new file mode 100644 index 0000000..71d3c72 --- /dev/null +++ b/docs/cn/Changelog_CN.md @@ -0,0 +1,25 @@ +### 20240121更新 + +1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网 + +2-WebUI添加英文系统英文翻译适配 + +3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载 + +4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) + +5-清理TEMP文件夹缓存音频等文件 + +6-在参考音频结尾留空0.3s,削弱合成音频包含参考音频结尾的问题 + +待修复: + +1-过短输出文件返回重复参考音频的问题 + +2-batch size超过条数导致微调有问题 + +3-hubert提取在half下出现nan概率更高的问题 + +高优: + +支持英文日文训练 diff --git a/docs/cn/README.md b/docs/cn/README.md index 5993b08..4c7f530 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -1,7 +1,7 @@