diff --git a/Docker/download.py b/Docker/download.py index 234fd06..952423d 100644 --- a/Docker/download.py +++ b/Docker/download.py @@ -1,5 +1,8 @@ # Download moda ASR related models from modelscope import snapshot_download -model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4") -model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4") -model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4") + +model_dir = snapshot_download( + "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", revision="v2.0.4" +) +model_dir = snapshot_download("damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", revision="v2.0.4") +model_dir = snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", revision="v2.0.4") diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py index 45f91d8..d845733 100644 --- a/GPT_SoVITS/AR/data/bucket_sampler.py +++ b/GPT_SoVITS/AR/data/bucket_sampler.py @@ -4,14 +4,11 @@ import itertools import math import random from random import shuffle -from typing import Iterator -from typing import Optional -from typing import TypeVar +from typing import Iterator, Optional, TypeVar import torch import torch.distributed as dist -from torch.utils.data import Dataset -from torch.utils.data import Sampler +from torch.utils.data import Dataset, Sampler __all__ = [ "DistributedBucketSampler", @@ -50,10 +47,7 @@ class DistributedBucketSampler(Sampler[T_co]): if torch.cuda.is_available(): torch.cuda.set_device(rank) if rank >= num_replicas or rank < 0: - raise ValueError( - "Invalid rank {}, rank should be in the interval" - " [0, {}]".format(rank, num_replicas - 1) - ) + raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1)) self.dataset = dataset self.num_replicas = num_replicas self.rank = rank @@ -61,19 +55,16 @@ class DistributedBucketSampler(Sampler[T_co]): self.drop_last = drop_last # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. - if ( - self.drop_last and len(self.dataset) % self.num_replicas != 0 - ): # type: ignore[arg-type] + if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type] # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( - (len(self.dataset) - self.num_replicas) - / self.num_replicas # type: ignore[arg-type] + (len(self.dataset) - self.num_replicas) / self.num_replicas, # type: ignore[arg-type] ) else: self.num_samples = math.ceil( - len(self.dataset) / self.num_replicas + len(self.dataset) / self.num_replicas, ) # type: ignore[arg-type] self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle @@ -118,10 +109,7 @@ class DistributedBucketSampler(Sampler[T_co]): grouped_batch_size = self.batch_size * self.num_replicas shuffled_bucket = list(itertools.chain(*shuffled_bucket)) n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) - batches = [ - shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] - for b in range(n_batch) - ] + batches = [shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] for b in range(n_batch)] shuffle(batches) indices = list(itertools.chain(*batches)) else: @@ -134,9 +122,7 @@ class DistributedBucketSampler(Sampler[T_co]): if padding_size <= len(indices): indices += indices[:padding_size] else: - indices += (indices * math.ceil(padding_size / len(indices)))[ - :padding_size - ] + indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] else: # remove tail of data to make it evenly divisible. indices = indices[: self.total_size] diff --git a/GPT_SoVITS/AR/data/data_module.py b/GPT_SoVITS/AR/data/data_module.py index cb94795..f360503 100644 --- a/GPT_SoVITS/AR/data/data_module.py +++ b/GPT_SoVITS/AR/data/data_module.py @@ -1,9 +1,10 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py # reference: https://github.com/lifeiteng/vall-e from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader + from AR.data.bucket_sampler import DistributedBucketSampler from AR.data.dataset import Text2SemanticDataset -from torch.utils.data import DataLoader class Text2SemanticDataModule(LightningDataModule): @@ -42,8 +43,12 @@ class Text2SemanticDataModule(LightningDataModule): # pad_val=self.config['data']['pad_val']) def train_dataloader(self): - batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] - batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 + batch_size = ( + self.config["train"]["batch_size"] // 2 + if self.config["train"].get("if_dpo", False) is True + else self.config["train"]["batch_size"] + ) + batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存 sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) return DataLoader( self._train_dataset, diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py index 9d2dfe8..402483d 100644 --- a/GPT_SoVITS/AR/data/dataset.py +++ b/GPT_SoVITS/AR/data/dataset.py @@ -1,21 +1,17 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py # reference: https://github.com/lifeiteng/vall-e -import pdb -import sys # sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") -import traceback, os -from typing import Dict -from typing import List +import os +import traceback +from typing import Dict, List import numpy as np import pandas as pd -import torch, json -from torch.utils.data import DataLoader -from torch.utils.data import Dataset -from transformers import AutoTokenizer +import torch +from torch.utils.data import DataLoader, Dataset -version = os.environ.get('version',None) +version = os.environ.get("version", None) from text import cleaned_text_to_sequence @@ -34,9 +30,7 @@ def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0 padded_sequences = [] for seq, length in zip(sequences, seq_lengths): - padding = ( - [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) - ) + padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) padded_sequences.append(padded_seq) batch = np.stack(padded_sequences) @@ -61,12 +55,16 @@ class Text2SemanticDataset(Dataset): super().__init__() self.semantic_data = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" + semantic_path, + delimiter="\t", + encoding="utf-8", ) # get dict self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path self.path3 = "%s/3-bert" % ( - os.path.dirname(phoneme_path) + os.path.dirname( + phoneme_path, + ) ) # "%s/3-bert"%exp_dir#bert_dir self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path assert os.path.exists(self.path2) @@ -127,7 +125,7 @@ class Text2SemanticDataset(Dataset): for i in range(semantic_data_len): # 先依次遍历 # get str - item_name = self.semantic_data.iloc[i,0] + item_name = self.semantic_data.iloc[i, 0] # print(self.phoneme_data) try: phoneme, word2ph, text = self.phoneme_data[item_name] @@ -137,7 +135,7 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue - semantic_str = self.semantic_data.iloc[i,1] + semantic_str = self.semantic_data.iloc[i, 1] # get token list semantic_ids = [int(idx) for idx in semantic_str.split(" ")] # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len @@ -158,9 +156,7 @@ class Text2SemanticDataset(Dataset): num_not_in += 1 continue # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 - if ( - len(phoneme_ids) > self.max_sec * self.hz / 2.5 - ): ###########2:改为恒定限制为semantic/2.5就行 + if len(phoneme_ids) > self.max_sec * self.hz / 2.5: ###########2:改为恒定限制为semantic/2.5就行 num_deleted_ps += 1 continue # if len(semantic_ids) > 1000:###########3 @@ -169,9 +165,7 @@ class Text2SemanticDataset(Dataset): ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) - if ( - ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio - ): ##########4#3~25#每秒多少个phone + if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio: ##########4#3~25#每秒多少个phone num_deleted_ps += 1 # print(item_name) continue @@ -194,12 +188,12 @@ class Text2SemanticDataset(Dataset): print(f"there are {num_not_in} semantic datas not in phoneme datas") if num_deleted_bigger > 0: print( - f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds" + f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds", ) if num_deleted_ps > 0: # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值 print( - f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}" + f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}", ) """ there are 31 semantic datas not in phoneme datas @@ -306,7 +300,10 @@ if __name__ == "__main__": batch_size = 12 dataloader = DataLoader( - dataset, batch_size=batch_size, collate_fn=dataset.collate, shuffle=False + dataset, + batch_size=batch_size, + collate_fn=dataset.collate, + shuffle=False, ) for i, batch in enumerate(dataloader): if i % 1000 == 0: diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py index 2dd3f39..0696c35 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py @@ -1,6 +1,7 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py # reference: https://github.com/lifeiteng/vall-e -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(now_dir) @@ -8,10 +9,12 @@ from typing import Dict import torch from pytorch_lightning import LightningModule + from AR.models.t2s_model import Text2SemanticDecoder from AR.modules.lr_schedulers import WarmupCosineLRSchedule from AR.modules.optim import ScaledAdam + class Text2SemanticLightningModule(LightningModule): def __init__(self, config, output_dir, is_train=True): super().__init__() @@ -23,7 +26,10 @@ class Text2SemanticLightningModule(LightningModule): # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) print( self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], ) ) if is_train: @@ -35,7 +41,7 @@ class Text2SemanticLightningModule(LightningModule): def training_step(self, batch: Dict, batch_idx: int): opt = self.optimizers() scheduler = self.lr_schedulers() - forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old + forward = self.model.forward if self.config["train"].get("if_dpo", False) == True else self.model.forward_old loss, acc = forward( batch["phoneme_ids"], batch["phoneme_ids_len"], @@ -113,9 +119,7 @@ class Text2SemanticLightningModule(LightningModule): def configure_optimizers(self): model_parameters = self.model.parameters() parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) lm_opt = ScaledAdam( model_parameters, lr=0.01, diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py index 487edb0..b0ab59c 100644 --- a/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py @@ -1,6 +1,7 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py # reference: https://github.com/lifeiteng/vall-e -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(now_dir) @@ -8,6 +9,7 @@ from typing import Dict import torch from pytorch_lightning import LightningModule + from AR.models.t2s_model_onnx import Text2SemanticDecoder from AR.modules.lr_schedulers import WarmupCosineLRSchedule from AR.modules.optim import ScaledAdam @@ -24,8 +26,11 @@ class Text2SemanticLightningModule(LightningModule): # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) print( self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] - ) + torch.load( + pretrained_s1, + map_location="cpu", + )["weight"], + ), ) if is_train: self.automatic_optimization = False @@ -79,9 +84,7 @@ class Text2SemanticLightningModule(LightningModule): def configure_optimizers(self): model_parameters = self.model.parameters() parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) + parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) lm_opt = ScaledAdam( model_parameters, lr=0.01, diff --git a/GPT_SoVITS/AR/models/t2s_model.py b/GPT_SoVITS/AR/models/t2s_model.py index 8a32d0d..4725b7a 100644 --- a/GPT_SoVITS/AR/models/t2s_model.py +++ b/GPT_SoVITS/AR/models/t2s_model.py @@ -2,27 +2,24 @@ # reference: https://github.com/lifeiteng/vall-e import math from typing import List, Optional -import torch -from tqdm import tqdm -from AR.models.utils import make_pad_mask, make_pad_mask_left -from AR.models.utils import ( - topk_sampling, - sample, - logits_to_probs, - multinomial_sample_one_no_sync, - dpo_loss, - make_reject_y, - get_batch_logps -) -from AR.modules.embedding import SinePositionalEmbedding -from AR.modules.embedding import TokenEmbedding -from AR.modules.transformer import LayerNorm -from AR.modules.transformer import TransformerEncoder -from AR.modules.transformer import TransformerEncoderLayer +import torch from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy +from tqdm import tqdm + +from AR.models.utils import ( + dpo_loss, + get_batch_logps, + make_pad_mask, + make_pad_mask_left, + make_reject_y, + sample, + topk_sampling, +) +from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer default_config = { "embedding_dim": 512, @@ -36,10 +33,17 @@ default_config = { "EOS": 1024, } + # @torch.jit.script ## 使用的话首次推理会非常慢,而且推理速度不稳定 # Efficient implementation equivalent to the following: -def scaled_dot_product_attention(query:torch.Tensor, key:torch.Tensor, value:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, scale:Optional[torch.Tensor]=None) -> torch.Tensor: - B, H, L, S =query.size(0), query.size(1), query.size(-2), key.size(-2) +def scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + B, H, L, S = query.size(0), query.size(1), query.size(-2), key.size(-2) if scale is None: scale_factor = torch.tensor(1 / math.sqrt(query.size(-1))) else: @@ -59,12 +63,13 @@ def scaled_dot_product_attention(query:torch.Tensor, key:torch.Tensor, value:tor if attn_mask.dtype == torch.bool: attn_weight.masked_fill_(attn_mask, 0) else: - attn_mask[attn_mask!=float("-inf")] =0 - attn_mask[attn_mask==float("-inf")] =1 + attn_mask[attn_mask != float("-inf")] = 0 + attn_mask[attn_mask == float("-inf")] = 1 attn_weight.masked_fill_(attn_mask, 0) return attn_weight @ value + @torch.jit.script class T2SMLP: def __init__(self, w1, b1, w2, b2): @@ -82,20 +87,20 @@ class T2SMLP: @torch.jit.script class T2SBlock: def __init__( - self, - num_heads, - hidden_dim: int, - mlp: T2SMLP, - qkv_w, - qkv_b, - out_w, - out_b, - norm_w1, - norm_b1, - norm_eps1, - norm_w2, - norm_b2, - norm_eps2, + self, + num_heads, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1, + norm_w2, + norm_b2, + norm_eps2, ): self.num_heads = num_heads self.mlp = mlp @@ -114,24 +119,32 @@ class T2SBlock: self.false = torch.tensor(False, dtype=torch.bool) @torch.jit.ignore - def to_mask(self, x:torch.Tensor, padding_mask:Optional[torch.Tensor]): + def to_mask( + self, + x: torch.Tensor, + padding_mask: Optional[torch.Tensor], + ): if padding_mask is None: return x - + if padding_mask.dtype == torch.bool: return x.masked_fill(padding_mask, 0) else: return x * padding_mask - - def process_prompt(self, x:torch.Tensor, attn_mask : torch.Tensor, padding_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True): - + def process_prompt( + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) batch_size = q.shape[0] q_len = q.shape[1] kv_len = k.shape[1] - + q = self.to_mask(q, padding_mask) k_cache = self.to_mask(k, padding_mask) v_cache = self.to_mask(v, padding_mask) @@ -149,9 +162,7 @@ class T2SBlock: attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -161,13 +172,20 @@ class T2SBlock: self.norm_eps2, ) return x, k_cache, v_cache - - def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:torch.Tensor=None, torch_sdpa:bool=True): + + def decode_next_token( + self, + x: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, + ): q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) k_cache = torch.cat([k_cache, k], dim=1) v_cache = torch.cat([v_cache, v], dim=1) - + batch_size = q.shape[0] q_len = q.shape[1] kv_len = k_cache.shape[1] @@ -176,7 +194,6 @@ class T2SBlock: k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - if torch_sdpa: attn = F.scaled_dot_product_attention(q, k, v, (~attn_mask) if attn_mask is not None else None) else: @@ -187,7 +204,11 @@ class T2SBlock: x = x + attn x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 + x, + [self.hidden_dim], + self.norm_w1, + self.norm_b1, + self.norm_eps1, ) x = x + self.mlp.forward(x) x = F.layer_norm( @@ -202,17 +223,19 @@ class T2SBlock: @torch.jit.script class T2STransformer: - def __init__(self, num_blocks : int, blocks: List[T2SBlock]): - self.num_blocks : int = num_blocks + def __init__(self, num_blocks: int, blocks: List[T2SBlock]): + self.num_blocks: int = num_blocks self.blocks = blocks def process_prompt( - self, x:torch.Tensor, attn_mask : torch.Tensor, - padding_mask : Optional[torch.Tensor]=None, - torch_sdpa:bool=True - ): - k_cache : List[torch.Tensor] = [] - v_cache : List[torch.Tensor] = [] + self, + x: torch.Tensor, + attn_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + torch_sdpa: bool = True, + ): + k_cache: List[torch.Tensor] = [] + v_cache: List[torch.Tensor] = [] for i in range(self.num_blocks): x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask, torch_sdpa) k_cache.append(k_cache_) @@ -220,14 +243,17 @@ class T2STransformer: return x, k_cache, v_cache def decode_next_token( - self, x:torch.Tensor, - k_cache: List[torch.Tensor], - v_cache: List[torch.Tensor], - attn_mask : torch.Tensor=None, - torch_sdpa:bool=True + self, + x: torch.Tensor, + k_cache: List[torch.Tensor], + v_cache: List[torch.Tensor], + attn_mask: torch.Tensor = None, + torch_sdpa: bool = True, ): for i in range(self.num_blocks): - x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i], attn_mask, torch_sdpa) + x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token( + x, k_cache[i], v_cache[i], attn_mask, torch_sdpa + ) return x, k_cache, v_cache @@ -249,16 +275,26 @@ class Text2SemanticDecoder(nn.Module): # assert self.EOS == 1024 self.bert_proj = nn.Linear(1024, self.embedding_dim) self.ar_text_embedding = TokenEmbedding( - self.embedding_dim, self.phoneme_vocab_size, self.p_dropout + self.embedding_dim, + self.phoneme_vocab_size, + self.p_dropout, ) self.ar_text_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, ) self.ar_audio_embedding = TokenEmbedding( - self.embedding_dim, self.vocab_size, self.p_dropout + self.embedding_dim, + self.vocab_size, + self.p_dropout, ) self.ar_audio_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True + self.embedding_dim, + dropout=0.1, + scale=False, + alpha=True, ) self.h = TransformerEncoder( @@ -293,7 +329,7 @@ class Text2SemanticDecoder(nn.Module): layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, - layer.linear2.bias + layer.linear2.bias, ) block = T2SBlock( @@ -309,11 +345,11 @@ class Text2SemanticDecoder(nn.Module): layer.norm1.eps, layer.norm2.weight, layer.norm2.bias, - layer.norm2.eps + layer.norm2.eps, ) blocks.append(block) - + self.t2s_transformer = T2STransformer(self.num_layers, blocks) def make_input_data(self, x, x_lens, y, y_lens, bert_feature): @@ -387,7 +423,9 @@ class Text2SemanticDecoder(nn.Module): logits = self.ar_predict_layer(xy_dec[:, x_len:]) ###### DPO ############# - reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(x, x_lens, reject_y, reject_y_lens, bert_feature) + reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data( + x, x_lens, reject_y, reject_y_lens, bert_feature + ) reject_xy_dec, _ = self.h( (reject_xy_pos, None), @@ -404,7 +442,7 @@ class Text2SemanticDecoder(nn.Module): A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets) loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True) - + loss = loss_1 + loss_2 return loss, acc @@ -473,14 +511,14 @@ class Text2SemanticDecoder(nn.Module): # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 def infer( - self, - x, - x_lens, - prompts, - bert_feature, - top_k: int = -100, - early_stop_num: int = -1, - temperature: float = 1.0, + self, + x, + x_lens, + prompts, + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, ): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -508,18 +546,14 @@ class Text2SemanticDecoder(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( - y.device - ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(y.device) xy_dec, _ = self.h( (xy_pos, None), mask=xy_attn_mask, ) logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = topk_sampling( - logits, top_k=top_k, top_p=1.0, temperature=temperature - ) + samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature) if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: print("use early stop num:", early_stop_num) @@ -542,18 +576,16 @@ class Text2SemanticDecoder(nn.Module): return y def pad_y_eos(self, y, y_mask_int, eos_id): - targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad( - y_mask_int, (0, 1), value=1 - ) + targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1) # 错位 return targets[:, :-1], targets[:, 1:] def infer_panel_batch_infer( self, - x:List[torch.LongTensor], #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:List[torch.LongTensor], + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, @@ -563,10 +595,19 @@ class Text2SemanticDecoder(nn.Module): ): if prompts is None: print("Warning: Prompt free is not supported batch_infer! switch to naive_infer") - return self.infer_panel_naive_batched(x, x_lens, prompts, bert_feature, top_k=top_k, top_p=top_p, early_stop_num=early_stop_num, temperature=temperature, **kwargs) + return self.infer_panel_naive_batched( + x, + x_lens, + prompts, + bert_feature, + top_k=top_k, + top_p=top_p, + early_stop_num=early_stop_num, + temperature=temperature, + **kwargs, + ) - - max_len = kwargs.get("max_len",x_lens.max()) + max_len = kwargs.get("max_len", x_lens.max()) x_list = [] for x_item, bert_item in zip(x, bert_feature): # max_len = max(max_len, x_item.shape[0], bert_item.shape[1]) @@ -574,14 +615,15 @@ class Text2SemanticDecoder(nn.Module): x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0)) x_item = self.ar_text_position(x_item).squeeze(0) # x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0] early_stop_num) or idx==1499: + + if (early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num) or idx == 1499: print("use early stop num:", early_stop_num) stop = True for i, batch_index in enumerate(batch_idx_map): batch_index = batch_idx_map[i] idx_list[batch_index] = idx y_list[batch_index] = y[i, :-1] - - if not (None in idx_list): + + if None not in idx_list: stop = True - + if stop: - if y.shape[1]==0: + if y.shape[1] == 0: y = torch.concat([y, torch.zeros_like(samples)], dim=1) print("bad zero prediction") print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") @@ -730,60 +764,65 @@ class Text2SemanticDecoder(nn.Module): ####################### update next step ################################### y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) - if (None in idx_list): + if None in idx_list: for i in range(x.shape[0]): if idx_list[i] is None: - idx_list[i] = 1500-1 ###如果没有生成到EOS,就用最大长度代替 - + idx_list[i] = 1500 - 1 ###如果没有生成到EOS,就用最大长度代替 + if ref_free: - return y_list, [0]*x.shape[0] + return y_list, [0] * x.shape[0] # print(idx_list) return y_list, idx_list - - def infer_panel_naive_batched(self, - x:List[torch.LongTensor], #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:List[torch.LongTensor], + + def infer_panel_naive_batched( + self, + x: List[torch.LongTensor], #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: List[torch.LongTensor], top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs - ): + **kwargs, + ): y_list = [] idx_list = [] for i in range(len(x)): - y, idx = self.infer_panel_naive(x[i].unsqueeze(0), - x_lens[i], - prompts[i].unsqueeze(0) if prompts is not None else None, - bert_feature[i].unsqueeze(0), - top_k, - top_p, - early_stop_num, - temperature, - repetition_penalty, - **kwargs) + y, idx = self.infer_panel_naive( + x[i].unsqueeze(0), + x_lens[i], + prompts[i].unsqueeze(0) if prompts is not None else None, + bert_feature[i].unsqueeze(0), + top_k, + top_p, + early_stop_num, + temperature, + repetition_penalty, + **kwargs, + ) y_list.append(y[0]) idx_list.append(idx) - + return y_list, idx_list - + def infer_panel_naive( self, - x:torch.LongTensor, #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:torch.LongTensor, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs + **kwargs, ): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -828,11 +867,13 @@ class Text2SemanticDecoder(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)\ - .unsqueeze(0)\ - .expand(bsz*self.num_head, -1, -1)\ - .view(bsz, self.num_head, src_len, src_len)\ - .to(device=x.device, dtype=torch.bool) + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) for idx in tqdm(range(1500)): if xy_attn_mask is not None: @@ -840,13 +881,11 @@ class Text2SemanticDecoder(nn.Module): else: xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) - logits = self.ar_predict_layer( - xy_dec[:, -1] - ) + logits = self.ar_predict_layer(xy_dec[:, -1]) if idx == 0: xy_attn_mask = None - if(idx<11):###至少预测出10个token不然不给停止(0.4s) + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) logits = logits[:, :-1] samples = sample( @@ -870,24 +909,27 @@ class Text2SemanticDecoder(nn.Module): ####################### update next step ################################### y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) if ref_free: return y[:, :-1], 0 return y[:, :-1], idx - - + def infer_panel( self, - x:torch.LongTensor, #####全部文本token - x_lens:torch.LongTensor, - prompts:torch.LongTensor, ####参考音频token - bert_feature:torch.LongTensor, + x: torch.LongTensor, #####全部文本token + x_lens: torch.LongTensor, + prompts: torch.LongTensor, ####参考音频token + bert_feature: torch.LongTensor, top_k: int = -100, top_p: int = 100, early_stop_num: int = -1, temperature: float = 1.0, repetition_penalty: float = 1.35, - **kwargs + **kwargs, ): - return self.infer_panel_naive(x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs) + return self.infer_panel_naive( + x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs + ) diff --git a/GPT_SoVITS/AR/models/t2s_model_onnx.py b/GPT_SoVITS/AR/models/t2s_model_onnx.py index 7834297..4f7b50a 100644 --- a/GPT_SoVITS/AR/models/t2s_model_onnx.py +++ b/GPT_SoVITS/AR/models/t2s_model_onnx.py @@ -1,17 +1,13 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py # reference: https://github.com/lifeiteng/vall-e import torch -from tqdm import tqdm - -from AR.modules.embedding_onnx import SinePositionalEmbedding -from AR.modules.embedding_onnx import TokenEmbedding -from AR.modules.transformer_onnx import LayerNorm -from AR.modules.transformer_onnx import TransformerEncoder -from AR.modules.transformer_onnx import TransformerEncoderLayer from torch import nn from torch.nn import functional as F from torchmetrics.classification import MulticlassAccuracy +from AR.modules.embedding_onnx import SinePositionalEmbedding, TokenEmbedding +from AR.modules.transformer_onnx import LayerNorm, TransformerEncoder, TransformerEncoderLayer + default_config = { "embedding_dim": 512, "hidden_dim": 512, @@ -26,12 +22,13 @@ default_config = { inf_tensor_value = torch.FloatTensor([-float("Inf")]).float() + def logits_to_probs( logits, - previous_tokens = None, + previous_tokens=None, temperature: float = 1.0, - top_k = None, - top_p = None, + top_k=None, + top_p=None, repetition_penalty: float = 1.0, ): previous_tokens = previous_tokens.squeeze() @@ -39,19 +36,27 @@ def logits_to_probs( previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=0, index=previous_tokens) score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty + score < 0, + score * repetition_penalty, + score / repetition_penalty, ) logits.scatter_(dim=0, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 + torch.nn.functional.softmax( + sorted_logits, + dim=-1, + ), + dim=-1, ) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[0] = False # keep at least one option indices_to_remove = sorted_indices_to_remove.scatter( - dim=0, index=sorted_indices, src=sorted_indices_to_remove + dim=0, + index=sorted_indices, + src=sorted_indices_to_remove, ) logits = logits.masked_fill(indices_to_remove, -float("Inf")) @@ -67,7 +72,7 @@ def logits_to_probs( def multinomial_sample_one_no_sync( - probs_sort + probs_sort, ): # Does multinomial sampling without a cuda synchronization q = torch.randn_like(probs_sort) return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) @@ -79,7 +84,9 @@ def sample( **sampling_kwargs, ): probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, **sampling_kwargs + logits=logits, + previous_tokens=previous_tokens, + **sampling_kwargs, ) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs @@ -91,7 +98,7 @@ class OnnxEncoder(nn.Module): self.ar_text_embedding = ar_text_embedding self.bert_proj = bert_proj self.ar_text_position = ar_text_position - + def forward(self, x, bert_feature): x = self.ar_text_embedding(x) x = x + self.bert_proj(bert_feature.transpose(1, 2)) @@ -99,8 +106,18 @@ class OnnxEncoder(nn.Module): class T2SFirstStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): super().__init__() self.ar_audio_embedding = ar_audio_embedding self.ar_audio_position = ar_audio_position @@ -111,11 +128,11 @@ class T2SFirstStageDecoder(nn.Module): self.top_k = top_k self.early_stop_num = early_stop_num self.num_layers = num_layers - + def forward(self, x, prompt): y = prompt - x_example = x[:,:,0] * 0.0 - #N, 1, 512 + x_example = x[:, :, 0] * 0.0 + # N, 1, 512 cache = { "all_stage": self.num_layers, "k": None, @@ -132,11 +149,15 @@ class T2SFirstStageDecoder(nn.Module): xy_pos = torch.concat([x, y_pos], dim=1) - y_example = y_pos[:,:,0] * 0.0 - x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool() + y_example = y_pos[:, :, 0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example).bool() y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64) y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum( - torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0 + torch.ones_like( + y_example.transpose(0, 1), + dtype=torch.int64, + ), + dim=0, ) y_attn_mask = y_attn_mask > 0 @@ -145,10 +166,16 @@ class T2SFirstStageDecoder(nn.Module): x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1) y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1) xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) - cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) - cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) + cache["k"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) + cache["v"] = ( + torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512))) + .unsqueeze(1) + .repeat(self.num_layers, 1, 1, 1) + ) xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) logits = self.ar_predict_layer(xy_dec[:, -1]) @@ -160,8 +187,18 @@ class T2SFirstStageDecoder(nn.Module): class T2SStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): + def __init__( + self, + ar_audio_embedding, + ar_audio_position, + h, + ar_predict_layer, + loss_fct, + ar_accuracy_metric, + top_k, + early_stop_num, + num_layers, + ): super().__init__() self.ar_audio_embedding = ar_audio_embedding self.ar_audio_position = ar_audio_position @@ -184,14 +221,18 @@ class T2SStageDecoder(nn.Module): } y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 + [ + cache["y_emb"], + self.ar_audio_embedding(y[:, -1:]), + ], + 1, ) cache["y_emb"] = y_emb y_pos = self.ar_audio_position(y_emb) xy_pos = y_pos[:, -1:] - - y_example = y_pos[:,:,0] * 0.0 + + y_example = y_pos[:, :, 0] * 0.0 xy_attn_mask = torch.cat([x_example, y_example], dim=1) xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool) @@ -250,12 +291,28 @@ class Text2SemanticDecoder(nn.Module): def init_onnx(self): self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position) - self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) - self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) + self.first_stage_decoder = T2SFirstStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) + self.stage_decoder = T2SStageDecoder( + self.ar_audio_embedding, + self.ar_audio_position, + self.h, + self.ar_predict_layer, + self.loss_fct, + self.ar_accuracy_metric, + self.top_k, + self.early_stop_num, + self.num_layers, + ) def forward(self, x, prompts, bert_feature): early_stop_num = self.early_stop_num @@ -286,7 +343,7 @@ class Text2SemanticDecoder(nn.Module): y = prompts prefix_len = y.shape[1] x_len = x.shape[1] - x_example = x[:,:,0] * 0.0 + x_example = x[:, :, 0] * 0.0 x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example) x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool) @@ -303,9 +360,7 @@ class Text2SemanticDecoder(nn.Module): if cache["first_infer"] == 1: y_emb = self.ar_audio_embedding(y) else: - y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 - ) + y_emb = torch.cat([cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1) cache["y_emb"] = y_emb y_pos = self.ar_audio_position(y_emb) if cache["first_infer"] == 1: @@ -317,7 +372,8 @@ class Text2SemanticDecoder(nn.Module): x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) y_attn_mask = F.pad( torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), - (x_len, 0), value=False + (x_len, 0), + value=False, ) xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) else: @@ -335,4 +391,4 @@ class Text2SemanticDecoder(nn.Module): break y = torch.concat([y, samples], dim=1) cache["first_infer"] = 0 - return y, idx \ No newline at end of file + return y, idx diff --git a/GPT_SoVITS/AR/models/utils.py b/GPT_SoVITS/AR/models/utils.py index d2ae26d..cc4f24d 100644 --- a/GPT_SoVITS/AR/models/utils.py +++ b/GPT_SoVITS/AR/models/utils.py @@ -1,8 +1,10 @@ # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py # reference: https://github.com/lifeiteng/vall-e +from typing import Tuple + import torch import torch.nn.functional as F -from typing import Tuple + def sequence_mask(length, max_length=None): if max_length is None: @@ -67,14 +69,18 @@ def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: n = lengths.size(0) seq_range = torch.arange(0, max_len, device=lengths.device) expaned_lengths = seq_range.unsqueeze(0).repeat(n, 1) - expaned_lengths -= (max_len-lengths).unsqueeze(-1) + expaned_lengths -= (max_len - lengths).unsqueeze(-1) - return expaned_lengths<0 + return expaned_lengths < 0 # https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py def top_k_top_p_filtering( - logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 + logits, + top_k=0, + top_p=1.0, + filter_value=-float("Inf"), + min_tokens_to_keep=1, ): """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: @@ -105,9 +111,7 @@ def top_k_top_p_filtering( sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter( - 1, sorted_indices, sorted_indices_to_remove - ) + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits @@ -130,7 +134,7 @@ def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): return token -from typing import Optional, Tuple +from typing import Optional def multinomial_sample_one_no_sync( @@ -156,19 +160,21 @@ def logits_to_probs( previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=1, index=previous_tokens) score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty + score < 0, + score * repetition_penalty, + score / repetition_penalty, ) logits.scatter_(dim=1, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 - ) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[:, 0] = False # keep at least one option indices_to_remove = sorted_indices_to_remove.scatter( - dim=1, index=sorted_indices, src=sorted_indices_to_remove + dim=1, + index=sorted_indices, + src=sorted_indices_to_remove, ) logits = logits.masked_fill(indices_to_remove, -float("Inf")) @@ -176,7 +182,7 @@ def logits_to_probs( if top_k is not None: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - pivot = v[: , -1].unsqueeze(-1) + pivot = v[:, -1].unsqueeze(-1) logits = torch.where(logits < pivot, -float("Inf"), logits) probs = torch.nn.functional.softmax(logits, dim=-1) @@ -188,18 +194,19 @@ def sample( previous_tokens: Optional[torch.Tensor] = None, **sampling_kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: - probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, **sampling_kwargs - ) + probs = logits_to_probs(logits=logits, previous_tokens=previous_tokens, **sampling_kwargs) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs -def dpo_loss(policy_chosen_logps: torch.FloatTensor, - policy_rejected_logps: torch.FloatTensor, - reference_chosen_logps: torch.FloatTensor, - reference_rejected_logps: torch.FloatTensor, - beta: float, - reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + +def dpo_loss( + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + beta: float, + reference_free: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: pi_logratios = policy_chosen_logps - policy_rejected_logps ref_logratios = reference_chosen_logps - reference_rejected_logps @@ -214,40 +221,53 @@ def dpo_loss(policy_chosen_logps: torch.FloatTensor, return losses.mean(), chosen_rewards, rejected_rewards -def get_batch_logps(logits_target: torch.FloatTensor, logits_reject: torch.FloatTensor, labels_target: torch.LongTensor, labels_reject: torch.LongTensor, average_log_prob: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor]: +def get_batch_logps( + logits_target: torch.FloatTensor, + logits_reject: torch.FloatTensor, + labels_target: torch.LongTensor, + labels_reject: torch.LongTensor, + average_log_prob: bool = False, +) -> Tuple[torch.FloatTensor, torch.FloatTensor]: # dummy token; we'll ignore the losses on these tokens later - per_token_logps_target = torch.gather(logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)).squeeze(2) - per_token_logps_reject = torch.gather(logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)).squeeze(2) + per_token_logps_target = torch.gather( + logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2) + ).squeeze(2) + per_token_logps_reject = torch.gather( + logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2) + ).squeeze(2) return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1) + def make_reject_y(y_o, y_lens): def repeat_P(y): range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() - pre = y[:range_idx[0]] - shf = y[range_idx[1]:] - range_text = y[range_idx[0]:range_idx[1]] + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] new_y = torch.cat([pre, range_text, range_text, shf]) return new_y + def lost_P(y): range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() - pre = y[:range_idx[0]] - shf = y[range_idx[1]:] - range_text = y[range_idx[0]:range_idx[1]] + pre = y[: range_idx[0]] + shf = y[range_idx[1] :] + range_text = y[range_idx[0] : range_idx[1]] new_y = torch.cat([pre, shf]) return new_y + bs = len(y_lens) reject_y = [] reject_y_lens = [] for b in range(bs): - process_item_idx = torch.randint(0, 1, size=(1, ))[0] + process_item_idx = torch.randint(0, 1, size=(1,))[0] if process_item_idx == 0: new_y = repeat_P(y_o[b]) reject_y.append(new_y) reject_y_lens.append(len(new_y)) - elif process_item_idx==1: + elif process_item_idx == 1: new_y = lost_P(y_o[b]) reject_y.append(new_y) reject_y_lens.append(len(new_y)) @@ -256,7 +276,7 @@ def make_reject_y(y_o, y_lens): pad_length = max_length - reject_y_lens[b] reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0) - reject_y = torch.stack(reject_y, dim = 0) + reject_y = torch.stack(reject_y, dim=0) reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device) return reject_y, reject_y_lens diff --git a/GPT_SoVITS/AR/modules/activation.py b/GPT_SoVITS/AR/modules/activation.py index 5ca888b..936f9c3 100644 --- a/GPT_SoVITS/AR/modules/activation.py +++ b/GPT_SoVITS/AR/modules/activation.py @@ -1,17 +1,14 @@ # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple +from typing import Optional, Tuple + import torch from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ +from torch.nn import Linear, Module +from torch.nn import functional as F +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from torch.nn import functional as F from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched F.multi_head_attention_forward = multi_head_attention_forward_patched @@ -73,6 +70,7 @@ class MultiheadAttention(Module): >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ + __constants__ = ["batch_first"] bias_k: Optional[torch.Tensor] bias_v: Optional[torch.Tensor] @@ -104,9 +102,7 @@ class MultiheadAttention(Module): self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if add_bias_kv: self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) @@ -117,31 +113,32 @@ class MultiheadAttention(Module): if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) + torch.empty((embed_dim, embed_dim), **factory_kwargs), ) self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) + torch.empty((embed_dim, self.kdim), **factory_kwargs), ) self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) + torch.empty((embed_dim, self.vdim), **factory_kwargs), ) self.register_parameter("in_proj_weight", None) else: self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs), ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: - self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) - ) + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) else: self.register_parameter("in_proj_bias", None) self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) self._reset_parameters() @@ -150,7 +147,10 @@ class MultiheadAttention(Module): raise NotImplementedError else: self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, ) self.in_proj_weight = self.in_proj_linear.weight @@ -164,7 +164,10 @@ class MultiheadAttention(Module): self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) if self.bias_k is not None: @@ -261,28 +264,26 @@ class MultiheadAttention(Module): if key_padding_mask is not None: _kpm_dtype = key_padding_mask.dtype if _kpm_dtype != torch.bool and not torch.is_floating_point( - key_padding_mask + key_padding_mask, ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) + raise AssertionError("only bool and floating types of key_padding_mask are supported") why_not_fast_path = "" if not is_batched: - why_not_fast_path = ( - f"input not batched; expected query.dim() of 3 but got {query.dim()}" - ) + why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" elif query is not key or key is not value: # When lifting this restriction, don't forget to either # enforce that the dtypes all match or test cases where # they don't! why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" - elif ( - self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype - ): + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + ) + elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype: # this case will fail anyway, but at least they'll get a useful error message. - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + why_not_fast_path = ( + f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + ) elif self.training: why_not_fast_path = "training is enabled" elif not self.batch_first: @@ -300,9 +301,7 @@ class MultiheadAttention(Module): elif attn_mask is not None: why_not_fast_path = "attn_mask was not None" elif query.is_nested and key_padding_mask is not None: - why_not_fast_path = ( - "key_padding_mask is not supported with NestedTensor input" - ) + why_not_fast_path = "key_padding_mask is not supported with NestedTensor input" elif self.num_heads % 2 == 1: why_not_fast_path = "num_heads is odd" elif torch.is_autocast_enabled(): @@ -322,20 +321,10 @@ class MultiheadAttention(Module): # generator expressions. if torch.overrides.has_torch_function(tensor_args): why_not_fast_path = "some Tensor argument has_torch_function" - elif not all( - [ - (x is None or x.is_cuda or "cpu" in str(x.device)) - for x in tensor_args - ] - ): + elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args]): why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" - elif torch.is_grad_enabled() and any( - [x is not None and x.requires_grad for x in tensor_args] - ): - why_not_fast_path = ( - "grad is enabled and at least one of query or the " - "input/output projection weights or biases requires_grad" - ) + elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]): + why_not_fast_path = "grad is enabled and at least one of query or the input/output projection weights or biases requires_grad" if not why_not_fast_path: return torch._native_multi_head_attention( query, @@ -350,11 +339,7 @@ class MultiheadAttention(Module): key_padding_mask if key_padding_mask is not None else attn_mask, need_weights, average_attn_weights, - 1 - if key_padding_mask is not None - else 0 - if attn_mask is not None - else None, + 1 if key_padding_mask is not None else 0 if attn_mask is not None else None, ) any_nested = query.is_nested or key.is_nested or value.is_nested diff --git a/GPT_SoVITS/AR/modules/activation_onnx.py b/GPT_SoVITS/AR/modules/activation_onnx.py index b54acd9..c14ce40 100644 --- a/GPT_SoVITS/AR/modules/activation_onnx.py +++ b/GPT_SoVITS/AR/modules/activation_onnx.py @@ -1,17 +1,13 @@ # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple +from typing import Optional, Tuple + import torch from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ +from torch.nn import Linear, Module +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter -from torch.nn import functional as F from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched @@ -47,9 +43,7 @@ class MultiheadAttention(Module): self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if add_bias_kv: self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) @@ -60,18 +54,30 @@ class MultiheadAttention(Module): if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) + torch.empty( + (embed_dim, embed_dim), + **factory_kwargs, + ) ) self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) + torch.empty( + (embed_dim, self.kdim), + **factory_kwargs, + ) ) self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) + torch.empty( + (embed_dim, self.vdim), + **factory_kwargs, + ) ) self.register_parameter("in_proj_weight", None) else: self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + torch.empty( + (3 * embed_dim, embed_dim), + **factory_kwargs, + ) ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) @@ -79,13 +85,11 @@ class MultiheadAttention(Module): if bias: self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) + torch.empty(3 * embed_dim, **factory_kwargs), ) else: self.register_parameter("in_proj_bias", None) - self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) + self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs) self._reset_parameters() else: @@ -93,7 +97,10 @@ class MultiheadAttention(Module): raise NotImplementedError else: self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + embed_dim, + 3 * embed_dim, + bias=bias, + **factory_kwargs, ) self.in_proj_weight = self.in_proj_linear.weight @@ -107,7 +114,10 @@ class MultiheadAttention(Module): self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs + embed_dim, + embed_dim, + bias=bias, + **factory_kwargs, ) if self.bias_k is not None: diff --git a/GPT_SoVITS/AR/modules/embedding.py b/GPT_SoVITS/AR/modules/embedding.py index 3a382f9..39da560 100644 --- a/GPT_SoVITS/AR/modules/embedding.py +++ b/GPT_SoVITS/AR/modules/embedding.py @@ -60,14 +60,11 @@ class SinePositionalEmbedding(nn.Module): return pe = torch.zeros(x.size(1), self.embedding_dim) if self.reverse: - position = torch.arange( - x.size(1) - 1, -1, -1.0, dtype=torch.float32 - ).unsqueeze(1) + position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) else: position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( - torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) - * -(math.log(10000.0) / self.embedding_dim) + torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim) ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) diff --git a/GPT_SoVITS/AR/modules/embedding_onnx.py b/GPT_SoVITS/AR/modules/embedding_onnx.py index b93405b..c870013 100644 --- a/GPT_SoVITS/AR/modules/embedding_onnx.py +++ b/GPT_SoVITS/AR/modules/embedding_onnx.py @@ -50,7 +50,7 @@ class SinePositionalEmbedding(nn.Module): self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) def extend_pe(self, x): - position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) + position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1) scpe = (position * self.div_term).unsqueeze(0) pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) pe = pe.contiguous().view(1, -1, self.embedding_dim) diff --git a/GPT_SoVITS/AR/modules/lr_schedulers.py b/GPT_SoVITS/AR/modules/lr_schedulers.py index b886746..707a911 100644 --- a/GPT_SoVITS/AR/modules/lr_schedulers.py +++ b/GPT_SoVITS/AR/modules/lr_schedulers.py @@ -49,13 +49,9 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): lr = self.end_lr else: - decay_ratio = (self._current_step - self.warmup_steps) / ( - self.total_steps - self.warmup_steps - ) + decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps) if decay_ratio < 0.0 or decay_ratio > 1.0: - raise RuntimeError( - "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." - ) + raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.") coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) @@ -70,7 +66,13 @@ if __name__ == "__main__": m = nn.Linear(10, 10) opt = Adam(m.parameters(), lr=1e-4) s = WarmupCosineLRSchedule( - opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 + opt, + 1e-6, + 2e-4, + 1e-6, + warmup_steps=2000, + total_steps=20000, + current_step=0, ) lrs = [] for i in range(25000): diff --git a/GPT_SoVITS/AR/modules/optim.py b/GPT_SoVITS/AR/modules/optim.py index 98785f0..aeebbee 100644 --- a/GPT_SoVITS/AR/modules/optim.py +++ b/GPT_SoVITS/AR/modules/optim.py @@ -16,8 +16,7 @@ import contextlib import logging from collections import defaultdict -from typing import List -from typing import Tuple +from typing import List, Tuple import torch from torch import Tensor @@ -71,12 +70,8 @@ class BatchedOptimizer(Optimizer): group_params_names: name for each parameter in group, which is List[str]. """ - batches = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter - batches_names = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str + batches = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter + batches_names = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of str assert len(param_group) == len(group_params_names) for p, named_p in zip(param_group, group_params_names): @@ -85,11 +80,8 @@ class BatchedOptimizer(Optimizer): batches_names[key].append(named_p) batches_names_keys = list(batches_names.keys()) - sorted_idx = sorted( - range(len(batches_names)), key=lambda i: batches_names_keys[i]) - batches_names = [ - batches_names[batches_names_keys[idx]] for idx in sorted_idx - ] + sorted_idx = sorted(range(len(batches_names)), key=lambda i: batches_names_keys[i]) + batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx] batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] stacked_params_dict = dict() @@ -106,16 +98,14 @@ class BatchedOptimizer(Optimizer): # group. class Optimizer will take care of saving/loading state. state = self.state[p] p_stacked = torch.stack(batch) - grad = torch.stack([ - torch.zeros_like(p) if p.grad is None else p.grad for p in batch - ]) + grad = torch.stack([torch.zeros_like(p) if p.grad is None else p.grad for p in batch]) p_stacked.grad = grad stacked_params_dict[key] = p_stacked tuples.append((p_stacked, state, batch_names)) yield tuples # <-- calling code will do the actual optimization here! - for ((stacked_params, _state, _names), batch) in zip(tuples, batches): + for (stacked_params, _state, _names), batch in zip(tuples, batches): for i, p in enumerate(batch): # batch is list of Parameter p.copy_(stacked_params[i]) @@ -164,25 +154,24 @@ class ScaledAdam(BatchedOptimizer): """ def __init__( - self, - params, - lr=3e-02, - clipping_scale=None, - betas=(0.9, 0.98), - scalar_lr_scale=0.1, - eps=1.0e-08, - param_min_rms=1.0e-05, - param_max_rms=3.0, - scalar_max=10.0, - size_update_period=4, - clipping_update_period=100, - parameters_names=None, - show_dominant_parameters=True, ): - + self, + params, + lr=3e-02, + clipping_scale=None, + betas=(0.9, 0.98), + scalar_lr_scale=0.1, + eps=1.0e-08, + param_min_rms=1.0e-05, + param_max_rms=3.0, + scalar_max=10.0, + size_update_period=4, + clipping_update_period=100, + parameters_names=None, + show_dominant_parameters=True, + ): assert parameters_names is not None, ( - "Please prepare parameters_names," - "which is a List[List[str]]. Each List[str] is for a group" - "and each str is for a parameter") + "Please prepare parameters_names,which is a List[List[str]]. Each List[str] is for a groupand each str is for a parameter" + ) defaults = dict( lr=lr, clipping_scale=clipping_scale, @@ -193,7 +182,8 @@ class ScaledAdam(BatchedOptimizer): param_max_rms=param_max_rms, scalar_max=scalar_max, size_update_period=size_update_period, - clipping_update_period=clipping_update_period, ) + clipping_update_period=clipping_update_period, + ) super(ScaledAdam, self).__init__(params, defaults) assert len(self.param_groups) == len(parameters_names) @@ -218,18 +208,13 @@ class ScaledAdam(BatchedOptimizer): batch = True - for group, group_params_names in zip(self.param_groups, - self.parameters_names): - - with self.batched_params(group["params"], - group_params_names) as batches: - + for group, group_params_names in zip(self.param_groups, self.parameters_names): + with self.batched_params(group["params"], group_params_names) as batches: # batches is list of pairs (stacked_param, state). stacked_param is like # a regular parameter, and will have a .grad, but the 1st dim corresponds to # a stacking dim, it is not a real dim. - if (len(batches[0][1]) == - 0): # if len(first state) == 0: not yet initialized + if len(batches[0][1]) == 0: # if len(first state) == 0: not yet initialized clipping_scale = 1 else: clipping_scale = self._get_clipping_scale(group, batches) @@ -239,9 +224,7 @@ class ScaledAdam(BatchedOptimizer): # grad is not going to be None, we handled that when creating the batches. grad = p.grad if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients" - ) + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") # State initialization if len(state) == 0: self._init_state(group, p, state) @@ -274,8 +257,7 @@ class ScaledAdam(BatchedOptimizer): # parameter-change "delta", which combines all forms of # update. this is equivalent to how it's done in Adam, # except for the first few steps. - state["delta"] = torch.zeros_like( - p, memory_format=torch.preserve_format) + state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format) batch_size = p.shape[0] numel = p.numel() // batch_size @@ -285,22 +267,16 @@ class ScaledAdam(BatchedOptimizer): # "param_rms" just periodically records the scalar root-mean-square value of # the parameter tensor. # it has a shape like (batch_size, 1, 1, 1, 1) - param_rms = ( - (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) + param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt() state["param_rms"] = param_rms state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) - state["scale_grads"] = torch.zeros(size_update_period, - *param_rms.shape, **kwargs) + state["scale_grads"] = torch.zeros(size_update_period, *param_rms.shape, **kwargs) # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. - state["exp_avg_sq"] = torch.zeros_like( - p, memory_format=torch.preserve_format) + state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format) - def _get_clipping_scale(self, - group: dict, - tuples: List[Tuple[Tensor, dict, List[str]]] - ) -> float: + def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]) -> float: """ Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients by this amount before applying the rest of the update. @@ -325,20 +301,18 @@ class ScaledAdam(BatchedOptimizer): clipping_update_period = group["clipping_update_period"] tot_sumsq = torch.tensor(0.0, device=first_p.device) - for (p, state, param_names) in tuples: + for p, state, param_names in tuples: grad = p.grad if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients") + raise RuntimeError("ScaledAdam optimizer does not support sparse gradients") if p.numel() == p.shape[0]: # a batch of scalars tot_sumsq += (grad**2).sum() # sum() to change shape [1] to [] else: - tot_sumsq += ((grad * state["param_rms"])**2).sum() + tot_sumsq += ((grad * state["param_rms"]) ** 2).sum() tot_norm = tot_sumsq.sqrt() if "model_norms" not in first_state: - first_state["model_norms"] = torch.zeros( - clipping_update_period, device=p.device) + first_state["model_norms"] = torch.zeros(clipping_update_period, device=p.device) first_state["model_norms"][step % clipping_update_period] = tot_norm if step % clipping_update_period == 0: @@ -350,20 +324,20 @@ class ScaledAdam(BatchedOptimizer): for n in range(0, 5): index = min( clipping_update_period - 1, - (clipping_update_period // 4) * n, ) + (clipping_update_period // 4) * n, + ) quartiles.append(sorted_norms[index].item()) median = quartiles[2] threshold = clipping_scale * median first_state["model_norm_threshold"] = threshold - percent_clipped = (first_state["num_clipped"] * 100.0 / - clipping_update_period - if "num_clipped" in first_state else 0.0) + percent_clipped = ( + first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0 + ) first_state["num_clipped"] = 0 quartiles = " ".join(["%.3e" % x for x in quartiles]) logging.info( - f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, " - f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" + f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" ) if step < clipping_update_period: @@ -373,25 +347,20 @@ class ScaledAdam(BatchedOptimizer): model_norm_threshold = first_state["model_norm_threshold"] except KeyError: logging.info( - "Warning: model_norm_threshold not in state: possibly " - "you changed config when restarting, adding clipping_scale option?" + "Warning: model_norm_threshold not in state: possibly you changed config when restarting, adding clipping_scale option?" ) return 1.0 ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) if ans < 1.0: first_state["num_clipped"] += 1 if ans < 0.1: - logging.warn( - f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}" - ) + logging.warn(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}") if self.show_dominant_parameters: assert p.shape[0] == len(param_names) self._show_gradient_dominating_parameter(tuples, tot_sumsq) return ans - def _show_gradient_dominating_parameter( - self, tuples: List[Tuple[Tensor, dict, List[str]]], - tot_sumsq: Tensor): + def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor): """ Show information of parameter wihch dominanting tot_sumsq. @@ -406,7 +375,7 @@ class ScaledAdam(BatchedOptimizer): from tuples, we still pass it to save some time. """ all_sumsq_orig = {} - for (p, state, batch_param_names) in tuples: + for p, state, batch_param_names in tuples: # p is a stacked batch parameters. batch_grad = p.grad if p.numel() == p.shape[0]: # a batch of scalars @@ -415,41 +384,46 @@ class ScaledAdam(BatchedOptimizer): batch_rms_orig = torch.ones(p.shape[0]) else: batch_rms_orig = state["param_rms"] - batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum( - dim=list(range(1, batch_grad.ndim))) - - for name, sumsq_orig, rms, grad in zip(batch_param_names, - batch_sumsq_orig, - batch_rms_orig, batch_grad): + batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(dim=list(range(1, batch_grad.ndim))) + for name, sumsq_orig, rms, grad in zip( + batch_param_names, + batch_sumsq_orig, + batch_rms_orig, + batch_grad, + ): proportion_orig = sumsq_orig / tot_sumsq all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) assert torch.isclose( sum([value[0] for value in all_sumsq_orig.values()]).cpu(), - torch.tensor(1.0), ) + torch.tensor(1.0), + ) sorted_by_proportion = { k: v for k, v in sorted( all_sumsq_orig.items(), key=lambda item: item[1][0], - reverse=True, ) + reverse=True, + ) } dominant_param_name = next(iter(sorted_by_proportion)) - (dominant_proportion, dominant_sumsq, dominant_rms, - dominant_grad, ) = sorted_by_proportion[dominant_param_name] - logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}" - f" with proportion {dominant_proportion:.2f}," - f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" - f"={dominant_sumsq:.3e}," - f" grad_sumsq = {(dominant_grad**2).sum():.3e}," - f" orig_rms_sq={(dominant_rms**2).item():.3e}") + ( + dominant_proportion, + dominant_sumsq, + dominant_rms, + dominant_grad, + ) = sorted_by_proportion[dominant_param_name] + logging.info( + f"Parameter Dominanting tot_sumsq {dominant_param_name}" + f" with proportion {dominant_proportion:.2f}," + f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" + f"={dominant_sumsq:.3e}," + f" grad_sumsq = {(dominant_grad**2).sum():.3e}," + f" orig_rms_sq={(dominant_rms**2).item():.3e}" + ) - def _step_one_batch(self, - group: dict, - p: Tensor, - state: dict, - clipping_scale: float): + def _step_one_batch(self, group: dict, p: Tensor, state: dict, clipping_scale: float): """ Do the step for one parameter, which is actually going to be a batch of `real` parameters, with dim 0 as the batch dim. @@ -475,13 +449,10 @@ class ScaledAdam(BatchedOptimizer): if numel > 1: # Update the size/scale of p, and set param_rms scale_grads = state["scale_grads"] - scale_grads[step % size_update_period] = (p * grad).sum( - dim=list(range(1, p.ndim)), keepdim=True) + scale_grads[step % size_update_period] = (p * grad).sum(dim=list(range(1, p.ndim)), keepdim=True) if step % size_update_period == size_update_period - 1: param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) - param_rms.copy_((p**2) - .mean(dim=list(range(1, p.ndim)), keepdim=True) - .sqrt()) + param_rms.copy_((p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) if step > 0: # self._size_update() learns the overall scale on the # parameter, by shrinking or expanding it. @@ -496,11 +467,13 @@ class ScaledAdam(BatchedOptimizer): state["step"] = step + 1 - def _size_update(self, - group: dict, - scale_grads: Tensor, - p: Tensor, - state: dict) -> None: + def _size_update( + self, + group: dict, + scale_grads: Tensor, + p: Tensor, + state: dict, + ) -> None: """ Called only where p.numel() > 1, this updates the scale of the parameter. If we imagine: p = underlying_param * scale.exp(), and we are doing @@ -529,11 +502,11 @@ class ScaledAdam(BatchedOptimizer): # faster decay at this level. beta2_corr = beta2**size_update_period - scale_exp_avg_sq = state[ - "scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) + scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) scale_exp_avg_sq.mul_(beta2_corr).add_( (scale_grads**2).mean(dim=0), # mean over dim `size_update_period` - alpha=1 - beta2_corr, ) # shape is (batch_size, 1, 1, ...) + alpha=1 - beta2_corr, + ) # shape is (batch_size, 1, 1, ...) # The 1st time we reach here is when size_step == 1. size_step = (step + 1) // size_update_period @@ -543,8 +516,7 @@ class ScaledAdam(BatchedOptimizer): denom = scale_exp_avg_sq.sqrt() + eps - scale_step = (-size_lr * (bias_correction2**0.5) * - scale_grads.sum(dim=0) / denom) + scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom is_too_small = param_rms < param_min_rms is_too_large = param_rms > param_max_rms @@ -580,9 +552,8 @@ class ScaledAdam(BatchedOptimizer): exp_avg_sq = state["exp_avg_sq"] exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) - this_step = state["step"] - (state["zero_step"] - if "zero_step" in state else 0) - bias_correction2 = 1 - beta2**(this_step + 1) + this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0) + bias_correction2 = 1 - beta2 ** (this_step + 1) if bias_correction2 < 0.99: # note: not in-place. exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) @@ -613,7 +584,7 @@ class ScaledAdam(BatchedOptimizer): # bias_correction2 is like in Adam. Don't bother with bias_correction1; # slower update at the start will help stability anyway. - bias_correction2 = 1 - beta2**(state["step"] + 1) + bias_correction2 = 1 - beta2 ** (state["step"] + 1) denom = (exp_avg_sq / bias_correction2).sqrt() + eps delta = state["delta"] diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py index cab6afe..5bffcea 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py @@ -5,7 +5,6 @@ from torch.nn.functional import ( _none_or_dtype, _in_projection_packed, ) -from torch.nn import functional as F import torch # Tensor = torch.Tensor # from typing import Callable, List, Optional, Tuple, Union @@ -25,18 +24,18 @@ def multi_head_attention_forward_patched( dropout_p: float, out_proj_weight, out_proj_bias, - training = True, - key_padding_mask = None, - need_weights = True, - attn_mask = None, - use_separate_proj_weight = False, - q_proj_weight = None, - k_proj_weight = None, - v_proj_weight = None, - static_k = None, - static_v = None, - average_attn_weights = True, - is_causal = False, + training=True, + key_padding_mask=None, + need_weights=True, + attn_mask=None, + use_separate_proj_weight=False, + q_proj_weight=None, + k_proj_weight=None, + v_proj_weight=None, + static_k=None, + static_v=None, + average_attn_weights=True, + is_causal=False, cache=None, ): r""" @@ -156,9 +155,7 @@ def multi_head_attention_forward_patched( cache=cache, ) - is_batched = _mha_shape_check( - query, key, value, key_padding_mask, attn_mask, num_heads - ) + is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads) # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input # is batched, run the computation and before returning squeeze the @@ -211,45 +208,33 @@ def multi_head_attention_forward_patched( # longer causal. is_causal = False - assert ( - embed_dim == embed_dim_to_check - ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + assert embed_dim == embed_dim_to_check, ( + f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + ) if isinstance(embed_dim, torch.Tensor): # embed_dim can be a tensor when JIT tracing head_dim = embed_dim.div(num_heads, rounding_mode="trunc") else: head_dim = embed_dim // num_heads - assert ( - head_dim * num_heads == embed_dim - ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used - assert ( - key.shape[:2] == value.shape[:2] - ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + assert key.shape[:2] == value.shape[:2], ( + f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + ) else: - assert ( - key.shape == value.shape - ), f"key shape {key.shape} does not match value shape {value.shape}" + assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: - assert ( - in_proj_weight is not None - ), "use_separate_proj_weight is False but in_proj_weight is None" + assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None" q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: - assert ( - q_proj_weight is not None - ), "use_separate_proj_weight is True but q_proj_weight is None" - assert ( - k_proj_weight is not None - ), "use_separate_proj_weight is True but k_proj_weight is None" - assert ( - v_proj_weight is not None - ), "use_separate_proj_weight is True but v_proj_weight is None" + assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" + assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" + assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: @@ -312,9 +297,7 @@ def multi_head_attention_forward_patched( f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." ) else: - raise RuntimeError( - f"attn_mask's dimension {attn_mask.dim()} is not supported" - ) + raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: @@ -338,34 +321,26 @@ def multi_head_attention_forward_patched( k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_k.size(0) == bsz * num_heads - ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" - assert ( - static_k.size(2) == head_dim - ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + assert static_k.size(0) == bsz * num_heads, ( + f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" + ) + assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_v.size(0) == bsz * num_heads - ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" - assert ( - static_v.size(2) == head_dim - ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + assert static_v.size(0) == bsz * num_heads, ( + f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" + ) + assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * num_heads, 1, head_dim) - k = torch.cat( - [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1 - ) - v = torch.cat( - [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1 - ) + k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) + v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: @@ -381,9 +356,7 @@ def multi_head_attention_forward_patched( src_len, ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = ( - key_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, num_heads, -1, -1) - .reshape(bsz * num_heads, 1, src_len) + key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) ) if attn_mask is None: attn_mask = key_padding_mask @@ -402,14 +375,10 @@ def multi_head_attention_forward_patched( B, Nt, E = q.shape q_scaled = q / math.sqrt(E) - assert not ( - is_causal and attn_mask is None - ), "FIXME: is_causal not implemented for need_weights" + assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" if attn_mask is not None: - attn_output_weights = torch.baddbmm( - attn_mask, q_scaled, k.transpose(-2, -1) - ) + attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1)) else: attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) attn_output_weights = softmax(attn_output_weights, dim=-1) @@ -418,9 +387,7 @@ def multi_head_attention_forward_patched( attn_output = torch.bmm(attn_output_weights, v) - attn_output = ( - attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) - ) + attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) @@ -449,13 +416,9 @@ def multi_head_attention_forward_patched( v = v.view(bsz, num_heads, src_len, head_dim) # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) - ) + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) diff --git a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py index 14bdb55..8144c9c 100644 --- a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py +++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py @@ -1,11 +1,9 @@ from torch.nn.functional import * from torch.nn.functional import ( - _mha_shape_check, _canonical_mask, - _none_or_dtype, - _in_projection_packed, ) + def multi_head_attention_forward_patched( query, key, @@ -34,7 +32,6 @@ def multi_head_attention_forward_patched( is_causal: bool = False, cache=None, ) -> Tuple[Tensor, Optional[Tensor]]: - # set up shape vars _, _, embed_dim = query.shape attn_mask = _canonical_mask( @@ -80,12 +77,8 @@ def multi_head_attention_forward_patched( q = q.view(num_heads, -1, head_dim).unsqueeze(0) k = k.view(num_heads, -1, head_dim).unsqueeze(0) v = v.view(num_heads, -1, head_dim).unsqueeze(0) - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) - ) + attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) + attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) attn_output = attn_output.view(-1, 1, attn_output.size(1)) diff --git a/GPT_SoVITS/AR/modules/scaling.py b/GPT_SoVITS/AR/modules/scaling.py index 9256a8c..aae1453 100644 --- a/GPT_SoVITS/AR/modules/scaling.py +++ b/GPT_SoVITS/AR/modules/scaling.py @@ -13,12 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging -import math import random from typing import Optional from typing import Tuple -from typing import Union import torch import torch.nn as nn @@ -61,9 +58,7 @@ class DoubleSwishFunction(torch.autograd.Function): # floors), should be expectation-preserving. floor = -0.043637 ceil = 1.2 - d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( - deriv - ) + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(deriv) if __name__ == "__main__": # for self-testing only. assert d_scaled.min() >= 0.0 @@ -153,13 +148,9 @@ def _compute_scale_factor( else: # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if # x_abs)_mean , min_abs. - below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( - min=0, max=max_factor - ) + below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(min=0, max=max_factor) - above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( - min=0, max=max_factor - ) + above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(min=0, max=max_factor) return below_threshold - above_threshold @@ -181,18 +172,16 @@ def _compute_sign_factor( else: # 0 if proportion_positive >= min_positive, else can be # as large as max_factor. - factor1 = ( - (min_positive - proportion_positive) * (gain_factor / min_positive) - ).clamp_(min=0, max=max_factor) + factor1 = ((min_positive - proportion_positive) * (gain_factor / min_positive)).clamp_(min=0, max=max_factor) if max_positive == 1.0: factor2 = 0.0 else: # 0 if self.proportion_positive <= max_positive, else can be # as large as -max_factor. - factor2 = ( - (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) - ).clamp_(min=0, max=max_factor) + factor2 = ((proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))).clamp_( + min=0, max=max_factor + ) sign_factor = factor1 - factor2 # require min_positive != 0 or max_positive != 1: assert not isinstance(sign_factor, float) @@ -320,15 +309,11 @@ class ActivationBalancer(torch.nn.Module): return _no_op(x) -def BalancedDoubleSwish( - d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 -) -> nn.Sequential: +def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25) -> nn.Sequential: """ ActivationBalancer -> DoubleSwish """ - balancer = ActivationBalancer( - d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob - ) + balancer = ActivationBalancer(d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob) return nn.Sequential( balancer, DoubleSwish(), diff --git a/GPT_SoVITS/AR/modules/transformer.py b/GPT_SoVITS/AR/modules/transformer.py index 7921f48..1bf21cd 100644 --- a/GPT_SoVITS/AR/modules/transformer.py +++ b/GPT_SoVITS/AR/modules/transformer.py @@ -42,12 +42,8 @@ class LayerNorm(nn.Module): self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) else: self.register_parameter("weight", None) self.register_parameter("bias", None) @@ -74,15 +70,10 @@ class LayerNorm(nn.Module): ) assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) class IdentityNorm(nn.Module): @@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module): >>> src = torch.rand(10, 32, 512) >>> out = transformer_encoder(src) """ + __constants__ = ["norm"] def __init__(self, encoder_layer, num_layers, norm=None): @@ -218,13 +210,9 @@ class TransformerEncoderLayer(nn.Module): ) # Implementation of Feedforward model - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) @@ -291,12 +279,8 @@ class TransformerEncoderLayer(nn.Module): if src_key_padding_mask is not None: _skpm_dtype = src_key_padding_mask.dtype - if _skpm_dtype != torch.bool and not torch.is_floating_point( - src_key_padding_mask - ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) + if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask): + raise AssertionError("only bool and floating types of key_padding_mask are supported") if self.norm_first: x = x + self._sa_block( diff --git a/GPT_SoVITS/AR/modules/transformer_onnx.py b/GPT_SoVITS/AR/modules/transformer_onnx.py index a3f68b4..fa17025 100644 --- a/GPT_SoVITS/AR/modules/transformer_onnx.py +++ b/GPT_SoVITS/AR/modules/transformer_onnx.py @@ -42,12 +42,8 @@ class LayerNorm(nn.Module): self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) + self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) else: self.register_parameter("weight", None) self.register_parameter("bias", None) @@ -74,15 +70,10 @@ class LayerNorm(nn.Module): ) assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) + return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__) class IdentityNorm(nn.Module): @@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module): >>> src = torch.rand(10, 32, 512) >>> out = transformer_encoder(src) """ + __constants__ = ["norm"] def __init__(self, encoder_layer, num_layers, norm=None): @@ -154,6 +146,7 @@ class TransformerEncoder(nn.Module): class TransformerEncoderLayer(nn.Module): __constants__ = ["batch_first", "norm_first"] + def __init__( self, d_model: int, @@ -184,13 +177,9 @@ class TransformerEncoderLayer(nn.Module): linear2_cls=linear2_self_attention_cls, **factory_kwargs, ) - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) + self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs) self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) + self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) diff --git a/GPT_SoVITS/AR/text_processing/phonemizer.py b/GPT_SoVITS/AR/text_processing/phonemizer.py index 9c5f58f..1003040 100644 --- a/GPT_SoVITS/AR/text_processing/phonemizer.py +++ b/GPT_SoVITS/AR/text_processing/phonemizer.py @@ -30,9 +30,7 @@ class GruutPhonemizer: "«": "«", "»": "»", } - self._punctuation_regexp: str = ( - rf"([{''.join(self._special_cases_dict.keys())}])" - ) + self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" def _normalize_punctuation(self, text: str) -> str: text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) @@ -53,13 +51,8 @@ class GruutPhonemizer: def phonemize(self, text: str, espeak: bool = False) -> str: text_to_phonemize: str = self._normalize_punctuation(text) - sents: List[Sentence] = [ - sent - for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) - ] - words: List[str] = [ - self._convert_punctuation(word) for word in itertools.chain(*sents) - ] + sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)] + words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)] return " ".join(words) def transform(self, phonemes): diff --git a/GPT_SoVITS/AR/text_processing/symbols.py b/GPT_SoVITS/AR/text_processing/symbols.py index 7d754a7..f7ef57f 100644 --- a/GPT_SoVITS/AR/text_processing/symbols.py +++ b/GPT_SoVITS/AR/text_processing/symbols.py @@ -3,7 +3,9 @@ PAD = "_" PUNCTUATION = ';:,.!?¡¿—…"«»“” ' LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" -IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +IPA_LETTERS = ( + "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +) SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) SPACE_ID = SYMBOLS.index(" ") SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} diff --git a/GPT_SoVITS/AR/utils/__init__.py b/GPT_SoVITS/AR/utils/__init__.py index c2eaf61..4a9cb4d 100644 --- a/GPT_SoVITS/AR/utils/__init__.py +++ b/GPT_SoVITS/AR/utils/__init__.py @@ -2,12 +2,12 @@ import re def str2bool(str): - return True if str.lower() == 'true' else False + return True if str.lower() == "true" else False def get_newest_ckpt(string_list): # 定义一个正则表达式模式,用于匹配字符串中的数字 - pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' + pattern = r"epoch=(\d+)-step=(\d+)\.ckpt" # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 extracted_info = [] @@ -18,8 +18,7 @@ def get_newest_ckpt(string_list): step = int(match.group(2)) extracted_info.append((epoch, step, string)) # 按照 epoch 后面的数字和 step 后面的数字进行排序 - sorted_info = sorted( - extracted_info, key=lambda x: (x[0], x[1]), reverse=True) + sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True) # 获取最新的 ckpt 文件名 newest_ckpt = sorted_info[0][2] return newest_ckpt @@ -28,9 +27,9 @@ def get_newest_ckpt(string_list): # 文本存在且不为空时 return True def check_txt_file(file_path): try: - with open(file_path, 'r') as file: + with open(file_path, "r") as file: text = file.readline().strip() - assert text.strip() != '' + assert text.strip() != "" return text except Exception: return False diff --git a/GPT_SoVITS/AR/utils/initialize.py b/GPT_SoVITS/AR/utils/initialize.py index 17ff9f9..ee7c713 100644 --- a/GPT_SoVITS/AR/utils/initialize.py +++ b/GPT_SoVITS/AR/utils/initialize.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """Initialize modules for espnet2 neural networks.""" + import torch from typeguard import check_argument_types diff --git a/GPT_SoVITS/AR/utils/io.py b/GPT_SoVITS/AR/utils/io.py index 52f1f3c..a6475cb 100644 --- a/GPT_SoVITS/AR/utils/io.py +++ b/GPT_SoVITS/AR/utils/io.py @@ -18,14 +18,10 @@ def save_config_to_yaml(config, path): def write_args(args, path): - args_dict = dict( - (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") - ) + args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_")) with open(path, "a") as args_file: args_file.write("==> torch version: {}\n".format(torch.__version__)) - args_file.write( - "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) - ) + args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version())) args_file.write("==> Cmd:\n") args_file.write(str(sys.argv)) args_file.write("\n==> args:\n") diff --git a/GPT_SoVITS/BigVGAN/activations.py b/GPT_SoVITS/BigVGAN/activations.py index 4f08dda..abe3ad9 100644 --- a/GPT_SoVITS/BigVGAN/activations.py +++ b/GPT_SoVITS/BigVGAN/activations.py @@ -23,9 +23,7 @@ class Snake(nn.Module): >>> x = a1(x) """ - def __init__( - self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False - ): + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): """ Initialization. INPUT: @@ -80,9 +78,7 @@ class SnakeBeta(nn.Module): >>> x = a1(x) """ - def __init__( - self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False - ): + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): """ Initialization. INPUT: diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py index fbc0fd8..ea333cf 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py @@ -20,9 +20,7 @@ class FusedAntiAliasActivation(torch.autograd.Function): @staticmethod def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta): - activation_results = anti_alias_activation_cuda.forward( - inputs, up_ftr, down_ftr, alpha, beta - ) + activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta) return activation_results @@ -61,17 +59,11 @@ class Activation1d(nn.Module): if self.act.__class__.__name__ == "Snake": beta = self.act.alpha.data # Snake uses same params for alpha and beta else: - beta = ( - self.act.beta.data - ) # Snakebeta uses different params for alpha and beta + beta = self.act.beta.data # Snakebeta uses different params for alpha and beta alpha = self.act.alpha.data - if ( - not self.act.alpha_logscale - ): # Exp baked into cuda kernel, cancel it out with a log + if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log alpha = torch.log(alpha) beta = torch.log(beta) - x = FusedAntiAliasActivation.apply( - x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta - ) + x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta) return x diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py index ca5d01d..14fbf05 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py @@ -58,17 +58,13 @@ def load(): srcpath / "anti_alias_activation.cpp", srcpath / "anti_alias_activation_cuda.cu", ] - anti_alias_activation_cuda = _cpp_extention_load_helper( - "anti_alias_activation_cuda", sources, extra_cuda_flags - ) + anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags) return anti_alias_activation_cuda def _get_cuda_bare_metal_version(cuda_dir): - raw_output = subprocess.check_output( - [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True - ) + raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) output = raw_output.split() release_idx = output.index("release") + 1 release = output[release_idx].split(".") diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py index 0fa35b0..dc905b2 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py @@ -27,9 +27,7 @@ else: # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License # https://adefossez.github.io/julius/julius/lowpass.html # LICENSE is in incl_licenses directory. -def kaiser_sinc_filter1d( - cutoff, half_width, kernel_size -): # return filter [1,1,kernel_size] +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] even = kernel_size % 2 == 0 half_size = kernel_size // 2 diff --git a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py index a35380f..e7928fa 100644 --- a/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py +++ b/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py @@ -11,18 +11,12 @@ class UpSample1d(nn.Module): def __init__(self, ratio=2, kernel_size=None): super().__init__() self.ratio = ratio - self.kernel_size = ( - int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - ) + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size self.stride = ratio self.pad = self.kernel_size // ratio - 1 self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 - self.pad_right = ( - self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 - ) - filter = kaiser_sinc_filter1d( - cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size - ) + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size) self.register_buffer("filter", filter) # x: [B, C, T] @@ -30,9 +24,7 @@ class UpSample1d(nn.Module): _, C, _ = x.shape x = F.pad(x, (self.pad, self.pad), mode="replicate") - x = self.ratio * F.conv_transpose1d( - x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C - ) + x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) x = x[..., self.pad_left : -self.pad_right] return x @@ -42,9 +34,7 @@ class DownSample1d(nn.Module): def __init__(self, ratio=2, kernel_size=None): super().__init__() self.ratio = ratio - self.kernel_size = ( - int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - ) + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size self.lowpass = LowPassFilter1d( cutoff=0.5 / ratio, half_width=0.6 / ratio, diff --git a/GPT_SoVITS/BigVGAN/bigvgan.py b/GPT_SoVITS/BigVGAN/bigvgan.py index 6c4a223..febdf16 100644 --- a/GPT_SoVITS/BigVGAN/bigvgan.py +++ b/GPT_SoVITS/BigVGAN/bigvgan.py @@ -50,7 +50,7 @@ class AMPBlock1(torch.nn.Module): activation: str = None, ): super().__init__() - + self.h = h self.convs1 = nn.ModuleList( @@ -87,9 +87,7 @@ class AMPBlock1(torch.nn.Module): ) self.convs2.apply(init_weights) - self.num_layers = len(self.convs1) + len( - self.convs2 - ) # Total number of conv layers + self.num_layers = len(self.convs1) + len(self.convs2) # Total number of conv layers # Select which Activation1d, lazy-load cuda version to ensure backward compatibility if self.h.get("use_cuda_kernel", False): @@ -105,22 +103,14 @@ class AMPBlock1(torch.nn.Module): if activation == "snake": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.Snake( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) elif activation == "snakebeta": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.SnakeBeta( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) @@ -169,7 +159,7 @@ class AMPBlock2(torch.nn.Module): activation: str = None, ): super().__init__() - + self.h = h self.convs = nn.ModuleList( @@ -205,22 +195,14 @@ class AMPBlock2(torch.nn.Module): if activation == "snake": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.Snake( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) elif activation == "snakebeta": self.activations = nn.ModuleList( [ - Activation1d( - activation=activations.SnakeBeta( - channels, alpha_logscale=h.snake_logscale - ) - ) + Activation1d(activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) for _ in range(self.num_layers) ] ) @@ -283,9 +265,7 @@ class BigVGAN( self.num_upsamples = len(h.upsample_rates) # Pre-conv - self.conv_pre = weight_norm( - Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) - ) + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) # Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default if h.resblock == "1": @@ -293,9 +273,7 @@ class BigVGAN( elif h.resblock == "2": resblock_class = AMPBlock2 else: - raise ValueError( - f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}" - ) + raise ValueError(f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}") # Transposed conv-based upsamplers. does not apply anti-aliasing self.ups = nn.ModuleList() @@ -320,22 +298,14 @@ class BigVGAN( self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = h.upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) - ): - self.resblocks.append( - resblock_class(h, ch, k, d, activation=h.activation) - ) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock_class(h, ch, k, d, activation=h.activation)) # Post-conv activation_post = ( activations.Snake(ch, alpha_logscale=h.snake_logscale) if h.activation == "snake" - else ( - activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) - if h.activation == "snakebeta" - else None - ) + else (activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) if h.activation == "snakebeta" else None) ) if activation_post is None: raise NotImplementedError( @@ -346,9 +316,7 @@ class BigVGAN( # Whether to use bias for the final conv_post. Default to True for backward compatibility self.use_bias_at_final = h.get("use_bias_at_final", True) - self.conv_post = weight_norm( - Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final) - ) + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)) # Weight initialization for i in range(len(self.ups)): @@ -451,13 +419,13 @@ class BigVGAN( # instantiate BigVGAN using h if use_cuda_kernel: print( - f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!" + "[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!" ) print( - f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!" + "[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!" ) print( - f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis" + "[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis" ) model = cls(h, use_cuda_kernel=use_cuda_kernel) @@ -485,7 +453,7 @@ class BigVGAN( model.load_state_dict(checkpoint_dict["generator"]) except RuntimeError: print( - f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!" + "[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!" ) model.remove_weight_norm() model.load_state_dict(checkpoint_dict["generator"]) diff --git a/GPT_SoVITS/BigVGAN/discriminators.py b/GPT_SoVITS/BigVGAN/discriminators.py index ffdf327..2d44c79 100644 --- a/GPT_SoVITS/BigVGAN/discriminators.py +++ b/GPT_SoVITS/BigVGAN/discriminators.py @@ -15,7 +15,7 @@ from torchaudio.transforms import Spectrogram, Resample from env import AttrDict from utils import get_padding import typing -from typing import Optional, List, Union, Dict, Tuple +from typing import List, Tuple class DiscriminatorP(torch.nn.Module): @@ -81,9 +81,7 @@ class DiscriminatorP(torch.nn.Module): ), ] ) - self.conv_post = norm_f( - Conv2d(int(1024 * self.d_mult), 1, (3, 1), 1, padding=(1, 0)) - ) + self.conv_post = norm_f(Conv2d(int(1024 * self.d_mult), 1, (3, 1), 1, padding=(1, 0))) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] @@ -113,13 +111,12 @@ class MultiPeriodDiscriminator(torch.nn.Module): self.mpd_reshapes = h.mpd_reshapes print(f"mpd_reshapes: {self.mpd_reshapes}") self.discriminators = nn.ModuleList( - [ - DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) - for rs in self.mpd_reshapes - ] + [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes] ) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], @@ -145,19 +142,13 @@ class DiscriminatorR(nn.Module): super().__init__() self.resolution = resolution - assert ( - len(self.resolution) == 3 - ), f"MRD layer requires list with len=3, got {self.resolution}" + assert len(self.resolution) == 3, f"MRD layer requires list with len=3, got {self.resolution}" self.lrelu_slope = 0.1 norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm if hasattr(cfg, "mrd_use_spectral_norm"): - print( - f"[INFO] overriding MRD use_spectral_norm as {cfg.mrd_use_spectral_norm}" - ) - norm_f = ( - weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm - ) + print(f"[INFO] overriding MRD use_spectral_norm as {cfg.mrd_use_spectral_norm}") + norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm self.d_mult = cfg.discriminator_channel_mult if hasattr(cfg, "mrd_channel_mult"): print(f"[INFO] overriding mrd channel multiplier as {cfg.mrd_channel_mult}") @@ -203,9 +194,7 @@ class DiscriminatorR(nn.Module): ), ] ) - self.conv_post = norm_f( - nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)) - ) + self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] @@ -248,14 +237,14 @@ class MultiResolutionDiscriminator(nn.Module): def __init__(self, cfg, debug=False): super().__init__() self.resolutions = cfg.resolutions - assert ( - len(self.resolutions) == 3 - ), f"MRD requires list of list with len=3, each element having a list with len=3. Got {self.resolutions}" - self.discriminators = nn.ModuleList( - [DiscriminatorR(cfg, resolution) for resolution in self.resolutions] + assert len(self.resolutions) == 3, ( + f"MRD requires list of list with len=3, each element having a list with len=3. Got {self.resolutions}" ) + self.discriminators = nn.ModuleList([DiscriminatorR(cfg, resolution) for resolution in self.resolutions]) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], @@ -309,25 +298,15 @@ class DiscriminatorB(nn.Module): convs = lambda: nn.ModuleList( [ weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4)) - ), - weight_norm( - nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1)) - ), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))), + weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))), ] ) self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))]) - self.conv_post = weight_norm( - nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)) - ) + self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))) def spectrogram(self, x: torch.Tensor) -> List[torch.Tensor]: # Remove DC offset @@ -376,17 +355,16 @@ class MultiBandDiscriminator(nn.Module): super().__init__() # fft_sizes (list[int]): Tuple of window lengths for FFT. Defaults to [2048, 1024, 512] if not set in h. self.fft_sizes = h.get("mbd_fft_sizes", [2048, 1024, 512]) - self.discriminators = nn.ModuleList( - [DiscriminatorB(window_length=w) for w in self.fft_sizes] - ) + self.discriminators = nn.ModuleList([DiscriminatorB(window_length=w) for w in self.fft_sizes]) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] @@ -406,7 +384,7 @@ class MultiBandDiscriminator(nn.Module): # Adapted from https://github.com/open-mmlab/Amphion/blob/main/models/vocoders/gan/discriminator/mssbcqtd.py under the MIT license. # LICENSE is in incl_licenses directory. class DiscriminatorCQT(nn.Module): - def __init__(self, cfg: AttrDict, hop_length: int, n_octaves:int, bins_per_octave: int): + def __init__(self, cfg: AttrDict, hop_length: int, n_octaves: int, bins_per_octave: int): super().__init__() self.cfg = cfg @@ -460,9 +438,7 @@ class DiscriminatorCQT(nn.Module): in_chs = min(self.filters_scale * self.filters, self.max_filters) for i, dilation in enumerate(self.dilations): - out_chs = min( - (self.filters_scale ** (i + 1)) * self.filters, self.max_filters - ) + out_chs = min((self.filters_scale ** (i + 1)) * self.filters, self.max_filters) self.convs.append( weight_norm( nn.Conv2d( @@ -486,9 +462,7 @@ class DiscriminatorCQT(nn.Module): in_chs, out_chs, kernel_size=(self.kernel_size[0], self.kernel_size[0]), - padding=self.get_2d_padding( - (self.kernel_size[0], self.kernel_size[0]) - ), + padding=self.get_2d_padding((self.kernel_size[0], self.kernel_size[0])), ) ) ) @@ -508,7 +482,7 @@ class DiscriminatorCQT(nn.Module): self.cqtd_normalize_volume = self.cfg.get("cqtd_normalize_volume", False) if self.cqtd_normalize_volume: print( - f"[INFO] cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!" + "[INFO] cqtd_normalize_volume set to True. Will apply DC offset removal & peak volume normalization in CQTD!" ) def get_2d_padding( @@ -580,9 +554,7 @@ class MultiScaleSubbandCQTDiscriminator(nn.Module): # Multi-scale params to loop over self.cfg["cqtd_hop_lengths"] = self.cfg.get("cqtd_hop_lengths", [512, 256, 256]) self.cfg["cqtd_n_octaves"] = self.cfg.get("cqtd_n_octaves", [9, 9, 9]) - self.cfg["cqtd_bins_per_octaves"] = self.cfg.get( - "cqtd_bins_per_octaves", [24, 36, 48] - ) + self.cfg["cqtd_bins_per_octaves"] = self.cfg.get("cqtd_bins_per_octaves", [24, 36, 48]) self.discriminators = nn.ModuleList( [ @@ -596,13 +568,14 @@ class MultiScaleSubbandCQTDiscriminator(nn.Module): ] ) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] @@ -629,13 +602,14 @@ class CombinedDiscriminator(nn.Module): super().__init__() self.discrimiantor = nn.ModuleList(list_discriminator) - def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[ + def forward( + self, y: torch.Tensor, y_hat: torch.Tensor + ) -> Tuple[ List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]], ]: - y_d_rs = [] y_d_gs = [] fmap_rs = [] diff --git a/GPT_SoVITS/BigVGAN/inference.py b/GPT_SoVITS/BigVGAN/inference.py index a213f31..5f892a3 100644 --- a/GPT_SoVITS/BigVGAN/inference.py +++ b/GPT_SoVITS/BigVGAN/inference.py @@ -35,9 +35,7 @@ def inference(a, h): with torch.no_grad(): for i, filname in enumerate(filelist): # Load the ground truth audio and resample if necessary - wav, sr = librosa.load( - os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True - ) + wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True) wav = torch.FloatTensor(wav).to(device) # Compute mel spectrogram from the ground truth audio x = get_mel_spectrogram(wav.unsqueeze(0), generator.h) @@ -48,9 +46,7 @@ def inference(a, h): audio = audio * MAX_WAV_VALUE audio = audio.cpu().numpy().astype("int16") - output_file = os.path.join( - a.output_dir, os.path.splitext(filname)[0] + "_generated.wav" - ) + output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav") write(output_file, h.sampling_rate, audio) print(output_file) diff --git a/GPT_SoVITS/BigVGAN/inference_e2e.py b/GPT_SoVITS/BigVGAN/inference_e2e.py index a39dc67..9c0df77 100644 --- a/GPT_SoVITS/BigVGAN/inference_e2e.py +++ b/GPT_SoVITS/BigVGAN/inference_e2e.py @@ -61,9 +61,7 @@ def inference(a, h): audio = audio * MAX_WAV_VALUE audio = audio.cpu().numpy().astype("int16") - output_file = os.path.join( - a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav" - ) + output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav") write(output_file, h.sampling_rate, audio) print(output_file) diff --git a/GPT_SoVITS/BigVGAN/loss.py b/GPT_SoVITS/BigVGAN/loss.py index d581151..c295a14 100644 --- a/GPT_SoVITS/BigVGAN/loss.py +++ b/GPT_SoVITS/BigVGAN/loss.py @@ -6,13 +6,12 @@ import torch -import torch.nn.functional as F import torch.nn as nn from librosa.filters import mel as librosa_mel_fn from scipy import signal import typing -from typing import Optional, List, Union, Dict, Tuple +from typing import List, Tuple from collections import namedtuple import math import functools @@ -117,15 +116,13 @@ class MultiScaleMelSpectrogramLoss(nn.Module): window_type, ): """ - Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from: + Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from: https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py """ B, C, T = wav.shape if match_stride: - assert ( - hop_length == window_length // 4 - ), "For match_stride, hop must equal n_fft // 4" + assert hop_length == window_length // 4, "For match_stride, hop must equal n_fft // 4" right_pad = math.ceil(T / hop_length) * hop_length - T pad = (window_length - hop_length) // 2 else: @@ -155,9 +152,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): magnitude = torch.abs(stft) nf = magnitude.shape[2] - mel_basis = self.get_mel_filters( - self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax - ) + mel_basis = self.get_mel_filters(self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax) mel_basis = torch.from_numpy(mel_basis).to(wav.device) mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T mel_spectrogram = mel_spectrogram.transpose(-1, 2) @@ -182,9 +177,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): """ loss = 0.0 - for n_mels, fmin, fmax, s in zip( - self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params - ): + for n_mels, fmin, fmax, s in zip(self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params): kwargs = { "n_mels": n_mels, "fmin": fmin, @@ -197,12 +190,8 @@ class MultiScaleMelSpectrogramLoss(nn.Module): x_mels = self.mel_spectrogram(x, **kwargs) y_mels = self.mel_spectrogram(y, **kwargs) - x_logmels = torch.log( - x_mels.clamp(min=self.clamp_eps).pow(self.pow) - ) / torch.log(torch.tensor(10.0)) - y_logmels = torch.log( - y_mels.clamp(min=self.clamp_eps).pow(self.pow) - ) / torch.log(torch.tensor(10.0)) + x_logmels = torch.log(x_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0)) + y_logmels = torch.log(y_mels.clamp(min=self.clamp_eps).pow(self.pow)) / torch.log(torch.tensor(10.0)) loss += self.log_weight * self.loss_fn(x_logmels, y_logmels) loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels) @@ -211,10 +200,7 @@ class MultiScaleMelSpectrogramLoss(nn.Module): # Loss functions -def feature_loss( - fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]] -) -> torch.Tensor: - +def feature_loss(fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]) -> torch.Tensor: loss = 0 for dr, dg in zip(fmap_r, fmap_g): for rl, gl in zip(dr, dg): @@ -226,7 +212,6 @@ def feature_loss( def discriminator_loss( disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor] ) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]: - loss = 0 r_losses = [] g_losses = [] @@ -243,7 +228,6 @@ def discriminator_loss( def generator_loss( disc_outputs: List[torch.Tensor], ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - loss = 0 gen_losses = [] for dg in disc_outputs: diff --git a/GPT_SoVITS/BigVGAN/meldataset.py b/GPT_SoVITS/BigVGAN/meldataset.py index a5859b9..dc12c98 100644 --- a/GPT_SoVITS/BigVGAN/meldataset.py +++ b/GPT_SoVITS/BigVGAN/meldataset.py @@ -86,9 +86,7 @@ def mel_spectrogram( key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}" if key not in mel_basis_cache: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) hann_window_cache[key] = torch.hann_window(win_size).to(device) @@ -96,9 +94,7 @@ def mel_spectrogram( hann_window = hann_window_cache[key] padding = (n_fft - hop_size) // 2 - y = torch.nn.functional.pad( - y.unsqueeze(1), (padding, padding), mode="reflect" - ).squeeze(1) + y = torch.nn.functional.pad(y.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1) spec = torch.stft( y, @@ -150,17 +146,13 @@ def get_dataset_filelist(a): with open(a.input_training_file, "r", encoding="utf-8") as fi: training_files = [ - os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") - for x in fi.read().split("\n") - if len(x) > 0 + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 ] print(f"first training file: {training_files[0]}") with open(a.input_validation_file, "r", encoding="utf-8") as fi: validation_files = [ - os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") - for x in fi.read().split("\n") - if len(x) > 0 + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 ] print(f"first validation file: {validation_files[0]}") @@ -171,9 +163,7 @@ def get_dataset_filelist(a): for x in fi.read().split("\n") if len(x) > 0 ] - print( - f"first unseen {i}th validation fileset: {unseen_validation_files[0]}" - ) + print(f"first unseen {i}th validation fileset: {unseen_validation_files[0]}") list_unseen_validation_files.append(unseen_validation_files) return training_files, validation_files, list_unseen_validation_files @@ -227,13 +217,9 @@ class MelDataset(torch.utils.data.Dataset): print("[INFO] checking dataset integrity...") for i in tqdm(range(len(self.audio_files))): - assert os.path.exists( - self.audio_files[i] - ), f"{self.audio_files[i]} not found" + assert os.path.exists(self.audio_files[i]), f"{self.audio_files[i]} not found" - def __getitem__( - self, index: int - ) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]: + def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]: try: filename = self.audio_files[index] @@ -248,17 +234,12 @@ class MelDataset(torch.utils.data.Dataset): # Obtain randomized audio chunk if source_sampling_rate != self.sampling_rate: # Adjust segment size to crop if the source sr is different - target_segment_size = math.ceil( - self.segment_size - * (source_sampling_rate / self.sampling_rate) - ) + target_segment_size = math.ceil(self.segment_size * (source_sampling_rate / self.sampling_rate)) else: target_segment_size = self.segment_size # Compute upper bound index for the random chunk - random_chunk_upper_bound = max( - 0, audio.shape[0] - target_segment_size - ) + random_chunk_upper_bound = max(0, audio.shape[0] - target_segment_size) # Crop or pad audio to obtain random chunk with target_segment_size if audio.shape[0] >= target_segment_size: @@ -318,9 +299,9 @@ class MelDataset(torch.utils.data.Dataset): else: # For fine-tuning, assert that the waveform is in the defined sampling_rate # Fine-tuning won't support on-the-fly resampling to be fool-proof (the dataset should have been prepared properly) - assert ( - source_sampling_rate == self.sampling_rate - ), f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}" + assert source_sampling_rate == self.sampling_rate, ( + f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}" + ) # Cast ndarray to torch tensor audio = torch.FloatTensor(audio) @@ -346,20 +327,14 @@ class MelDataset(torch.utils.data.Dataset): mel = mel[:, :, mel_start : mel_start + frames_per_seg] audio = audio[ :, - mel_start - * self.hop_size : (mel_start + frames_per_seg) - * self.hop_size, + mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size, ] # Pad pre-computed mel and audio to match length to ensuring fine-tuning without error. # NOTE: this may introduce a single-frame misalignment of the # To remove possible misalignment, it is recommended to prepare the pair where the audio length is the integer multiple of self.hop_size - mel = torch.nn.functional.pad( - mel, (0, frames_per_seg - mel.size(2)), "constant" - ) - audio = torch.nn.functional.pad( - audio, (0, self.segment_size - audio.size(1)), "constant" - ) + mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant") + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") # Compute mel_loss used by spectral regression objective. Uses self.fmax_loss instead (usually None) mel_loss = mel_spectrogram( @@ -376,9 +351,10 @@ class MelDataset(torch.utils.data.Dataset): # Shape sanity checks assert ( - audio.shape[1] == mel.shape[2] * self.hop_size - and audio.shape[1] == mel_loss.shape[2] * self.hop_size - ), f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}" + audio.shape[1] == mel.shape[2] * self.hop_size and audio.shape[1] == mel_loss.shape[2] * self.hop_size + ), ( + f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}" + ) return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) @@ -387,9 +363,7 @@ class MelDataset(torch.utils.data.Dataset): if self.fine_tuning: raise e # Terminate training if it is fine-tuning. The dataset should have been prepared properly. else: - print( - f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}" - ) + print(f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}") return self[random.randrange(len(self))] def __len__(self): diff --git a/GPT_SoVITS/BigVGAN/tests/test_activation.py b/GPT_SoVITS/BigVGAN/tests/test_activation.py index 146600e..4134883 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_activation.py +++ b/GPT_SoVITS/BigVGAN/tests/test_activation.py @@ -3,6 +3,7 @@ import os import sys + # to import modules from parent_dir parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(parent_dir) @@ -24,14 +25,10 @@ def test_anti_alias_activation(): data = torch.rand((10, 10, 200), device="cuda") # Check activations.Snake cuda vs. torch - fused_anti_alias_activation = activation1d.Activation1d( - activation=Snake(10), fused=True - ).cuda() + fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda() fused_activation_output = fused_anti_alias_activation(data) - torch_anti_alias_activation = activation1d.Activation1d( - activation=Snake(10), fused=False - ).cuda() + torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda() torch_activation_output = torch_anti_alias_activation(data) test_result = (fused_activation_output - torch_activation_output).abs() diff --git a/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py b/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py index 3e65385..4cc46b9 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py +++ b/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py @@ -3,6 +3,7 @@ import os import sys + # to import modules from parent_dir parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(parent_dir) @@ -24,14 +25,10 @@ def test_anti_alias_activation(): data = torch.rand((10, 10, 200), device="cuda") # Check activations, Snake CUDA vs. Torch - fused_anti_alias_activation = activation1d.Activation1d( - activation=SnakeBeta(10), fused=True - ).cuda() + fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda() fused_activation_output = fused_anti_alias_activation(data) - torch_anti_alias_activation = activation1d.Activation1d( - activation=SnakeBeta(10), fused=False - ).cuda() + torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda() torch_activation_output = torch_anti_alias_activation(data) test_result = (fused_activation_output - torch_activation_output).abs() @@ -57,7 +54,6 @@ def test_anti_alias_activation(): ) - if __name__ == "__main__": from alias_free_activation.cuda import load diff --git a/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py b/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py index 86ad051..8ddb29e 100644 --- a/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py +++ b/GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py @@ -42,9 +42,7 @@ def generate_soundwave(duration=5.0, sr=24000): def get_mel(x, h): - return mel_spectrogram( - x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax - ) + return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax) def load_checkpoint(filepath, device): @@ -56,9 +54,7 @@ def load_checkpoint(filepath, device): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Test script to check CUDA kernel correctness." - ) + parser = argparse.ArgumentParser(description="Test script to check CUDA kernel correctness.") parser.add_argument( "--checkpoint_file", type=str, @@ -91,27 +87,25 @@ if __name__ == "__main__": # define number of samples and length of mel frame to benchmark num_sample = 10 num_mel_frame = 16384 - + # CUDA kernel correctness check diff = 0.0 for i in tqdm(range(num_sample)): # Random mel data = torch.rand((1, h.num_mels, num_mel_frame), device="cuda") - + with torch.inference_mode(): audio_original = generator_original(data) - + with torch.inference_mode(): audio_cuda_kernel = generator_cuda_kernel(data) # Both outputs should be (almost) the same test_result = (audio_original - audio_cuda_kernel).abs() diff += test_result.mean(dim=-1).item() - + diff /= num_sample - if ( - diff <= 2e-3 - ): # We can expect a small difference (~1e-3) which does not affect perceptual quality + if diff <= 2e-3: # We can expect a small difference (~1e-3) which does not affect perceptual quality print( f"\n[Success] test CUDA fused vs. plain torch BigVGAN inference" f"\n > mean_difference={diff}" @@ -125,9 +119,9 @@ if __name__ == "__main__": f"\n > fused_values={audio_cuda_kernel[-1][-1][-30:].tolist()}, " f"\n > torch_values={audio_original[-1][-1][-30:].tolist()}" ) - + del data, audio_original, audio_cuda_kernel - + # Variables for tracking total time and VRAM usage toc_total_original = 0 toc_total_cuda_kernel = 0 @@ -145,10 +139,10 @@ if __name__ == "__main__": audio_original = generator_original(data) torch.cuda.synchronize() toc = time() - tic - toc_total_original += toc + toc_total_original += toc vram_used_original_total += torch.cuda.max_memory_allocated(device="cuda") - + del data, audio_original torch.cuda.empty_cache() @@ -163,11 +157,11 @@ if __name__ == "__main__": torch.cuda.synchronize() toc = time() - tic toc_total_cuda_kernel += toc - + audio_length_total += audio_cuda_kernel.shape[-1] - + vram_used_cuda_kernel_total += torch.cuda.max_memory_allocated(device="cuda") - + del data, audio_cuda_kernel torch.cuda.empty_cache() @@ -175,8 +169,8 @@ if __name__ == "__main__": audio_second = audio_length_total / h.sampling_rate khz_original = audio_length_total / toc_total_original / 1000 khz_cuda_kernel = audio_length_total / toc_total_cuda_kernel / 1000 - vram_used_original_gb = vram_used_original_total / num_sample / (1024 ** 3) - vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024 ** 3) + vram_used_original_gb = vram_used_original_total / num_sample / (1024**3) + vram_used_cuda_kernel_gb = vram_used_cuda_kernel_total / num_sample / (1024**3) # Print results print( diff --git a/GPT_SoVITS/BigVGAN/train.py b/GPT_SoVITS/BigVGAN/train.py index 01eeb09..39718cd 100644 --- a/GPT_SoVITS/BigVGAN/train.py +++ b/GPT_SoVITS/BigVGAN/train.py @@ -77,24 +77,18 @@ def train(rank, a, h): # Define additional discriminators. BigVGAN-v1 uses UnivNet's MRD as default # New in BigVGAN-v2: option to switch to new discriminators: MultiBandDiscriminator / MultiScaleSubbandCQTDiscriminator if h.get("use_mbd_instead_of_mrd", False): # Switch to MBD - print( - "[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator" - ) + print("[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator") # Variable name is kept as "mrd" for backward compatibility & minimal code change mrd = MultiBandDiscriminator(h).to(device) elif h.get("use_cqtd_instead_of_mrd", False): # Switch to CQTD - print( - "[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator" - ) + print("[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator") mrd = MultiScaleSubbandCQTDiscriminator(h).to(device) else: # Fallback to original MRD in BigVGAN-v1 mrd = MultiResolutionDiscriminator(h).to(device) # New in BigVGAN-v2: option to switch to multi-scale L1 mel loss if h.get("use_multiscale_melloss", False): - print( - "[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss" - ) + print("[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss") fn_mel_loss_multiscale = MultiScaleMelSpectrogramLoss( sampling_rate=h.sampling_rate ) # NOTE: accepts waveform as input @@ -114,9 +108,7 @@ def train(rank, a, h): if os.path.isdir(a.checkpoint_path): # New in v2.1: If the step prefix pattern-based checkpoints are not found, also check for renamed files in Hugging Face Hub to resume training - cp_g = scan_checkpoint( - a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt" - ) + cp_g = scan_checkpoint(a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt") cp_do = scan_checkpoint( a.checkpoint_path, prefix="do_", @@ -143,9 +135,7 @@ def train(rank, a, h): mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) mrd = DistributedDataParallel(mrd, device_ids=[rank]).to(device) - optim_g = torch.optim.AdamW( - generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2] - ) + optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) optim_d = torch.optim.AdamW( itertools.chain(mrd.parameters(), mpd.parameters()), h.learning_rate, @@ -156,12 +146,8 @@ def train(rank, a, h): optim_g.load_state_dict(state_dict_do["optim_g"]) optim_d.load_state_dict(state_dict_do["optim_d"]) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=h.lr_decay, last_epoch=last_epoch - ) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR( - optim_d, gamma=h.lr_decay, last_epoch=last_epoch - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch) # Define training and validation datasets @@ -169,9 +155,7 @@ def train(rank, a, h): unseen_validation_filelist will contain sample filepaths outside the seen training & validation dataset Example: trained on LibriTTS, validate on VCTK """ - training_filelist, validation_filelist, list_unseen_validation_filelist = ( - get_dataset_filelist(a) - ) + training_filelist, validation_filelist, list_unseen_validation_filelist = get_dataset_filelist(a) trainset = MelDataset( training_filelist, @@ -324,33 +308,26 @@ def train(rank, a, h): h.fmax_for_loss, ) min_t = min(y_mel.size(-1), y_g_hat_mel.size(-1)) - val_err_tot += F.l1_loss(y_mel[...,:min_t], y_g_hat_mel[...,:min_t]).item() + val_err_tot += F.l1_loss(y_mel[..., :min_t], y_g_hat_mel[..., :min_t]).item() # PESQ calculation. only evaluate PESQ if it's speech signal (nonspeech PESQ will error out) - if ( - not "nonspeech" in mode - ): # Skips if the name of dataset (in mode string) contains "nonspeech" - + if "nonspeech" not in mode: # Skips if the name of dataset (in mode string) contains "nonspeech" # Resample to 16000 for pesq y_16k = pesq_resampler(y) y_g_hat_16k = pesq_resampler(y_g_hat.squeeze(1)) y_int_16k = (y_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() - y_g_hat_int_16k = ( - (y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() - ) + y_g_hat_int_16k = (y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy() val_pesq_tot += pesq(16000, y_int_16k, y_g_hat_int_16k, "wb") # MRSTFT calculation min_t = min(y.size(-1), y_g_hat.size(-1)) - val_mrstft_tot += loss_mrstft(y_g_hat[...,:min_t], y[...,:min_t]).item() + val_mrstft_tot += loss_mrstft(y_g_hat[..., :min_t], y[..., :min_t]).item() # Log audio and figures to Tensorboard if j % a.eval_subsample == 0: # Subsample every nth from validation set if steps >= 0: sw.add_audio(f"gt_{mode}/y_{j}", y[0], steps, h.sampling_rate) - if ( - a.save_audio - ): # Also save audio to disk if --save_audio is set to True + if a.save_audio: # Also save audio to disk if --save_audio is set to True save_audio( y[0], os.path.join( @@ -373,9 +350,7 @@ def train(rank, a, h): steps, h.sampling_rate, ) - if ( - a.save_audio - ): # Also save audio to disk if --save_audio is set to True + if a.save_audio: # Also save audio to disk if --save_audio is set to True save_audio( y_g_hat[0, 0], os.path.join( @@ -487,15 +462,11 @@ def train(rank, a, h): # MPD y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) - loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss( - y_df_hat_r, y_df_hat_g - ) + loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g) # MRD y_ds_hat_r, y_ds_hat_g, _, _ = mrd(y, y_g_hat.detach()) - loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss( - y_ds_hat_r, y_ds_hat_g - ) + loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g) loss_disc_all = loss_disc_s + loss_disc_f @@ -505,17 +476,11 @@ def train(rank, a, h): # Whether to freeze D for initial training steps if steps >= a.freeze_step: loss_disc_all.backward() - grad_norm_mpd = torch.nn.utils.clip_grad_norm_( - mpd.parameters(), clip_grad_norm - ) - grad_norm_mrd = torch.nn.utils.clip_grad_norm_( - mrd.parameters(), clip_grad_norm - ) + grad_norm_mpd = torch.nn.utils.clip_grad_norm_(mpd.parameters(), clip_grad_norm) + grad_norm_mrd = torch.nn.utils.clip_grad_norm_(mrd.parameters(), clip_grad_norm) optim_d.step() else: - print( - f"[WARNING] skipping D training for the first {a.freeze_step} steps" - ) + print(f"[WARNING] skipping D training for the first {a.freeze_step} steps") grad_norm_mpd = 0.0 grad_norm_mrd = 0.0 @@ -523,9 +488,7 @@ def train(rank, a, h): optim_g.zero_grad() # L1 Mel-Spectrogram Loss - lambda_melloss = h.get( - "lambda_melloss", 45.0 - ) # Defaults to 45 in BigVGAN-v1 if not set + lambda_melloss = h.get("lambda_melloss", 45.0) # Defaults to 45 in BigVGAN-v1 if not set if h.get("use_multiscale_melloss", False): # uses wav for loss loss_mel = fn_mel_loss_multiscale(y, y_g_hat) * lambda_melloss else: # Uses mel for loss @@ -542,27 +505,19 @@ def train(rank, a, h): loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) if steps >= a.freeze_step: - loss_gen_all = ( - loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel - ) + loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel else: - print( - f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps" - ) + print(f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps") loss_gen_all = loss_mel loss_gen_all.backward() - grad_norm_g = torch.nn.utils.clip_grad_norm_( - generator.parameters(), clip_grad_norm - ) + grad_norm_g = torch.nn.utils.clip_grad_norm_(generator.parameters(), clip_grad_norm) optim_g.step() if rank == 0: # STDOUT logging if steps % a.stdout_interval == 0: - mel_error = ( - loss_mel.item() / lambda_melloss - ) # Log training mel regression loss to stdout + mel_error = loss_mel.item() / lambda_melloss # Log training mel regression loss to stdout print( f"Steps: {steps:d}, " f"Gen Loss Total: {loss_gen_all:4.3f}, " @@ -577,11 +532,7 @@ def train(rank, a, h): checkpoint_path = f"{a.checkpoint_path}/g_{steps:08d}" save_checkpoint( checkpoint_path, - { - "generator": ( - generator.module if h.num_gpus > 1 else generator - ).state_dict() - }, + {"generator": (generator.module if h.num_gpus > 1 else generator).state_dict()}, ) checkpoint_path = f"{a.checkpoint_path}/do_{steps:08d}" save_checkpoint( @@ -598,9 +549,7 @@ def train(rank, a, h): # Tensorboard summary logging if steps % a.summary_interval == 0: - mel_error = ( - loss_mel.item() / lambda_melloss - ) # Log training mel regression loss to tensorboard + mel_error = loss_mel.item() / lambda_melloss # Log training mel regression loss to tensorboard sw.add_scalar("training/gen_loss_total", loss_gen_all.item(), steps) sw.add_scalar("training/mel_spec_error", mel_error, steps) sw.add_scalar("training/fm_loss_mpd", loss_fm_f.item(), steps) @@ -612,12 +561,8 @@ def train(rank, a, h): sw.add_scalar("training/disc_loss_mrd", loss_disc_s.item(), steps) sw.add_scalar("training/grad_norm_mrd", grad_norm_mrd, steps) sw.add_scalar("training/grad_norm_g", grad_norm_g, steps) - sw.add_scalar( - "training/learning_rate_d", scheduler_d.get_last_lr()[0], steps - ) - sw.add_scalar( - "training/learning_rate_g", scheduler_g.get_last_lr()[0], steps - ) + sw.add_scalar("training/learning_rate_d", scheduler_d.get_last_lr()[0], steps) + sw.add_scalar("training/learning_rate_g", scheduler_g.get_last_lr()[0], steps) sw.add_scalar("training/epoch", epoch + 1, steps) # Validation @@ -660,9 +605,7 @@ def train(rank, a, h): scheduler_d.step() if rank == 0: - print( - f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n" - ) + print(f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n") def main(): @@ -674,12 +617,8 @@ def main(): parser.add_argument("--input_wavs_dir", default="LibriTTS") parser.add_argument("--input_mels_dir", default="ft_dataset") - parser.add_argument( - "--input_training_file", default="tests/LibriTTS/train-full.txt" - ) - parser.add_argument( - "--input_validation_file", default="tests/LibriTTS/val-full.txt" - ) + parser.add_argument("--input_training_file", default="tests/LibriTTS/train-full.txt") + parser.add_argument("--input_validation_file", default="tests/LibriTTS/val-full.txt") parser.add_argument( "--list_input_unseen_wavs_dir", diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 1571ef7..1b7ad11 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -1,74 +1,86 @@ -from copy import deepcopy +import gc import math -import os, sys, gc +import os import random -import traceback +import sys import time +import traceback +from copy import deepcopy + import torchaudio from tqdm import tqdm + now_dir = os.getcwd() sys.path.append(now_dir) -import ffmpeg import os -from typing import Generator, List, Tuple, Union +from typing import List, Tuple, Union + +import ffmpeg +import librosa import numpy as np import torch import torch.nn.functional as F import yaml -from transformers import AutoModelForMaskedLM, AutoTokenizer -from tools.audio_sr import AP_BWE from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from BigVGAN.bigvgan import BigVGAN from feature_extractor.cnhubert import CNHubert +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch from module.models import SynthesizerTrn, SynthesizerTrnV3 from peft import LoraConfig, get_peft_model -import librosa -from time import time as ttime +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new +from transformers import AutoModelForMaskedLM, AutoTokenizer + +from tools.audio_sr import AP_BWE from tools.i18n.i18n import I18nAuto, scan_language_list from tools.my_utils import load_audio -from module.mel_processing import spectrogram_torch from TTS_infer_pack.text_segmentation_method import splits from TTS_infer_pack.TextPreprocessor import TextPreprocessor -from BigVGAN.bigvgan import BigVGAN -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch -from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language -i18n = I18nAuto(language=language) +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +i18n = I18nAuto(language=language) spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) -def speed_change(input_audio:np.ndarray, speed:float, sr:int): +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +def speed_change(input_audio: np.ndarray, speed: float, sr: int): # 将 NumPy 数组转换为原始 PCM 流 raw_audio = input_audio.astype(np.int16).tobytes() # 设置 ffmpeg 输入流 - input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1) + input_stream = ffmpeg.input("pipe:", format="s16le", acodec="pcm_s16le", ar=str(sr), ac=1) # 变速处理 - output_stream = input_stream.filter('atempo', speed) + output_stream = input_stream.filter("atempo", speed) # 输出流到管道 - out, _ = ( - output_stream.output('pipe:', format='s16le', acodec='pcm_s16le') - .run(input=raw_audio, capture_stdout=True, capture_stderr=True) + out, _ = output_stream.output("pipe:", format="s16le", acodec="pcm_s16le").run( + input=raw_audio, capture_stdout=True, capture_stderr=True ) # 将管道输出解码为 NumPy 数组 @@ -77,14 +89,13 @@ def speed_change(input_audio:np.ndarray, speed:float, sr:int): return processed_audio +resample_transform_dict = {} + -resample_transform_dict={} def resample(audio_tensor, sr0, device): global resample_transform_dict if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) return resample_transform_dict[sr0](audio_tensor) @@ -156,11 +167,12 @@ default_v3: version: v3 """ -def set_seed(seed:int): + +def set_seed(seed: int): seed = int(seed) seed = seed if seed != -1 else random.randint(0, 2**32 - 1) print(f"Set seed to {seed}") - os.environ['PYTHONHASHSEED'] = str(seed) + os.environ["PYTHONHASHSEED"] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -178,40 +190,41 @@ def set_seed(seed:int): pass return seed + class TTS_Config: - default_configs={ - "v1":{ - "device": "cpu", - "is_half": False, - "version": "v1", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, - "v2":{ - "device": "cpu", - "is_half": False, - "version": "v2", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, - "v3":{ - "device": "cpu", - "is_half": False, - "version": "v3", - "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", - "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth", - "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", - "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - }, + default_configs = { + "v1": { + "device": "cpu", + "is_half": False, + "version": "v1", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, + "v2": { + "device": "cpu", + "is_half": False, + "version": "v2", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, + "v3": { + "device": "cpu", + "is_half": False, + "version": "v3", + "t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", + "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + }, } - configs:dict = None - v1_languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] - v2_languages:list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"] - languages:list = v2_languages + configs: dict = None + v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] + v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"] + languages: list = v2_languages # "all_zh",#全部按中文识别 # "en",#全部按英文识别#######不变 # "all_ja",#全部按日文识别 @@ -224,33 +237,31 @@ class TTS_Config: # "auto",#多语种启动切分识别语种 # "auto_yue",#多语种启动切分识别语种 - def __init__(self, configs: Union[dict, str]=None): - + def __init__(self, configs: Union[dict, str] = None): # 设置默认配置文件路径 - configs_base_path:str = "GPT_SoVITS/configs/" + configs_base_path: str = "GPT_SoVITS/configs/" os.makedirs(configs_base_path, exist_ok=True) - self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml") + self.configs_path: str = os.path.join(configs_base_path, "tts_infer.yaml") if configs in ["", None]: if not os.path.exists(self.configs_path): self.save_configs() print(f"Create default config file at {self.configs_path}") - configs:dict = deepcopy(self.default_configs) + configs: dict = deepcopy(self.default_configs) if isinstance(configs, str): self.configs_path = configs - configs:dict = self._load_configs(self.configs_path) + configs: dict = self._load_configs(self.configs_path) assert isinstance(configs, dict) version = configs.get("version", "v2").lower() assert version in ["v1", "v2", "v3"] self.default_configs[version] = configs.get(version, self.default_configs[version]) - self.configs:dict = configs.get("custom", deepcopy(self.default_configs[version])) - + self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version])) self.device = self.configs.get("device", torch.device("cpu")) if "cuda" in str(self.device) and not torch.cuda.is_available(): - print(f"Warning: CUDA is not available, set device to CPU.") + print("Warning: CUDA is not available, set device to CPU.") self.device = torch.device("cpu") self.is_half = self.configs.get("is_half", False) @@ -263,81 +274,77 @@ class TTS_Config: self.vits_weights_path = self.configs.get("vits_weights_path", None) self.bert_base_path = self.configs.get("bert_base_path", None) self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None) - self.languages = self.v1_languages if self.version=="v1" else self.v2_languages - - self.is_v3_synthesizer:bool = False + self.languages = self.v1_languages if self.version == "v1" else self.v2_languages + self.is_v3_synthesizer: bool = False if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)): - self.t2s_weights_path = self.default_configs[version]['t2s_weights_path'] + self.t2s_weights_path = self.default_configs[version]["t2s_weights_path"] print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}") if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)): - self.vits_weights_path = self.default_configs[version]['vits_weights_path'] + self.vits_weights_path = self.default_configs[version]["vits_weights_path"] print(f"fall back to default vits_weights_path: {self.vits_weights_path}") if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)): - self.bert_base_path = self.default_configs[version]['bert_base_path'] + self.bert_base_path = self.default_configs[version]["bert_base_path"] print(f"fall back to default bert_base_path: {self.bert_base_path}") if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)): - self.cnhuhbert_base_path = self.default_configs[version]['cnhuhbert_base_path'] + self.cnhuhbert_base_path = self.default_configs[version]["cnhuhbert_base_path"] print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}") self.update_configs() - self.max_sec = None - self.hz:int = 50 - self.semantic_frame_rate:str = "25hz" - self.segment_size:int = 20480 - self.filter_length:int = 2048 - self.sampling_rate:int = 32000 - self.hop_length:int = 640 - self.win_length:int = 2048 - self.n_speakers:int = 300 + self.hz: int = 50 + self.semantic_frame_rate: str = "25hz" + self.segment_size: int = 20480 + self.filter_length: int = 2048 + self.sampling_rate: int = 32000 + self.hop_length: int = 640 + self.win_length: int = 2048 + self.n_speakers: int = 300 - - - def _load_configs(self, configs_path: str)->dict: + def _load_configs(self, configs_path: str) -> dict: if os.path.exists(configs_path): ... else: print(i18n("路径不存在,使用默认配置")) self.save_configs(configs_path) - with open(configs_path, 'r', encoding='utf-8') as f: + with open(configs_path, "r", encoding="utf-8") as f: configs = yaml.load(f, Loader=yaml.FullLoader) return configs - def save_configs(self, configs_path:str=None)->None: - configs=deepcopy(self.default_configs) + def save_configs(self, configs_path: str = None) -> None: + configs = deepcopy(self.default_configs) if self.configs is not None: configs["custom"] = self.update_configs() if configs_path is None: configs_path = self.configs_path - with open(configs_path, 'w') as f: + with open(configs_path, "w") as f: yaml.dump(configs, f) def update_configs(self): self.config = { - "device" : str(self.device), - "is_half" : self.is_half, - "version" : self.version, - "t2s_weights_path" : self.t2s_weights_path, - "vits_weights_path" : self.vits_weights_path, - "bert_base_path" : self.bert_base_path, + "device": str(self.device), + "is_half": self.is_half, + "version": self.version, + "t2s_weights_path": self.t2s_weights_path, + "vits_weights_path": self.vits_weights_path, + "bert_base_path": self.bert_base_path, "cnhuhbert_base_path": self.cnhuhbert_base_path, } return self.config - def update_version(self, version:str)->None: + def update_version(self, version: str) -> None: self.version = version - self.languages = self.v1_languages if self.version=="v1" else self.v2_languages + self.languages = self.v1_languages if self.version == "v1" else self.v2_languages def __str__(self): self.configs = self.update_configs() - string = "TTS Config".center(100, '-') + '\n' + string = "TTS Config".center(100, "-") + "\n" for k, v in self.configs.items(): string += f"{str(k).ljust(20)}: {str(v)}\n" - string += "-" * 100 + '\n' + string += "-" * 100 + "\n" return string def __repr__(self): @@ -355,77 +362,71 @@ class TTS: if isinstance(configs, TTS_Config): self.configs = configs else: - self.configs:TTS_Config = TTS_Config(configs) + self.configs: TTS_Config = TTS_Config(configs) - self.t2s_model:Text2SemanticLightningModule = None - self.vits_model:Union[SynthesizerTrn, SynthesizerTrnV3] = None - self.bert_tokenizer:AutoTokenizer = None - self.bert_model:AutoModelForMaskedLM = None - self.cnhuhbert_model:CNHubert = None - self.bigvgan_model:BigVGAN = None - self.sr_model:AP_BWE = None - self.sr_model_not_exist:bool = False + self.t2s_model: Text2SemanticLightningModule = None + self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None + self.bert_tokenizer: AutoTokenizer = None + self.bert_model: AutoModelForMaskedLM = None + self.cnhuhbert_model: CNHubert = None + self.bigvgan_model: BigVGAN = None + self.sr_model: AP_BWE = None + self.sr_model_not_exist: bool = False self._init_models() - self.text_preprocessor:TextPreprocessor = \ - TextPreprocessor(self.bert_model, - self.bert_tokenizer, - self.configs.device) + self.text_preprocessor: TextPreprocessor = TextPreprocessor( + self.bert_model, self.bert_tokenizer, self.configs.device + ) - - self.prompt_cache:dict = { - "ref_audio_path" : None, + self.prompt_cache: dict = { + "ref_audio_path": None, "prompt_semantic": None, - "refer_spec" : [], - "prompt_text" : None, - "prompt_lang" : None, - "phones" : None, - "bert_features" : None, - "norm_text" : None, + "refer_spec": [], + "prompt_text": None, + "prompt_lang": None, + "phones": None, + "bert_features": None, + "norm_text": None, "aux_ref_audio_paths": [], } + self.stop_flag: bool = False + self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32 - self.stop_flag:bool = False - self.precision:torch.dtype = torch.float16 if self.configs.is_half else torch.float32 - - def _init_models(self,): + def _init_models( + self, + ): self.init_t2s_weights(self.configs.t2s_weights_path) self.init_vits_weights(self.configs.vits_weights_path) self.init_bert_weights(self.configs.bert_base_path) self.init_cnhuhbert_weights(self.configs.cnhuhbert_base_path) # self.enable_half_precision(self.configs.is_half) - - def init_cnhuhbert_weights(self, base_path: str): print(f"Loading CNHuBERT weights from {base_path}") self.cnhuhbert_model = CNHubert(base_path) - self.cnhuhbert_model=self.cnhuhbert_model.eval() + self.cnhuhbert_model = self.cnhuhbert_model.eval() self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device) - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.cnhuhbert_model = self.cnhuhbert_model.half() - - def init_bert_weights(self, base_path: str): print(f"Loading BERT weights from {base_path}") self.bert_tokenizer = AutoTokenizer.from_pretrained(base_path) self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path) - self.bert_model=self.bert_model.eval() + self.bert_model = self.bert_model.eval() self.bert_model = self.bert_model.to(self.configs.device) - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.bert_model = self.bert_model.half() def init_vits_weights(self, weights_path: str): - self.configs.vits_weights_path = weights_path - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(weights_path) - path_sovits_v3=self.configs.default_configs["v3"]["vits_weights_path"] + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path) + path_sovits_v3 = self.configs.default_configs["v3"]["vits_weights_path"] - if if_lora_v3==True and os.path.exists(path_sovits_v3)==False: - info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + if if_lora_v3 == True and os.path.exists(path_sovits_v3) == False: + info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") raise FileExistsError(info) # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) @@ -433,9 +434,9 @@ class TTS: hps = dict_s2["config"] hps["model"]["semantic_frame_rate"] = "25hz" - if 'enc_p.text_embedding.weight'not in dict_s2['weight']: - hps["model"]["version"] = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps["model"]["version"] = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps["model"]["version"] = "v1" else: hps["model"]["version"] = "v2" @@ -455,12 +456,12 @@ class TTS: # print(f"model_version:{model_version}") # print(f'hps["model"]["version"]:{hps["model"]["version"]}') - if model_version!="v3": + if model_version != "v3": vits_model = SynthesizerTrn( self.configs.filter_length // 2 + 1, self.configs.segment_size // self.configs.hop_length, n_speakers=self.configs.n_speakers, - **kwargs + **kwargs, ) self.configs.is_v3_synthesizer = False else: @@ -468,18 +469,22 @@ class TTS: self.configs.filter_length // 2 + 1, self.configs.segment_size // self.configs.hop_length, n_speakers=self.configs.n_speakers, - **kwargs + **kwargs, ) self.configs.is_v3_synthesizer = True self.init_bigvgan() if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"): del vits_model.enc_q - if if_lora_v3==False: - print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") + if if_lora_v3 == False: + print( + f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}" + ) else: - print(f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}") - lora_rank=dict_s2["lora_rank"] + print( + f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}" + ) + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -487,19 +492,19 @@ class TTS: init_lora_weights=True, ) vits_model.cfm = get_peft_model(vits_model.cfm, lora_config) - print(f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") - - vits_model.cfm = vits_model.cfm.merge_and_unload() + print( + f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}" + ) + vits_model.cfm = vits_model.cfm.merge_and_unload() vits_model = vits_model.to(self.configs.device) vits_model = vits_model.eval() self.vits_model = vits_model - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.vits_model = self.vits_model.half() - def init_t2s_weights(self, weights_path: str): print(f"Loading Text2Semantic weights from {weights_path}") self.configs.t2s_weights_path = weights_path @@ -513,14 +518,16 @@ class TTS: t2s_model = t2s_model.to(self.configs.device) t2s_model = t2s_model.eval() self.t2s_model = t2s_model - if self.configs.is_half and str(self.configs.device)!="cpu": + if self.configs.is_half and str(self.configs.device) != "cpu": self.t2s_model = self.t2s_model.half() - def init_bigvgan(self): if self.bigvgan_model is not None: return - self.bigvgan_model = BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + self.bigvgan_model = BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode self.bigvgan_model.remove_weight_norm() self.bigvgan_model = self.bigvgan_model.eval() @@ -533,20 +540,19 @@ class TTS: if self.sr_model is not None: return try: - self.sr_model:AP_BWE=AP_BWE(self.configs.device,DictToAttrRecursive) + self.sr_model: AP_BWE = AP_BWE(self.configs.device, DictToAttrRecursive) self.sr_model_not_exist = False except FileNotFoundError: print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) self.sr_model_not_exist = True - def enable_half_precision(self, enable: bool = True, save: bool = True): - ''' - To enable half precision for the TTS model. - Args: - enable: bool, whether to enable half precision. + """ + To enable half precision for the TTS model. + Args: + enable: bool, whether to enable half precision. - ''' + """ if str(self.configs.device) == "cpu" and enable: print("Half precision is not supported on CPU.") return @@ -557,11 +563,11 @@ class TTS: self.configs.save_configs() if enable: if self.t2s_model is not None: - self.t2s_model =self.t2s_model.half() + self.t2s_model = self.t2s_model.half() if self.vits_model is not None: self.vits_model = self.vits_model.half() if self.bert_model is not None: - self.bert_model =self.bert_model.half() + self.bert_model = self.bert_model.half() if self.cnhuhbert_model is not None: self.cnhuhbert_model = self.cnhuhbert_model.half() if self.bigvgan_model is not None: @@ -579,11 +585,11 @@ class TTS: self.bigvgan_model = self.bigvgan_model.float() def set_device(self, device: torch.device, save: bool = True): - ''' - To set the device for all models. - Args: - device: torch.device, the device to use for all models. - ''' + """ + To set the device for all models. + Args: + device: torch.device, the device to use for all models. + """ self.configs.device = device if save: self.configs.save_configs() @@ -599,15 +605,14 @@ class TTS: self.bigvgan_model = self.bigvgan_model.to(device) if self.sr_model is not None: self.sr_model = self.sr_model.to(device) - - def set_ref_audio(self, ref_audio_path:str): - ''' - To set the reference audio for the TTS model, - including the prompt_semantic and refer_spepc. - Args: - ref_audio_path: str, the path of the reference audio. - ''' + def set_ref_audio(self, ref_audio_path: str): + """ + To set the reference audio for the TTS model, + including the prompt_semantic and refer_spepc. + Args: + ref_audio_path: str, the path of the reference audio. + """ self._set_prompt_semantic(ref_audio_path) self._set_ref_spec(ref_audio_path) self._set_ref_audio_path(ref_audio_path) @@ -617,21 +622,22 @@ class TTS: def _set_ref_spec(self, ref_audio_path): spec = self._get_ref_spec(ref_audio_path) - if self.prompt_cache["refer_spec"] in [[],None]: - self.prompt_cache["refer_spec"]=[spec] + if self.prompt_cache["refer_spec"] in [[], None]: + self.prompt_cache["refer_spec"] = [spec] else: self.prompt_cache["refer_spec"][0] = spec def _get_ref_spec(self, ref_audio_path): raw_audio, raw_sr = torchaudio.load(ref_audio_path) - raw_audio=raw_audio.to(self.configs.device).float() + raw_audio = raw_audio.to(self.configs.device).float() self.prompt_cache["raw_audio"] = raw_audio self.prompt_cache["raw_sr"] = raw_sr audio = load_audio(ref_audio_path, int(self.configs.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1):audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( @@ -647,14 +653,14 @@ class TTS: spec = spec.half() return spec - def _set_prompt_semantic(self, ref_wav_path:str): + def _set_prompt_semantic(self, ref_wav_path: str): zero_wav = np.zeros( int(self.configs.sampling_rate * 0.3), dtype=np.float16 if self.configs.is_half else np.float32, ) with torch.no_grad(): wav16k, sr = librosa.load(ref_wav_path, sr=16000) - if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000: raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) wav16k = torch.from_numpy(wav16k) zero_wav_torch = torch.from_numpy(zero_wav) @@ -665,9 +671,7 @@ class TTS: zero_wav_torch = zero_wav_torch.half() wav16k = torch.cat([wav16k, zero_wav_torch]) - hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( + hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose( 1, 2 ) # .float() codes = self.vits_model.extract_latent(hubert_feature) @@ -675,12 +679,12 @@ class TTS: prompt_semantic = codes[0, 0].to(self.configs.device) self.prompt_cache["prompt_semantic"] = prompt_semantic - def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length:int=None): + def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None): seq = sequences[0] ndim = seq.dim() if axis < 0: axis += ndim - dtype:torch.dtype = seq.dtype + dtype: torch.dtype = seq.dtype pad_value = torch.tensor(pad_value, dtype=dtype) seq_lengths = [seq.shape[axis] for seq in sequences] if max_length is None: @@ -696,15 +700,17 @@ class TTS: batch = torch.stack(padded_sequences) return batch - def to_batch(self, data:list, - prompt_data:dict=None, - batch_size:int=5, - threshold:float=0.75, - split_bucket:bool=True, - device:torch.device=torch.device("cpu"), - precision:torch.dtype=torch.float32, - ): - _data:list = [] + def to_batch( + self, + data: list, + prompt_data: dict = None, + batch_size: int = 5, + threshold: float = 0.75, + split_bucket: bool = True, + device: torch.device = torch.device("cpu"), + precision: torch.dtype = torch.float32, + ): + _data: list = [] index_and_len_list = [] for idx, item in enumerate(data): norm_text_len = len(item["norm_text"]) @@ -717,29 +723,28 @@ class TTS: batch_index_list_len = 0 pos = 0 - while pos =threshold) or (pos_end-pos==1): - batch_index=index_and_len_list[pos:pos_end, 0].tolist() + batch = index_and_len_list[pos:pos_end, 1].astype(np.float32) + score = batch[(pos_end - pos) // 2] / (batch.mean() + 1e-8) + if (score >= threshold) or (pos_end - pos == 1): + batch_index = index_and_len_list[pos:pos_end, 0].tolist() batch_index_list_len += len(batch_index) batch_index_list.append(batch_index) pos = pos_end break - pos_end=pos_end-1 + pos_end = pos_end - 1 assert batch_index_list_len == len(data) else: for i in range(len(data)): - if i%batch_size == 0: + if i % batch_size == 0: batch_index_list.append([]) batch_index_list[-1].append(i) - for batch_idx, index_list in enumerate(batch_index_list): item_list = [data[idx] for idx in index_list] phones_list = [] @@ -753,14 +758,14 @@ class TTS: all_phones_max_len = 0 for item in item_list: if prompt_data is not None: - all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\ - .to(dtype=precision, device=device) - all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"]).to(device) + all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1).to( + dtype=precision, device=device + ) + all_phones = torch.LongTensor(prompt_data["phones"] + item["phones"]).to(device) phones = torch.LongTensor(item["phones"]).to(device) # norm_text = prompt_data["norm_text"]+item["norm_text"] else: - all_bert_features = item["bert_features"]\ - .to(dtype=precision, device=device) + all_bert_features = item["bert_features"].to(dtype=precision, device=device) phones = torch.LongTensor(item["phones"]).to(device) all_phones = phones # norm_text = item["norm_text"] @@ -779,7 +784,6 @@ class TTS: all_phones_batch = all_phones_list all_bert_features_batch = all_bert_features_list - max_len = max(all_bert_max_len, all_phones_max_len) # phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len) #### 直接对phones和bert_features进行pad。(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略) @@ -811,8 +815,8 @@ class TTS: return _data, batch_index_list - def recovery_order(self, data:list, batch_index_list:list)->list: - ''' + def recovery_order(self, data: list, batch_index_list: list) -> list: + """ Recovery the order of the audio according to the batch_index_list. Args: @@ -821,22 +825,24 @@ class TTS: Returns: list (List[torch.Tensor]): the data in the original order. - ''' + """ length = len(sum(batch_index_list, [])) - _data = [None]*length + _data = [None] * length for i, index_list in enumerate(batch_index_list): for j, index in enumerate(index_list): _data[index] = data[i][j] return _data - def stop(self,): - ''' + def stop( + self, + ): + """ Stop the inference process. - ''' + """ self.stop_flag = True @torch.no_grad() - def run(self, inputs:dict): + def run(self, inputs: dict): """ Text to speech inference. @@ -869,17 +875,17 @@ class TTS: Tuple[int, np.ndarray]: sampling rate and audio data. """ ########## variables initialization ########### - self.stop_flag:bool = False - text:str = inputs.get("text", "") - text_lang:str = inputs.get("text_lang", "") - ref_audio_path:str = inputs.get("ref_audio_path", "") - aux_ref_audio_paths:list = inputs.get("aux_ref_audio_paths", []) - prompt_text:str = inputs.get("prompt_text", "") - prompt_lang:str = inputs.get("prompt_lang", "") - top_k:int = inputs.get("top_k", 5) - top_p:float = inputs.get("top_p", 1) - temperature:float = inputs.get("temperature", 1) - text_split_method:str = inputs.get("text_split_method", "cut0") + self.stop_flag: bool = False + text: str = inputs.get("text", "") + text_lang: str = inputs.get("text_lang", "") + ref_audio_path: str = inputs.get("ref_audio_path", "") + aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", []) + prompt_text: str = inputs.get("prompt_text", "") + prompt_lang: str = inputs.get("prompt_lang", "") + top_k: int = inputs.get("top_k", 5) + top_p: float = inputs.get("top_p", 1) + temperature: float = inputs.get("temperature", 1) + text_split_method: str = inputs.get("text_split_method", "cut0") batch_size = inputs.get("batch_size", 1) batch_threshold = inputs.get("batch_threshold", 0.75) speed_factor = inputs.get("speed_factor", 1.0) @@ -907,9 +913,9 @@ class TTS: split_bucket = False print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理")) - if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer): + if split_bucket and speed_factor == 1.0 and not (self.configs.is_v3_synthesizer and parallel_infer): print(i18n("分桶处理模式已开启")) - elif speed_factor!=1.0: + elif speed_factor != 1.0: print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理")) split_bucket = False elif self.configs.is_v3_synthesizer and parallel_infer: @@ -918,7 +924,7 @@ class TTS: else: print(i18n("分桶处理模式已关闭")) - if fragment_interval<0.01: + if fragment_interval < 0.01: fragment_interval = 0.01 print(i18n("分段间隔过小,已自动设置为0.01")) @@ -933,9 +939,12 @@ class TTS: if no_prompt_text and self.configs.is_v3_synthesizer: raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3") - if ref_audio_path in [None, ""] and \ - ((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])): - raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") + if ref_audio_path in [None, ""] and ( + (self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []]) + ): + raise ValueError( + "ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()" + ) ###### setting reference audio and prompt text preprocessing ######## t0 = time.perf_counter() @@ -945,7 +954,7 @@ class TTS: self.set_ref_audio(ref_audio_path) aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else [] - paths = set(aux_ref_audio_paths)&set(self.prompt_cache["aux_ref_audio_paths"]) + paths = set(aux_ref_audio_paths) & set(self.prompt_cache["aux_ref_audio_paths"]) if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])): self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]] @@ -959,58 +968,57 @@ class TTS: if not no_prompt_text: prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_lang != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_lang != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) if self.prompt_cache["prompt_text"] != prompt_text: - phones, bert_features, norm_text = \ - self.text_preprocessor.segment_and_extract_feature_for_text( - prompt_text, - prompt_lang, - self.configs.version) + phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text( + prompt_text, prompt_lang, self.configs.version + ) self.prompt_cache["prompt_text"] = prompt_text self.prompt_cache["prompt_lang"] = prompt_lang self.prompt_cache["phones"] = phones self.prompt_cache["bert_features"] = bert_features self.prompt_cache["norm_text"] = norm_text - - - ###### text preprocessing ######## t1 = time.perf_counter() - data:list = None + data: list = None if not return_fragment: data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) if len(data) == 0: yield 16000, np.zeros(int(16000), dtype=np.int16) return - batch_index_list:list = None - data, batch_index_list = self.to_batch(data, - prompt_data=self.prompt_cache if not no_prompt_text else None, - batch_size=batch_size, - threshold=batch_threshold, - split_bucket=split_bucket, - device=self.configs.device, - precision=self.precision - ) + batch_index_list: list = None + data, batch_index_list = self.to_batch( + data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=split_bucket, + device=self.configs.device, + precision=self.precision, + ) else: - print(f'############ {i18n("切分文本")} ############') + print(f"############ {i18n('切分文本')} ############") texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method) data = [] for i in range(len(texts)): - if i%batch_size == 0: + if i % batch_size == 0: data.append([]) data[-1].append(texts[i]) def make_batch(batch_texts): batch_data = [] - print(f'############ {i18n("提取文本Bert特征")} ############') + print(f"############ {i18n('提取文本Bert特征')} ############") for text in tqdm(batch_texts): - phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang, self.configs.version) + phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text( + text, text_lang, self.configs.version + ) if phones is None: continue - res={ + res = { "phones": phones, "bert_features": bert_features, "norm_text": norm_text, @@ -1018,17 +1026,17 @@ class TTS: batch_data.append(res) if len(batch_data) == 0: return None - batch, _ = self.to_batch(batch_data, - prompt_data=self.prompt_cache if not no_prompt_text else None, - batch_size=batch_size, - threshold=batch_threshold, - split_bucket=False, - device=self.configs.device, - precision=self.precision - ) + batch, _ = self.to_batch( + batch_data, + prompt_data=self.prompt_cache if not no_prompt_text else None, + batch_size=batch_size, + threshold=batch_threshold, + split_bucket=False, + device=self.configs.device, + precision=self.precision, + ) return batch[0] - t2 = time.perf_counter() try: print("############ 推理 ############") @@ -1044,20 +1052,22 @@ class TTS: if item is None: continue - batch_phones:List[torch.LongTensor] = item["phones"] + batch_phones: List[torch.LongTensor] = item["phones"] # batch_phones:torch.LongTensor = item["phones"] - batch_phones_len:torch.LongTensor = item["phones_len"] - all_phoneme_ids:torch.LongTensor = item["all_phones"] - all_phoneme_lens:torch.LongTensor = item["all_phones_len"] - all_bert_features:torch.LongTensor = item["all_bert_features"] - norm_text:str = item["norm_text"] + batch_phones_len: torch.LongTensor = item["phones_len"] + all_phoneme_ids: torch.LongTensor = item["all_phones"] + all_phoneme_lens: torch.LongTensor = item["all_phones_len"] + all_bert_features: torch.LongTensor = item["all_bert_features"] + norm_text: str = item["norm_text"] max_len = item["max_len"] print(i18n("前端处理后的文本(每句):"), norm_text) - if no_prompt_text : + if no_prompt_text: prompt = None else: - prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) + prompt = ( + self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device) + ) print(f"############ {i18n('预测语义Token')} ############") pred_semantic_list, idx_list = self.t2s_model.model.infer_panel( @@ -1076,8 +1086,10 @@ class TTS: t4 = time.perf_counter() t_34 += t4 - t3 - refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]] - + refer_audio_spec: torch.Tensor = [ + item.to(dtype=self.precision, device=self.configs.device) + for item in self.prompt_cache["refer_spec"] + ] batch_audio_fragment = [] @@ -1100,60 +1112,65 @@ class TTS: # ## vits并行推理 method 2 pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] upsample_rate = math.prod(self.vits_model.upsample_rates) - audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))] - audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))] - all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) + audio_frag_idx = [ + pred_semantic_list[i].shape[0] * 2 * upsample_rate + for i in range(0, len(pred_semantic_list)) + ] + audio_frag_end_idx = [sum(audio_frag_idx[: i + 1]) for i in range(0, len(audio_frag_idx))] + all_pred_semantic = ( + torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device) + ) _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device) - _batch_audio_fragment = (self.vits_model.decode( - all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) + _batch_audio_fragment = self.vits_model.decode( + all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] audio_frag_end_idx.insert(0, 0) - batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))] + batch_audio_fragment = [ + _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]] + for i in range(1, len(audio_frag_end_idx)) + ] else: - # ## vits串行推理 + # ## vits串行推理 for i, idx in enumerate(tqdm(idx_list)): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 - audio_fragment =(self.vits_model.decode( - _pred_semantic, phones, refer_audio_spec, speed=speed_factor - ).detach()[0, 0, :]) - batch_audio_fragment.append( - audio_fragment - ) ###试试重建不带上prompt部分 + _pred_semantic = ( + pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 + audio_fragment = self.vits_model.decode( + _pred_semantic, phones, refer_audio_spec, speed=speed_factor + ).detach()[0, 0, :] + batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分 else: if parallel_infer: print(f"{i18n('并行合成中')}...") audio_fragments = self.v3_synthesis_batched_infer( - idx_list, - pred_semantic_list, - batch_phones, - speed=speed_factor, - sample_steps=sample_steps - ) + idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps + ) batch_audio_fragment.extend(audio_fragments) else: for i, idx in enumerate(tqdm(idx_list)): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 + _pred_semantic = ( + pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 audio_fragment = self.v3_synthesis( - _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps - ) - batch_audio_fragment.append( - audio_fragment - ) + _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps + ) + batch_audio_fragment.append(audio_fragment) t5 = time.perf_counter() t_45 += t5 - t4 if return_fragment: print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4)) - yield self.audio_postprocess([batch_audio_fragment], - output_sr, - None, - speed_factor, - False, - fragment_interval, - super_sampling if self.configs.is_v3_synthesizer else False - ) + yield self.audio_postprocess( + [batch_audio_fragment], + output_sr, + None, + speed_factor, + False, + fragment_interval, + super_sampling if self.configs.is_v3_synthesizer else False, + ) else: audio.append(batch_audio_fragment) @@ -1166,14 +1183,15 @@ class TTS: if len(audio) == 0: yield 16000, np.zeros(int(16000), dtype=np.int16) return - yield self.audio_postprocess(audio, - output_sr, - batch_index_list, - speed_factor, - split_bucket, - fragment_interval, - super_sampling if self.configs.is_v3_synthesizer else False - ) + yield self.audio_postprocess( + audio, + output_sr, + batch_index_list, + speed_factor, + split_bucket, + fragment_interval, + super_sampling if self.configs.is_v3_synthesizer else False, + ) except Exception as e: traceback.print_exc() @@ -1192,7 +1210,7 @@ class TTS: def empty_cache(self): try: - gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 + gc.collect() # 触发gc的垃圾回收。避免内存一直增长。 if "cuda" in str(self.configs.device): torch.cuda.empty_cache() elif str(self.configs.device) == "mps": @@ -1200,29 +1218,28 @@ class TTS: except: pass - def audio_postprocess(self, - audio:List[torch.Tensor], - sr:int, - batch_index_list:list=None, - speed_factor:float=1.0, - split_bucket:bool=True, - fragment_interval:float=0.3, - super_sampling:bool=False, - )->Tuple[int, np.ndarray]: + def audio_postprocess( + self, + audio: List[torch.Tensor], + sr: int, + batch_index_list: list = None, + speed_factor: float = 1.0, + split_bucket: bool = True, + fragment_interval: float = 0.3, + super_sampling: bool = False, + ) -> Tuple[int, np.ndarray]: zero_wav = torch.zeros( - int(self.configs.sampling_rate * fragment_interval), - dtype=self.precision, - device=self.configs.device - ) + int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device + ) for i, batch in enumerate(audio): for j, audio_fragment in enumerate(batch): - max_audio=torch.abs(audio_fragment).max()#简单防止16bit爆音 - if max_audio>1: audio_fragment/=max_audio - audio_fragment:torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) + max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音 + if max_audio > 1: + audio_fragment /= max_audio + audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) audio[i][j] = audio_fragment - if split_bucket: audio = self.recovery_order(audio, batch_index_list) else: @@ -1236,11 +1253,12 @@ class TTS: t1 = time.perf_counter() self.init_sr_model() if not self.sr_model_not_exist: - audio,sr=self.sr_model(audio.unsqueeze(0),sr) - max_audio=np.abs(audio).max() - if max_audio > 1: audio /= max_audio + audio, sr = self.sr_model(audio.unsqueeze(0), sr) + max_audio = np.abs(audio).max() + if max_audio > 1: + audio /= max_audio t2 = time.perf_counter() - print(f"超采样用时:{t2-t1:.3f}s") + print(f"超采样用时:{t2 - t1:.3f}s") else: audio = audio.cpu().numpy() @@ -1254,51 +1272,49 @@ class TTS: return sr, audio - - def v3_synthesis(self, - semantic_tokens:torch.Tensor, - phones:torch.Tensor, - speed:float=1.0, - sample_steps:int=32 - ): - + def v3_synthesis( + self, semantic_tokens: torch.Tensor, phones: torch.Tensor, speed: float = 1.0, sample_steps: int = 32 + ): prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device) - fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) - ref_audio:torch.Tensor = self.prompt_cache["raw_audio"] + fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) + ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_sr = self.prompt_cache["raw_sr"] - ref_audio=ref_audio.to(self.configs.device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(self.configs.device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if ref_sr!=24000: - ref_audio=resample(ref_audio, ref_sr, self.configs.device) + if ref_sr != 24000: + ref_audio = resample(ref_audio, ref_sr, self.configs.device) mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): + if T_min > 468: mel2 = mel2[:, :, -468:] fea_ref = fea_ref[:, :, -468:] T_min = 468 chunk_len = 934 - T_min - mel2=mel2.to(self.precision) + mel2 = mel2.to(self.precision) fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = self.vits_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:] @@ -1307,49 +1323,45 @@ class TTS: cfm_res = torch.cat(cfm_resss, 2) cfm_res = denorm_spec(cfm_res) - with torch.inference_mode(): wav_gen = self.bigvgan_model(cfm_res) - audio=wav_gen[0][0]#.cpu().detach().numpy() - + audio = wav_gen[0][0] # .cpu().detach().numpy() + return audio - - - def v3_synthesis_batched_infer(self, - idx_list:List[int], - semantic_tokens_list:List[torch.Tensor], - batch_phones:List[torch.Tensor], - speed:float=1.0, - sample_steps:int=32 - )->List[torch.Tensor]: - + def v3_synthesis_batched_infer( + self, + idx_list: List[int], + semantic_tokens_list: List[torch.Tensor], + batch_phones: List[torch.Tensor], + speed: float = 1.0, + sample_steps: int = 32, + ) -> List[torch.Tensor]: prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device) prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device) refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device) - fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) - ref_audio:torch.Tensor = self.prompt_cache["raw_audio"] + fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec) + ref_audio: torch.Tensor = self.prompt_cache["raw_audio"] ref_sr = self.prompt_cache["raw_sr"] - ref_audio=ref_audio.to(self.configs.device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(self.configs.device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if ref_sr!=24000: - ref_audio=resample(ref_audio, ref_sr, self.configs.device) + if ref_sr != 24000: + ref_audio = resample(ref_audio, ref_sr, self.configs.device) mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): + if T_min > 468: mel2 = mel2[:, :, -468:] fea_ref = fea_ref[:, :, -468:] T_min = 468 chunk_len = 934 - T_min - mel2=mel2.to(self.precision) - + mel2 = mel2.to(self.precision) # #### batched inference overlapped_len = 12 @@ -1359,96 +1371,93 @@ class TTS: for i, idx in enumerate(idx_list): phones = batch_phones[i].unsqueeze(0).to(self.configs.device) - semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次 + semantic_tokens = ( + semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) + ) # .unsqueeze(0)#mq要多unsqueeze一次 feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed) feat_list.append(feat) feat_lens.append(feat.shape[2]) feats = torch.cat(feat_list, 2) - feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0) + feats_padded = F.pad(feats, (overlapped_len, 0), "constant", 0) pos = 0 padding_len = 0 while True: - if pos ==0: - chunk = feats_padded[:, :, pos:pos + chunk_len] + if pos == 0: + chunk = feats_padded[:, :, pos : pos + chunk_len] else: pos = pos - overlapped_len - chunk = feats_padded[:, :, pos:pos + chunk_len] + chunk = feats_padded[:, :, pos : pos + chunk_len] pos += chunk_len - if (chunk.shape[-1] == 0): break + if chunk.shape[-1] == 0: + break # padding for the last chunk padding_len = chunk_len - chunk.shape[2] if padding_len != 0: - chunk = F.pad(chunk, (0,padding_len), "constant", 0) + chunk = F.pad(chunk, (0, padding_len), "constant", 0) feat_chunks.append(chunk) - - feat_chunks = torch.cat(feat_chunks, 0) bs = feat_chunks.shape[0] - fea_ref = fea_ref.repeat(bs,1,1) + fea_ref = fea_ref.repeat(bs, 1, 1) fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1) - pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) + pred_spec = self.vits_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) pred_spec = pred_spec[:, :, -chunk_len:] dd = pred_spec.shape[1] pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0) # pred_spec = pred_spec[..., :-padding_len] - pred_spec = denorm_spec(pred_spec) - + with torch.no_grad(): wav_gen = self.bigvgan_model(pred_spec) - audio = wav_gen[0][0]#.cpu().detach().numpy() - + audio = wav_gen[0][0] # .cpu().detach().numpy() audio_fragments = [] upsample_rate = 256 pos = 0 while pos < audio.shape[-1]: - audio_fragment = audio[pos:pos+chunk_len*upsample_rate] + audio_fragment = audio[pos : pos + chunk_len * upsample_rate] audio_fragments.append(audio_fragment) - pos += chunk_len*upsample_rate + pos += chunk_len * upsample_rate - audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate) - audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate] + audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate) + audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate] audio_fragments = [] for feat_len in feat_lens: - audio_fragment = audio[:feat_len*upsample_rate] + audio_fragment = audio[: feat_len * upsample_rate] audio_fragments.append(audio_fragment) - audio = audio[feat_len*upsample_rate:] - + audio = audio[feat_len * upsample_rate :] return audio_fragments - - - def sola_algorithm(self, - audio_fragments:List[torch.Tensor], - overlap_len:int, - ): - - for i in range(len(audio_fragments)-1): + def sola_algorithm( + self, + audio_fragments: List[torch.Tensor], + overlap_len: int, + ): + for i in range(len(audio_fragments) - 1): f1 = audio_fragments[i] - f2 = audio_fragments[i+1] + f2 = audio_fragments[i + 1] w1 = f1[-overlap_len:] w2 = f2[:overlap_len] assert w1.shape == w2.shape - corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1] + corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1), padding=w2.shape[-1] // 2).view(-1)[:-1] idx = corr.argmax() - f1_ = f1[:-(overlap_len-idx)] + f1_ = f1[: -(overlap_len - idx)] audio_fragments[i] = f1_ f2_ = f2[idx:] - window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype) - f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):] - audio_fragments[i+1] = f2_ - + window = torch.hann_window((overlap_len - idx) * 2, device=f1.device, dtype=f1.dtype) + f2_[: (overlap_len - idx)] = ( + window[: (overlap_len - idx)] * f2_[: (overlap_len - idx)] + + window[(overlap_len - idx) :] * f1[-(overlap_len - idx) :] + ) + audio_fragments[i + 1] = f2_ return torch.cat(audio_fragments, 0) - - - diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 0ebe553..426929f 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -1,8 +1,9 @@ - -import os, sys +import os +import sys import threading from tqdm import tqdm + now_dir = os.getcwd() sys.path.append(now_dir) @@ -18,17 +19,19 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_ from tools.i18n.i18n import I18nAuto, scan_language_list -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) -punctuation = set(['!', '?', '…', ',', '.', '-']) +punctuation = set(["!", "?", "…", ",", ".", "-"]) -def get_first(text:str) -> str: + +def get_first(text: str) -> str: pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" text = re.split(pattern, text)[0].strip() return text -def merge_short_text_in_array(texts:str, threshold:int) -> list: + +def merge_short_text_in_array(texts: str, threshold: int) -> list: if (len(texts)) < 2: return texts result = [] @@ -38,7 +41,7 @@ def merge_short_text_in_array(texts:str, threshold:int) -> list: if len(text) >= threshold: result.append(text) text = "" - if (len(text) > 0): + if len(text) > 0: if len(result) == 0: result.append(text) else: @@ -46,28 +49,24 @@ def merge_short_text_in_array(texts:str, threshold:int) -> list: return result - - - class TextPreprocessor: - def __init__(self, bert_model:AutoModelForMaskedLM, - tokenizer:AutoTokenizer, device:torch.device): + def __init__(self, bert_model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, device: torch.device): self.bert_model = bert_model self.tokenizer = tokenizer self.device = device self.bert_lock = threading.RLock() - def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]: - print(f'############ {i18n("切分文本")} ############') + def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]: + print(f"############ {i18n('切分文本')} ############") text = self.replace_consecutive_punctuation(text) texts = self.pre_seg_text(text, lang, text_split_method) result = [] - print(f'############ {i18n("提取文本Bert特征")} ############') + print(f"############ {i18n('提取文本Bert特征')} ############") for text in tqdm(texts): phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version) - if phones is None or norm_text=="": + if phones is None or norm_text == "": continue - res={ + res = { "phones": phones, "bert_features": bert_features, "norm_text": norm_text, @@ -75,11 +74,11 @@ class TextPreprocessor: result.append(res) return result - def pre_seg_text(self, text:str, lang:str, text_split_method:str): + def pre_seg_text(self, text: str, lang: str, text_split_method: str): text = text.strip("\n") if len(text) == 0: return [] - if (text[0] not in splits and len(get_first(text)) < 4): + if text[0] not in splits and len(get_first(text)) < 4: text = "。" + text if lang != "en" else "." + text print(i18n("实际输入的目标文本:")) print(text) @@ -95,18 +94,18 @@ class TextPreprocessor: _texts = merge_short_text_in_array(_texts, 5) texts = [] - for text in _texts: # 解决输入目标文本的空行导致报错的问题 - if (len(text.strip()) == 0): - continue + if len(text.strip()) == 0: + continue if not re.sub("\W+", "", text): # 检测一下,如果是纯符号,就跳过。 continue - if (text[-1] not in splits): text += "。" if lang != "en" else "." + if text[-1] not in splits: + text += "。" if lang != "en" else "." # 解决句子过长导致Bert报错的问题 - if (len(text) > 510): + if len(text) > 510: texts.extend(split_big_text(text)) else: texts.append(text) @@ -115,78 +114,79 @@ class TextPreprocessor: print(texts) return texts - def segment_and_extract_feature_for_text(self, text:str, language:str, version:str="v1")->Tuple[list, torch.Tensor, str]: + def segment_and_extract_feature_for_text( + self, text: str, language: str, version: str = "v1" + ) -> Tuple[list, torch.Tensor, str]: return self.get_phones_and_bert(text, language, version) - def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): + def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False): with self.bert_lock: - if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: - # language = language.replace("all_","") - formattext = text - while " " in formattext: - formattext = formattext.replace(" ", " ") - if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"zh",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = self.get_bert_feature(norm_text, word2ph).to(self.device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return self.get_phones_and_bert(formattext,"yue",version) - else: - phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) - bert = torch.zeros( - (1024, len(phones)), - dtype=torch.float32, - ).to(self.device) - elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] - if language == "auto": - for tmp in LangSegmenter.getTexts(text): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "auto_yue": - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "zh": - tmp["lang"] = "yue" - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - else: - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "en": - langlist.append(tmp["lang"]) - else: - # 因无法区别中日韩文汉字,以用户输入为准 - langlist.append(language) - textlist.append(tmp["text"]) - # print(textlist) - # print(langlist) - phones_list = [] - bert_list = [] - norm_text_list = [] - for i in range(len(textlist)): - lang = langlist[i] - phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) - bert = self.get_bert_inf(phones, word2ph, norm_text, lang) - phones_list.append(phones) - norm_text_list.append(norm_text) - bert_list.append(bert) - bert = torch.cat(bert_list, dim=1) - phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + # language = language.replace("all_","") + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext, "zh", version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = self.get_bert_feature(norm_text, word2ph).to(self.device) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return self.get_phones_and_bert(formattext, "yue", version) + else: + phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float32, + ).to(self.device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist = [] + langlist = [] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + # print(textlist) + # print(langlist) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version) + bert = self.get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = "".join(norm_text_list) - if not final and len(phones) < 6: - return self.get_phones_and_bert("." + text,language,version,final=True) + if not final and len(phones) < 6: + return self.get_phones_and_bert("." + text, language, version, final=True) - return phones, bert, norm_text + return phones, bert, norm_text - - def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor: + def get_bert_feature(self, text: str, word2ph: list) -> torch.Tensor: with torch.no_grad(): inputs = self.tokenizer(text, return_tensors="pt") for i in inputs: @@ -201,14 +201,14 @@ class TextPreprocessor: phone_level_feature = torch.cat(phone_level_feature, dim=0) return phone_level_feature.T - def clean_text_inf(self, text:str, language:str, version:str="v2"): - language = language.replace("all_","") + def clean_text_inf(self, text: str, language: str, version: str = "v2"): + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text - def get_bert_inf(self, phones:list, word2ph:list, norm_text:str, language:str): - language=language.replace("all_","") + def get_bert_inf(self, phones: list, word2ph: list, norm_text: str, language: str): + language = language.replace("all_", "") if language == "zh": feature = self.get_bert_feature(norm_text, word2ph).to(self.device) else: @@ -219,21 +219,19 @@ class TextPreprocessor: return feature - - def filter_text(self,texts): - _text=[] - if all(text in [None, " ", "\n",""] for text in texts): + def filter_text(self, texts): + _text = [] + if all(text in [None, " ", "\n", ""] for text in texts): raise ValueError(i18n("请输入有效文本")) for text in texts: - if text in [None, " ", ""]: + if text in [None, " ", ""]: pass else: _text.append(text) return _text - - def replace_consecutive_punctuation(self,text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + def replace_consecutive_punctuation(self, text): + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result diff --git a/GPT_SoVITS/TTS_infer_pack/__init__.py b/GPT_SoVITS/TTS_infer_pack/__init__.py index 7438198..8579a63 100644 --- a/GPT_SoVITS/TTS_infer_pack/__init__.py +++ b/GPT_SoVITS/TTS_infer_pack/__init__.py @@ -1 +1 @@ -from . import TTS, text_segmentation_method \ No newline at end of file +from . import TTS, text_segmentation_method diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index 4ee0cfb..fda70a4 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -1,41 +1,57 @@ - - - - import re from typing import Callable -punctuation = set(['!', '?', '…', ',', '.', '-'," "]) +punctuation = set(["!", "?", "…", ",", ".", "-", " "]) METHODS = dict() -def get_method(name:str)->Callable: + +def get_method(name: str) -> Callable: method = METHODS.get(name, None) if method is None: raise ValueError(f"Method {name} not found") return method -def get_method_names()->list: + +def get_method_names() -> list: return list(METHODS.keys()) + def register_method(name): def decorator(func): METHODS[name] = func return func + return decorator -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } + +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + def split_big_text(text, max_len=510): # 定义全角和半角标点符号 punctuation = "".join(splits) # 切割文本 - segments = re.split('([' + punctuation + '])', text) - + segments = re.split("([" + punctuation + "])", text) + # 初始化结果列表和当前片段 result = [] - current_segment = '' - + current_segment = "" + for segment in segments: # 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段 if len(current_segment + segment) > max_len: @@ -43,13 +59,12 @@ def split_big_text(text, max_len=510): current_segment = segment else: current_segment += segment - + # 将最后一个片段加入结果列表 if current_segment: result.append(current_segment) - - return result + return result def split(todo_text): @@ -90,7 +105,7 @@ def cut1(inp): if len(split_idx) > 1: opts = [] for idx in range(len(split_idx) - 1): - opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) else: opts = [inp] opts = [item for item in opts if not set(item).issubset(punctuation)] @@ -123,6 +138,7 @@ def cut2(inp): opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) + # 按中文句号。切 @register_method("cut3") def cut3(inp): @@ -131,26 +147,28 @@ def cut3(inp): opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) -#按英文句号.切 + +# 按英文句号.切 @register_method("cut4") def cut4(inp): inp = inp.strip("\n") - opts = re.split(r'(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): + if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): items.append(char) else: items.append(char) @@ -166,8 +184,6 @@ def cut5(inp): return "\n".join(opt) - -if __name__ == '__main__': +if __name__ == "__main__": method = get_method("cut5") print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。")) - diff --git a/GPT_SoVITS/download.py b/GPT_SoVITS/download.py index e7c8c97..fc4ead6 100644 --- a/GPT_SoVITS/download.py +++ b/GPT_SoVITS/download.py @@ -1,5 +1,13 @@ -import os, sys +import os +import sys + now_dir = os.getcwd() sys.path.insert(0, now_dir) from text.g2pw import G2PWPinyin -g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True) \ No newline at end of file + +g2pw = G2PWPinyin( + model_dir="GPT_SoVITS/text/G2PWModel", + model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + v_to_u=False, + neutral_tone_with_five=True, +) diff --git a/GPT_SoVITS/export_torch_script.py b/GPT_SoVITS/export_torch_script.py index 3f2c296..69817a3 100644 --- a/GPT_SoVITS/export_torch_script.py +++ b/GPT_SoVITS/export_torch_script.py @@ -3,7 +3,6 @@ import argparse from typing import Optional from my_utils import load_audio -from text import cleaned_text_to_sequence import torch import torchaudio @@ -33,7 +32,8 @@ default_config = { "EOS": 1024, } -def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: + +def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: config = dict_s1["config"] config["model"]["dropout"] = float(config["model"]["dropout"]) t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) @@ -41,6 +41,7 @@ def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule: t2s_model = t2s_model.eval() return t2s_model + @torch.jit.script def logits_to_probs( logits, @@ -57,39 +58,35 @@ def logits_to_probs( if previous_tokens is not None and repetition_penalty != 1.0: previous_tokens = previous_tokens.long() score = torch.gather(logits, dim=1, index=previous_tokens) - score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty - ) + score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) logits.scatter_(dim=1, index=previous_tokens, src=score) if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 - ) + cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) sorted_indices_to_remove = cum_probs > top_p sorted_indices_to_remove[:, 0] = False # keep at least one option - indices_to_remove = sorted_indices_to_remove.scatter( - dim=1, index=sorted_indices, src=sorted_indices_to_remove - ) + indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) logits = logits.masked_fill(indices_to_remove, -float("Inf")) logits = logits / max(temperature, 1e-5) if top_k is not None: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - pivot = v[: , -1].unsqueeze(-1) + pivot = v[:, -1].unsqueeze(-1) logits = torch.where(logits < pivot, -float("Inf"), logits) probs = torch.nn.functional.softmax(logits, dim=-1) return probs + @torch.jit.script -def multinomial_sample_one_no_sync(probs_sort): +def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization q = torch.randn_like(probs_sort) return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + @torch.jit.script def sample( logits, @@ -100,15 +97,20 @@ def sample( repetition_penalty: float = 1.0, ): probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty + logits=logits, + previous_tokens=previous_tokens, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, ) idx_next = multinomial_sample_one_no_sync(probs) return idx_next, probs @torch.jit.script -def spectrogram_torch(y:Tensor, n_fft:int, sampling_rate:int, hop_size:int, win_size:int, center:bool=False): - hann_window = torch.hann_window(win_size,device=y.device,dtype=y.dtype) +def spectrogram_torch(y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False): + hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), @@ -158,6 +160,7 @@ class DictToAttrRecursive(dict): except KeyError: raise AttributeError(f"Attribute {item} not found") + @torch.jit.script class T2SMLP: def __init__(self, w1, b1, w2, b2): @@ -171,23 +174,24 @@ class T2SMLP: x = F.linear(x, self.w2, self.b2) return x + @torch.jit.script class T2SBlock: def __init__( - self, - num_heads: int, - hidden_dim: int, - mlp: T2SMLP, - qkv_w, - qkv_b, - out_w, - out_b, - norm_w1, - norm_b1, - norm_eps1: float, - norm_w2, - norm_b2, - norm_eps2: float, + self, + num_heads: int, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1: float, + norm_w2, + norm_b2, + norm_eps2: float, ): self.num_heads = num_heads self.mlp = mlp @@ -206,22 +210,22 @@ class T2SBlock: self.false = torch.tensor(False, dtype=torch.bool) @torch.jit.ignore - def to_mask(self, x:torch.Tensor, padding_mask:Optional[torch.Tensor]): + def to_mask(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor]): if padding_mask is None: return x - + if padding_mask.dtype == torch.bool: return x.masked_fill(padding_mask, 0) else: return x * padding_mask - - def process_prompt(self, x:torch.Tensor, attn_mask : torch.Tensor, padding_mask:Optional[torch.Tensor]=None): + + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1) batch_size = q.shape[0] q_len = q.shape[1] kv_len = k.shape[1] - + q = self.to_mask(q, padding_mask) k_cache = self.to_mask(k, padding_mask) v_cache = self.to_mask(v, padding_mask) @@ -232,22 +236,20 @@ class T2SBlock: attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) - attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim) + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b) if padding_mask is not None: for i in range(batch_size): # mask = padding_mask[i,:,0] - if self.false.device!= padding_mask.device: + if self.false.device != padding_mask.device: self.false = self.false.to(padding_mask.device) - idx = torch.where(padding_mask[i,:,0]==self.false)[0] - x_item = x[i,idx,:].unsqueeze(0) - attn_item = attn[i,idx,:].unsqueeze(0) + idx = torch.where(padding_mask[i, :, 0] == self.false)[0] + x_item = x[i, idx, :].unsqueeze(0) + attn_item = attn[i, idx, :].unsqueeze(0) x_item = x_item + attn_item - x_item = F.layer_norm( - x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x_item = F.layer_norm(x_item, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x_item = x_item + self.mlp.forward(x_item) x_item = F.layer_norm( x_item, @@ -256,13 +258,11 @@ class T2SBlock: self.norm_b2, self.norm_eps2, ) - x[i,idx,:] = x_item.squeeze(0) + x[i, idx, :] = x_item.squeeze(0) x = self.to_mask(x, padding_mask) else: x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -272,13 +272,13 @@ class T2SBlock: self.norm_eps2, ) return x, k_cache, v_cache - - def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor): + + def decode_next_token(self, x: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor): q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) k_cache = torch.cat([k_cache, k], dim=1) v_cache = torch.cat([v_cache, v], dim=1) - + batch_size = q.shape[0] q_len = q.shape[1] kv_len = k_cache.shape[1] @@ -289,14 +289,12 @@ class T2SBlock: attn = F.scaled_dot_product_attention(q, k, v) - attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim) + attn = attn.permute(2, 0, 1, 3).reshape(batch_size * q_len, self.hidden_dim) attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0) attn = F.linear(attn, self.out_w, self.out_b) x = x + attn - x = F.layer_norm( - x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) + x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1) x = x + self.mlp.forward(x) x = F.layer_norm( x, @@ -307,48 +305,46 @@ class T2SBlock: ) return x, k_cache, v_cache + @torch.jit.script class T2STransformer: - def __init__(self, num_blocks : int, blocks: list[T2SBlock]): - self.num_blocks : int = num_blocks + def __init__(self, num_blocks: int, blocks: list[T2SBlock]): + self.num_blocks: int = num_blocks self.blocks = blocks - def process_prompt( - self, x:torch.Tensor, attn_mask : torch.Tensor,padding_mask : Optional[torch.Tensor]=None): - k_cache : list[torch.Tensor] = [] - v_cache : list[torch.Tensor] = [] + def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + k_cache: list[torch.Tensor] = [] + v_cache: list[torch.Tensor] = [] for i in range(self.num_blocks): x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask) k_cache.append(k_cache_) v_cache.append(v_cache_) return x, k_cache, v_cache - def decode_next_token( - self, x:torch.Tensor, - k_cache: list[torch.Tensor], - v_cache: list[torch.Tensor]): + def decode_next_token(self, x: torch.Tensor, k_cache: list[torch.Tensor], v_cache: list[torch.Tensor]): for i in range(self.num_blocks): x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i]) return x, k_cache, v_cache + class VitsModel(nn.Module): def __init__(self, vits_path): super().__init__() # dict_s2 = torch.load(vits_path,map_location="cpu") dict_s2 = torch.load(vits_path) self.hps = dict_s2["config"] - if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: self.hps["model"]["version"] = "v1" else: self.hps["model"]["version"] = "v2" - + self.hps = DictToAttrRecursive(self.hps) self.hps.model.semantic_frame_rate = "25hz" self.vq_model = SynthesizerTrn( self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, - **self.hps.model + **self.hps.model, ) self.vq_model.eval() self.vq_model.load_state_dict(dict_s2["weight"], strict=False) @@ -360,12 +356,13 @@ class VitsModel(nn.Module): self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, - center=False + center=False, ) return self.vq_model(pred_semantic, text_seq, refer, speed)[0, 0] + class T2SModel(nn.Module): - def __init__(self,raw_t2s:Text2SemanticLightningModule): + def __init__(self, raw_t2s: Text2SemanticLightningModule): super(T2SModel, self).__init__() self.model_dim = raw_t2s.model.model_dim self.embedding_dim = raw_t2s.model.embedding_dim @@ -374,7 +371,7 @@ class T2SModel(nn.Module): self.vocab_size = raw_t2s.model.vocab_size self.phoneme_vocab_size = raw_t2s.model.phoneme_vocab_size # self.p_dropout = float(raw_t2s.model.p_dropout) - self.EOS:int = int(raw_t2s.model.EOS) + self.EOS: int = int(raw_t2s.model.EOS) self.norm_first = raw_t2s.model.norm_first assert self.EOS == self.vocab_size - 1 self.hz = 50 @@ -384,7 +381,7 @@ class T2SModel(nn.Module): self.ar_text_position = raw_t2s.model.ar_text_position self.ar_audio_embedding = raw_t2s.model.ar_audio_embedding self.ar_audio_position = raw_t2s.model.ar_audio_position - + # self.t2s_transformer = T2STransformer(self.num_layers, blocks) # self.t2s_transformer = raw_t2s.model.t2s_transformer @@ -393,12 +390,7 @@ class T2SModel(nn.Module): for i in range(self.num_layers): layer = h.layers[i] - t2smlp = T2SMLP( - layer.linear1.weight, - layer.linear1.bias, - layer.linear2.weight, - layer.linear2.bias - ) + t2smlp = T2SMLP(layer.linear1.weight, layer.linear1.bias, layer.linear2.weight, layer.linear2.bias) block = T2SBlock( self.num_head, @@ -413,11 +405,11 @@ class T2SModel(nn.Module): layer.norm1.eps, layer.norm2.weight, layer.norm2.bias, - layer.norm2.eps + layer.norm2.eps, ) blocks.append(block) - + self.t2s_transformer = T2STransformer(self.num_layers, blocks) # self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) @@ -426,20 +418,27 @@ class T2SModel(nn.Module): self.max_sec = raw_t2s.config["data"]["max_sec"] self.top_k = int(raw_t2s.config["inference"]["top_k"]) self.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) - - def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor,top_k:LongTensor): + + def forward( + self, + prompts: LongTensor, + ref_seq: LongTensor, + text_seq: LongTensor, + ref_bert: torch.Tensor, + text_bert: torch.Tensor, + top_k: LongTensor, + ): bert = torch.cat([ref_bert.T, text_bert.T], 1) all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) bert = bert.unsqueeze(0) x = self.ar_text_embedding(all_phoneme_ids) x = x + self.bert_proj(bert.transpose(1, 2)) - x:torch.Tensor = self.ar_text_position(x) + x: torch.Tensor = self.ar_text_position(x) early_stop_num = self.early_stop_num - - #[1,N,512] [1,N] + # [1,N,512] [1,N] # y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) y = prompts # x_example = x[:,:,0] * 0.0 @@ -465,15 +464,17 @@ class T2SModel(nn.Module): (x_len, 0), value=False, ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)\ - .unsqueeze(0)\ - .expand(bsz*self.num_head, -1, -1)\ - .view(bsz, self.num_head, src_len, src_len)\ - .to(device=x.device, dtype=torch.bool) - + xy_attn_mask = ( + torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + .unsqueeze(0) + .expand(bsz * self.num_head, -1, -1) + .view(bsz, self.num_head, src_len, src_len) + .to(device=x.device, dtype=torch.bool) + ) + idx = 0 top_k = int(top_k) - + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None) logits = self.ar_predict_layer(xy_dec[:, -1]) @@ -481,23 +482,25 @@ class T2SModel(nn.Module): samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) stop = False # for idx in range(1, 50): for idx in range(1, 1500): - #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] # y, k, v, y_emb, logits, samples = self.stage_decoder(y, k, v, y_emb, x_example) xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) logits = self.ar_predict_layer(xy_dec[:, -1]) - if(idx<11):###至少预测出10个token不然不给停止(0.4s) + if idx < 11: ###至少预测出10个token不然不给停止(0.4s) logits = logits[:, :-1] - + samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0] y = torch.concat([y, samples], dim=1) - + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: stop = True if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: @@ -508,20 +511,22 @@ class T2SModel(nn.Module): break y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) - - y[0,-1] = 0 - + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[ + :, y_len + idx + ].to(dtype=y_emb.dtype, device=y_emb.device) + + y[0, -1] = 0 + return y[:, -idx:].unsqueeze(0) -bert_path = os.environ.get( - "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" -) + +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path + @torch.jit.script -def build_phone_level_feature(res:Tensor, word2ph:IntTensor): +def build_phone_level_feature(res: Tensor, word2ph: IntTensor): phone_level_feature = [] for i in range(word2ph.shape[0]): repeat_feature = res[i].repeat(word2ph[i].item(), 1) @@ -530,103 +535,111 @@ def build_phone_level_feature(res:Tensor, word2ph:IntTensor): # [sum(word2ph), 1024] return phone_level_feature + class MyBertModel(torch.nn.Module): def __init__(self, bert_model): super(MyBertModel, self).__init__() self.bert = bert_model - def forward(self, input_ids:torch.Tensor, attention_mask:torch.Tensor, token_type_ids:torch.Tensor, word2ph:IntTensor): + def forward( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, word2ph: IntTensor + ): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # res = torch.cat(outputs["hidden_states"][-3:-2], -1)[0][1:-1] res = torch.cat(outputs[1][-3:-2], -1)[0][1:-1] return build_phone_level_feature(res, word2ph) + class SSLModel(torch.nn.Module): def __init__(self): super().__init__() self.ssl = cnhubert.get_model().model - def forward(self, ref_audio_16k)-> torch.Tensor: + def forward(self, ref_audio_16k) -> torch.Tensor: ssl_content = self.ssl(ref_audio_16k)["last_hidden_state"].transpose(1, 2) return ssl_content + class ExportSSLModel(torch.nn.Module): - def __init__(self,ssl:SSLModel): + def __init__(self, ssl: SSLModel): super().__init__() self.ssl = ssl - def forward(self, ref_audio:torch.Tensor): + def forward(self, ref_audio: torch.Tensor): return self.ssl(ref_audio) - + @torch.jit.export - def resample(self,ref_audio:torch.Tensor,src_sr:int,dst_sr:int)->torch.Tensor: - audio = resamplex(ref_audio,src_sr,dst_sr).float() + def resample(self, ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + audio = resamplex(ref_audio, src_sr, dst_sr).float() return audio + def export_bert(output_path): tokenizer = AutoTokenizer.from_pretrained(bert_path) - + text = "叹息声一声接着一声传出,木兰对着房门织布.听不见织布机织布的声音,只听见木兰在叹息.问木兰在想什么?问木兰在惦记什么?木兰答道,我也没有在想什么,也没有在惦记什么." ref_bert_inputs = tokenizer(text, return_tensors="pt") word2ph = [] for c in text: - if c in [',','。',':','?',",",".","?"]: + if c in [",", "。", ":", "?", ",", ".", "?"]: word2ph.append(1) else: word2ph.append(2) - ref_bert_inputs['word2ph'] = torch.Tensor(word2ph).int() + ref_bert_inputs["word2ph"] = torch.Tensor(word2ph).int() - bert_model = AutoModelForMaskedLM.from_pretrained(bert_path,output_hidden_states=True,torchscript=True) + bert_model = AutoModelForMaskedLM.from_pretrained(bert_path, output_hidden_states=True, torchscript=True) my_bert_model = MyBertModel(bert_model) ref_bert_inputs = { - 'input_ids': ref_bert_inputs['input_ids'], - 'attention_mask': ref_bert_inputs['attention_mask'], - 'token_type_ids': ref_bert_inputs['token_type_ids'], - 'word2ph': ref_bert_inputs['word2ph'] + "input_ids": ref_bert_inputs["input_ids"], + "attention_mask": ref_bert_inputs["attention_mask"], + "token_type_ids": ref_bert_inputs["token_type_ids"], + "word2ph": ref_bert_inputs["word2ph"], } - torch._dynamo.mark_dynamic(ref_bert_inputs['input_ids'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['attention_mask'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['token_type_ids'], 1) - torch._dynamo.mark_dynamic(ref_bert_inputs['word2ph'], 0) + torch._dynamo.mark_dynamic(ref_bert_inputs["input_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["attention_mask"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["token_type_ids"], 1) + torch._dynamo.mark_dynamic(ref_bert_inputs["word2ph"], 0) - my_bert_model = torch.jit.trace(my_bert_model,example_kwarg_inputs=ref_bert_inputs) + my_bert_model = torch.jit.trace(my_bert_model, example_kwarg_inputs=ref_bert_inputs) output_path = os.path.join(output_path, "bert_model.pt") my_bert_model.save(output_path) - print('#### exported bert ####') + print("#### exported bert ####") -def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_bert_and_ssl=False, device='cpu'): + +def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_bert_and_ssl=False, device="cpu"): if not os.path.exists(output_path): os.makedirs(output_path) print(f"目录已创建: {output_path}") else: print(f"目录已存在: {output_path}") - + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float() ssl = SSLModel() if export_bert_and_ssl: - s = ExportSSLModel(torch.jit.trace(ssl,example_inputs=(ref_audio))) + s = ExportSSLModel(torch.jit.trace(ssl, example_inputs=(ref_audio))) ssl_path = os.path.join(output_path, "ssl_model.pt") torch.jit.script(s).save(ssl_path) - print('#### exported ssl ####') + print("#### exported ssl ####") export_bert(output_path) else: s = ExportSSLModel(ssl) print(f"device: {device}") - - ref_seq_id,ref_bert_T,ref_norm_text = get_phones_and_bert(ref_text,"all_zh",'v2') + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") ref_seq = torch.LongTensor([ref_seq_id]).to(device) ref_bert = ref_bert_T.T.to(ref_seq.device) - text_seq_id,text_bert_T,norm_text = get_phones_and_bert("这是一条测试语音,说什么无所谓,只是给它一个例子","all_zh",'v2') + text_seq_id, text_bert_T, norm_text = get_phones_and_bert( + "这是一条测试语音,说什么无所谓,只是给它一个例子", "all_zh", "v2" + ) text_seq = torch.LongTensor([text_seq_id]).to(device) text_bert = text_bert_T.T.to(text_seq.device) ssl_content = ssl(ref_audio).to(device) - # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" + # vits_path = "SoVITS_weights_v2/xw_e8_s216.pth" vits = VitsModel(vits_path).to(device) vits.eval() @@ -634,18 +647,18 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be # dict_s1 = torch.load(gpt_path, map_location=device) dict_s1 = torch.load(gpt_path) raw_t2s = get_raw_t2s_model(dict_s1).to(device) - print('#### get_raw_t2s_model ####') + print("#### get_raw_t2s_model ####") print(raw_t2s.config) t2s_m = T2SModel(raw_t2s) t2s_m.eval() t2s = torch.jit.script(t2s_m).to(device) - print('#### script t2s_m ####') - - print("vits.hps.data.sampling_rate:",vits.hps.data.sampling_rate) - gpt_sovits = GPT_SoVITS(t2s,vits).to(device) + print("#### script t2s_m ####") + + print("vits.hps.data.sampling_rate:", vits.hps.data.sampling_rate) + gpt_sovits = GPT_SoVITS(t2s, vits).to(device) gpt_sovits.eval() - - ref_audio_sr = s.resample(ref_audio,16000,32000).to(device) + + ref_audio_sr = s.resample(ref_audio, 16000, 32000).to(device) torch._dynamo.mark_dynamic(ssl_content, 2) torch._dynamo.mark_dynamic(ref_audio_sr, 1) @@ -658,32 +671,28 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be with torch.no_grad(): gpt_sovits_export = torch.jit.trace( - gpt_sovits, - example_inputs=( - ssl_content, - ref_audio_sr, - ref_seq, - text_seq, - ref_bert, - text_bert, - top_k)) - + gpt_sovits, example_inputs=(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, text_bert, top_k) + ) + gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt") gpt_sovits_export.save(gpt_sovits_path) - print('#### exported gpt_sovits ####') + print("#### exported gpt_sovits ####") + @torch.jit.script def parse_audio(ref_audio): - ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float()#.to(ref_audio.device) - ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,32000).float()#.to(ref_audio.device) - return ref_audio_16k,ref_audio_sr + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() # .to(ref_audio.device) + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, 32000).float() # .to(ref_audio.device) + return ref_audio_16k, ref_audio_sr + @torch.jit.script -def resamplex(ref_audio:torch.Tensor,src_sr:int,dst_sr:int)->torch.Tensor: - return torchaudio.functional.resample(ref_audio,src_sr,dst_sr).float() +def resamplex(ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torch.Tensor: + return torchaudio.functional.resample(ref_audio, src_sr, dst_sr).float() + class GPT_SoVITS(nn.Module): - def __init__(self, t2s:T2SModel,vits:VitsModel): + def __init__(self, t2s: T2SModel, vits: VitsModel): super().__init__() self.t2s = t2s self.vits = vits @@ -710,12 +719,11 @@ class GPT_SoVITS(nn.Module): def test(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--output_path', required=True, help="Path to the output directory") - + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") args = parser.parse_args() gpt_path = args.gpt_model @@ -726,7 +734,7 @@ def test(): tokenizer = AutoTokenizer.from_pretrained(bert_path) # bert_model = AutoModelForMaskedLM.from_pretrained(bert_path,output_hidden_states=True,torchscript=True) # bert = MyBertModel(bert_model) - my_bert = torch.jit.load("onnx/bert_model.pt",map_location='cuda') + my_bert = torch.jit.load("onnx/bert_model.pt", map_location="cuda") # dict_s1 = torch.load(gpt_path, map_location="cuda") # raw_t2s = get_raw_t2s_model(dict_s1) @@ -740,95 +748,97 @@ def test(): # ssl = ExportSSLModel(SSLModel()).to('cuda') # ssl.eval() - ssl = torch.jit.load("onnx/by/ssl_model.pt",map_location='cuda') + ssl = torch.jit.load("onnx/by/ssl_model.pt", map_location="cuda") # gpt_sovits = GPT_SoVITS(t2s,vits) - gpt_sovits = torch.jit.load("onnx/by/gpt_sovits_model.pt",map_location='cuda') + gpt_sovits = torch.jit.load("onnx/by/gpt_sovits_model.pt", map_location="cuda") - ref_seq_id,ref_bert_T,ref_norm_text = get_phones_and_bert(ref_text,"all_zh",'v2') + ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2") ref_seq = torch.LongTensor([ref_seq_id]) ref_bert = ref_bert_T.T.to(ref_seq.device) # text_seq_id,text_bert_T,norm_text = get_phones_and_bert("昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字.","all_zh",'v2') text = "昨天晚上看见征兵文书,知道君主在大规模征兵,那么多卷征兵文册,每一卷上都有父亲的名字." - text_seq_id,text_bert_T,norm_text = get_phones_and_bert(text,"all_zh",'v2') - + text_seq_id, text_bert_T, norm_text = get_phones_and_bert(text, "all_zh", "v2") + test_bert = tokenizer(text, return_tensors="pt") word2ph = [] for c in text: - if c in [',','。',':','?',"?",",","."]: + if c in [",", "。", ":", "?", "?", ",", "."]: word2ph.append(1) else: word2ph.append(2) - test_bert['word2ph'] = torch.Tensor(word2ph).int() + test_bert["word2ph"] = torch.Tensor(word2ph).int() test_bert = my_bert( - test_bert['input_ids'].to('cuda'), - test_bert['attention_mask'].to('cuda'), - test_bert['token_type_ids'].to('cuda'), - test_bert['word2ph'].to('cuda') + test_bert["input_ids"].to("cuda"), + test_bert["attention_mask"].to("cuda"), + test_bert["token_type_ids"].to("cuda"), + test_bert["word2ph"].to("cuda"), ) - + text_seq = torch.LongTensor([text_seq_id]) text_bert = text_bert_T.T.to(text_seq.device) - print('text_bert:',text_bert.shape,text_bert) - print('test_bert:',test_bert.shape,test_bert) - print(torch.allclose(text_bert.to('cuda'),test_bert)) + print("text_bert:", text_bert.shape, text_bert) + print("test_bert:", test_bert.shape, test_bert) + print(torch.allclose(text_bert.to("cuda"), test_bert)) - print('text_seq:',text_seq.shape) - print('text_bert:',text_bert.shape,text_bert.type()) + print("text_seq:", text_seq.shape) + print("text_bert:", text_bert.shape, text_bert.type()) - #[1,N] - ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float().to('cuda') - print('ref_audio:',ref_audio.shape) - - ref_audio_sr = ssl.resample(ref_audio,16000,32000) - print('start ssl') + # [1,N] + ref_audio = torch.tensor([load_audio(ref_audio_path, 16000)]).float().to("cuda") + print("ref_audio:", ref_audio.shape) + + ref_audio_sr = ssl.resample(ref_audio, 16000, 32000) + print("start ssl") ssl_content = ssl(ref_audio) - print('start gpt_sovits:') - print('ssl_content:',ssl_content.shape) - print('ref_audio_sr:',ref_audio_sr.shape) - print('ref_seq:',ref_seq.shape) - ref_seq=ref_seq.to('cuda') - print('text_seq:',text_seq.shape) - text_seq=text_seq.to('cuda') - print('ref_bert:',ref_bert.shape) - ref_bert=ref_bert.to('cuda') - print('text_bert:',text_bert.shape) - text_bert=text_bert.to('cuda') + print("start gpt_sovits:") + print("ssl_content:", ssl_content.shape) + print("ref_audio_sr:", ref_audio_sr.shape) + print("ref_seq:", ref_seq.shape) + ref_seq = ref_seq.to("cuda") + print("text_seq:", text_seq.shape) + text_seq = text_seq.to("cuda") + print("ref_bert:", ref_bert.shape) + ref_bert = ref_bert.to("cuda") + print("text_bert:", text_bert.shape) + text_bert = text_bert.to("cuda") - top_k = torch.LongTensor([5]).to('cuda') + top_k = torch.LongTensor([5]).to("cuda") with torch.no_grad(): audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert, top_k) - print('start write wav') + print("start write wav") soundfile.write("out.wav", audio.detach().cpu().numpy(), 32000) import text import json -def export_symbel(version='v2'): - if version=='v1': + +def export_symbel(version="v2"): + if version == "v1": symbols = text._symbol_to_id_v1 - with open(f"onnx/symbols_v1.json", "w") as file: + with open("onnx/symbols_v1.json", "w") as file: json.dump(symbols, file, indent=4) else: symbols = text._symbol_to_id_v2 - with open(f"onnx/symbols_v2.json", "w") as file: + with open("onnx/symbols_v2.json", "w") as file: json.dump(symbols, file, indent=4) + def main(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--output_path', required=True, help="Path to the output directory") - parser.add_argument('--export_common_model', action='store_true', help="Export Bert and SSL model") - parser.add_argument('--device', help="Device to use") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument("--output_path", required=True, help="Path to the output directory") + parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model") + parser.add_argument("--device", help="Device to use") args = parser.parse_args() export( @@ -841,9 +851,11 @@ def main(): export_bert_and_ssl=args.export_common_model, ) + import inference_webui + if __name__ == "__main__": - inference_webui.is_half=False - inference_webui.dtype=torch.float32 + inference_webui.is_half = False + inference_webui.dtype = torch.float32 main() # test() diff --git a/GPT_SoVITS/export_torch_script_v3.py b/GPT_SoVITS/export_torch_script_v3.py index 8b73d30..b34495a 100644 --- a/GPT_SoVITS/export_torch_script_v3.py +++ b/GPT_SoVITS/export_torch_script_v3.py @@ -6,16 +6,16 @@ from export_torch_script import ( spectrogram_torch, ) from f5_tts.model.backbones.dit import DiT -from feature_extractor import cnhubert from inference_webui import get_phones_and_bert import librosa from module import commons -from module.mel_processing import mel_spectrogram_torch, spectral_normalize_torch +from module.mel_processing import mel_spectrogram_torch from module.models_onnx import CFM, SynthesizerTrnV3 import numpy as np import torch._dynamo.config import torchaudio -import logging, uvicorn +import logging +import uvicorn import torch import soundfile from librosa.filters import mel as librosa_mel_fn @@ -32,7 +32,6 @@ now_dir = os.getcwd() class MelSpectrgram(torch.nn.Module): - def __init__( self, dtype, @@ -48,14 +47,12 @@ class MelSpectrgram(torch.nn.Module): ): super().__init__() self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype) - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device) - self.n_fft:int = n_fft - self.hop_size:int = hop_size - self.win_size:int = win_size - self.center:bool = center + self.n_fft: int = n_fft + self.hop_size: int = hop_size + self.win_size: int = win_size + self.center: bool = center def forward(self, y): y = torch.nn.functional.pad( @@ -172,9 +169,7 @@ class ExportCFM(torch.nn.Module): ): T_min = fea_ref.size(2) fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) - cfm_res = self.cfm( - fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps - ) + cfm_res = self.cfm(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps) cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:] @@ -198,6 +193,7 @@ mel_fn = lambda x: mel_spectrogram_torch( spec_min = -12 spec_max = 2 + @torch.jit.script def norm_spec(x): spec_min = -12 @@ -212,7 +208,6 @@ def denorm_spec(x): class ExportGPTSovitsHalf(torch.nn.Module): - def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3): super().__init__() self.hps = hps @@ -231,15 +226,15 @@ class ExportGPTSovitsHalf(torch.nn.Module): center=False, ) # self.dtype = dtype - self.filter_length:int = hps.data.filter_length - self.sampling_rate:int = hps.data.sampling_rate - self.hop_length:int = hps.data.hop_length - self.win_length:int = hps.data.win_length + self.filter_length: int = hps.data.filter_length + self.sampling_rate: int = hps.data.sampling_rate + self.hop_length: int = hps.data.hop_length + self.win_length: int = hps.data.win_length def forward( self, ssl_content, - ref_audio_32k:torch.FloatTensor, + ref_audio_32k: torch.FloatTensor, phoneme_ids0, phoneme_ids1, bert1, @@ -255,21 +250,17 @@ class ExportGPTSovitsHalf(torch.nn.Module): center=False, ).to(ssl_content.dtype) - codes = self.vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0) # print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - pred_semantic = self.t2s_m( - prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k - ) + pred_semantic = self.t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) # print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - ge = self.vq_model.create_ge(refer) # print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - + prompt_ = prompt.unsqueeze(0) fea_ref = self.vq_model(prompt_, phoneme_ids0, ge) # print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S")) @@ -293,6 +284,7 @@ class ExportGPTSovitsHalf(torch.nn.Module): return fea_ref, fea_todo, mel2 + class GPTSoVITSV3(torch.nn.Module): def __init__(self, gpt_sovits_half, cfm, bigvgan): super().__init__() @@ -303,9 +295,9 @@ class GPTSoVITSV3(torch.nn.Module): def forward( self, ssl_content, - ref_audio_32k:torch.FloatTensor, - phoneme_ids0:torch.LongTensor, - phoneme_ids1:torch.LongTensor, + ref_audio_32k: torch.FloatTensor, + phoneme_ids0: torch.LongTensor, + phoneme_ids1: torch.LongTensor, bert1, bert2, top_k: torch.LongTensor, @@ -313,7 +305,9 @@ class GPTSoVITSV3(torch.nn.Module): ): # current_time = datetime.now() # print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S")) - fea_ref, fea_todo, mel2 = self.gpt_sovits_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + fea_ref, fea_todo, mel2 = self.gpt_sovits_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) chunk_len = 934 - fea_ref.shape[2] wav_gen_list = [] idx = 0 @@ -331,7 +325,13 @@ class GPTSoVITSV3(torch.nn.Module): # 经过 bigvgan 之后音频长度就是 fea_todo.shape[2] * 256 complete_len = chunk_len - fea_todo_chunk.shape[-1] if complete_len != 0: - fea_todo_chunk = torch.cat([fea_todo_chunk, torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype)], 2) + fea_todo_chunk = torch.cat( + [ + fea_todo_chunk, + torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype), + ], + 2, + ) cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps) idx += chunk_len @@ -339,17 +339,17 @@ class GPTSoVITSV3(torch.nn.Module): cfm_res = denorm_spec(cfm_res) bigvgan_res = self.bigvgan(cfm_res) wav_gen_list.append(bigvgan_res) - + wav_gen = torch.cat(wav_gen_list, 2) return wav_gen[0][0][:wav_gen_length] + def init_bigvgan(): global bigvgan_model from BigVGAN import bigvgan bigvgan_model = bigvgan.BigVGAN.from_pretrained( - "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" - % (now_dir,), + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False, ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode @@ -467,10 +467,7 @@ def export_cfm( cfm = e_cfm.cfm B, T = mu.size(0), mu.size(1) - x = ( - torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) - * temperature - ) + x = torch.randn([B, cfm.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature print("x:", x.shape, x.dtype) prompt_len = prompt.size(-1) prompt_x = torch.zeros_like(x, dtype=mu.dtype) @@ -565,11 +562,7 @@ def export(): wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() codes = sovits.vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) @@ -626,10 +619,7 @@ def export(): "create_ge": refer, } - - trace_vq_model = torch.jit.trace_module( - sovits.vq_model, inputs, optimize=True - ) + trace_vq_model = torch.jit.trace_module(sovits.vq_model, inputs, optimize=True) trace_vq_model.save("onnx/ad/vq_model.pt") print(fea_ref.shape, fea_ref.dtype, ge.shape) @@ -714,9 +704,7 @@ def export(): idx += chunk_len - cfm_res, fea_ref, mel2 = export_cfm_( - fea_ref, fea_todo_chunk, mel2, sample_steps - ) + cfm_res, fea_ref, mel2 = export_cfm_(fea_ref, fea_todo_chunk, mel2, sample_steps) cfm_resss.append(cfm_res) continue @@ -726,9 +714,7 @@ def export(): with torch.inference_mode(): cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype) torch._dynamo.mark_dynamic(cmf_res_rand, 2) - bigvgan_model_ = torch.jit.trace( - bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,) - ) + bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,)) bigvgan_model_.save("onnx/ad/bigvgan_model.pt") wav_gen = bigvgan_model(cmf_res) print("wav_gen:", wav_gen.shape, wav_gen.dtype) @@ -748,7 +734,6 @@ def test_export( bigvgan, output, ): - # hps = sovits.hps ref_wav_path = "onnx/ad/ref.wav" speed = 1.0 @@ -773,13 +758,9 @@ def test_export( wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() - ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() phones1, bert1, norm_text1 = get_phones_and_bert( @@ -799,8 +780,18 @@ def test_export( current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logger.info("start inference %s", current_time) - print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) - fea_ref, fea_todo, mel2 = gpt_sovits_v3_half(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) + fea_ref, fea_todo, mel2 = gpt_sovits_v3_half( + ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k + ) chunk_len = 934 - fea_ref.shape[2] print(fea_ref.shape, fea_todo.shape, mel2.shape) @@ -812,7 +803,6 @@ def test_export( wav_gen_length = fea_todo.shape[2] * 256 while 1: - current_time = datetime.now() print("idx:", idx, current_time.strftime("%Y-%m-%d %H:%M:%S")) fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] @@ -861,7 +851,6 @@ def test_export1( gpt_sovits_v3, output, ): - # hps = sovits.hps ref_wav_path = "onnx/ad/ref.wav" speed = 1.0 @@ -886,14 +875,10 @@ def test_export1( wav16k = wav16k.to(device) zero_wav_torch = zero_wav_torch.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() print("ssl_content:", ssl_content.shape, ssl_content.dtype) - ref_audio_32k,_ = librosa.load(ref_wav_path, sr=32000) + ref_audio_32k, _ = librosa.load(ref_wav_path, sr=32000) ref_audio_32k = torch.from_numpy(ref_audio_32k).unsqueeze(0).to(device).float() phones1, bert1, norm_text1 = get_phones_and_bert( @@ -913,11 +898,19 @@ def test_export1( current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logger.info("start inference %s", current_time) - print(ssl_content.shape, ref_audio_32k.shape, phoneme_ids0.shape, phoneme_ids1.shape, bert1.shape, bert2.shape, top_k.shape) + print( + ssl_content.shape, + ref_audio_32k.shape, + phoneme_ids0.shape, + phoneme_ids1.shape, + bert1.shape, + bert2.shape, + top_k.shape, + ) wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps) print("wav_gen:", wav_gen.shape, wav_gen.dtype) - wav_gen = torch.cat([wav_gen,zero_wav_torch],0) + wav_gen = torch.cat([wav_gen, zero_wav_torch], 0) audio = wav_gen.cpu().detach().numpy() logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) @@ -929,20 +922,19 @@ import time def test_(): - sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth") # cfm = ExportCFM(sovits.cfm) # cfm.cfm.estimator = dit sovits.cfm = None - + cfm = torch.jit.load("onnx/ad/cfm.pt", map_location=device) # cfm = torch.jit.optimize_for_inference(cfm) cfm = cfm.half().to(device) - + cfm.eval() - logger.info(f"cfm ok") + logger.info("cfm ok") dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt") # v2 的 gpt 也可以用 @@ -957,17 +949,14 @@ def test_(): t2s_m = torch.jit.script(t2s_m) t2s_m.eval() # t2s_m.top_k = 15 - logger.info(f"t2s_m ok") + logger.info("t2s_m ok") - - vq_model: torch.jit.ScriptModule = torch.jit.load( - "onnx/ad/vq_model.pt", map_location=device - ) + vq_model: torch.jit.ScriptModule = torch.jit.load("onnx/ad/vq_model.pt", map_location=device) # vq_model = torch.jit.optimize_for_inference(vq_model) # vq_model = vq_model.half().to(device) vq_model.eval() # vq_model = sovits.vq_model - logger.info(f"vq_model ok") + logger.info("vq_model ok") # gpt_sovits_v3_half = torch.jit.load("onnx/ad/gpt_sovits_v3_half.pt") # gpt_sovits_v3_half = torch.jit.optimize_for_inference(gpt_sovits_v3_half) @@ -975,7 +964,7 @@ def test_(): # gpt_sovits_v3_half = gpt_sovits_v3_half.cuda() # gpt_sovits_v3_half.eval() gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model) - logger.info(f"gpt_sovits_v3_half ok") + logger.info("gpt_sovits_v3_half ok") # init_bigvgan() # global bigvgan_model @@ -985,7 +974,7 @@ def test_(): bigvgan_model = bigvgan_model.cuda() bigvgan_model.eval() - logger.info(f"bigvgan ok") + logger.info("bigvgan ok") gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model) gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3) @@ -1020,8 +1009,9 @@ def test_(): # "out2.wav", # ) + def test_export_gpt_sovits_v3(): - gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt",map_location=device) + gpt_sovits_v3 = torch.jit.load("onnx/ad/gpt_sovits_v3.pt", map_location=device) # test_export1( # "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....", # gpt_sovits_v3, diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py index ac32fa5..7d98a85 100644 --- a/GPT_SoVITS/f5_tts/model/backbones/dit.py +++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py @@ -11,7 +11,6 @@ from __future__ import annotations import torch from torch import nn -import torch.nn.functional as F from torch.utils.checkpoint import checkpoint from x_transformers.x_transformers import RotaryEmbedding @@ -28,6 +27,7 @@ from GPT_SoVITS.f5_tts.model.modules import ( from module.commons import sequence_mask + class TextEmbedding(nn.Module): def __init__(self, text_dim, conv_layers=0, conv_mult=2): super().__init__() @@ -130,26 +130,24 @@ class DiT(nn.Module): return ckpt_forward - def forward(#x, prompt_x, x_lens, t, style,cond - self,#d is channel,n is T + def forward( # x, prompt_x, x_lens, t, style,cond + self, # d is channel,n is T x0: float["b n d"], # nosied input audio # noqa: F722 cond0: float["b n d"], # masked cond audio # noqa: F722 x_lens, time: float["b"] | float[""], # time step # noqa: F821 F722 - dt_base_bootstrap, + dt_base_bootstrap, text0, # : int["b nt"] # noqa: F722#####condition feature use_grad_ckpt=False, # bool ###no-use drop_audio_cond=False, # cfg for cond audio drop_text=False, # cfg for text # mask: bool["b n"] | None = None, # noqa: F722 - ): - - x=x0.transpose(2,1) - cond=cond0.transpose(2,1) - text=text0.transpose(2,1) - mask = sequence_mask(x_lens,max_length=x.size(1)).to(x.device) + x = x0.transpose(2, 1) + cond = cond0.transpose(2, 1) + text = text0.transpose(2, 1) + mask = sequence_mask(x_lens, max_length=x.size(1)).to(x.device) batch, seq_len = x.shape[0], x.shape[1] if time.ndim == 0: @@ -158,8 +156,8 @@ class DiT(nn.Module): # t: conditioning time, c: context (text + masked cond audio), x: noised input audio t = self.time_embed(time) dt = self.d_embed(dt_base_bootstrap) - t+=dt - text_embed = self.text_embed(text, seq_len, drop_text=drop_text)###need to change + t += dt + text_embed = self.text_embed(text, seq_len, drop_text=drop_text) ###need to change x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond) rope = self.rotary_embed.forward_from_seq_len(seq_len) @@ -179,4 +177,4 @@ class DiT(nn.Module): x = self.norm_out(x, t) output = self.proj_out(x) - return output \ No newline at end of file + return output diff --git a/GPT_SoVITS/f5_tts/model/modules.py b/GPT_SoVITS/f5_tts/model/modules.py index 5f6f5cf..9f030d9 100644 --- a/GPT_SoVITS/f5_tts/model/modules.py +++ b/GPT_SoVITS/f5_tts/model/modules.py @@ -391,6 +391,7 @@ class Attention(nn.Module): # Attention processor + # from torch.nn.attention import SDPBackend # torch.backends.cuda.enable_flash_sdp(True) class AttnProcessor: @@ -545,6 +546,7 @@ class JointAttnProcessor: # DiT Block + class DiTBlock(nn.Module): def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1): super().__init__() diff --git a/GPT_SoVITS/feature_extractor/__init__.py b/GPT_SoVITS/feature_extractor/__init__.py index 79aa929..01ef5dd 100644 --- a/GPT_SoVITS/feature_extractor/__init__.py +++ b/GPT_SoVITS/feature_extractor/__init__.py @@ -1,6 +1,3 @@ from . import cnhubert, whisper_enc -content_module_map = { - 'cnhubert': cnhubert, - 'whisper': whisper_enc -} \ No newline at end of file +content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc} diff --git a/GPT_SoVITS/feature_extractor/cnhubert.py b/GPT_SoVITS/feature_extractor/cnhubert.py index 013e462..f22b8d0 100644 --- a/GPT_SoVITS/feature_extractor/cnhubert.py +++ b/GPT_SoVITS/feature_extractor/cnhubert.py @@ -1,14 +1,11 @@ -import time - -import librosa import torch -import torch.nn.functional as F -import soundfile as sf import os from transformers import logging as tf_logging + tf_logging.set_verbosity_error() import logging + logging.getLogger("numba").setLevel(logging.WARNING) from transformers import ( @@ -23,21 +20,19 @@ cnhubert_base_path = None class CNHubert(nn.Module): - def __init__(self, base_path:str=None): + def __init__(self, base_path: str = None): super().__init__() if base_path is None: base_path = cnhubert_base_path - if os.path.exists(base_path):... - else:raise FileNotFoundError(base_path) + if os.path.exists(base_path): + ... + else: + raise FileNotFoundError(base_path) self.model = HubertModel.from_pretrained(base_path, local_files_only=True) - self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - base_path, local_files_only=True - ) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True) def forward(self, x): - input_values = self.feature_extractor( - x, return_tensors="pt", sampling_rate=16000 - ).input_values.to(x.device) + input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) feats = self.model(input_values)["last_hidden_state"] return feats diff --git a/GPT_SoVITS/feature_extractor/whisper_enc.py b/GPT_SoVITS/feature_extractor/whisper_enc.py index 983c3e4..260539b 100644 --- a/GPT_SoVITS/feature_extractor/whisper_enc.py +++ b/GPT_SoVITS/feature_extractor/whisper_enc.py @@ -19,7 +19,5 @@ def get_content(model=None, wav_16k_tensor=None): feature_len = mel.shape[-1] // 2 assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" with torch.no_grad(): - feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ - :1, :feature_len, : - ].transpose(1, 2) + feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2) return feature diff --git a/GPT_SoVITS/inference_cli.py b/GPT_SoVITS/inference_cli.py index bd987aa..459a3d3 100644 --- a/GPT_SoVITS/inference_cli.py +++ b/GPT_SoVITS/inference_cli.py @@ -7,13 +7,23 @@ from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights i18n = I18nAuto() -def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): + +def synthesize( + GPT_model_path, + SoVITS_model_path, + ref_audio_path, + ref_text_path, + ref_language, + target_text_path, + target_language, + output_path, +): # Read reference text - with open(ref_text_path, 'r', encoding='utf-8') as file: + with open(ref_text_path, "r", encoding="utf-8") as file: ref_text = file.read() # Read target text - with open(target_text_path, 'r', encoding='utf-8') as file: + with open(target_text_path, "r", encoding="utf-8") as file: target_text = file.read() # Change model weights @@ -21,12 +31,16 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, change_sovits_weights(sovits_path=SoVITS_model_path) # Synthesize audio - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=i18n(ref_language), - text=target_text, - text_language=i18n(target_language), top_p=1, temperature=1) - + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=i18n(ref_language), + text=target_text, + text_language=i18n(target_language), + top_p=1, + temperature=1, + ) + result_list = list(synthesis_result) if result_list: @@ -35,21 +49,38 @@ def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, sf.write(output_wav_path, last_audio_data, last_sampling_rate) print(f"Audio saved to {output_wav_path}") + def main(): parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio") - parser.add_argument('--target_text', required=True, help="Path to the target text file") - parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text") - parser.add_argument('--output_path', required=True, help="Path to the output directory") + parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") + parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") + parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") + parser.add_argument("--ref_text", required=True, help="Path to the reference text file") + parser.add_argument( + "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio" + ) + parser.add_argument("--target_text", required=True, help="Path to the target text file") + parser.add_argument( + "--target_language", + required=True, + choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], + help="Language of the target text", + ) + parser.add_argument("--output_path", required=True, help="Path to the output directory") args = parser.parse_args() - synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path) + synthesize( + args.gpt_model, + args.sovits_model, + args.ref_audio, + args.ref_text, + args.ref_language, + args.target_text, + args.target_language, + args.output_path, + ) -if __name__ == '__main__': + +if __name__ == "__main__": main() - diff --git a/GPT_SoVITS/inference_gui.py b/GPT_SoVITS/inference_gui.py index 2059155..379f7fa 100644 --- a/GPT_SoVITS/inference_gui.py +++ b/GPT_SoVITS/inference_gui.py @@ -6,6 +6,7 @@ from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QSta import soundfile as sf from tools.i18n.i18n import I18nAuto + i18n = I18nAuto() from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav @@ -18,7 +19,7 @@ class GPTSoVITSGUI(QMainWindow): def __init__(self): super().__init__() - self.setWindowTitle('GPT-SoVITS GUI') + self.setWindowTitle("GPT-SoVITS GUI") self.setGeometry(800, 450, 950, 850) self.setStyleSheet(""" @@ -61,11 +62,12 @@ class GPTSoVITSGUI(QMainWindow): border: 1px solid #45a049; box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); } - """) + """) license_text = ( - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " - "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " + "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." + ) license_label = QLabel(license_text) license_label.setWordWrap(True) @@ -124,14 +126,16 @@ class GPTSoVITSGUI(QMainWindow): self.output_text = QTextEdit() self.output_text.setReadOnly(True) - self.add_drag_drop_events([ - self.GPT_model_input, - self.SoVITS_model_input, - self.ref_audio_input, - self.ref_text_input, - self.target_text_input, - self.output_input, - ]) + self.add_drag_drop_events( + [ + self.GPT_model_input, + self.SoVITS_model_input, + self.ref_audio_input, + self.ref_text_input, + self.target_text_input, + self.output_input, + ] + ) self.synthesize_button = QPushButton("合成") self.synthesize_button.clicked.connect(self.synthesize) @@ -235,14 +239,14 @@ class GPTSoVITSGUI(QMainWindow): def upload_ref_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") if file_path: - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, "r", encoding="utf-8") as file: content = file.read() self.ref_text_input.setText(content) def upload_target_text(self): file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") if file_path: - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, "r", encoding="utf-8") as file: content = file.read() self.target_text_input.setText(content) @@ -284,17 +288,19 @@ class GPTSoVITSGUI(QMainWindow): change_sovits_weights(sovits_path=SoVITS_model_path) self.SoVITS_Path = SoVITS_model_path - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=language_combobox, - text=target_text, - text_language=target_language_combobox) + synthesis_result = get_tts_wav( + ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=language_combobox, + text=target_text, + text_language=target_language_combobox, + ) result_list = list(synthesis_result) if result_list: last_sampling_rate, last_audio_data = result_list[-1] - output_wav_path = os.path.join(output_path, "output.wav") + output_wav_path = os.path.join(output_path, "output.wav") sf.write(output_wav_path, last_audio_data, last_sampling_rate) result = "Audio saved to " + output_wav_path @@ -303,8 +309,8 @@ class GPTSoVITSGUI(QMainWindow): self.output_text.append("处理结果:\n" + result) -if __name__ == '__main__': +if __name__ == "__main__": app = QApplication(sys.argv) mainWin = GPTSoVITSGUI() mainWin.show() - sys.exit(app.exec_()) \ No newline at end of file + sys.exit(app.exec_()) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index dd9086f..3f9750a 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1,13 +1,18 @@ -''' +""" 按中英混合识别 按日英混合识别 多语种启动切分识别语种 全部按中文识别 全部按英文识别 全部按日文识别 -''' +""" + import logging -import traceback,torchaudio,warnings +import traceback +import warnings + +import torchaudio + logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) @@ -16,59 +21,68 @@ logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR) -warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action="ignore", category=FutureWarning) + +import json +import os +import re +import sys -import os, re, sys, json -import pdb import torch from text.LangSegmenter import LangSegmenter try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... -version=model_version=os.environ.get("version","v2") -path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" -is_exist_s2gv3=os.path.exists(path_sovits_v3) -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] + + analytics.version_check = lambda: None +except: + ... +version = model_version = os.environ.get("version", "v2") +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + path_sovits_v3, +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] - -_ =[[],[]] +_ = [[], []] for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) -pretrained_gpt_name,pretrained_sovits_name = _ + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ -if os.path.exists(f"./weight.json"): +if os.path.exists("./weight.json"): pass else: - with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file) + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) -with open(f"./weight.json", 'r', encoding="utf-8") as file: +with open("./weight.json", "r", encoding="utf-8") as file: weight_data = file.read() - weight_data=json.loads(weight_data) - gpt_path = os.environ.get( - "gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name)) - sovits_path = os.environ.get( - "sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name)) - if isinstance(gpt_path,list): + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): gpt_path = gpt_path[0] - if isinstance(sovits_path,list): + if isinstance(sovits_path, list): sovits_path = sovits_path[0] # gpt_path = os.environ.get( # "gpt_path", pretrained_gpt_name # ) # sovits_path = os.environ.get("sovits_path", pretrained_sovits_name) -cnhubert_base_path = os.environ.get( - "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base" -) -bert_path = os.environ.get( - "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" -) +cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base") +bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large") infer_ttswebui = os.environ.get("infer_ttswebui", 9872) infer_ttswebui = int(infer_ttswebui) is_share = os.environ.get("is_share", "False") @@ -77,18 +91,20 @@ if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() # is_half=False -punctuation = set(['!', '?', '…', ',', '.', '-'," "]) +punctuation = set(["!", "?", "…", ",", ".", "-", " "]) import gradio as gr -from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np import librosa +import numpy as np from feature_extractor import cnhubert +from transformers import AutoModelForMaskedLM, AutoTokenizer cnhubert.cnhubert_base_path = cnhubert_base_path -from GPT_SoVITS.module.models import SynthesizerTrn,SynthesizerTrnV3 -import numpy as np import random + +from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3 + + def set_seed(seed): if seed == -1: seed = random.randint(0, 1000000) @@ -98,18 +114,21 @@ def set_seed(seed): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) + + # set_seed(42) +from time import time as ttime + from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from peft import LoraConfig, get_peft_model from text import cleaned_text_to_sequence from text.cleaner import clean_text -from time import time as ttime -from tools.my_utils import load_audio -from tools.i18n.i18n import I18nAuto, scan_language_list -from peft import LoraConfig, PeftModel, get_peft_model -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 @@ -120,27 +139,27 @@ else: device = "cpu" dict_language_v1 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 } dict_language_v2 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("粤语"): "all_yue",#全部按中文识别 - i18n("韩文"): "all_ko",#全部按韩文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("粤英混合"): "yue",#按粤英混合识别####不变 - i18n("韩英混合"): "ko",#按韩英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 - i18n("多语种混合(粤语)"): "auto_yue",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 } -dict_language = dict_language_v1 if version =='v1' else dict_language_v2 +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 tokenizer = AutoTokenizer.from_pretrained(bert_path) bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) @@ -200,87 +219,109 @@ if is_half == True: else: ssl_model = ssl_model.to(device) -resample_transform_dict={} +resample_transform_dict = {} + + def resample(audio_tensor, sr0): global resample_transform_dict if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) return resample_transform_dict[sr0](audio_tensor) + ###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt -#symbol_version-model_version-if_lora_v3 -from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new -def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): - global vq_model, hps, version, model_version, dict_language,if_lora_v3 - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) +# symbol_version-model_version-if_lora_v3 +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global vq_model, hps, version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) # print(sovits_path,version, model_version, if_lora_v3) - if if_lora_v3==True and is_exist_s2gv3==False: - info= "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + if if_lora_v3 == True and is_exist_s2gv3 == False: + info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") gr.Warning(info) raise FileExistsError(info) - dict_language = dict_language_v1 if version =='v1' else dict_language_v2 + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): - prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language} + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) else: - prompt_text_update = {'__type__':'update', 'value':''} - prompt_language_update = {'__type__':'update', 'value':i18n("中文")} + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} if text_language in list(dict_language.keys()): - text_update, text_language_update = {'__type__':'update'}, {'__type__':'update', 'value':text_language} + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} else: - text_update = {'__type__':'update', 'value':''} - text_language_update = {'__type__':'update', 'value':i18n("中文")} - if model_version=="v3": - visible_sample_steps=True - visible_inp_refs=False + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version == "v3": + visible_sample_steps = True + visible_inp_refs = False else: - visible_sample_steps=False - visible_inp_refs=True - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False} + visible_sample_steps = False + visible_inp_refs = True + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "visible": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) dict_s2 = load_sovits_new(sovits_path) hps = dict_s2["config"] hps = DictToAttrRecursive(hps) hps.model.semantic_frame_rate = "25hz" - if 'enc_p.text_embedding.weight'not in dict_s2['weight']: - hps.model.version = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps.model.version = "v1" else: hps.model.version = "v2" - version=hps.model.version + version = hps.model.version # print("sovits版本:",hps.model.version) - if model_version!="v3": + if model_version != "v3": vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model + **hps.model, ) - model_version=version + model_version = version else: vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **hps.model + **hps.model, ) - if ("pretrained" not in sovits_path): + if "pretrained" not in sovits_path: try: del vq_model.enc_q - except:pass + except: + pass if is_half == True: vq_model = vq_model.half().to(device) else: vq_model = vq_model.to(device) vq_model.eval() - if if_lora_v3==False: - print("loading sovits_%s"%model_version,vq_model.load_state_dict(dict_s2["weight"], strict=False)) + if if_lora_v3 == False: + print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False)) else: - print("loading sovits_v3pretrained_G", vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False)) - lora_rank=dict_s2["lora_rank"] + print( + "loading sovits_v3pretrained_G", + vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False), + ) + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -288,22 +329,38 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): init_lora_weights=True, ) vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) - print("loading sovits_v3_lora%s"%(lora_rank)) + print("loading sovits_v3_lora%s" % (lora_rank)) vq_model.load_state_dict(dict_s2["weight"], strict=False) vq_model.cfm = vq_model.cfm.merge_and_unload() # torch.save(vq_model.state_dict(),"merge_win.pth") vq_model.eval() - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True} - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["SoVITS"][version]=sovits_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "visible": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "value": False, "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) -try:next(change_sovits_weights(sovits_path)) -except:pass +try: + next(change_sovits_weights(sovits_path)) +except: + pass + def change_gpt_weights(gpt_path): global hz, max_sec, t2s_model, config @@ -319,23 +376,29 @@ def change_gpt_weights(gpt_path): t2s_model.eval() # total = sum([param.nelement() for param in t2s_model.parameters()]) # print("Number of parameter: %.2fM" % (total / 1e6)) - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["GPT"][version]=gpt_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["GPT"][version] = gpt_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) change_gpt_weights(gpt_path) -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" -import torch,soundfile +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +import torch + now_dir = os.getcwd() -import soundfile + def init_bigvgan(): global bigvgan_model from BigVGAN import bigvgan - bigvgan_model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() @@ -344,16 +407,20 @@ def init_bigvgan(): else: bigvgan_model = bigvgan_model.to(device) -if model_version!="v3":bigvgan_model=None -else:init_bigvgan() + +if model_version != "v3": + bigvgan_model = None +else: + init_bigvgan() def get_spepc(hps, filename): # audio = load_audio(filename, int(hps.data.sampling_rate)) audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1):audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( @@ -366,17 +433,21 @@ def get_spepc(hps, filename): ) return spec + def clean_text_inf(text, language, version): - language = language.replace("all_","") + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text -dtype=torch.float16 if is_half == True else torch.float32 + +dtype = torch.float16 if is_half == True else torch.float32 + + def get_bert_inf(phones, word2ph, norm_text, language): - language=language.replace("all_","") + language = language.replace("all_", "") if language == "zh": - bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) else: bert = torch.zeros( (1024, len(phones)), @@ -386,7 +457,21 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} def get_first(text): @@ -394,24 +479,27 @@ def get_first(text): text = re.split(pattern, text)[0].strip() return text + from text import chinese -def get_phones_and_bert(text,language,version,final=False): + + +def get_phones_and_bert(text, language, version, final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"zh",version) + return get_phones_and_bert(formattext, "zh", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = get_bert_feature(norm_text, word2ph).to(device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"yue",version) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = torch.zeros( @@ -419,8 +507,8 @@ def get_phones_and_bert(text,language,version,final=False): dtype=torch.float16 if is_half == True else torch.float32, ).to(device) elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] + textlist = [] + langlist = [] if language == "auto": for tmp in LangSegmenter.getTexts(text): langlist.append(tmp["lang"]) @@ -453,30 +541,42 @@ def get_phones_and_bert(text,language,version,final=False): bert_list.append(bert) bert = torch.cat(bert_list, dim=1) phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + norm_text = "".join(norm_text_list) if not final and len(phones) < 6: - return get_phones_and_bert("." + text,language,version,final=True) + return get_phones_and_bert("." + text, language, version, final=True) - return phones,bert.to(dtype),norm_text + return phones, bert.to(dtype), norm_text + + +from module.mel_processing import mel_spectrogram_torch, spectrogram_torch -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) + + +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + def merge_short_text_in_array(texts, threshold): if (len(texts)) < 2: @@ -488,50 +588,77 @@ def merge_short_text_in_array(texts, threshold): if len(text) >= threshold: result.append(text) text = "" - if (len(text) > 0): + if len(text) > 0: if len(result) == 0: result.append(text) else: result[len(result) - 1] += text return result -sr_model=None -def audio_sr(audio,sr): + +sr_model = None + + +def audio_sr(audio, sr): global sr_model - if sr_model==None: + if sr_model == None: from tools.audio_sr import AP_BWE + try: - sr_model=AP_BWE(device,DictToAttrRecursive) + sr_model = AP_BWE(device, DictToAttrRecursive) except FileNotFoundError: gr.Warning(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好")) - return audio.cpu().detach().numpy(),sr - return sr_model(audio,sr) + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) ##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature # cache_tokens={}#暂未实现清理机制 -cache= {} -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False,inp_refs=None,sample_steps=8,if_sr=False,pause_second=0.3): +cache = {} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut=i18n("不切"), + top_k=20, + top_p=0.6, + temperature=0.6, + ref_free=False, + speed=1, + if_freeze=False, + inp_refs=None, + sample_steps=8, + if_sr=False, + pause_second=0.3, +): global cache - if ref_wav_path:pass - else:gr.Warning(i18n('请上传参考音频')) - if text:pass - else:gr.Warning(i18n('请填入推理文本')) + if ref_wav_path: + pass + else: + gr.Warning(i18n("请上传参考音频")) + if text: + pass + else: + gr.Warning(i18n("请填入推理文本")) t = [] if prompt_text is None or len(prompt_text) == 0: ref_free = True - if model_version=="v3": - ref_free=False#s2v3暂不支持ref_free + if model_version == "v3": + ref_free = False # s2v3暂不支持ref_free else: - if_sr=False + if_sr = False t0 = ttime() prompt_language = dict_language[prompt_language] text_language = dict_language[text_language] - if not ref_free: prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." print(i18n("实际输入的参考文本:"), prompt_text) text = text.strip("\n") # if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text @@ -549,7 +676,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, if not ref_free: with torch.no_grad(): wav16k, sr = librosa.load(ref_wav_path, sr=16000) - if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000: gr.Warning(i18n("参考音频在3~10秒范围外,请更换!")) raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) wav16k = torch.from_numpy(wav16k) @@ -558,27 +685,23 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, else: wav16k = wav16k.to(device) wav16k = torch.cat([wav16k, zero_wav_torch]) - ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ - "last_hidden_state" - ].transpose( - 1, 2 - ) # .float() + ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float() codes = vq_model.extract_latent(ssl_content) prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) t1 = ttime() - t.append(t1-t0) + t.append(t1 - t0) - if (how_to_cut == i18n("凑四句一切")): + if how_to_cut == i18n("凑四句一切"): text = cut1(text) - elif (how_to_cut == i18n("凑50字一切")): + elif how_to_cut == i18n("凑50字一切"): text = cut2(text) - elif (how_to_cut == i18n("按中文句号。切")): + elif how_to_cut == i18n("按中文句号。切"): text = cut3(text) - elif (how_to_cut == i18n("按英文句号.切")): + elif how_to_cut == i18n("按英文句号.切"): text = cut4(text) - elif (how_to_cut == i18n("按标点符号切")): + elif how_to_cut == i18n("按标点符号切"): text = cut5(text) while "\n\n" in text: text = text.replace("\n\n", "\n") @@ -589,19 +712,20 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, audio_opt = [] ###s2v3暂不支持ref_free if not ref_free: - phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language, version) + phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version) - for i_text,text in enumerate(texts): + for i_text, text in enumerate(texts): # 解决输入目标文本的空行导致报错的问题 - if (len(text.strip()) == 0): + if len(text.strip()) == 0: continue - if (text[-1] not in splits): text += "。" if text_language != "en" else "." + if text[-1] not in splits: + text += "。" if text_language != "en" else "." print(i18n("实际输入的目标文本(每句):"), text) - phones2,bert2,norm_text2=get_phones_and_bert(text, text_language, version) + phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) print(i18n("前端处理后的文本(每句):"), norm_text2) if not ref_free: bert = torch.cat([bert1, bert2], 1) - all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0) else: bert = bert2 all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) @@ -612,7 +736,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, t2 = ttime() # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature) # print(cache.keys(),if_freeze) - if(i_text in cache and if_freeze==True):pred_semantic=cache[i_text] + if i_text in cache and if_freeze == True: + pred_semantic = cache[i_text] else: with torch.no_grad(): pred_semantic, idx = t2s_model.model.infer_panel( @@ -627,59 +752,65 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, early_stop_num=hz * max_sec, ) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) - cache[i_text]=pred_semantic + cache[i_text] = pred_semantic t3 = ttime() ###v3不存在以下逻辑和inp_refs - if model_version!="v3": - refers=[] - if(inp_refs): + if model_version != "v3": + refers = [] + if inp_refs: for path in inp_refs: try: refer = get_spepc(hps, path.name).to(dtype).to(device) refers.append(refer) except: traceback.print_exc() - if(len(refers)==0):refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] - audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers,speed=speed)[0][0]#.cpu().detach().numpy() + if len(refers) == 0: + refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] + audio = vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed + )[0][0] # .cpu().detach().numpy() else: refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) - phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0) - phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) # print(11111111, phoneme_ids0, phoneme_ids1) - fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) ref_audio, sr = torchaudio.load(ref_wav_path) - ref_audio=ref_audio.to(device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if sr!=24000: - ref_audio=resample(ref_audio,sr) + if sr != 24000: + ref_audio = resample(ref_audio, sr) # print("ref_audio",ref_audio.abs().mean()) mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): + if T_min > 468: mel2 = mel2[:, :, -468:] fea_ref = fea_ref[:, :, -468:] T_min = 468 chunk_len = 934 - T_min # print("fea_ref",fea_ref,fea_ref.shape) # print("mel2",mel2) - mel2=mel2.to(dtype) - fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge,speed) + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) # print("fea_todo",fea_todo) # print("ge",ge.abs().mean()) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) # set_seed(123) - cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] # print("fea", fea) # print("mel2in", mel2) @@ -687,27 +818,30 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, cfm_resss.append(cfm_res) cmf_res = torch.cat(cfm_resss, 2) cmf_res = denorm_spec(cmf_res) - if bigvgan_model==None:init_bigvgan() + if bigvgan_model == None: + init_bigvgan() with torch.inference_mode(): wav_gen = bigvgan_model(cmf_res) - audio=wav_gen[0][0]#.cpu().detach().numpy() - max_audio=torch.abs(audio).max()#简单防止16bit爆音 - if max_audio>1:audio=audio/max_audio + audio = wav_gen[0][0] # .cpu().detach().numpy() + max_audio = torch.abs(audio).max() # 简单防止16bit爆音 + if max_audio > 1: + audio = audio / max_audio audio_opt.append(audio) - audio_opt.append(zero_wav_torch)#zero_wav + audio_opt.append(zero_wav_torch) # zero_wav t4 = ttime() - t.extend([t2 - t1,t3 - t2, t4 - t3]) + t.extend([t2 - t1, t3 - t2, t4 - t3]) t1 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) - audio_opt=torch.cat(audio_opt, 0)#np.concatenate - sr=hps.data.sampling_rate if model_version!="v3"else 24000 - if if_sr==True and sr==24000: + audio_opt = torch.cat(audio_opt, 0) # np.concatenate + sr = hps.data.sampling_rate if model_version != "v3" else 24000 + if if_sr == True and sr == 24000: print(i18n("音频超分中")) - audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr) - max_audio=np.abs(audio_opt).max() - if max_audio > 1: audio_opt /= max_audio + audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio else: - audio_opt=audio_opt.cpu().detach().numpy() + audio_opt = audio_opt.cpu().detach().numpy() yield sr, (audio_opt * 32767).astype(np.int16) @@ -738,7 +872,7 @@ def cut1(inp): if len(split_idx) > 1: opts = [] for idx in range(len(split_idx) - 1): - opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]])) else: opts = [inp] opts = [item for item in opts if not set(item).issubset(punctuation)] @@ -774,11 +908,12 @@ def cut3(inp): inp = inp.strip("\n") opts = ["%s" % item for item in inp.strip("。").split("。")] opts = [item for item in opts if not set(item).issubset(punctuation)] - return "\n".join(opts) + return "\n".join(opts) + def cut4(inp): inp = inp.strip("\n") - opts = re.split(r'(? 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): + if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit(): items.append(char) else: items.append(char) @@ -810,17 +945,18 @@ def cut5(inp): def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts + def process_text(texts): - _text=[] - if all(text in [None, " ", "\n",""] for text in texts): + _text = [] + if all(text in [None, " ", "\n", ""] for text in texts): raise ValueError(i18n("请输入有效文本")) for text in texts: - if text in [None, " ", ""]: + if text in [None, " ", ""]: pass else: _text.append(text) @@ -829,35 +965,42 @@ def process_text(texts): def change_choices(): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) - return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names = [i for i in pretrained_sovits_name] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) GPT_names = [i for i in pretrained_gpt_name] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) -def html_center(text, label='p'): + +def html_center(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" -def html_left(text, label='p'): + +def html_left(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" @@ -865,66 +1008,196 @@ def html_left(text, label='p'): with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Group(): - gr.Markdown(html_center(i18n("模型切换"),'h3')) + gr.Markdown(html_center(i18n("模型切换"), "h3")) with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True, scale=14) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True, scale=14) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + scale=14, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + scale=14, + ) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14) refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - gr.Markdown(html_center(i18n("*请上传并填写参考信息"),'h3')) + gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3")) with gr.Row(): inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13) with gr.Column(scale=13): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1) - gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"
"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))) - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1) + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。") + + i18n("v3暂不支持该模式,使用了会报错。"), + value=False, + interactive=True if model_version != "v3" else False, + show_label=True, + scale=1, + ) + gr.Markdown( + html_left( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) + ) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1) with gr.Column(scale=14): prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"), + label=i18n("参考音频的语种"), + choices=list(dict_language.keys()), + value=i18n("中文"), ) - inp_refs = gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple")if model_version!="v3"else gr.File(label=i18n("可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"),file_count="multiple",visible=False) - sample_steps = gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),value=32,choices=[4,8,16,32],visible=True)if model_version=="v3"else gr.Radio(label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),choices=[4,8,16,32],visible=False,value=32) - if_sr_Checkbox=gr.Checkbox(label=i18n("v3输出如果觉得闷可以试试开超分"), value=False, interactive=True, show_label=True,visible=False if model_version!="v3"else True) - gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3')) + inp_refs = ( + gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + ) + if model_version != "v3" + else gr.File( + label=i18n( + "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。" + ), + file_count="multiple", + visible=False, + ) + ) + sample_steps = ( + gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + value=32, + choices=[4, 8, 16, 32], + visible=True, + ) + if model_version == "v3" + else gr.Radio( + label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), + choices=[4, 8, 16, 32], + visible=False, + value=32, + ) + ) + if_sr_Checkbox = gr.Checkbox( + label=i18n("v3输出如果觉得闷可以试试开超分"), + value=False, + interactive=True, + show_label=True, + visible=False if model_version != "v3" else True, + ) + gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) with gr.Row(): with gr.Column(scale=13): text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26) with gr.Column(scale=7): text_language = gr.Dropdown( - label=i18n("需要合成的语种")+i18n(".限制范围越小判别效果越好。"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1 - ) + label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"), + choices=list(dict_language.keys()), + value=i18n("中文"), + scale=1, + ) how_to_cut = gr.Dropdown( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, scale=1 - ) + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) gr.Markdown(value=html_center(i18n("语速调整,高为更快"))) - if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), value=False, interactive=True,show_label=True, scale=1) + if_freeze = gr.Checkbox( + label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"), + value=False, + interactive=True, + show_label=True, + scale=1, + ) with gr.Row(): - speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1) - pause_second_slider = gr.Slider(minimum=0.1,maximum=0.5,step=0.01,label=i18n("句间停顿秒数"),value=0.3,interactive=True, scale=1) + speed = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1 + ) + pause_second_slider = gr.Slider( + minimum=0.1, + maximum=0.5, + step=0.01, + label=i18n("句间停顿秒数"), + value=0.3, + interactive=True, + scale=1, + ) gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):"))) - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=15,interactive=True, scale=1) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1) - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True, scale=1) + top_k = gr.Slider( + minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1 + ) + top_p = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1 + ) + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1 + ) # with gr.Column(): # gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。")) # phoneme=gr.Textbox(label=i18n("音素框"), value="") # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") with gr.Row(): - inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25) + inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25) output = gr.Audio(label=i18n("输出的语音"), scale=14) inference_button.click( get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider], + [ + inp_ref, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_text_free, + speed, + if_freeze, + inp_refs, + sample_steps, + if_sr_Checkbox, + pause_second_slider, + ], [output], ) - SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button]) + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + if_sr_Checkbox, + inference_button, + ], + ) GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) @@ -943,8 +1216,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: # button5.click(cut5, [text_inp], [text_opt]) # gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))) -if __name__ == '__main__': - app.queue().launch(#concurrency_count=511, max_size=1022 +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 40cdae9..837a2e4 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -1,14 +1,19 @@ -''' +""" 按中英混合识别 按日英混合识别 多语种启动切分识别语种 全部按中文识别 全部按英文识别 全部按日文识别 -''' +""" + +import json +import logging +import os import random -import os, re, logging, json +import re import sys + now_dir = os.getcwd() sys.path.append(now_dir) sys.path.append("%s/GPT_SoVITS" % (now_dir)) @@ -20,13 +25,14 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) -import pdb import torch try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... + + analytics.version_check = lambda: None +except: + ... infer_ttswebui = os.environ.get("infer_ttswebui", 9872) @@ -41,16 +47,16 @@ gpt_path = os.environ.get("gpt_path", None) sovits_path = os.environ.get("sovits_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None) bert_path = os.environ.get("bert_path", None) -version=model_version=os.environ.get("version","v2") +version = model_version = os.environ.get("version", "v2") import gradio as gr -from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR from TTS_infer_pack.text_segmentation_method import get_method -from tools.i18n.i18n import I18nAuto, scan_language_list -from inference_webui import DictToAttrRecursive +from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config -language=os.environ.get("language","Auto") -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = os.environ.get("language", "Auto") +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) @@ -67,30 +73,30 @@ else: # device = "cpu" dict_language_v1 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 } dict_language_v2 = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("粤语"): "all_yue",#全部按中文识别 - i18n("韩文"): "all_ko",#全部按韩文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("粤英混合"): "yue",#按粤英混合识别####不变 - i18n("韩英混合"): "ko",#按韩英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 - i18n("多语种混合(粤语)"): "auto_yue",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("粤语"): "all_yue", # 全部按中文识别 + i18n("韩文"): "all_ko", # 全部按韩文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("粤英混合"): "yue", # 按粤英混合识别####不变 + i18n("韩英混合"): "ko", # 按韩英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 + i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种 } -dict_language = dict_language_v1 if version =='v1' else dict_language_v2 +dict_language = dict_language_v1 if version == "v1" else dict_language_v2 cut_method = { - i18n("不切"):"cut0", + i18n("不切"): "cut0", i18n("凑四句一切"): "cut1", i18n("凑50字一切"): "cut2", i18n("按中文句号。切"): "cut3", @@ -117,22 +123,33 @@ gpt_path = tts_config.t2s_weights_path sovits_path = tts_config.vits_weights_path version = tts_config.version -def inference(text, text_lang, - ref_audio_path, - aux_ref_audio_paths, - prompt_text, - prompt_lang, top_k, - top_p, temperature, - text_split_method, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed, keep_random, parallel_infer, - repetition_penalty, sample_steps, super_sampling, - ): +def inference( + text, + text_lang, + ref_audio_path, + aux_ref_audio_paths, + prompt_text, + prompt_lang, + top_k, + top_p, + temperature, + text_split_method, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, +): seed = -1 if keep_random else seed actual_seed = seed if seed not in [-1, "", None] else random.randint(0, 2**32 - 1) - inputs={ + inputs = { "text": text, "text_lang": dict_language[text_lang], "ref_audio_path": ref_audio_path, @@ -143,12 +160,12 @@ def inference(text, text_lang, "top_p": top_p, "temperature": temperature, "text_split_method": cut_method[text_split_method], - "batch_size":int(batch_size), - "speed_factor":float(speed_factor), - "split_bucket":split_bucket, - "return_fragment":False, - "fragment_interval":fragment_interval, - "seed":actual_seed, + "batch_size": int(batch_size), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "return_fragment": False, + "fragment_interval": fragment_interval, + "seed": actual_seed, "parallel_infer": parallel_infer, "repetition_penalty": repetition_penalty, "sample_steps": int(sample_steps), @@ -158,11 +175,12 @@ def inference(text, text_lang, for item in tts_pipeline.run(inputs): yield item, actual_seed except NO_PROMPT_ERROR: - gr.Warning(i18n('V3不支持无参考文本模式,请填写参考文本!')) + gr.Warning(i18n("V3不支持无参考文本模式,请填写参考文本!")) + def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts @@ -170,125 +188,193 @@ def custom_sort_key(s): def change_choices(): SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) - return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } -path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] -_ =[[],[]] +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + path_sovits_v3, +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + +_ = [[], []] for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) -pretrained_gpt_name,pretrained_sovits_name = _ + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ -if os.path.exists(f"./weight.json"): +if os.path.exists("./weight.json"): pass else: - with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file) + with open("./weight.json", "w", encoding="utf-8") as file: + json.dump({"GPT": {}, "SoVITS": {}}, file) -with open(f"./weight.json", 'r', encoding="utf-8") as file: +with open("./weight.json", "r", encoding="utf-8") as file: weight_data = file.read() - weight_data=json.loads(weight_data) - gpt_path = os.environ.get( - "gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name)) - sovits_path = os.environ.get( - "sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name)) - if isinstance(gpt_path,list): + weight_data = json.loads(weight_data) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) + if isinstance(gpt_path, list): gpt_path = gpt_path[0] - if isinstance(sovits_path,list): + if isinstance(sovits_path, list): sovits_path = sovits_path[0] +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) def get_weights_names(GPT_weight_root, SoVITS_weight_root): SoVITS_names = [i for i in pretrained_sovits_name] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) GPT_names = [i for i in pretrained_gpt_name] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) -from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new -def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): - global version, model_version, dict_language,if_lora_v3 - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) +from process_ckpt import get_sovits_version_from_path_fast + + +def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): + global version, model_version, dict_language, if_lora_v3 + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) # print(sovits_path,version, model_version, if_lora_v3) if if_lora_v3 and not os.path.exists(path_sovits_v3): - info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") + info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") gr.Warning(info) raise FileExistsError(info) - dict_language = dict_language_v1 if version =='v1' else dict_language_v2 + dict_language = dict_language_v1 if version == "v1" else dict_language_v2 if prompt_language is not None and text_language is not None: if prompt_language in list(dict_language.keys()): - prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language} + prompt_text_update, prompt_language_update = ( + {"__type__": "update"}, + {"__type__": "update", "value": prompt_language}, + ) else: - prompt_text_update = {'__type__':'update', 'value':''} - prompt_language_update = {'__type__':'update', 'value':i18n("中文")} + prompt_text_update = {"__type__": "update", "value": ""} + prompt_language_update = {"__type__": "update", "value": i18n("中文")} if text_language in list(dict_language.keys()): - text_update, text_language_update = {'__type__':'update'}, {'__type__':'update', 'value':text_language} + text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language} else: - text_update = {'__type__':'update', 'value':''} - text_language_update = {'__type__':'update', 'value':i18n("中文")} - if model_version=="v3": - visible_sample_steps=True - visible_inp_refs=False + text_update = {"__type__": "update", "value": ""} + text_language_update = {"__type__": "update", "value": i18n("中文")} + if model_version == "v3": + visible_sample_steps = True + visible_inp_refs = False else: - visible_sample_steps=False - visible_inp_refs=True - #prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free, - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False} + visible_sample_steps = False + visible_inp_refs = True + # prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free, + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, + ) tts_pipeline.init_vits_weights(sovits_path) - yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True} - with open("./weight.json")as f: - data=f.read() - data=json.loads(data) - data["SoVITS"][version]=sovits_path - with open("./weight.json","w")as f:f.write(json.dumps(data)) + yield ( + {"__type__": "update", "choices": list(dict_language.keys())}, + {"__type__": "update", "choices": list(dict_language.keys())}, + prompt_text_update, + prompt_language_update, + text_update, + text_language_update, + {"__type__": "update", "interactive": visible_sample_steps, "value": 32}, + {"__type__": "update", "visible": visible_inp_refs}, + {"__type__": "update", "interactive": True if model_version != "v3" else False}, + {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, + ) + with open("./weight.json") as f: + data = f.read() + data = json.loads(data) + data["SoVITS"][version] = sovits_path + with open("./weight.json", "w") as f: + f.write(json.dumps(data)) + with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Column(): # with gr.Group(): gr.Markdown(value=i18n("模型切换")) with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + ) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - with gr.Row(): with gr.Column(): gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath") - inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple", visible=True if model_version!="v3"else False) + inp_refs = gr.File( + label=i18n("辅参考音频(可选多个,或不选)"), + file_count="multiple", + visible=True if model_version != "v3" else False, + ) prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) with gr.Row(): prompt_language = gr.Dropdown( label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文") ) with gr.Column(): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True) - gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"
"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")) + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), + value=False, + interactive=True if model_version != "v3" else False, + show_label=True, + ) + gr.Markdown( + i18n("使用无参考文本模式时建议使用微调的GPT") + + "
" + + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。") + ) with gr.Column(): gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) @@ -297,42 +383,66 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label=i18n("需要合成的文本的语种"), choices=list(dict_language.keys()), value=i18n("中文") ) - with gr.Group(): gr.Markdown(value=i18n("推理设置")) with gr.Row(): - with gr.Column(): with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True) - sample_steps = gr.Radio(label=i18n("采样步数(仅对V3生效)"),value=32,choices=[4,8,16,32],visible=True) + batch_size = gr.Slider( + minimum=1, maximum=200, step=1, label=i18n("batch_size"), value=20, interactive=True + ) + sample_steps = gr.Radio( + label=i18n("采样步数(仅对V3生效)"), value=32, choices=[4, 8, 16, 32], visible=True + ) with gr.Row(): - fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True) - speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="语速",value=1.0,interactive=True) + fragment_interval = gr.Slider( + minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True + ) + speed_factor = gr.Slider( + minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True + ) with gr.Row(): - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True) + top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True) with gr.Row(): - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True) - + temperature = gr.Slider( + minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True + ) + repetition_penalty = gr.Slider( + minimum=0, maximum=2, step=0.05, label=i18n("重复惩罚"), value=1.35, interactive=True + ) + with gr.Column(): with gr.Row(): how_to_cut = gr.Dropdown( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, scale=1 - ) - super_sampling = gr.Checkbox(label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True) + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + scale=1, + ) + super_sampling = gr.Checkbox( + label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True + ) with gr.Row(): parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True) - split_bucket = gr.Checkbox(label=i18n("数据分桶(并行推理时会降低一点计算量)"), value=True, interactive=True, show_label=True) + split_bucket = gr.Checkbox( + label=i18n("数据分桶(并行推理时会降低一点计算量)"), + value=True, + interactive=True, + show_label=True, + ) with gr.Row(): - - seed = gr.Number(label=i18n("随机种子"),value=-1) + seed = gr.Number(label=i18n("随机种子"), value=-1) keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True) output = gr.Audio(label=i18n("输出的语音")) @@ -340,40 +450,78 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: inference_button = gr.Button(i18n("合成语音"), variant="primary") stop_infer = gr.Button(i18n("终止合成"), variant="primary") - inference_button.click( inference, [ - text,text_language, inp_ref, inp_refs, - prompt_text, prompt_language, - top_k, top_p, temperature, - how_to_cut, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed, keep_random, parallel_infer, - repetition_penalty, sample_steps, super_sampling, - ], + text, + text_language, + inp_ref, + inp_refs, + prompt_text, + prompt_language, + top_k, + top_p, + temperature, + how_to_cut, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + keep_random, + parallel_infer, + repetition_penalty, + sample_steps, + super_sampling, + ], [output, seed], ) stop_infer.click(tts_pipeline.stop, [], []) - SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])# + SoVITS_dropdown.change( + change_sovits_weights, + [SoVITS_dropdown, prompt_language, text_language], + [ + prompt_language, + text_language, + prompt_text, + prompt_language, + text, + text_language, + sample_steps, + inp_refs, + ref_text_free, + inference_button, + ], + ) # GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) with gr.Group(): - gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + gr.Markdown( + value=i18n( + "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。" + ) + ) with gr.Row(): text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4) with gr.Column(): _how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, - ) - cut_text= gr.Button(i18n("切分"), variant="primary") + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + ) + cut_text = gr.Button(i18n("切分"), variant="primary") def to_cut(text_inp, how_to_cut): - if len(text_inp.strip()) == 0 or text_inp==[]: + if len(text_inp.strip()) == 0 or text_inp == []: return "" method = get_method(cut_method[how_to_cut]) return method(text_inp) @@ -382,8 +530,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt]) gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -if __name__ == '__main__': - app.queue().launch(#concurrency_count=511, max_size=1022 +if __name__ == "__main__": + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, diff --git a/GPT_SoVITS/module/attentions.py b/GPT_SoVITS/module/attentions.py index a2e9e51..341de4a 100644 --- a/GPT_SoVITS/module/attentions.py +++ b/GPT_SoVITS/module/attentions.py @@ -18,7 +18,7 @@ class Encoder(nn.Module): p_dropout=0.0, window_size=4, isflow=False, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -56,9 +56,7 @@ class Encoder(nn.Module): ) self.norm_layers_2.append(LayerNorm(hidden_channels)) if isflow: - cond_layer = torch.nn.Conv1d( - kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1) self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) self.cond_layer = weight_norm_modules(cond_layer, name="weight") self.gin_channels = kwargs["gin_channels"] @@ -74,9 +72,7 @@ class Encoder(nn.Module): x = self.cond_pre(x) cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - x = commons.fused_add_tanh_sigmoid_multiply( - x, g_l, torch.IntTensor([self.hidden_channels]) - ) + x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels])) y = self.attn_layers[i](x, x, attn_mask) y = self.drop(y) x = self.norm_layers_1[i](x + y) @@ -99,7 +95,7 @@ class Decoder(nn.Module): p_dropout=0.0, proximal_bias=False, proximal_init=True, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -131,9 +127,7 @@ class Decoder(nn.Module): ) self.norm_layers_0.append(LayerNorm(hidden_channels)) self.encdec_attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout - ) + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout) ) self.norm_layers_1.append(LayerNorm(hidden_channels)) self.ffn_layers.append( @@ -153,9 +147,7 @@ class Decoder(nn.Module): x: decoder input h: encoder output """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask for i in range(self.n_layers): @@ -211,14 +203,8 @@ class MultiHeadAttention(nn.Module): if window_size is not None: n_heads_rel = 1 if heads_share else n_heads rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) nn.init.xavier_uniform_(self.conv_q.weight) nn.init.xavier_uniform_(self.conv_k.weight) @@ -247,46 +233,28 @@ class MultiHeadAttention(nn.Module): scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." + assert t_s == t_t, "Relative attention is only available for self-attention." key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) + rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) scores_local = self._relative_position_to_absolute_position(rel_logits) scores = scores + scores_local if self.proximal_bias: assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) if mask is not None: scores = scores.masked_fill(mask == 0, -1e4) if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) + assert t_s == t_t, "Local attention is only available for self-attention." + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) scores = scores.masked_fill(block_mask == 0, -1e4) p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] p_attn = self.drop(p_attn) output = torch.matmul(p_attn, value) if self.window_size is not None: relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = ( - output.transpose(2, 3).contiguous().view(b, d, t_t) - ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] return output, p_attn def _matmul_with_relative_values(self, x, y): @@ -320,9 +288,7 @@ class MultiHeadAttention(nn.Module): ) else: padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] return used_relative_embeddings def _relative_position_to_absolute_position(self, x): @@ -336,14 +302,10 @@ class MultiHeadAttention(nn.Module): # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :] return x_final def _absolute_position_to_relative_position(self, x): @@ -353,9 +315,7 @@ class MultiHeadAttention(nn.Module): """ batch, heads, length, _ = x.size() # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) @@ -537,9 +497,7 @@ class Depthwise_Separable_TransposeConv1D(nn.Module): def weight_norm_modules(module, name="weight", dim=0): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( - module, Depthwise_Separable_TransposeConv1D - ): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): module.weight_norm() return module else: @@ -547,9 +505,7 @@ def weight_norm_modules(module, name="weight", dim=0): def remove_weight_norm_modules(module, name="weight"): - if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( - module, Depthwise_Separable_TransposeConv1D - ): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D): module.remove_weight_norm() else: remove_weight_norm(module, name) @@ -567,7 +523,7 @@ class FFT(nn.Module): proximal_bias=False, proximal_init=True, isflow=False, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -579,9 +535,7 @@ class FFT(nn.Module): self.proximal_bias = proximal_bias self.proximal_init = proximal_init if isflow: - cond_layer = torch.nn.Conv1d( - kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1) self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) self.cond_layer = weight_norm_modules(cond_layer, name="weight") self.gin_channels = kwargs["gin_channels"] @@ -622,18 +576,14 @@ class FFT(nn.Module): if g is not None: g = self.cond_layer(g) - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) x = x * x_mask for i in range(self.n_layers): if g is not None: x = self.cond_pre(x) cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - x = commons.fused_add_tanh_sigmoid_multiply( - x, g_l, torch.IntTensor([self.hidden_channels]) - ) + x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels])) y = self.self_attn_layers[i](x, x, self_attn_mask) y = self.drop(y) x = self.norm_layers_0[i](x + y) diff --git a/GPT_SoVITS/module/attentions_onnx.py b/GPT_SoVITS/module/attentions_onnx.py index 097b1b9..9961f98 100644 --- a/GPT_SoVITS/module/attentions_onnx.py +++ b/GPT_SoVITS/module/attentions_onnx.py @@ -7,6 +7,7 @@ from module import commons from typing import Optional + class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): super().__init__() @@ -43,7 +44,7 @@ class Encoder(nn.Module): p_dropout=0.0, window_size=4, isflow=True, - **kwargs + **kwargs, ): super().__init__() self.hidden_channels = hidden_channels @@ -65,13 +66,9 @@ class Encoder(nn.Module): if self.gin_channels != 0: self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) # vits2 says 3rd block, so idx is 2 by default - self.cond_layer_idx = ( - kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 - ) + self.cond_layer_idx = kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 logging.debug(self.gin_channels, self.cond_layer_idx) - assert ( - self.cond_layer_idx < self.n_layers - ), "cond_layer_idx should be less than n_layers" + assert self.cond_layer_idx < self.n_layers, "cond_layer_idx should be less than n_layers" self.drop = nn.Dropout(p_dropout) self.attn_layers = nn.ModuleList() self.norm_layers_1 = nn.ModuleList() @@ -117,11 +114,13 @@ class Encoder(nn.Module): # x = self.norm_layers_2[i](x + y) # x = x * x_mask # return x - + def forward(self, x, x_mask): attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask - for attn_layers,norm_layers_1,ffn_layers,norm_layers_2 in zip(self.attn_layers,self.norm_layers_1,self.ffn_layers,self.norm_layers_2): + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ): y = attn_layers(x, x, attn_mask) y = self.drop(y) x = norm_layers_1(x + y) @@ -170,14 +169,8 @@ class MultiHeadAttention(nn.Module): if window_size is not None: n_heads_rel = 1 if heads_share else n_heads rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) nn.init.xavier_uniform_(self.conv_q.weight) nn.init.xavier_uniform_(self.conv_k.weight) @@ -187,7 +180,7 @@ class MultiHeadAttention(nn.Module): self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.bias.copy_(self.conv_q.bias) - def forward(self, x, c, attn_mask:Optional[torch.Tensor]=None): + def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None): q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) @@ -198,7 +191,7 @@ class MultiHeadAttention(nn.Module): x = self.conv_o(x) return x - def attention(self, query, key, value, mask:Optional[torch.Tensor]=None): + def attention(self, query, key, value, mask: Optional[torch.Tensor] = None): # reshape [b, d, t] -> [b, n_h, t, d_k] b, d, t_s, _ = (*key.size(), query.size(2)) query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3) @@ -223,8 +216,8 @@ class MultiHeadAttention(nn.Module): relative_weights = self._absolute_position_to_relative_position(p_attn) value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - - output = (output.transpose(2, 3).contiguous().view(b, d, -1)) + + output = output.transpose(2, 3).contiguous().view(b, d, -1) return output, p_attn def _matmul_with_relative_values(self, x, y): @@ -248,19 +241,17 @@ class MultiHeadAttention(nn.Module): def _get_relative_embeddings(self, relative_embeddings, length): max_relative_position = 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. - pad_l = torch.zeros((1), dtype = torch.int64) + length - (self.window_size + 1) - pad_s = torch.zeros((1), dtype = torch.int64) + (self.window_size + 1) - length - pad_length = torch.max(pad_l, other=torch.zeros((1), dtype = torch.int64)) - slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype = torch.int64)) + pad_l = torch.zeros((1), dtype=torch.int64) + length - (self.window_size + 1) + pad_s = torch.zeros((1), dtype=torch.int64) + (self.window_size + 1) - length + pad_length = torch.max(pad_l, other=torch.zeros((1), dtype=torch.int64)) + slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype=torch.int64)) slice_end_position = slice_start_position + 2 * length - 1 padded_relative_embeddings = F.pad( relative_embeddings, commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), ) - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] return used_relative_embeddings def _relative_position_to_absolute_position(self, x): @@ -274,14 +265,10 @@ class MultiHeadAttention(nn.Module): # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :] return x_final def _absolute_position_to_relative_position(self, x): @@ -291,9 +278,7 @@ class MultiHeadAttention(nn.Module): """ batch, heads, length, _ = x.size() # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) @@ -351,7 +336,7 @@ class FFN(nn.Module): x = self.drop(x) x = self.conv_2(self.padding(x * x_mask)) return x * x_mask - + def padding(self, x): return self._same_padding(x) @@ -395,12 +380,6 @@ class MRTE(nn.Module): ssl_enc = self.c_pre(ssl_enc * ssl_mask) text_enc = self.text_pre(text * text_mask) - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge x = self.c_post(x * ssl_mask) return x diff --git a/GPT_SoVITS/module/commons.py b/GPT_SoVITS/module/commons.py index 6083535..20392f9 100644 --- a/GPT_SoVITS/module/commons.py +++ b/GPT_SoVITS/module/commons.py @@ -28,9 +28,7 @@ def intersperse(lst, item): def kl_divergence(m_p, logs_p, m_q, logs_q): """KL(P||Q)""" kl = (logs_q - logs_p) - 0.5 - kl += ( - 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) - ) + kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) return kl @@ -67,9 +65,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): position = torch.arange(length, dtype=torch.float) num_timescales = channels // 2 - log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( - num_timescales - 1 - ) + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1) inv_timescales = min_timescale * torch.exp( torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment ) diff --git a/GPT_SoVITS/module/core_vq.py b/GPT_SoVITS/module/core_vq.py index a5e22d6..b7dab31 100644 --- a/GPT_SoVITS/module/core_vq.py +++ b/GPT_SoVITS/module/core_vq.py @@ -30,6 +30,7 @@ # SOFTWARE. """Core vector quantization implementation.""" + import typing as tp from einops import rearrange, repeat @@ -121,9 +122,7 @@ class EuclideanCodebook(nn.Module): ): super().__init__() self.decay = decay - init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = ( - uniform_init if not kmeans_init else torch.zeros - ) + init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros embed = init_fn(codebook_size, dim) self.codebook_size = codebook_size @@ -151,9 +150,7 @@ class EuclideanCodebook(nn.Module): # broadcast_tensors(self.buffers()) def replace_(self, samples, mask): - modified_codebook = torch.where( - mask[..., None], sample_vectors(samples, self.codebook_size), self.embed - ) + modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed) self.embed.data.copy_(modified_codebook) def expire_codes_(self, batch_samples): @@ -174,11 +171,7 @@ class EuclideanCodebook(nn.Module): def quantize(self, x): embed = self.embed.t() - dist = -( - x.pow(2).sum(1, keepdim=True) - - 2 * x @ embed - + embed.pow(2).sum(0, keepdim=True) - ) + dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True)) embed_ind = dist.max(dim=-1).indices return embed_ind @@ -222,8 +215,7 @@ class EuclideanCodebook(nn.Module): embed_sum = x.t() @ embed_onehot ema_inplace(self.embed_avg, embed_sum.t(), self.decay) cluster_size = ( - laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) - * self.cluster_size.sum() + laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum() ) embed_normalized = self.embed_avg / cluster_size.unsqueeze(1) self.embed.data.copy_(embed_normalized) @@ -264,12 +256,8 @@ class VectorQuantization(nn.Module): _codebook_dim: int = default(codebook_dim, dim) requires_projection = _codebook_dim != dim - self.project_in = ( - nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() - ) - self.project_out = ( - nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() - ) + self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() + self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() self.epsilon = epsilon self.commitment_weight = commitment_weight @@ -330,13 +318,9 @@ class ResidualVectorQuantization(nn.Module): def __init__(self, *, num_quantizers, **kwargs): super().__init__() - self.layers = nn.ModuleList( - [VectorQuantization(**kwargs) for _ in range(num_quantizers)] - ) + self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)]) - def forward( - self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None - ): + def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None): quantized_out = 0.0 residual = x @@ -359,9 +343,7 @@ class ResidualVectorQuantization(nn.Module): out_losses, out_indices = map(torch.stack, (all_losses, all_indices)) return quantized_out, out_indices, out_losses, out_quantized - def encode( - self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None - ) -> torch.Tensor: + def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor: residual = x all_indices = [] n_q = n_q or len(self.layers) diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 6ceca20..4a9a50c 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,24 +1,18 @@ -import time -import logging import os import random import traceback -import numpy as np import torch import torch.utils.data from tqdm import tqdm -from module import commons -from module.mel_processing import spectrogram_torch,spec_to_mel_torch +from module.mel_processing import spectrogram_torch, spec_to_mel_torch from text import cleaned_text_to_sequence -from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F -from functools import lru_cache -import requests -from scipy.io import wavfile -from io import BytesIO from tools.my_utils import load_audio -version = os.environ.get('version',None) + +version = os.environ.get("version", None) + + # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ @@ -43,7 +37,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -51,7 +45,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -76,7 +70,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -111,7 +105,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -129,8 +123,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) return spec, audio_norm @@ -146,12 +141,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): return len(self.audiopaths_sid_text) def random_slice(self, ssl, wav, mel): - assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( - "first", ssl.shape, wav.shape) + assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape) len_mel = mel.shape[1] if self.val: - reference_mel = mel[:, :len_mel // 3] + reference_mel = mel[:, : len_mel // 3] return reference_mel, ssl, wav, mel dir = random.randint(0, 1) sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) @@ -159,20 +153,29 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): if dir == 0: reference_mel = mel[:, :sep_point] ssl = ssl[:, :, sep_point:] - wav2 = wav[:, sep_point * self.hop_length:] + wav2 = wav[:, sep_point * self.hop_length :] mel = mel[:, sep_point:] else: reference_mel = mel[:, sep_point:] ssl = ssl[:, :, :sep_point] - wav2 = wav[:, :sep_point * self.hop_length] + wav2 = wav[:, : sep_point * self.hop_length] mel = mel[:, :sep_point] assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( - ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir) + ssl.shape, + wav.shape, + wav2.shape, + mel.shape, + sep_point, + self.hop_length, + sep_point * self.hop_length, + dir, + ) return reference_mel, ssl, wav2, mel -class TextAudioSpeakerCollate(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollate: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -184,9 +187,7 @@ class TextAudioSpeakerCollate(): batch: [text_normalized, spec_normalized, wav_normalized, sid] """ # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) @@ -214,22 +215,24 @@ class TextAudioSpeakerCollate(): row = batch[ids_sorted_decreasing[i]] ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav + wav_padded[i, :, : wav.size(1)] = wav wav_lengths[i] = wav.size(1) text = row[3] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths + + class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -253,7 +256,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -261,7 +264,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -286,7 +289,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -313,15 +316,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths - self.spec_min=-12 - self.spec_max=2 + self.spec_min = -12 + self.spec_max = 2 + + self.filter_length_mel = self.win_length_mel = 1024 + self.hop_length_mel = 256 + self.n_mel_channels = 100 + self.sampling_rate_mel = 24000 + self.mel_fmin = 0 + self.mel_fmax = None - self.filter_length_mel=self.win_length_mel=1024 - self.hop_length_mel=256 - self.n_mel_channels=100 - self.sampling_rate_mel=24000 - self.mel_fmin=0 - self.mel_fmax=None def norm_spec(self, x): return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 @@ -332,7 +336,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -347,25 +351,35 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): return (ssl, spec, mel, text) def get_audio(self, filename): - audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - audio=torch.FloatTensor(audio_array)#/32768 + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 - audio24=torch.FloatTensor(audio_array24)#/32768 + audio_array24 = load_audio( + filename, 24000 + ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 + audio24 = torch.FloatTensor(audio_array24) # /32768 audio_norm24 = audio24 audio_norm24 = audio_norm24.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) - - spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False) - mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax) + spec1 = spectrogram_torch( + audio_norm24, + self.filter_length_mel, + self.sampling_rate_mel, + self.hop_length_mel, + self.win_length_mel, + center=False, + ) + mel = spec_to_mel_torch( + spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax + ) mel = torch.squeeze(mel, 0) - mel=self.norm_spec(mel) + mel = self.norm_spec(mel) # print(1111111,spec.shape,mel.shape) return spec, mel @@ -379,9 +393,10 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_sid_text) -class TextAudioSpeakerCollateV3(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollateV3: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -392,12 +407,10 @@ class TextAudioSpeakerCollateV3(): ------ batch: [text_normalized, spec_normalized, wav_normalized, sid] """ - #ssl, spec, wav,mel, text + # ssl, spec, wav,mel, text # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) -#(ssl, spec,mel, text) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) + # (ssl, spec,mel, text) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1)) @@ -411,7 +424,7 @@ class TextAudioSpeakerCollateV3(): # max_wav_len = max([x[2].size(1) for x in batch]) max_text_len = max([x[3].size(0) for x in batch]) - max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320 + max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320 ssl_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) @@ -422,7 +435,7 @@ class TextAudioSpeakerCollateV3(): spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) - text_padded = torch.LongTensor(len(batch), max_text_len) + text_padded = torch.LongTensor(len(batch), max_text_len) # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) spec_padded.zero_() @@ -435,11 +448,11 @@ class TextAudioSpeakerCollateV3(): row = batch[ids_sorted_decreasing[i]] # ssl, spec, wav,mel, text ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) # wav = row[2] @@ -447,15 +460,17 @@ class TextAudioSpeakerCollateV3(): # wav_lengths[i] = wav.size(1) mel = row[2] - mel_padded[i, :, :mel.size(1)] = mel + mel_padded[i, :, : mel.size(1)] = mel mel_lengths[i] = mel.size(1) text = row[3] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths - return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths + return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths + + class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -479,7 +494,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): for line in lines: tmp = line.split("\t") - if (len(tmp) != 4): + if len(tmp) != 4: continue self.phoneme_data[tmp[0]] = [tmp[1]] @@ -487,7 +502,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): tmp = self.audiopaths_sid_text leng = len(tmp) min_num = 100 - if (leng < min_num): + if leng < min_num: self.audiopaths_sid_text = [] for _ in range(max(2, int(min_num / leng))): self.audiopaths_sid_text += tmp @@ -512,7 +527,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): for audiopath in tqdm(self.audiopaths_sid_text): try: phoneme = self.phoneme_data[audiopath][0] - phoneme = phoneme.split(' ') + phoneme = phoneme.split(" ") phoneme_ids = cleaned_text_to_sequence(phoneme, version) except Exception: print(f"{audiopath} not in self.phoneme_data !") @@ -539,15 +554,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo self.audiopaths_sid_text = audiopaths_sid_text_new self.lengths = lengths - self.spec_min=-12 - self.spec_max=2 + self.spec_min = -12 + self.spec_max = 2 + + self.filter_length_mel = self.win_length_mel = 1024 + self.hop_length_mel = 256 + self.n_mel_channels = 100 + self.sampling_rate_mel = 24000 + self.mel_fmin = 0 + self.mel_fmax = None - self.filter_length_mel=self.win_length_mel=1024 - self.hop_length_mel=256 - self.n_mel_channels=100 - self.sampling_rate_mel=24000 - self.mel_fmin=0 - self.mel_fmax=None def norm_spec(self, x): return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 @@ -555,10 +571,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): audiopath, phoneme_ids = audiopath_sid_text text = torch.FloatTensor(phoneme_ids) try: - spec, mel,wav = self.get_audio("%s/%s" % (self.path5, audiopath)) + spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) with torch.no_grad(): ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") - if (ssl.shape[-1] != spec.shape[-1]): + if ssl.shape[-1] != spec.shape[-1]: typee = ssl.dtype ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) ssl.requires_grad = False @@ -573,27 +589,37 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): return (ssl, spec, wav, mel, text) def get_audio(self, filename): - audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768 - audio=torch.FloatTensor(audio_array)#/32768 + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 - audio24=torch.FloatTensor(audio_array24)#/32768 + audio_array24 = load_audio( + filename, 24000 + ) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速 + audio24 = torch.FloatTensor(audio_array24) # /32768 audio_norm24 = audio24 audio_norm24 = audio_norm24.unsqueeze(0) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + spec = spectrogram_torch( + audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False + ) spec = torch.squeeze(spec, 0) - - spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False) - mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax) + spec1 = spectrogram_torch( + audio_norm24, + self.filter_length_mel, + self.sampling_rate_mel, + self.hop_length_mel, + self.win_length_mel, + center=False, + ) + mel = spec_to_mel_torch( + spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax + ) mel = torch.squeeze(mel, 0) - mel=self.norm_spec(mel) + mel = self.norm_spec(mel) # print(1111111,spec.shape,mel.shape) - return spec, mel,audio_norm + return spec, mel, audio_norm def get_sid(self, sid): sid = torch.LongTensor([int(sid)]) @@ -605,9 +631,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_sid_text) -class TextAudioSpeakerCollateV3b(): - """ Zero-pads model inputs and targets - """ + + +class TextAudioSpeakerCollateV3b: + """Zero-pads model inputs and targets""" def __init__(self, return_ids=False): self.return_ids = return_ids @@ -618,12 +645,10 @@ class TextAudioSpeakerCollateV3b(): ------ batch: [text_normalized, spec_normalized, wav_normalized, sid] """ - #ssl, spec, wav,mel, text + # ssl, spec, wav,mel, text # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) -#(ssl, spec,mel, text) + _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True) + # (ssl, spec,mel, text) max_ssl_len = max([x[0].size(2) for x in batch]) max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1)) @@ -636,7 +661,7 @@ class TextAudioSpeakerCollateV3b(): max_spec_len = int(2 * ((max_spec_len // 2) + 1)) max_wav_len = max([x[2].size(1) for x in batch]) max_text_len = max([x[4].size(0) for x in batch]) - max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320 + max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320 ssl_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) @@ -647,7 +672,7 @@ class TextAudioSpeakerCollateV3b(): spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) - text_padded = torch.LongTensor(len(batch), max_text_len) + text_padded = torch.LongTensor(len(batch), max_text_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) spec_padded.zero_() @@ -660,28 +685,40 @@ class TextAudioSpeakerCollateV3b(): row = batch[ids_sorted_decreasing[i]] # ssl, spec, wav,mel, text ssl = row[0] - ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :] ssl_lengths[i] = ssl.size(2) spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec + spec_padded[i, :, : spec.size(1)] = spec spec_lengths[i] = spec.size(1) wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav + wav_padded[i, :, : wav.size(1)] = wav wav_lengths[i] = wav.size(1) mel = row[3] - mel_padded[i, :, :mel.size(1)] = mel + mel_padded[i, :, : mel.size(1)] = mel mel_lengths[i] = mel.size(1) text = row[4] - text_padded[i, :text.size(0)] = text + text_padded[i, : text.size(0)] = text text_lengths[i] = text.size(0) - return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths + return ( + ssl_padded, + spec_padded, + mel_padded, + ssl_lengths, + spec_lengths, + text_padded, + text_lengths, + wav_padded, + wav_lengths, + mel_lengths, + ) # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths + class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): """ Maintain similar input lengths in a batch. @@ -745,12 +782,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): num_samples_bucket = self.num_samples_per_bucket[i] rem = num_samples_bucket - len_bucket - ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] + ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)] - ids_bucket = ids_bucket[self.rank::self.num_replicas] + ids_bucket = ids_bucket[self.rank :: self.num_replicas] for j in range(len(ids_bucket) // self.batch_size): - batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] + batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]] batches.append(batch) if self.shuffle: @@ -777,4 +814,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): return -1 def __len__(self): - return self.num_samples // self.batch_size \ No newline at end of file + return self.num_samples // self.batch_size diff --git a/GPT_SoVITS/module/losses.py b/GPT_SoVITS/module/losses.py index b23fc8c..2b642db 100644 --- a/GPT_SoVITS/module/losses.py +++ b/GPT_SoVITS/module/losses.py @@ -1,7 +1,6 @@ import math import torch -from torch.nn import functional as F def feature_loss(fmap_r, fmap_g): @@ -66,8 +65,6 @@ def mle_loss(z, m, logs, logdet, mask): torch.exp(-2 * logs) * ((z - m) ** 2) ) # neg normal likelihood w/o the constant term l = l - torch.sum(logdet) # log jacobian determinant - l = l / torch.sum( - torch.ones_like(z) * mask - ) # averaging across batch, channel and time axes + l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term return l diff --git a/GPT_SoVITS/module/mel_processing.py b/GPT_SoVITS/module/mel_processing.py index d94b045..7718b4a 100644 --- a/GPT_SoVITS/module/mel_processing.py +++ b/GPT_SoVITS/module/mel_processing.py @@ -1,16 +1,5 @@ -import math -import os -import random import torch -from torch import nn -import torch.nn.functional as F import torch.utils.data -import numpy as np -import librosa -import librosa.util as librosa_util -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window -from scipy.io.wavfile import read from librosa.filters import mel as librosa_mel_fn MAX_WAV_VALUE = 32768.0 @@ -58,9 +47,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) dtype_device = str(y.dtype) + "_" + str(y.device) wnsize_dtype_device = str(win_size) + "_" + dtype_device if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), @@ -90,20 +77,14 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): dtype_device = str(spec.dtype) + "_" + str(spec.device) fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=spec.dtype, device=spec.device - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) return spec -def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False -): +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): if torch.min(y) < -1.0: print("min value is ", torch.min(y)) if torch.max(y) > 1.0: @@ -114,16 +95,10 @@ def mel_spectrogram_torch( fmax_dtype_device = str(fmax) + "_" + dtype_device wnsize_dtype_device = str(win_size) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=y.dtype, device=y.device - ) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 33bd607..aac520a 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -1,9 +1,7 @@ import warnings + warnings.filterwarnings("ignore") -import copy import math -import os -import pdb import torch from torch import nn @@ -13,16 +11,18 @@ from module import commons from module import modules from module import attentions from f5_tts.model import DiT -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding from module.mrte_model import MRTE from module.quantize import ResidualVectorQuantizer + # from text import symbols from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 from torch.cuda.amp import autocast -import contextlib,random +import contextlib +import random class StochasticDurationPredictor(nn.Module): @@ -48,29 +48,21 @@ class StochasticDurationPredictor(nn.Module): self.flows = nn.ModuleList() self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -91,10 +83,7 @@ class StochasticDurationPredictor(nn.Module): h_w = self.post_pre(w) h_w = self.post_convs(h_w, x_mask) h_w = self.post_proj(h_w) * x_mask - e_q = ( - torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) - * x_mask - ) + e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask z_q = e_q for flow in self.post_flows: z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) @@ -102,13 +91,8 @@ class StochasticDurationPredictor(nn.Module): z_u, z1 = torch.split(z_q, [1, 1], 1) u = torch.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += torch.sum( - (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] - ) - logq = ( - torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - - logdet_tot_q - ) + logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) + logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) @@ -117,18 +101,12 @@ class StochasticDurationPredictor(nn.Module): for flow in flows: z, logdet = flow(z, x_mask, g=x, reverse=reverse) logdet_tot = logdet_tot + logdet - nll = ( - torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - - logdet_tot - ) + nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot return nll + logq # [b] else: flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = ( - torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) - * noise_scale - ) + z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale for flow in flows: z = flow(z, x_mask, g=x, reverse=reverse) z0, z1 = torch.split(z, [1, 1], 1) @@ -137,9 +115,7 @@ class StochasticDurationPredictor(nn.Module): class DurationPredictor(nn.Module): - def __init__( - self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 - ): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): super().__init__() self.in_channels = in_channels @@ -149,13 +125,9 @@ class DurationPredictor(nn.Module): self.gin_channels = gin_channels self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d( - in_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) @@ -190,7 +162,7 @@ class TextEncoder(nn.Module): kernel_size, p_dropout, latent_channels=192, - version = "v2", + version="v2", ): super().__init__() self.out_channels = out_channels @@ -237,26 +209,22 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, y, y_lengths, text, text_lengths, ge, speed=1,test=None): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) + def forward(self, y, y_lengths, text, text_lengths, ge, speed=1, test=None): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) y = self.ssl_proj(y * y_mask) * y_mask y = self.encoder_ssl(y * y_mask, y_mask) - text_mask = torch.unsqueeze( - commons.sequence_mask(text_lengths, text.size(1)), 1 - ).to(y.dtype) + text_mask = torch.unsqueeze(commons.sequence_mask(text_lengths, text.size(1)), 1).to(y.dtype) if test == 1: text[:, :] = 0 text = self.text_embedding(text).transpose(1, 2) text = self.encoder_text(text * text_mask, text_mask) y = self.mrte(y, y_mask, text, text_mask, ge) y = self.encoder2(y * y_mask, y_mask) - if(speed!=1): - y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear") + if speed != 1: + y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear") y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest") stats = self.proj(y) * y_mask m, logs = torch.split(stats, self.out_channels, dim=1) @@ -360,9 +328,7 @@ class PosteriorEncoder(nn.Module): def forward(self, x, x_lengths, g=None): if g != None: g = g.detach() - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -372,14 +338,9 @@ class PosteriorEncoder(nn.Module): class Encoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0 + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -394,7 +355,7 @@ class Encoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, x, x_lengths, g=None): - if(g!=None): + if g != None: g = g.detach() x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask @@ -402,6 +363,7 @@ class Encoder(nn.Module): stats = self.proj(x) * x_mask return stats, x_mask + class WNEncoder(nn.Module): def __init__( self, @@ -434,9 +396,7 @@ class WNEncoder(nn.Module): self.norm = modules.LayerNorm(out_channels) def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) out = self.proj(x) * x_mask @@ -459,9 +419,7 @@ class Generator(torch.nn.Module): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() @@ -481,9 +439,7 @@ class Generator(torch.nn.Module): self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) @@ -636,9 +592,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): @@ -738,10 +692,7 @@ class Quantizer(torch.nn.Module): super(Quantizer, self).__init__() assert embed_dim % n_code_groups == 0 self.quantizer_modules = nn.ModuleList( - [ - Quantizer_module(n_codes, embed_dim // n_code_groups) - for _ in range(n_code_groups) - ] + [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)] ) self.n_code_groups = n_code_groups self.embed_dim = embed_dim @@ -759,9 +710,7 @@ class Quantizer(torch.nn.Module): z_q.append(_z_q) min_indicies.append(_min_indicies) # B * T, z_q = torch.cat(z_q, -1).reshape(xin.shape) - loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( - (z_q - xin.detach()) ** 2 - ) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2) z_q = xin + (z_q - xin).detach() z_q = z_q.transpose(1, 2) codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) @@ -801,13 +750,9 @@ class CodePredictor(nn.Module): self.p_dropout = p_dropout self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) - self.ref_enc = modules.MelStyleEncoder( - ssl_dim, style_vector_dim=hidden_channels - ) + self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) + self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) self.n_q = n_q @@ -820,9 +765,7 @@ class CodePredictor(nn.Module): x = x + g x = self.encoder(x * x_mask, x_mask) x = self.out_proj(x * x_mask) * x_mask - logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( - 2, 3 - ) + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3) target = codes[1:].transpose(0, 1) if not infer: logits = logits.reshape(-1, self.dims) @@ -870,8 +813,8 @@ class SynthesizerTrn(nn.Module): use_sdp=True, semantic_frame_rate=None, freeze_quantizer=None, - version = "v2", - **kwargs + version="v2", + **kwargs, ): super().__init__() self.spec_channels = spec_channels @@ -902,7 +845,7 @@ class SynthesizerTrn(nn.Module): n_layers, kernel_size, p_dropout, - version = version, + version=version, ) self.dec = Generator( inter_channels, @@ -923,12 +866,10 @@ class SynthesizerTrn(nn.Module): 16, gin_channels=gin_channels, ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels - ) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) # self.version=os.environ.get("version","v1") - if(self.version=="v1"): + if self.version == "v1": self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels) else: self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) @@ -945,13 +886,11 @@ class SynthesizerTrn(nn.Module): self.freeze_quantizer = freeze_quantizer def forward(self, ssl, y, y_lengths, text, text_lengths): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) - if(self.version=="v1"): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + if self.version == "v1": ge = self.ref_enc(y * y_mask, y_mask) else: - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) with autocast(enabled=False): maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() with maybe_no_grad: @@ -959,24 +898,16 @@ class SynthesizerTrn(nn.Module): self.ssl_proj.eval() self.quantizer.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge - ) + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z_p = self.flow(z, y_mask, g=ge) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) + z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=ge) return ( o, @@ -989,24 +920,18 @@ class SynthesizerTrn(nn.Module): ) def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5): - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( - y.dtype - ) - if(self.version=="v1"): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + if self.version == "v1": ge = self.ref_enc(y * y_mask, y_mask) else: - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) ssl = self.ssl_proj(ssl) quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0]) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge, test=test - ) + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, test=test) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -1015,39 +940,34 @@ class SynthesizerTrn(nn.Module): return o, y_mask, (z, z_p, m_p, logs_p) @torch.no_grad() - def decode(self, codes, text, refer, noise_scale=0.5,speed=1): + def decode(self, codes, text, refer, noise_scale=0.5, speed=1): def get_ge(refer): ge = None if refer is not None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) - refer_mask = torch.unsqueeze( - commons.sequence_mask(refer_lengths, refer.size(2)), 1 - ).to(refer.dtype) - if (self.version == "v1"): + refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) + if self.version == "v1": ge = self.ref_enc(refer * refer_mask, refer_mask) else: ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) return ge - if(type(refer)==list): - ges=[] + + if type(refer) == list: + ges = [] for _refer in refer: - ge=get_ge(_refer) + ge = get_ge(_refer) ges.append(ge) - ge=torch.stack(ges,0).mean(0) + ge = torch.stack(ges, 0).mean(0) else: - ge=get_ge(refer) + ge = get_ge(refer) y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) if self.semantic_frame_rate == "25hz": - quantized = F.interpolate( - quantized, size=int(quantized.shape[-1] * 2), mode="nearest" - ) - x, m_p, logs_p, y_mask = self.enc_p( - quantized, y_lengths, text, text_lengths, ge,speed - ) + quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest") + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -1059,11 +979,10 @@ class SynthesizerTrn(nn.Module): ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) return codes.transpose(0, 1) + + class CFM(torch.nn.Module): - def __init__( - self, - in_channels,dit - ): + def __init__(self, in_channels, dit): super().__init__() self.sigma_min = 1e-6 @@ -1077,41 +996,54 @@ class CFM(torch.nn.Module): def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0): """Forward diffusion""" B, T = mu.size(0), mu.size(1) - x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) * temperature + x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature prompt_len = prompt.size(-1) - prompt_x = torch.zeros_like(x,dtype=mu.dtype) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) prompt_x[..., :prompt_len] = prompt[..., :prompt_len] x[..., :prompt_len] = 0 - mu=mu.transpose(2,1) + mu = mu.transpose(2, 1) t = 0 d = 1 / n_timesteps for j in range(n_timesteps): - t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t - d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d # v_pred = model(x, t_tensor, d_tensor, **extra_args) - v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu, use_grad_ckpt=False,drop_audio_cond=False,drop_text=False).transpose(2, 1) - if inference_cfg_rate>1e-5: - neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1) - v_pred=v_pred+(v_pred-neg)*inference_cfg_rate + v_pred = self.estimator( + x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False + ).transpose(2, 1) + if inference_cfg_rate > 1e-5: + neg = self.estimator( + x, + prompt_x, + x_lens, + t_tensor, + d_tensor, + mu, + use_grad_ckpt=False, + drop_audio_cond=True, + drop_text=True, + ).transpose(2, 1) + v_pred = v_pred + (v_pred - neg) * inference_cfg_rate x = x + d * v_pred t = t + d x[:, :, :prompt_len] = 0 return x + def forward(self, x1, x_lens, prompt_lens, mu, use_grad_ckpt): b, _, t = x1.shape t = torch.rand([b], device=mu.device, dtype=x1.dtype) - x0 = torch.randn_like(x1,device=mu.device) + x0 = torch.randn_like(x1, device=mu.device) vt = x1 - x0 xt = x0 + t[:, None, None] * vt - dt = torch.zeros_like(t,device=mu.device) + dt = torch.zeros_like(t, device=mu.device) prompt = torch.zeros_like(x1) for i in range(b): - prompt[i, :, :prompt_lens[i]] = x1[i, :, :prompt_lens[i]] - xt[i, :, :prompt_lens[i]] = 0 - gailv=0.3# if ttime()>1736250488 else 0.1 + prompt[i, :, : prompt_lens[i]] = x1[i, :, : prompt_lens[i]] + xt[i, :, : prompt_lens[i]] = 0 + gailv = 0.3 # if ttime()>1736250488 else 0.1 if random.random() < gailv: base = torch.randint(2, 8, (t.shape[0],), device=mu.device) - d = 1/torch.pow(2, base) + d = 1 / torch.pow(2, base) d_input = d.clone() d_input[d_input < 1e-2] = 0 # with torch.no_grad(): @@ -1119,52 +1051,55 @@ class CFM(torch.nn.Module): # v_pred_1 = self.diffusion(xt, t, d_input, cond=conditioning).detach() x_mid = xt + d[:, None, None] * v_pred_1 # v_pred_2 = self.diffusion(x_mid, t+d, d_input, cond=conditioning).detach() - v_pred_2 = self.estimator(x_mid, prompt, x_lens, t+d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach() + v_pred_2 = self.estimator(x_mid, prompt, x_lens, t + d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach() vt = (v_pred_1 + v_pred_2) / 2 vt = vt.detach() - dt = 2*d + dt = 2 * d - vt_pred = self.estimator(xt, prompt, x_lens, t,dt, mu, use_grad_ckpt).transpose(2,1) + vt_pred = self.estimator(xt, prompt, x_lens, t, dt, mu, use_grad_ckpt).transpose(2, 1) loss = 0 for i in range(b): - loss += self.criterion(vt_pred[i, :, prompt_lens[i]:x_lens[i]], vt[i, :, prompt_lens[i]:x_lens[i]]) + loss += self.criterion(vt_pred[i, :, prompt_lens[i] : x_lens[i]], vt[i, :, prompt_lens[i] : x_lens[i]]) loss /= b return loss + def set_no_grad(net_g): for name, param in net_g.named_parameters(): - param.requires_grad=False + param.requires_grad = False + class SynthesizerTrnV3(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - version="v3", - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + version="v3", + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1185,132 +1120,133 @@ class SynthesizerTrnV3(nn.Module): self.gin_channels = gin_channels self.version = version - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, # gin_channels=gin_channels) # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) - self.freeze_quantizer=freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim - if self.freeze_quantizer==True: + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + self.freeze_quantizer = freeze_quantizer + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim + if self.freeze_quantizer == True: set_no_grad(self.ssl_proj) set_no_grad(self.quantizer) set_no_grad(self.enc_p) - def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths, use_grad_ckpt):#ssl_lengths no need now + def forward( + self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths, use_grad_ckpt + ): # ssl_lengths no need now with autocast(enabled=False): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() with maybe_no_grad: if self.freeze_quantizer: - self.ssl_proj.eval()# + self.ssl_proj.eval() # self.quantizer.eval() self.enc_p.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT - fea, y_mask_ = self.wns1(fea, mel_lengths, ge)##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. - B=ssl.shape[0] - prompt_len_max = mel_lengths*2/3 + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT + fea, y_mask_ = self.wns1( + fea, mel_lengths, ge + ) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. + B = ssl.shape[0] + prompt_len_max = mel_lengths * 2 / 3 prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long) - minn=min(mel.shape[-1],fea.shape[-1]) - mel=mel[:,:,:minn] - fea=fea[:,:,:minn] - cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt) + minn = min(mel.shape[-1], fea.shape[-1]) + mel = mel[:, :, :minn] + fea = fea[:, :, :minn] + cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt) return cfm_loss @torch.no_grad() - def decode_encp(self, codes,text, refer,ge=None,speed=1): + def decode_encp(self, codes, text, refer, ge=None, speed=1): # print(2333333,refer.shape) # ge=None - if(ge==None): + if ge == None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) - y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device) - if speed==1: - sizee=int(codes.size(2)*2.5*1.5) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) + y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) + if speed == 1: + sizee = int(codes.size(2) * 2.5 * 1.5) else: - sizee=int(codes.size(2)*2.5*1.5/speed)+1 + sizee = int(codes.size(2) * 2.5 * 1.5 / speed) + 1 y_lengths1 = torch.LongTensor([sizee]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT - x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge,speed) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT + x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) - return fea,ge + return fea, ge def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) + return codes.transpose(0, 1) + class SynthesizerTrnV3b(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1330,47 +1266,52 @@ class SynthesizerTrnV3b(nn.Module): self.n_speakers = n_speakers self.gin_channels = gin_channels - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, - gin_channels=gin_channels) + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels + ) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) - self.freeze_quantizer=freeze_quantizer + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + self.freeze_quantizer = freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim - - def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths):#ssl_lengths no need now + def forward(self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths): # ssl_lengths no need now with autocast(enabled=False): y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) - ge = self.ref_enc(y[:,:704] * y_mask, y_mask) + ge = self.ref_enc(y[:, :704] * y_mask, y_mask) # ge = self.ref_enc(y * y_mask, y_mask)#change back, new spec setting is whole 24k # ge=None maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext() @@ -1379,51 +1320,59 @@ class SynthesizerTrnV3b(nn.Module): self.ssl_proj.eval() self.quantizer.eval() ssl = self.ssl_proj(ssl) - quantized, codes, commit_loss, quantized_list = self.quantizer( - ssl, layers=[0] - ) - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0]) + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) z_p = self.flow(z, y_mask, g=ge) z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT fea, y_mask_ = self.wns1(fea, mel_lengths, ge) learned_mel = self.linear_mel(fea) - B=ssl.shape[0] - prompt_len_max = mel_lengths*2/3 - prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)# - minn=min(mel.shape[-1],fea.shape[-1]) - mel=mel[:,:,:minn] - fea=fea[:,:,:minn] - cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea)#fea==cond,y_lengths==target_mel_lengths#ge not need - return commit_loss,cfm_loss,F.mse_loss(learned_mel, mel),o, ids_slice, y_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), quantized + B = ssl.shape[0] + prompt_len_max = mel_lengths * 2 / 3 + prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long) # + minn = min(mel.shape[-1], fea.shape[-1]) + mel = mel[:, :, :minn] + fea = fea[:, :, :minn] + cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea) # fea==cond,y_lengths==target_mel_lengths#ge not need + return ( + commit_loss, + cfm_loss, + F.mse_loss(learned_mel, mel), + o, + ids_slice, + y_mask, + y_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + quantized, + ) @torch.no_grad() - def decode_encp(self, codes,text, refer,ge=None): + def decode_encp(self, codes, text, refer, ge=None): # print(2333333,refer.shape) # ge=None - if(ge==None): + if ge == None: refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) - y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device) - y_lengths1 = torch.LongTensor([int(codes.size(2)*2.5*1.5)]).to(codes.device) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) + y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) + y_lengths1 = torch.LongTensor([int(codes.size(2) * 2.5 * 1.5)]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) - return fea,ge + return fea, ge def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) + return codes.transpose(0, 1) diff --git a/GPT_SoVITS/module/models_onnx.py b/GPT_SoVITS/module/models_onnx.py index 1c24056..8a3ad13 100644 --- a/GPT_SoVITS/module/models_onnx.py +++ b/GPT_SoVITS/module/models_onnx.py @@ -1,4 +1,3 @@ -import copy import math from typing import Optional import torch @@ -11,14 +10,14 @@ from module import attentions_onnx as attentions from f5_tts.model import DiT -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from module.commons import init_weights, get_padding from module.quantize import ResidualVectorQuantizer + # from text import symbols from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 -from torch.cuda.amp import autocast class StochasticDurationPredictor(nn.Module): @@ -44,29 +43,21 @@ class StochasticDurationPredictor(nn.Module): self.flows = nn.ModuleList() self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append( - modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) - ) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv( - filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout - ) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -87,10 +78,7 @@ class StochasticDurationPredictor(nn.Module): h_w = self.post_pre(w) h_w = self.post_convs(h_w, x_mask) h_w = self.post_proj(h_w) * x_mask - e_q = ( - torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) - * x_mask - ) + e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask z_q = e_q for flow in self.post_flows: z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) @@ -98,13 +86,8 @@ class StochasticDurationPredictor(nn.Module): z_u, z1 = torch.split(z_q, [1, 1], 1) u = torch.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += torch.sum( - (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] - ) - logq = ( - torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - - logdet_tot_q - ) + logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) + logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) @@ -113,18 +96,12 @@ class StochasticDurationPredictor(nn.Module): for flow in flows: z, logdet = flow(z, x_mask, g=x, reverse=reverse) logdet_tot = logdet_tot + logdet - nll = ( - torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - - logdet_tot - ) + nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot return nll + logq # [b] else: flows = list(reversed(self.flows)) flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = ( - torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) - * noise_scale - ) + z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale for flow in flows: z = flow(z, x_mask, g=x, reverse=reverse) z0, z1 = torch.split(z, [1, 1], 1) @@ -133,9 +110,7 @@ class StochasticDurationPredictor(nn.Module): class DurationPredictor(nn.Module): - def __init__( - self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 - ): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): super().__init__() self.in_channels = in_channels @@ -145,13 +120,9 @@ class DurationPredictor(nn.Module): self.gin_channels = gin_channels self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d( - in_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) @@ -234,7 +205,7 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, y, text, ge, speed=1): - y_mask = torch.ones_like(y[:1,:1,:]) + y_mask = torch.ones_like(y[:1, :1, :]) y = self.ssl_proj(y * y_mask) * y_mask y = self.encoder_ssl(y * y_mask, y_mask) @@ -246,8 +217,8 @@ class TextEncoder(nn.Module): y = self.mrte(y, y_mask, text, text_mask, ge) y = self.encoder2(y * y_mask, y_mask) - if(speed!=1): - y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear") + if speed != 1: + y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear") y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest") stats = self.proj(y) * y_mask @@ -333,9 +304,7 @@ class PosteriorEncoder(nn.Module): def forward(self, x, x_lengths, g=None): if g != None: g = g.detach() - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -345,14 +314,9 @@ class PosteriorEncoder(nn.Module): class Encoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): + def __init__( + self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0 + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -367,7 +331,7 @@ class Encoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, x, x_lengths, g=None): - if(g!=None): + if g != None: g = g.detach() x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask @@ -375,6 +339,7 @@ class Encoder(nn.Module): stats = self.proj(x) * x_mask return stats, x_mask + class WNEncoder(nn.Module): def __init__( self, @@ -407,9 +372,7 @@ class WNEncoder(nn.Module): self.norm = modules.LayerNorm(out_channels) def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) out = self.proj(x) * x_mask @@ -432,9 +395,7 @@ class Generator(torch.nn.Module): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() @@ -454,9 +415,7 @@ class Generator(torch.nn.Module): self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) @@ -465,7 +424,7 @@ class Generator(torch.nn.Module): if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward(self, x, g:Optional[torch.Tensor]=None): + def forward(self, x, g: Optional[torch.Tensor] = None): x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -609,9 +568,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): @@ -711,10 +668,7 @@ class Quantizer(torch.nn.Module): super(Quantizer, self).__init__() assert embed_dim % n_code_groups == 0 self.quantizer_modules = nn.ModuleList( - [ - Quantizer_module(n_codes, embed_dim // n_code_groups) - for _ in range(n_code_groups) - ] + [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)] ) self.n_code_groups = n_code_groups self.embed_dim = embed_dim @@ -732,9 +686,7 @@ class Quantizer(torch.nn.Module): z_q.append(_z_q) min_indicies.append(_min_indicies) # B * T, z_q = torch.cat(z_q, -1).reshape(xin.shape) - loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( - (z_q - xin.detach()) ** 2 - ) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2) z_q = xin + (z_q - xin).detach() z_q = z_q.transpose(1, 2) codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) @@ -774,13 +726,9 @@ class CodePredictor(nn.Module): self.p_dropout = p_dropout self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) - self.ref_enc = modules.MelStyleEncoder( - ssl_dim, style_vector_dim=hidden_channels - ) + self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) + self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) self.n_q = n_q @@ -793,9 +741,7 @@ class CodePredictor(nn.Module): x = x + g x = self.encoder(x * x_mask, x_mask) x = self.out_proj(x * x_mask) * x_mask - logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( - 2, 3 - ) + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3) target = codes[1:].transpose(0, 1) if not infer: logits = logits.reshape(-1, self.dims) @@ -844,7 +790,7 @@ class SynthesizerTrn(nn.Module): semantic_frame_rate=None, freeze_quantizer=None, version="v2", - **kwargs + **kwargs, ): super().__init__() self.spec_channels = spec_channels @@ -896,9 +842,7 @@ class SynthesizerTrn(nn.Module): # 16, # gin_channels=gin_channels, # ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels - ) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) # self.version=os.environ.get("version","v1") if self.version == "v1": @@ -923,9 +867,9 @@ class SynthesizerTrn(nn.Module): # self.enc_p.encoder_text.requires_grad_(False) # self.enc_p.mrte.requires_grad_(False) - def forward(self, codes, text, refer,noise_scale=0.5, speed=1): - refer_mask = torch.ones_like(refer[:1,:1,:]) - if (self.version == "v1"): + def forward(self, codes, text, refer, noise_scale=0.5, speed=1): + refer_mask = torch.ones_like(refer[:1, :1, :]) + if self.version == "v1": ge = self.ref_enc(refer * refer_mask, refer_mask) else: ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) @@ -935,10 +879,8 @@ class SynthesizerTrn(nn.Module): dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0) quantized = dquantized.contiguous().view(1, self.ssl_dim, -1) - x, m_p, logs_p, y_mask = self.enc_p( - quantized, text, ge, speed - ) - + x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed) + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=ge, reverse=True) @@ -951,11 +893,9 @@ class SynthesizerTrn(nn.Module): quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) return codes.transpose(0, 1) + class CFM(torch.nn.Module): - def __init__( - self, - in_channels,dit - ): + def __init__(self, in_channels, dit): super().__init__() # self.sigma_min = 1e-6 @@ -965,27 +905,34 @@ class CFM(torch.nn.Module): # self.criterion = torch.nn.MSELoss() - def forward(self, mu:torch.Tensor, x_lens:torch.LongTensor, prompt:torch.Tensor, n_timesteps:torch.LongTensor, temperature:float=1.0): + def forward( + self, + mu: torch.Tensor, + x_lens: torch.LongTensor, + prompt: torch.Tensor, + n_timesteps: torch.LongTensor, + temperature: float = 1.0, + ): """Forward diffusion""" B, T = mu.size(0), mu.size(1) - x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) + x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) ntimesteps = int(n_timesteps) prompt_len = prompt.size(-1) - prompt_x = torch.zeros_like(x,dtype=mu.dtype) + prompt_x = torch.zeros_like(x, dtype=mu.dtype) prompt_x[..., :prompt_len] = prompt[..., :prompt_len] x[..., :prompt_len] = 0.0 - mu=mu.transpose(2,1) - t = torch.tensor(0.0,dtype=x.dtype,device=x.device) - d = torch.tensor(1.0/ntimesteps,dtype=x.dtype,device=x.device) - d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d + mu = mu.transpose(2, 1) + t = torch.tensor(0.0, dtype=x.dtype, device=x.device) + d = torch.tensor(1.0 / ntimesteps, dtype=x.dtype, device=x.device) + d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d for j in range(ntimesteps): - t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t + t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t # d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d # v_pred = model(x, t_tensor, d_tensor, **extra_args) - v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu).transpose(2, 1) + v_pred = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu).transpose(2, 1) # if inference_cfg_rate>1e-5: # neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1) # v_pred=v_pred+(v_pred-neg)*inference_cfg_rate @@ -997,47 +944,51 @@ class CFM(torch.nn.Module): def set_no_grad(net_g): for name, param in net_g.named_parameters(): - param.requires_grad=False + param.requires_grad = False + @torch.jit.script_if_tracing def compile_codes_length(codes): y_lengths1 = torch.LongTensor([codes.size(2)]).to(codes.device) return y_lengths1 * 2.5 * 1.5 + @torch.jit.script_if_tracing def compile_ref_length(refer): refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) return refer_lengths + class SynthesizerTrnV3(nn.Module): """ Synthesizer for Training """ - def __init__(self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - semantic_frame_rate=None, - freeze_quantizer=None, - version="v3", - **kwargs): - + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + version="v3", + **kwargs, + ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -1058,41 +1009,38 @@ class SynthesizerTrnV3(nn.Module): self.gin_channels = gin_channels self.version = version - self.model_dim=512 + self.model_dim = 512 self.use_sdp = use_sdp - self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback - self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback + self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels) ###Rollback # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, # gin_channels=gin_channels) # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - ssl_dim = 768 - assert semantic_frame_rate in ['25hz', "50hz"] + assert semantic_frame_rate in ["25hz", "50hz"] self.semantic_frame_rate = semantic_frame_rate - if semantic_frame_rate == '25hz': + if semantic_frame_rate == "25hz": self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) else: self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) - self.quantizer = ResidualVectorQuantizer( - dimension=ssl_dim, - n_q=1, - bins=1024 - ) + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) freeze_quantizer - inter_channels2=512 - self.bridge=nn.Sequential( - nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), - nn.LeakyReLU() - ) - self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels) - self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1) - self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim - if freeze_quantizer==True: + inter_channels2 = 512 + self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU()) + self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels) + self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1) + self.cfm = CFM( + 100, + DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)), + ) # text_dim is condition feature dim + if freeze_quantizer == True: set_no_grad(self.ssl_proj) set_no_grad(self.quantizer) set_no_grad(self.enc_p) @@ -1100,24 +1048,23 @@ class SynthesizerTrnV3(nn.Module): def create_ge(self, refer): refer_lengths = compile_ref_length(refer) refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype) - ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask) + ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) return ge - def forward(self, codes, text,ge,speed=1): + def forward(self, codes, text, ge, speed=1): + y_lengths1 = compile_codes_length(codes) - y_lengths1=compile_codes_length(codes) - quantized = self.quantizer.decode(codes) - if self.semantic_frame_rate == '25hz': - quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT - x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge,speed) - fea=self.bridge(x) - fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT + x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed) + fea = self.bridge(x) + fea = F.interpolate(fea, scale_factor=1.875, mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) return fea def extract_latent(self, x): - ssl = self.ssl_proj(x) + ssl = self.ssl_proj(x) quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) - return codes.transpose(0,1) \ No newline at end of file + return codes.transpose(0, 1) diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py index f444745..7493f0b 100644 --- a/GPT_SoVITS/module/modules.py +++ b/GPT_SoVITS/module/modules.py @@ -52,11 +52,7 @@ class ConvReluNorm(nn.Module): self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) self.norm_layers.append(LayerNorm(hidden_channels)) self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) for _ in range(n_layers - 1): @@ -156,9 +152,7 @@ class WN(torch.nn.Module): self.drop = nn.Dropout(p_dropout) if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") for i in range(n_layers): @@ -479,9 +473,7 @@ class ConvFlow(nn.Module): self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) + self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) self.proj.weight.data.zero_() self.proj.bias.data.zero_() @@ -495,9 +487,7 @@ class ConvFlow(nn.Module): h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels) unnormalized_derivatives = h[..., 2 * self.num_bins :] x1, logabsdet = piecewise_rational_quadratic_transform( @@ -616,9 +606,7 @@ class MultiHeadAttention(nn.Module): self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) - self.attention = ScaledDotProductAttention( - temperature=np.power(d_model, 0.5), dropout=dropout - ) + self.attention = ScaledDotProductAttention(temperature=np.power(d_model, 0.5), dropout=dropout) self.fc = nn.Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) @@ -649,9 +637,7 @@ class MultiHeadAttention(nn.Module): output, attn = self.attention(q, k, v, mask=slf_mask) output = output.view(n_head, sz_b, len_x, d_v) - output = ( - output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) - ) # b x lq x (n*dv) + output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) # b x lq x (n*dv) output = self.fc(output) @@ -741,9 +727,7 @@ class MelStyleEncoder(nn.Module): if mask is not None: mask = (mask.int() == 0).squeeze(1) max_len = x.shape[1] - slf_attn_mask = ( - mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None - ) + slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None # spectral x = self.spectral(x) @@ -785,9 +769,7 @@ class MelStyleEncoderVAE(nn.Module): mu = self.fc1(enc_out) logvar = self.fc2(enc_out) posterior = D.Normal(mu, torch.exp(logvar)) - kl_divergence = D.kl_divergence( - posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar)) - ) + kl_divergence = D.kl_divergence(posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar))) loss_kl = kl_divergence.mean() z = posterior.rsample() @@ -825,9 +807,7 @@ class ActNorm(nn.Module): def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): if x_mask is None: - x_mask = torch.ones(x.size(0), 1, x.size(2)).to( - device=x.device, dtype=x.dtype - ) + x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype) x_len = torch.sum(x_mask, [1, 2]) if not self.initialized: self.initialize(x, x_mask) @@ -856,9 +836,7 @@ class ActNorm(nn.Module): v = m_sq - (m**2) logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) - bias_init = ( - (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) - ) + bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) self.bias.data.copy_(bias_init) @@ -873,9 +851,7 @@ class InvConvNear(nn.Module): self.n_split = n_split self.no_jacobian = no_jacobian - w_init = torch.linalg.qr( - torch.FloatTensor(self.n_split, self.n_split).normal_() - )[0] + w_init = torch.linalg.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) @@ -890,11 +866,7 @@ class InvConvNear(nn.Module): x_len = torch.sum(x_mask, [1, 2]) x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) - x = ( - x.permute(0, 1, 3, 2, 4) - .contiguous() - .view(b, self.n_split, c // self.n_split, t) - ) + x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t) if reverse: if hasattr(self, "weight_inv"): diff --git a/GPT_SoVITS/module/mrte_model.py b/GPT_SoVITS/module/mrte_model.py index b0cd242..e889b7e 100644 --- a/GPT_SoVITS/module/mrte_model.py +++ b/GPT_SoVITS/module/mrte_model.py @@ -31,32 +31,15 @@ class MRTE(nn.Module): text_enc = self.text_pre(text * text_mask) if test != None: if test == 0: - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge elif test == 1: x = ssl_enc + ge elif test == 2: - x = ( - self.cross_attention( - ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask - ) - + ge - ) + x = self.cross_attention(ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask) + ge else: raise ValueError("test should be 0,1,2") else: - x = ( - self.cross_attention( - ssl_enc * ssl_mask, text_enc * text_mask, attn_mask - ) - + ssl_enc - + ge - ) + x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge x = self.c_post(x * ssl_mask) return x @@ -70,9 +53,7 @@ class SpeakerEncoder(torch.nn.Module): model_embedding_size=256, ): super(SpeakerEncoder, self).__init__() - self.lstm = nn.LSTM( - mel_n_channels, model_hidden_size, model_num_layers, batch_first=True - ) + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() diff --git a/GPT_SoVITS/module/quantize.py b/GPT_SoVITS/module/quantize.py index f9a5c63..0afed83 100644 --- a/GPT_SoVITS/module/quantize.py +++ b/GPT_SoVITS/module/quantize.py @@ -7,7 +7,6 @@ """Residual vector quantizer implementation.""" from dataclasses import dataclass, field -import math import typing as tp import torch @@ -88,14 +87,10 @@ class ResidualVectorQuantizer(nn.Module): raise ValueError( f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." ) - quantized, codes, commit_loss, quantized_list = self.vq( - x, n_q=n_q, layers=layers - ) + quantized, codes, commit_loss, quantized_list = self.vq(x, n_q=n_q, layers=layers) return quantized, codes, torch.mean(commit_loss), quantized_list - def encode( - self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None - ) -> torch.Tensor: + def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor: """Encode a given input tensor with the specified sample rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizer to use and returns indices for each quantizer. diff --git a/GPT_SoVITS/module/transforms.py b/GPT_SoVITS/module/transforms.py index a11f799..16b5498 100644 --- a/GPT_SoVITS/module/transforms.py +++ b/GPT_SoVITS/module/transforms.py @@ -37,7 +37,7 @@ def piecewise_rational_quadratic_transform( min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, - **spline_kwargs + **spline_kwargs, ) return outputs, logabsdet @@ -175,8 +175,7 @@ def rational_quadratic_spline( theta_one_minus_theta = root * (1 - root) denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta ) derivative_numerator = input_delta.pow(2) * ( input_derivatives_plus_one * root.pow(2) @@ -190,12 +189,9 @@ def rational_quadratic_spline( theta = (inputs - input_cumwidths) / input_bin_widths theta_one_minus_theta = theta * (1 - theta) - numerator = input_heights * ( - input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta - ) + numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta) denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta ) outputs = input_cumheights + numerator / denominator diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index 43aac19..fd68013 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -1,23 +1,22 @@ -from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 -from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule import torch import torchaudio -from torch import nn +from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule from feature_extractor import cnhubert +from module.models_onnx import SynthesizerTrn, symbols_v1, symbols_v2 +from torch import nn cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" cnhubert.cnhubert_base_path = cnhubert_base_path ssl_model = cnhubert.get_model() -from text import cleaned_text_to_sequence -import soundfile -from tools.my_utils import load_audio -import os import json +import os + +import soundfile +from text import cleaned_text_to_sequence + def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - hann_window = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) + hann_window = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), @@ -73,7 +72,7 @@ class T2SEncoder(nn.Module): super().__init__() self.encoder = t2s.onnx_encoder self.vits = vits - + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): codes = self.vits.extract_latent(ssl_content) prompt_semantic = codes[0, 0] @@ -102,22 +101,22 @@ class T2SModel(nn.Module): self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model) self.first_stage_decoder = self.t2s_model.first_stage_decoder self.stage_decoder = self.t2s_model.stage_decoder - #self.t2s_model = torch.jit.script(self.t2s_model) + # self.t2s_model = torch.jit.script(self.t2s_model) def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): early_stop_num = self.t2s_model.early_stop_num - #[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] + # [1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) prefix_len = prompts.shape[1] - #[1,N,512] [1,N] + # [1,N,512] [1,N] y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) stop = False for idx in range(1, 1500): - #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + # [1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] enco = self.stage_decoder(y, k, v, y_emb, x_example) y, k, v, y_emb, logits, samples = enco if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: @@ -131,13 +130,11 @@ class T2SModel(nn.Module): return y[:, -idx:].unsqueeze(0) def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False): - #self.onnx_encoder = torch.jit.script(self.onnx_encoder) + # self.onnx_encoder = torch.jit.script(self.onnx_encoder) if dynamo: export_options = torch.onnx.ExportOptions(dynamic_shapes=True) onnx_encoder_export_output = torch.onnx.dynamo_export( - self.onnx_encoder, - (ref_seq, text_seq, ref_bert, text_bert, ssl_content), - export_options=export_options + self.onnx_encoder, (ref_seq, text_seq, ref_bert, text_bert, ssl_content), export_options=export_options ) onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx") return @@ -149,13 +146,13 @@ class T2SModel(nn.Module): input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], output_names=["x", "prompts"], dynamic_axes={ - "ref_seq": {1 : "ref_length"}, - "text_seq": {1 : "text_length"}, - "ref_bert": {0 : "ref_length"}, - "text_bert": {0 : "text_length"}, - "ssl_content": {2 : "ssl_length"}, + "ref_seq": {1: "ref_length"}, + "text_seq": {1: "text_length"}, + "ref_bert": {0: "ref_length"}, + "text_bert": {0: "text_length"}, + "ssl_content": {2: "ssl_length"}, }, - opset_version=16 + opset_version=16, ) x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) @@ -166,11 +163,11 @@ class T2SModel(nn.Module): input_names=["x", "prompts"], output_names=["y", "k", "v", "y_emb", "x_example"], dynamic_axes={ - "x": {1 : "x_length"}, - "prompts": {1 : "prompts_length"}, + "x": {1: "x_length"}, + "prompts": {1: "prompts_length"}, }, verbose=False, - opset_version=16 + opset_version=16, ) y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) @@ -181,38 +178,38 @@ class T2SModel(nn.Module): input_names=["iy", "ik", "iv", "iy_emb", "ix_example"], output_names=["y", "k", "v", "y_emb", "logits", "samples"], dynamic_axes={ - "iy": {1 : "iy_length"}, - "ik": {1 : "ik_length"}, - "iv": {1 : "iv_length"}, - "iy_emb": {1 : "iy_emb_length"}, - "ix_example": {1 : "ix_example_length"}, + "iy": {1: "iy_length"}, + "ik": {1: "ik_length"}, + "iv": {1: "iv_length"}, + "iy_emb": {1: "iy_emb_length"}, + "ix_example": {1: "ix_example_length"}, }, verbose=False, - opset_version=16 + opset_version=16, ) class VitsModel(nn.Module): def __init__(self, vits_path): super().__init__() - dict_s2 = torch.load(vits_path,map_location="cpu") + dict_s2 = torch.load(vits_path, map_location="cpu") self.hps = dict_s2["config"] - if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: self.hps["model"]["version"] = "v1" else: self.hps["model"]["version"] = "v2" - + self.hps = DictToAttrRecursive(self.hps) self.hps.model.semantic_frame_rate = "25hz" self.vq_model = SynthesizerTrn( self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, - **self.hps.model + **self.hps.model, ) self.vq_model.eval() self.vq_model.load_state_dict(dict_s2["weight"], strict=False) - + def forward(self, text_seq, pred_semantic, ref_audio): refer = spectrogram_torch( ref_audio, @@ -220,7 +217,7 @@ class VitsModel(nn.Module): self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, - center=False + center=False, ) return self.vq_model(pred_semantic, text_seq, refer)[0, 0] @@ -230,18 +227,22 @@ class GptSoVits(nn.Module): super().__init__() self.vits = vits self.t2s = t2s - + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, debug=False): pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) audio = self.vits(text_seq, pred_semantic, ref_audio) if debug: import onnxruntime + sess = onnxruntime.InferenceSession("onnx/koharu/koharu_vits.onnx", providers=["CPU"]) - audio1 = sess.run(None, { - "text_seq" : text_seq.detach().cpu().numpy(), - "pred_semantic" : pred_semantic.detach().cpu().numpy(), - "ref_audio" : ref_audio.detach().cpu().numpy() - }) + audio1 = sess.run( + None, + { + "text_seq": text_seq.detach().cpu().numpy(), + "pred_semantic": pred_semantic.detach().cpu().numpy(), + "ref_audio": ref_audio.detach().cpu().numpy(), + }, + ) return audio, audio1 return audio @@ -255,12 +256,12 @@ class GptSoVits(nn.Module): input_names=["text_seq", "pred_semantic", "ref_audio"], output_names=["audio"], dynamic_axes={ - "text_seq": {1 : "text_length"}, - "pred_semantic": {2 : "pred_length"}, - "ref_audio": {1 : "audio_length"}, + "text_seq": {1: "text_length"}, + "pred_semantic": {2: "pred_length"}, + "ref_audio": {1: "audio_length"}, }, opset_version=17, - verbose=False + verbose=False, ) @@ -278,14 +279,67 @@ def export(vits_path, gpt_path, project_name, vits_model="v2"): gpt = T2SModel(gpt_path, vits) gpt_sovits = GptSoVits(vits, gpt) ssl = SSLModel() - ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"],version=vits_model)]) - text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"],version=vits_model)]) + ref_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "n", + "i2", + "h", + "ao3", + ",", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) + text_seq = torch.LongTensor( + [ + cleaned_text_to_sequence( + [ + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + "w", + "o3", + "sh", + "i4", + "b", + "ai2", + "y", + "e4", + ], + version=vits_model, + ) + ] + ) ref_bert = torch.randn((ref_seq.shape[1], 1024)).float() text_bert = torch.randn((text_seq.shape[1], 1024)).float() ref_audio = torch.randn((1, 48000 * 5)).float() # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float() - ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float() - ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float() + ref_audio_16k = torchaudio.functional.resample(ref_audio, 48000, 16000).float() + ref_audio_sr = torchaudio.functional.resample(ref_audio, 48000, vits.hps.data.sampling_rate).float() try: os.mkdir(f"onnx/{project_name}") @@ -326,8 +380,8 @@ def export(vits_path, gpt_path, project_name, vits_model="v2"): } MoeVSConfJson = json.dumps(MoeVSConf) - with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile: - json.dump(MoeVSConf, MoeVsConfFile, indent = 4) + with open(f"onnx/{project_name}.json", "w") as MoeVsConfFile: + json.dump(MoeVSConf, MoeVsConfFile, indent=4) if __name__ == "__main__": @@ -341,4 +395,4 @@ if __name__ == "__main__": exp_path = "nahida" export(vits_path, gpt_path, exp_path) - # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) \ No newline at end of file + # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) diff --git a/GPT_SoVITS/prepare_datasets/1-get-text.py b/GPT_SoVITS/prepare_datasets/1-get-text.py index bdeacc7..8d83e79 100644 --- a/GPT_SoVITS/prepare_datasets/1-get-text.py +++ b/GPT_SoVITS/prepare_datasets/1-get-text.py @@ -8,19 +8,17 @@ exp_name = os.environ.get("exp_name") i_part = os.environ.get("i_part") all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] opt_dir = os.environ.get("opt_dir") bert_pretrained_dir = os.environ.get("bert_pretrained_dir") import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -version = os.environ.get('version', None) -import sys, numpy as np, traceback, pdb +version = os.environ.get("version", None) +import traceback import os.path -from glob import glob -from tqdm import tqdm from text.cleaner import clean_text from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np from tools.my_utils import clean_path # inp_text=sys.argv[1] @@ -36,13 +34,13 @@ from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) @@ -56,8 +54,10 @@ if os.path.exists(txt_path) == False: # device = "mps" else: device = "cpu" - if os.path.exists(bert_pretrained_dir):... - else:raise FileNotFoundError(bert_pretrained_dir) + if os.path.exists(bert_pretrained_dir): + ... + else: + raise FileNotFoundError(bert_pretrained_dir) tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) if is_half == True: @@ -86,12 +86,10 @@ if os.path.exists(txt_path) == False: def process(data, res): for name, text, lan in data: try: - name=clean_path(name) + name = clean_path(name) name = os.path.basename(name) print(name) - phones, word2ph, norm_text = clean_text( - text.replace("%", "-").replace("¥", ","), lan, version - ) + phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("¥", ","), lan, version) path_bert = "%s/%s.pt" % (bert_dir, name) if os.path.exists(path_bert) == False and lan == "zh": bert_feature = get_bert_feature(norm_text, word2ph) @@ -131,9 +129,7 @@ if os.path.exists(txt_path) == False: wav_name, spk_name, language, text = line.split("|") # todo.append([name,text,"zh"]) if language in language_v1_to_language_v2.keys(): - todo.append( - [wav_name, text, language_v1_to_language_v2.get(language, language)] - ) + todo.append([wav_name, text, language_v1_to_language_v2.get(language, language)]) else: print(f"\033[33m[Waring] The {language = } of {wav_name} is not supported for training.\033[0m") except: diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 27b61f2..3a84c01 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -1,25 +1,31 @@ # -*- coding: utf-8 -*- -import sys,os -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") +import sys +import os + +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] from feature_extractor import cnhubert -opt_dir= os.environ.get("opt_dir") -cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") + +opt_dir = os.environ.get("opt_dir") +cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir") import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -import pdb,traceback,numpy as np,logging +import traceback +import numpy as np from scipy.io import wavfile import librosa + now_dir = os.getcwd() sys.path.append(now_dir) -from tools.my_utils import load_audio,clean_path +from tools.my_utils import load_audio, clean_path # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path @@ -34,90 +40,95 @@ from tools.my_utils import load_audio,clean_path from time import time as ttime import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) + tmp_path = "%s%s.pth" % (ttime(), i_part) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) -hubert_dir="%s/4-cnhubert"%(opt_dir) -wav32dir="%s/5-wav32k"%(opt_dir) -os.makedirs(opt_dir,exist_ok=True) -os.makedirs(hubert_dir,exist_ok=True) -os.makedirs(wav32dir,exist_ok=True) -maxx=0.95 -alpha=0.5 +hubert_dir = "%s/4-cnhubert" % (opt_dir) +wav32dir = "%s/5-wav32k" % (opt_dir) +os.makedirs(opt_dir, exist_ok=True) +os.makedirs(hubert_dir, exist_ok=True) +os.makedirs(wav32dir, exist_ok=True) + +maxx = 0.95 +alpha = 0.5 if torch.cuda.is_available(): device = "cuda:0" # elif torch.backends.mps.is_available(): # device = "mps" else: device = "cpu" -model=cnhubert.get_model() +model = cnhubert.get_model() # is_half=False -if(is_half==True): - model=model.half().to(device) +if is_half == True: + model = model.half().to(device) else: model = model.to(device) -nan_fails=[] -def name2go(wav_name,wav_path): - hubert_path="%s/%s.pt"%(hubert_dir,wav_name) - if(os.path.exists(hubert_path)):return +nan_fails = [] + + +def name2go(wav_name, wav_path): + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path): + return tmp_audio = load_audio(wav_path, 32000) tmp_max = np.abs(tmp_audio).max() if tmp_max > 2.2: print("%s-filtered,%s" % (wav_name, tmp_max)) return - tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio - tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio - tmp_audio = librosa.resample( - tmp_audio32b, orig_sr=32000, target_sr=16000 - )#不是重采样问题 + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio + tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio + tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000) # 不是重采样问题 tensor_wav16 = torch.from_numpy(tmp_audio) - if (is_half == True): - tensor_wav16=tensor_wav16.half().to(device) + if is_half == True: + tensor_wav16 = tensor_wav16.half().to(device) else: tensor_wav16 = tensor_wav16.to(device) - ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) - if np.isnan(ssl.detach().numpy()).sum()!= 0: - nan_fails.append((wav_name,wav_path)) - print("nan filtered:%s"%wav_name) + ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu() # torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum() != 0: + nan_fails.append((wav_name, wav_path)) + print("nan filtered:%s" % wav_name) return wavfile.write( - "%s/%s"%(wav32dir,wav_name), + "%s/%s" % (wav32dir, wav_name), 32000, tmp_audio32.astype("int16"), ) - my_save(ssl,hubert_path) + my_save(ssl, hubert_path) -with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") -for line in lines[int(i_part)::int(all_parts)]: +with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + +for line in lines[int(i_part) :: int(all_parts)]: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) - if (inp_wav_dir != "" and inp_wav_dir != None): + wav_name = clean_path(wav_name) + if inp_wav_dir != "" and inp_wav_dir != None: wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(inp_wav_dir, wav_name) + wav_path = "%s/%s" % (inp_wav_dir, wav_name) else: - wav_path=wav_name + wav_path = wav_name wav_name = os.path.basename(wav_name) - name2go(wav_name,wav_path) + name2go(wav_name, wav_path) except: - print(line,traceback.format_exc()) + print(line, traceback.format_exc()) -if(len(nan_fails)>0 and is_half==True): - is_half=False - model=model.float() +if len(nan_fails) > 0 and is_half == True: + is_half = False + model = model.float() for wav in nan_fails: try: - name2go(wav[0],wav[1]) + name2go(wav[0], wav[1]) except: - print(wav_name,traceback.format_exc()) + print(wav_name, traceback.format_exc()) diff --git a/GPT_SoVITS/prepare_datasets/3-get-semantic.py b/GPT_SoVITS/prepare_datasets/3-get-semantic.py index b213a8a..ddb0607 100644 --- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py +++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py @@ -5,13 +5,15 @@ exp_name = os.environ.get("exp_name") i_part = os.environ.get("i_part") all_parts = os.environ.get("all_parts") if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] opt_dir = os.environ.get("opt_dir") pretrained_s2G = os.environ.get("pretrained_s2G") s2config_path = os.environ.get("s2config_path") -if os.path.exists(pretrained_s2G):... -else:raise FileNotFoundError(pretrained_s2G) +if os.path.exists(pretrained_s2G): + ... +else: + raise FileNotFoundError(pretrained_s2G) # version=os.environ.get("version","v2") size = os.path.getsize(pretrained_s2G) if size < 82978 * 1024: @@ -25,23 +27,22 @@ elif size < 700 * 1024 * 1024: else: version = "v3" import torch + is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -import math, traceback -import multiprocessing -import sys, pdb +import traceback +import sys now_dir = os.getcwd() sys.path.append(now_dir) -from random import shuffle -import torch.multiprocessing as mp -from glob import glob -from tqdm import tqdm -import logging, librosa, utils -if version!="v3": +import logging +import utils + +if version != "v3": from module.models import SynthesizerTrn else: from module.models import SynthesizerTrnV3 as SynthesizerTrn from tools.my_utils import clean_path + logging.getLogger("numba").setLevel(logging.WARNING) # from config import pretrained_s2G @@ -70,7 +71,7 @@ if os.path.exists(semantic_path) == False: hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, version=version, - **hps.model + **hps.model, ) if is_half == True: vq_model = vq_model.half().to(device) @@ -107,7 +108,7 @@ if os.path.exists(semantic_path) == False: try: # wav_name,text=line.split("\t") wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) + wav_name = clean_path(wav_name) wav_name = os.path.basename(wav_name) # name2go(name,lines1) name2go(wav_name, lines1) diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 36ef434..147f3bd 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -1,37 +1,44 @@ import traceback from collections import OrderedDict from time import time as ttime -import shutil,os +import shutil +import os import torch from tools.i18n.i18n import I18nAuto i18n = I18nAuto() -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s.pth"%(ttime()) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) -''' +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + + +""" 00:v1 01:v2 02:v3 03:v3lora -''' +""" from io import BytesIO -def my_save2(fea,path): + + +def my_save2(fea, path): bio = BytesIO() torch.save(fea, bio) bio.seek(0) data = bio.getvalue() - data = b'03' + data[2:]###temp for v3lora only, todo - with open(path, "wb") as f: f.write(data) + data = b"03" + data[2:] ###temp for v3lora only, todo + with open(path, "wb") as f: + f.write(data) -def savee(ckpt, name, epoch, steps, hps,lora_rank=None): + +def savee(ckpt, name, epoch, steps, hps, lora_rank=None): try: opt = OrderedDict() opt["weight"] = {} @@ -42,7 +49,7 @@ def savee(ckpt, name, epoch, steps, hps,lora_rank=None): opt["config"] = hps opt["info"] = "%sepoch_%siteration" % (epoch, steps) if lora_rank: - opt["lora_rank"]=lora_rank + opt["lora_rank"] = lora_rank my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) else: my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) @@ -50,41 +57,48 @@ def savee(ckpt, name, epoch, steps, hps,lora_rank=None): except: return traceback.format_exc() -head2version={ - b'00':["v1","v1",False], - b'01':["v2","v2",False], - b'02':["v2","v3",False], - b'03':["v2","v3",True], + +head2version = { + b"00": ["v1", "v1", False], + b"01": ["v2", "v2", False], + b"02": ["v2", "v3", False], + b"03": ["v2", "v3", True], } -hash_pretrained_dict={ - "dc3c97e17592963677a4a1681f30c653":["v2","v2",False],#s2G488k.pth#sovits_v1_pretrained - "43797be674a37c1c83ee81081941ed0f":["v2","v3",False],#s2Gv3.pth#sovits_v3_pretrained - "6642b37f3dbb1f76882b69937c95a5f3":["v2","v2",False],#s2G2333K.pth#sovits_v2_pretrained +hash_pretrained_dict = { + "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained + "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained + "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained } import hashlib + + def get_hash_from_file(sovits_path): - with open(sovits_path,"rb")as f:data=f.read(8192) + with open(sovits_path, "rb") as f: + data = f.read(8192) hash_md5 = hashlib.md5() hash_md5.update(data) return hash_md5.hexdigest() + + def get_sovits_version_from_path_fast(sovits_path): ###1-if it is pretrained sovits models, by hash - hash=get_hash_from_file(sovits_path) + hash = get_hash_from_file(sovits_path) if hash in hash_pretrained_dict: return hash_pretrained_dict[hash] ###2-new weights or old weights, by head - with open(sovits_path,"rb")as f:version=f.read(2) - if version!=b"PK": + with open(sovits_path, "rb") as f: + version = f.read(2) + if version != b"PK": return head2version[version] ###3-old weights, by file size - if_lora_v3=False - size=os.path.getsize(sovits_path) - ''' + if_lora_v3 = False + size = os.path.getsize(sovits_path) + """ v1weights:about 82942KB half thr:82978KB v2weights:about 83014KB v3weights:about 750MB - ''' + """ if size < 82978 * 1024: model_version = version = "v1" elif size < 700 * 1024 * 1024: @@ -92,15 +106,16 @@ def get_sovits_version_from_path_fast(sovits_path): else: version = "v2" model_version = "v3" - return version,model_version,if_lora_v3 + return version, model_version, if_lora_v3 + def load_sovits_new(sovits_path): - f=open(sovits_path,"rb") - meta=f.read(2) - if meta!="PK": - data = b'PK' + f.read() + f = open(sovits_path, "rb") + meta = f.read(2) + if meta != "PK": + data = b"PK" + f.read() bio = BytesIO() bio.write(data) bio.seek(0) return torch.load(bio, map_location="cpu", weights_only=False) - return torch.load(sovits_path,map_location="cpu", weights_only=False) \ No newline at end of file + return torch.load(sovits_path, map_location="cpu", weights_only=False) diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py index 4311db9..1176f0b 100644 --- a/GPT_SoVITS/s1_train.py +++ b/GPT_SoVITS/s1_train.py @@ -1,31 +1,28 @@ # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py import os -import pdb if "_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] import argparse import logging +import platform from pathlib import Path -import torch, platform -from pytorch_lightning import seed_everything -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger -from pytorch_lightning.strategies import DDPStrategy +import torch from AR.data.data_module import Text2SemanticDataModule from AR.models.t2s_lightning_module import Text2SemanticLightningModule from AR.utils.io import load_yaml_config +from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger +from pytorch_lightning.strategies import DDPStrategy logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) torch.set_float32_matmul_precision("high") -from AR.utils import get_newest_ckpt - from collections import OrderedDict -from time import time as ttime -import shutil + +from AR.utils import get_newest_ckpt from process_ckpt import my_save @@ -37,7 +34,7 @@ class my_model_ckpt(ModelCheckpoint): if_save_every_weights, half_weights_save_dir, exp_name, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.if_save_latest = if_save_latest @@ -50,10 +47,7 @@ class my_model_ckpt(ModelCheckpoint): # if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): if self._should_save_on_train_epoch_end(trainer): monitor_candidates = self._monitor_candidates(trainer) - if ( - self._every_n_epochs >= 1 - and (trainer.current_epoch + 1) % self._every_n_epochs == 0 - ): + if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0: if ( self.if_save_latest == True ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt @@ -75,7 +69,7 @@ class my_model_ckpt(ModelCheckpoint): to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) # torch.save( # print(os.environ) - if(os.environ.get("LOCAL_RANK","0")=="0"): + if os.environ.get("LOCAL_RANK", "0") == "0": my_save( to_save_od, "%s/%s-e%s.ckpt" @@ -112,7 +106,7 @@ def main(args): dirpath=ckpt_dir, ) logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) - os.environ["MASTER_ADDR"]="localhost" + os.environ["MASTER_ADDR"] = "localhost" os.environ["USE_LIBUV"] = "0" trainer: Trainer = Trainer( max_epochs=config["train"]["epochs"], @@ -123,9 +117,9 @@ def main(args): devices=-1 if torch.cuda.is_available() else 1, benchmark=False, fast_dev_run=False, - strategy = DDPStrategy( - process_group_backend="nccl" if platform.system() != "Windows" else "gloo" - ) if torch.cuda.is_available() else "auto", + strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo") + if torch.cuda.is_available() + else "auto", precision=config["train"]["precision"], logger=logger, num_sanity_val_steps=0, @@ -133,9 +127,7 @@ def main(args): use_distributed_sampler=False, # 非常简单的修改,但解决了采用自定义的 bucket_sampler 下训练步数不一致的问题! ) - model: Text2SemanticLightningModule = Text2SemanticLightningModule( - config, output_dir - ) + model: Text2SemanticLightningModule = Text2SemanticLightningModule(config, output_dir) data_module: Text2SemanticDataModule = Text2SemanticDataModule( config, diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py index ea863b0..ab46118 100644 --- a/GPT_SoVITS/s2_train.py +++ b/GPT_SoVITS/s2_train.py @@ -1,36 +1,41 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons +from module import commons from module.data_utils import ( - TextAudioSpeakerLoader, - TextAudioSpeakerCollate, DistributedBucketSampler, + TextAudioSpeakerCollate, + TextAudioSpeakerLoader, ) -from module.models import ( - SynthesizerTrn, - MultiPeriodDiscriminator, -) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss +from module.losses import discriminator_loss, feature_loss, generator_loss, kl_loss from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from module.models import ( + MultiPeriodDiscriminator, + SynthesizerTrn, +) from process_ckpt import savee torch.backends.cudnn.benchmark = False @@ -46,7 +51,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -74,7 +78,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -128,19 +132,27 @@ def run(rank, n_gpus, hps): # batch_size=1, pin_memory=True, # drop_last=False, collate_fn=collate_fn) - net_g = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).to(device) + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + net_d = ( + MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + if torch.cuda.is_available() + else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) + ) for name, param in net_g.named_parameters(): if not param.requires_grad: print(name, "not requires_grad") @@ -193,7 +205,7 @@ def run(rank, n_gpus, hps): try: # 如果能加载自动resume _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "D_*.pth"), net_d, optim_d, ) # D多半加载没事 @@ -201,11 +213,11 @@ def run(rank, n_gpus, hps): logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 # global_step = 0 @@ -213,37 +225,55 @@ def run(rank, n_gpus, hps): # traceback.print_exc() epoch_str = 1 global_step = 0 - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.module.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) if torch.cuda.is_available() else net_g.load_state_dict( + ) + if torch.cuda.is_available() + else net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) ##测试不加载优化器 - if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): + if ( + hps.train.pretrained_s2D != "" + and hps.train.pretrained_s2D != None + and os.path.exists(hps.train.pretrained_s2D) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) - print("loaded pretrained %s" % hps.train.pretrained_s2D, + print( + "loaded pretrained %s" % hps.train.pretrained_s2D, net_d.module.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] - ) if torch.cuda.is_available() else net_d.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], ) + if torch.cuda.is_available() + else net_d.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"], + ), ) # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 + optim_g, + gamma=hps.train.lr_decay, + last_epoch=-1, ) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( - optim_d, gamma=hps.train.lr_decay, last_epoch=-1 + optim_d, + gamma=hps.train.lr_decay, + last_epoch=-1, ) for _ in range(epoch_str): scheduler_g.step() @@ -285,9 +315,7 @@ def run(rank, n_gpus, hps): print("training done") -def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers -): +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims # scheduler_g, scheduler_d = schedulers @@ -311,17 +339,38 @@ def train_and_evaluate( text_lengths, ) in enumerate(tqdm(train_loader)): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( - rank, non_blocking=True + y, y_lengths = ( + y.cuda( + rank, + non_blocking=True, + ), + y_lengths.cuda( + rank, + non_blocking=True, + ), ) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -350,9 +399,7 @@ def train_and_evaluate( hps.data.mel_fmin, hps.data.mel_fmax, ) - y_mel = commons.slice_segments( - mel, ids_slice, hps.train.segment_size // hps.data.hop_length - ) + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1), hps.data.filter_length, @@ -364,15 +411,14 @@ def train_and_evaluate( hps.data.mel_fmax, ) - y = commons.slice_segments( - y, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice + y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g + y_d_hat_r, + y_d_hat_g, ) loss_disc_all = loss_disc optim_d.zero_grad() @@ -405,7 +451,8 @@ def train_and_evaluate( losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] logger.info( "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) + epoch, + 100.0 * batch_idx / len(train_loader), ) ) logger.info([x.item() for x in losses] + [global_step, lr]) @@ -429,25 +476,37 @@ def train_and_evaluate( # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict=None - try:###Some people installed the wrong version of matplotlib. + image_dict = None + try: ###Some people installed the wrong version of matplotlib. image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() + y_mel[0].data.cpu().numpy(), ), "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() + y_hat_mel[0].data.cpu().numpy(), ), "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() + mel[0].data.cpu().numpy(), ), "all/stats_ssl": utils.plot_spectrogram_to_numpy( - stats_ssl[0].data.cpu().numpy() + stats_ssl[0].data.cpu().numpy(), ), } - except:pass - if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,) - else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,) + except: + pass + if image_dict: + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + else: + utils.summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: @@ -457,7 +516,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), ), ) utils.save_checkpoint( @@ -466,7 +526,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(global_step), ), ) else: @@ -476,7 +537,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), ), ) utils.save_checkpoint( @@ -485,7 +547,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "D_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "D_{}.pth".format(233333333333), ), ) if rank == 0 and hps.train.if_save_every_weights == True: @@ -540,10 +603,24 @@ def evaluate(hps, generator, eval_loader, writer_eval): ssl = ssl.to(device) text, text_lengths = text.to(device), text_lengths.to(device) for test in [0, 1]: - y_hat, mask, *_ = generator.module.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test - ) if torch.cuda.is_available() else generator.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test + y_hat, mask, *_ = ( + generator.module.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) + if torch.cuda.is_available() + else generator.infer( + ssl, + spec, + spec_lengths, + text, + text_lengths, + test=test, + ) ) y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length @@ -568,19 +645,19 @@ def evaluate(hps, generator, eval_loader, writer_eval): image_dict.update( { f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].cpu().numpy() - ) + y_hat_mel[0].cpu().numpy(), + ), } ) audio_dict.update( - {f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]]} + { + f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]], + }, ) image_dict.update( { - f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( - mel[0].cpu().numpy() - ) - } + f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()), + }, ) audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) diff --git a/GPT_SoVITS/s2_train_v3.py b/GPT_SoVITS/s2_train_v3.py index 9933dee..71d2196 100644 --- a/GPT_SoVITS/s2_train_v3.py +++ b/GPT_SoVITS/s2_train_v3.py @@ -1,36 +1,41 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch -from torch.nn import functional as F +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) from random import randint -from module import commons +from module import commons +from module.data_utils import ( + DistributedBucketSampler, +) +from module.data_utils import ( + TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, +) from module.data_utils import ( TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader, - TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, - DistributedBucketSampler, ) from module.models import ( SynthesizerTrnV3 as SynthesizerTrn, - MultiPeriodDiscriminator, ) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch from process_ckpt import savee torch.backends.cudnn.benchmark = False @@ -46,7 +51,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -74,7 +78,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -128,17 +132,21 @@ def run(rank, n_gpus, hps): # batch_size=1, pin_memory=True, # drop_last=False, collate_fn=collate_fn) - net_g = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).to(device) + net_g = ( + SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) + if torch.cuda.is_available() + else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + ) # net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) # for name, param in net_g.named_parameters(): @@ -146,7 +154,7 @@ def run(rank, n_gpus, hps): # print(name, "not requires_grad") optim_g = torch.optim.AdamW( - filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 + filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps, @@ -174,11 +182,11 @@ def run(rank, n_gpus, hps): # logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_*.pth"), + utils.latest_checkpoint_path("%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 # global_step = 0 @@ -186,17 +194,24 @@ def run(rank, n_gpus, hps): # traceback.print_exc() epoch_str = 1 global_step = 0 - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.module.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) if torch.cuda.is_available() else net_g.load_state_dict( + ) + if torch.cuda.is_available() + else net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) ##测试不加载优化器 # if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): # if rank == 0: @@ -212,9 +227,7 @@ def run(rank, n_gpus, hps): # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) # scheduler_d = torch.optim.lr_scheduler.ExponentialLR( # optim_d, gamma=hps.train.lr_decay, last_epoch=-1 # ) @@ -224,7 +237,7 @@ def run(rank, n_gpus, hps): scaler = GradScaler(enabled=hps.train.fp16_run) - net_d=optim_d=scheduler_d=None + net_d = optim_d = scheduler_d = None print("start training from epoch %s" % epoch_str) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: @@ -260,7 +273,16 @@ def run(rank, n_gpus, hps): def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers + rank, + epoch, + hps, + nets, + optims, + schedulers, + scaler, + loaders, + logger, + writers, ): net_g, net_d = nets optim_g, optim_d = optims @@ -284,19 +306,33 @@ def train_and_evaluate( # text, # text_lengths, # ) in enumerate(tqdm(train_loader)): - for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate(tqdm(train_loader)): + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True - ) - mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -307,8 +343,18 @@ def train_and_evaluate( text, text_lengths = text.to(device), text_lengths.to(device) with autocast(enabled=hps.train.fp16_run): - cfm_loss = net_g(ssl, spec, mel,ssl_lengths,spec_lengths, text, text_lengths,mel_lengths, use_grad_ckpt=hps.train.grad_ckpt) - loss_gen_all=cfm_loss + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) @@ -318,12 +364,15 @@ def train_and_evaluate( if rank == 0: if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] + lr = optim_g.param_groups[0]["lr"] # losses = [commit_loss,cfm_loss,mel_loss,loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] losses = [cfm_loss] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, + 100.0 * batch_idx / len(train_loader), + ) + ) logger.info([x.item() for x in losses] + [global_step, lr]) scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} @@ -337,7 +386,8 @@ def train_and_evaluate( writer=writer, global_step=global_step, # images=image_dict, - scalars=scalar_dict) + scalars=scalar_dict, + ) # if global_step % hps.train.eval_interval == 0: # # evaluate(hps, net_g, eval_loader, writer_eval) @@ -347,7 +397,6 @@ def train_and_evaluate( # # if keep_ckpts > 0: # # utils.clean_checkpoints(path_to_models=hps.s2_ckpt_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True) - global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: if hps.train.if_save_latest == 0: @@ -357,7 +406,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(global_step) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(global_step), ), ) # utils.save_checkpoint( @@ -376,7 +426,8 @@ def train_and_evaluate( hps.train.learning_rate, epoch, os.path.join( - "%s/logs_s2_%s" % (hps.data.exp_dir,hps.model.version), "G_{}.pth".format(233333333333) + "%s/logs_s2_%s" % (hps.data.exp_dir, hps.model.version), + "G_{}.pth".format(233333333333), ), ) # utils.save_checkpoint( diff --git a/GPT_SoVITS/s2_train_v3_lora.py b/GPT_SoVITS/s2_train_v3_lora.py index 75b3415..42582b4 100644 --- a/GPT_SoVITS/s2_train_v3_lora.py +++ b/GPT_SoVITS/s2_train_v3_lora.py @@ -1,38 +1,45 @@ import warnings + warnings.filterwarnings("ignore") -import utils, os +import os + +import utils + hps = utils.get_hparams(stage=2) os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import logging + import torch -from torch.nn import functional as F +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler from tqdm import tqdm -import logging, traceback logging.getLogger("matplotlib").setLevel(logging.INFO) logging.getLogger("h5py").setLevel(logging.INFO) logging.getLogger("numba").setLevel(logging.INFO) +from collections import OrderedDict as od from random import randint + from module import commons -from peft import LoraConfig, PeftModel, get_peft_model +from module.data_utils import ( + DistributedBucketSampler, +) +from module.data_utils import ( + TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, +) from module.data_utils import ( TextAudioSpeakerLoaderV3 as TextAudioSpeakerLoader, - TextAudioSpeakerCollateV3 as TextAudioSpeakerCollate, - DistributedBucketSampler, ) from module.models import ( SynthesizerTrnV3 as SynthesizerTrn, - MultiPeriodDiscriminator, ) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from peft import LoraConfig, get_peft_model from process_ckpt import savee -from collections import OrderedDict as od + torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = False ###反正A100fp32更快,那试试tf32吧 @@ -46,7 +53,6 @@ device = "cpu" # cuda以外的设备,等mps优化后加入 def main(): - if torch.cuda.is_available(): n_gpus = torch.cuda.device_count() else: @@ -65,7 +71,7 @@ def main(): def run(rank, n_gpus, hps): - global global_step,no_grad_names,save_root,lora_rank + global global_step, no_grad_names, save_root, lora_rank if rank == 0: logger = utils.get_logger(hps.data.exp_dir) logger.info(hps) @@ -74,7 +80,7 @@ def run(rank, n_gpus, hps): writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", + backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", init_method="env://?use_libuv=False", world_size=n_gpus, rank=rank, @@ -122,21 +128,24 @@ def run(rank, n_gpus, hps): persistent_workers=True, prefetch_factor=4, ) - save_root="%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir,hps.model.version,hps.train.lora_rank) - os.makedirs(save_root,exist_ok=True) - lora_rank=int(hps.train.lora_rank) + save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank) + os.makedirs(save_root, exist_ok=True) + lora_rank = int(hps.train.lora_rank) lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, lora_alpha=lora_rank, init_lora_weights=True, ) - def get_model(hps):return SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ) + + def get_model(hps): + return SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ) + def get_optim(net_g): return torch.optim.AdamW( filter(lambda p: p.requires_grad, net_g.parameters()), ###默认所有层lr一致 @@ -144,61 +153,66 @@ def run(rank, n_gpus, hps): betas=hps.train.betas, eps=hps.train.eps, ) - def model2cuda(net_g,rank): + + def model2cuda(net_g, rank): if torch.cuda.is_available(): net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True) else: net_g = net_g.to(device) return net_g - try:# 如果能加载自动resume + + try: # 如果能加载自动resume net_g = get_model(hps) net_g.cfm = get_peft_model(net_g.cfm, lora_config) - net_g=model2cuda(net_g,rank) - optim_g=get_optim(net_g) + net_g = model2cuda(net_g, rank) + optim_g = get_optim(net_g) # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(save_root, "G_*.pth"), net_g, optim_g, ) - epoch_str+=1 + epoch_str += 1 global_step = (epoch_str - 1) * len(train_loader) except: # 如果首次不能加载,加载pretrain # traceback.print_exc() epoch_str = 1 global_step = 0 net_g = get_model(hps) - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): + if ( + hps.train.pretrained_s2G != "" + and hps.train.pretrained_s2G != None + and os.path.exists(hps.train.pretrained_s2G) + ): if rank == 0: logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print("loaded pretrained %s" % hps.train.pretrained_s2G, + print( + "loaded pretrained %s" % hps.train.pretrained_s2G, net_g.load_state_dict( torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], strict=False, - ) + ), ) net_g.cfm = get_peft_model(net_g.cfm, lora_config) - net_g=model2cuda(net_g,rank) + net_g = model2cuda(net_g, rank) optim_g = get_optim(net_g) - no_grad_names=set() + no_grad_names = set() for name, param in net_g.named_parameters(): if not param.requires_grad: - no_grad_names.add(name.replace("module.","")) + no_grad_names.add(name.replace("module.", "")) # print(name, "not requires_grad") # print(no_grad_names) # os._exit(233333) - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 - ) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=-1) for _ in range(epoch_str): scheduler_g.step() scaler = GradScaler(enabled=hps.train.fp16_run) - net_d=optim_d=scheduler_d=None - print("start training from epoch %s"%epoch_str) + net_d = optim_d = scheduler_d = None + print("start training from epoch %s" % epoch_str) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate( @@ -230,9 +244,8 @@ def run(rank, n_gpus, hps): scheduler_g.step() print("training done") -def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers -): + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): net_g, net_d = nets optim_g, optim_d = optims # scheduler_g, scheduler_d = schedulers @@ -244,18 +257,32 @@ def train_and_evaluate( global global_step net_g.train() - for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate(tqdm(train_loader)): + for batch_idx, (ssl, spec, mel, ssl_lengths, spec_lengths, text, text_lengths, mel_lengths) in enumerate( + tqdm(train_loader) + ): if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True - ) - mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = ( + spec.cuda( + rank, + non_blocking=True, + ), + spec_lengths.cuda( + rank, + non_blocking=True, + ), ) + mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) ssl = ssl.cuda(rank, non_blocking=True) ssl.requires_grad = False - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True + text, text_lengths = ( + text.cuda( + rank, + non_blocking=True, + ), + text_lengths.cuda( + rank, + non_blocking=True, + ), ) else: spec, spec_lengths = spec.to(device), spec_lengths.to(device) @@ -265,8 +292,18 @@ def train_and_evaluate( text, text_lengths = text.to(device), text_lengths.to(device) with autocast(enabled=hps.train.fp16_run): - cfm_loss = net_g(ssl, spec, mel,ssl_lengths,spec_lengths, text, text_lengths,mel_lengths, use_grad_ckpt=hps.train.grad_ckpt) - loss_gen_all=cfm_loss + cfm_loss = net_g( + ssl, + spec, + mel, + ssl_lengths, + spec_lengths, + text, + text_lengths, + mel_lengths, + use_grad_ckpt=hps.train.grad_ckpt, + ) + loss_gen_all = cfm_loss optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) @@ -276,18 +313,17 @@ def train_and_evaluate( if rank == 0: if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] + lr = optim_g.param_groups[0]["lr"] losses = [cfm_loss] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) + logger.info("Train Epoch: {} [{:.0f}%]".format(epoch, 100.0 * batch_idx / len(train_loader))) logger.info([x.item() for x in losses] + [global_step, lr]) scalar_dict = {"loss/g/total": loss_gen_all, "learning_rate": lr, "grad_norm_g": grad_norm_g} utils.summarize( writer=writer, global_step=global_step, - scalars=scalar_dict) + scalars=scalar_dict, + ) global_step += 1 if epoch % hps.train.save_every_epoch == 0 and rank == 0: @@ -297,9 +333,7 @@ def train_and_evaluate( optim_g, hps.train.learning_rate, epoch, - os.path.join( - save_root, "G_{}.pth".format(global_step) - ), + os.path.join(save_root, "G_{}.pth".format(global_step)), ) else: utils.save_checkpoint( @@ -307,21 +341,19 @@ def train_and_evaluate( optim_g, hps.train.learning_rate, epoch, - os.path.join( - save_root, "G_{}.pth".format(233333333333) - ), + os.path.join(save_root, "G_{}.pth".format(233333333333)), ) if rank == 0 and hps.train.if_save_every_weights == True: if hasattr(net_g, "module"): ckpt = net_g.module.state_dict() else: ckpt = net_g.state_dict() - sim_ckpt=od() + sim_ckpt = od() for key in ckpt: # if "cfm"not in key: # print(key) if key not in no_grad_names: - sim_ckpt[key]=ckpt[key].half().cpu() + sim_ckpt[key] = ckpt[key].half().cpu() logger.info( "saving ckpt %s_e%s:%s" % ( @@ -329,10 +361,11 @@ def train_and_evaluate( epoch, savee( sim_ckpt, - hps.name + "_e%s_s%s_l%s" % (epoch, global_step,lora_rank), + hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank), epoch, global_step, - hps,lora_rank=lora_rank + hps, + lora_rank=lora_rank, ), ) ) diff --git a/GPT_SoVITS/text/LangSegmenter/__init__.py b/GPT_SoVITS/text/LangSegmenter/__init__.py index 6fe6095..0a76490 100644 --- a/GPT_SoVITS/text/LangSegmenter/__init__.py +++ b/GPT_SoVITS/text/LangSegmenter/__init__.py @@ -1 +1 @@ -from .langsegmenter import LangSegmenter \ No newline at end of file +from .langsegmenter import LangSegmenter diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py index c558348..1740a54 100644 --- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py +++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py @@ -3,38 +3,44 @@ import re # jieba静音 import jieba + jieba.setLogLevel(logging.CRITICAL) # 更改fast_langdetect大模型位置 from pathlib import Path import fast_langdetect -fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect")) + +fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector( + fast_langdetect.infer.LangDetectConfig( + cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect" + ) +) from split_lang import LangSplitter def full_en(text): - pattern = r'^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$' + pattern = r"^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$" return bool(re.match(pattern, text)) def full_cjk(text): # 来自wiki cjk_ranges = [ - (0x4E00, 0x9FFF), # CJK Unified Ideographs - (0x3400, 0x4DB5), # CJK Extension A - (0x20000, 0x2A6DD), # CJK Extension B - (0x2A700, 0x2B73F), # CJK Extension C - (0x2B740, 0x2B81F), # CJK Extension D - (0x2B820, 0x2CEAF), # CJK Extension E - (0x2CEB0, 0x2EBEF), # CJK Extension F - (0x30000, 0x3134A), # CJK Extension G - (0x31350, 0x323AF), # CJK Extension H - (0x2EBF0, 0x2EE5D), # CJK Extension H + (0x4E00, 0x9FFF), # CJK Unified Ideographs + (0x3400, 0x4DB5), # CJK Extension A + (0x20000, 0x2A6DD), # CJK Extension B + (0x2A700, 0x2B73F), # CJK Extension C + (0x2B740, 0x2B81F), # CJK Extension D + (0x2B820, 0x2CEAF), # CJK Extension E + (0x2CEB0, 0x2EBEF), # CJK Extension F + (0x30000, 0x3134A), # CJK Extension G + (0x31350, 0x323AF), # CJK Extension H + (0x2EBF0, 0x2EE5D), # CJK Extension H ] - pattern = r'[0-9、-〜。!?.!?… ]+$' + pattern = r"[0-9、-〜。!?.!?… ]+$" cjk_text = "" for char in text: @@ -45,7 +51,7 @@ def full_cjk(text): return cjk_text -def split_jako(tag_lang,item): +def split_jako(tag_lang, item): if tag_lang == "ja": pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)" else: @@ -53,41 +59,40 @@ def split_jako(tag_lang,item): lang_list: list[dict] = [] tag = 0 - for match in re.finditer(pattern, item['text']): + for match in re.finditer(pattern, item["text"]): if match.start() > tag: - lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]}) + lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]}) tag = match.end() - lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]}) + lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]}) - if tag < len(item['text']): - lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]}) + if tag < len(item["text"]): + lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]}) return lang_list def merge_lang(lang_list, item): - if lang_list and item['lang'] == lang_list[-1]['lang']: - lang_list[-1]['text'] += item['text'] + if lang_list and item["lang"] == lang_list[-1]["lang"]: + lang_list[-1]["text"] += item["text"] else: lang_list.append(item) return lang_list -class LangSegmenter(): +class LangSegmenter: # 默认过滤器, 基于gsv目前四种语言 DEFAULT_LANG_MAP = { "zh": "zh", "yue": "zh", # 粤语 "wuu": "zh", # 吴语 "zh-cn": "zh", - "zh-tw": "x", # 繁体设置为x + "zh-tw": "x", # 繁体设置为x "ko": "ko", "ja": "ja", "en": "en", } - def getTexts(text): lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP) substr = lang_splitter.split_by_lang(text=text) @@ -95,18 +100,18 @@ class LangSegmenter(): lang_list: list[dict] = [] for _, item in enumerate(substr): - dict_item = {'lang':item.lang,'text':item.text} + dict_item = {"lang": item.lang, "text": item.text} # 处理短英文被识别为其他语言的问题 - if full_en(dict_item['text']): - dict_item['lang'] = 'en' - lang_list = merge_lang(lang_list,dict_item) + if full_en(dict_item["text"]): + dict_item["lang"] = "en" + lang_list = merge_lang(lang_list, dict_item) continue # 处理非日语夹日文的问题(不包含CJK) ja_list: list[dict] = [] - if dict_item['lang'] != 'ja': - ja_list = split_jako('ja',dict_item) + if dict_item["lang"] != "ja": + ja_list = split_jako("ja", dict_item) if not ja_list: ja_list.append(dict_item) @@ -115,8 +120,8 @@ class LangSegmenter(): ko_list: list[dict] = [] temp_list: list[dict] = [] for _, ko_item in enumerate(ja_list): - if ko_item["lang"] != 'ko': - ko_list = split_jako('ko',ko_item) + if ko_item["lang"] != "ko": + ko_list = split_jako("ko", ko_item) if ko_list: temp_list.extend(ko_list) @@ -126,28 +131,28 @@ class LangSegmenter(): # 未存在非日韩文夹日韩文 if len(temp_list) == 1: # 未知语言检查是否为CJK - if dict_item['lang'] == 'x': - cjk_text = full_cjk(dict_item['text']) + if dict_item["lang"] == "x": + cjk_text = full_cjk(dict_item["text"]) if cjk_text: - dict_item = {'lang':'zh','text':cjk_text} - lang_list = merge_lang(lang_list,dict_item) + dict_item = {"lang": "zh", "text": cjk_text} + lang_list = merge_lang(lang_list, dict_item) continue else: - lang_list = merge_lang(lang_list,dict_item) + lang_list = merge_lang(lang_list, dict_item) continue # 存在非日韩文夹日韩文 for _, temp_item in enumerate(temp_list): # 未知语言检查是否为CJK - if temp_item['lang'] == 'x': - cjk_text = full_cjk(dict_item['text']) + if temp_item["lang"] == "x": + cjk_text = full_cjk(dict_item["text"]) if cjk_text: - dict_item = {'lang':'zh','text':cjk_text} - lang_list = merge_lang(lang_list,dict_item) + dict_item = {"lang": "zh", "text": cjk_text} + lang_list = merge_lang(lang_list, dict_item) else: - lang_list = merge_lang(lang_list,temp_item) + lang_list = merge_lang(lang_list, temp_item) return lang_list - + if __name__ == "__main__": text = "MyGO?,你也喜欢まいご吗?" @@ -155,4 +160,3 @@ if __name__ == "__main__": text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。" print(LangSegmenter.getTexts(text)) - diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index 2791d7a..82df1fb 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -10,18 +10,19 @@ from text import symbols2 as symbols_v2 _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} + def cleaned_text_to_sequence(cleaned_text, version=None): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. Args: text: string to convert to a sequence Returns: List of integers corresponding to the symbols in the text - ''' - if version is None:version=os.environ.get('version', 'v2') - if version == "v1": - phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] - else: - phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] - - return phones + """ + if version is None: + version = os.environ.get("version", "v2") + if version == "v1": + phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] + else: + phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] + return phones diff --git a/GPT_SoVITS/text/cantonese.py b/GPT_SoVITS/text/cantonese.py index 970be20..1f07c41 100644 --- a/GPT_SoVITS/text/cantonese.py +++ b/GPT_SoVITS/text/cantonese.py @@ -1,6 +1,5 @@ # reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py -import sys import re import cn2an import ToJyutping @@ -99,9 +98,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -115,7 +112,9 @@ def text_normalize(text): return dest_text -punctuation_set=set(punctuation) +punctuation_set = set(punctuation) + + def jyuping_to_initials_finals_tones(jyuping_syllables): initials_finals = [] tones = [] @@ -160,12 +159,14 @@ def jyuping_to_initials_finals_tones(jyuping_syllables): assert len(initials_finals) == len(tones) ###魔改为辅音+带音调的元音 - phones=[] - for a,b in zip(initials_finals,tones): - if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。 - todo="%s%s"%(a,b) - else:todo=a - if(todo not in punctuation_set):todo="Y%s"%todo + phones = [] + for a, b in zip(initials_finals, tones): + if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。 + todo = "%s%s" % (a, b) + else: + todo = a + if todo not in punctuation_set: + todo = "Y%s" % todo phones.append(todo) # return initials_finals, tones, word2ph @@ -218,4 +219,4 @@ if __name__ == "__main__": # phones, tones, word2ph = g2p(text) phones, word2ph = g2p(text) # print(phones, tones, word2ph) - print(phones, word2ph) \ No newline at end of file + print(phones, word2ph) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index 55dc997..ce44215 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -1,5 +1,4 @@ import os -import pdb import re import cn2an @@ -17,7 +16,9 @@ pinyin_to_symbol_map = { for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() } -import jieba_fast, logging +import jieba_fast +import logging + jieba_fast.setLogLevel(logging.CRITICAL) import jieba_fast.posseg as psg @@ -37,7 +38,7 @@ rep_map = { "/": ",", "—": "-", "~": "…", - "~":"…", + "~": "…", } tone_modifier = ToneSandhi() @@ -49,9 +50,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -62,17 +61,15 @@ def replace_punctuation_with_en(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result @@ -87,9 +84,7 @@ def _get_initials_finals(word): initials = [] finals = [] orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) - orig_finals = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 - ) + orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for c, v in zip(orig_initials, orig_finals): initials.append(c) finals.append(v) diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py index 2b4599d..612aa3a 100644 --- a/GPT_SoVITS/text/chinese2.py +++ b/GPT_SoVITS/text/chinese2.py @@ -1,10 +1,9 @@ import os -import pdb import re import cn2an from pypinyin import lazy_pinyin, Style -from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals +from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials from text.symbols import punctuation from text.tone_sandhi import ToneSandhi @@ -18,18 +17,26 @@ pinyin_to_symbol_map = { for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() } -import jieba_fast, logging +import jieba_fast +import logging + jieba_fast.setLogLevel(logging.CRITICAL) import jieba_fast.posseg as psg # is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启 # is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False -is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False +is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False if is_g2pw: # print("当前使用g2pw进行拼音推理") from text.g2pw import G2PWPinyin, correct_pronunciation + parent_directory = os.path.dirname(current_file_path) - g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True) + g2pw = G2PWPinyin( + model_dir="GPT_SoVITS/text/G2PWModel", + model_source=os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"), + v_to_u=False, + neutral_tone_with_five=True, + ) rep_map = { ":": ",", @@ -46,7 +53,7 @@ rep_map = { "/": ",", "—": "-", "~": "…", - "~":"…", + "~": "…", } tone_modifier = ToneSandhi() @@ -58,9 +65,7 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text @@ -77,9 +82,7 @@ def _get_initials_finals(word): finals = [] orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) - orig_finals = lazy_pinyin( - word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 - ) + orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for c, v in zip(orig_initials, orig_finals): initials.append(c) @@ -87,31 +90,66 @@ def _get_initials_finals(word): return initials, finals -must_erhua = { - "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" -} +must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"} not_erhua = { - "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", - "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", - "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", - "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿", "少儿" + "虐儿", + "为儿", + "护儿", + "瞒儿", + "救儿", + "替儿", + "有儿", + "一儿", + "我儿", + "俺儿", + "妻儿", + "拐儿", + "聋儿", + "乞儿", + "患儿", + "幼儿", + "孤儿", + "婴儿", + "婴幼儿", + "连体儿", + "脑瘫儿", + "流浪儿", + "体弱儿", + "混血儿", + "蜜雪儿", + "舫儿", + "祖儿", + "美儿", + "应采儿", + "可儿", + "侄儿", + "孙儿", + "侄孙儿", + "女儿", + "男儿", + "红孩儿", + "花儿", + "虫儿", + "马儿", + "鸟儿", + "猪儿", + "猫儿", + "狗儿", + "少儿", } -def _merge_erhua(initials: list[str], - finals: list[str], - word: str, - pos: str) -> list[list[str]]: + + +def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]: """ Do erhub. """ # fix er1 for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': - finals[i] = 'er2' + if i == len(finals) - 1 and word[i] == "儿" and phn == "er1": + finals[i] = "er2" # 发音 - if word not in must_erhua and (word in not_erhua or - pos in {"a", "j", "nr"}): + if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}): return initials, finals # "……" 等情况直接返回 @@ -124,9 +162,13 @@ def _merge_erhua(initials: list[str], new_initials = [] new_finals = [] for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn in { - "er2", "er5" - } and word[-2:] not in not_erhua and new_finals: + if ( + i == len(finals) - 1 + and word[i] == "儿" + and phn in {"er2", "er5"} + and word[-2:] not in not_erhua + and new_finals + ): phn = "er" + new_finals[-1][-1] new_initials.append(initials[i]) @@ -160,7 +202,7 @@ def _g2p(segments): # assert len(sub_initials) == len(sub_finals) == len(word) initials = sum(initials, []) finals = sum(finals, []) - print("pypinyin结果",initials,finals) + print("pypinyin结果", initials, finals) else: # g2pw采用整句推理 pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3) @@ -171,19 +213,19 @@ def _g2p(segments): sub_finals = [] now_word_length = pre_word_length + len(word) - if pos == 'eng': + if pos == "eng": pre_word_length = now_word_length continue word_pinyins = pinyins[pre_word_length:now_word_length] # 多音字消歧 - word_pinyins = correct_pronunciation(word,word_pinyins) + word_pinyins = correct_pronunciation(word, word_pinyins) for pinyin in word_pinyins: if pinyin[0].isalpha(): sub_initials.append(to_initials(pinyin)) - sub_finals.append(to_finals_tone3(pinyin,neutral_tone_with_five=True)) + sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True)) else: sub_initials.append(pinyin) sub_finals.append(pinyin) @@ -259,18 +301,18 @@ def replace_punctuation_with_en(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) - replaced_text = re.sub( - r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text - ) + replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text) return replaced_text + def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result + def text_normalize(text): # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization tx = TextNormalizer() @@ -283,6 +325,7 @@ def text_normalize(text): dest_text = replace_consecutive_punctuation(dest_text) return dest_text + # 不排除英文的文本格式化 def mix_text_normalize(text): # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 98535f2..7ba8f37 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -19,55 +19,57 @@ special = [ def clean_text(text, language, version=None): - if version is None:version=os.environ.get('version', 'v2') + if version is None: + version = os.environ.get("version", "v2") if version == "v1": symbols = symbols_v1.symbols language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} else: symbols = symbols_v2.symbols - language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} + language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} - if(language not in language_module_map): - language="en" - text=" " + if language not in language_module_map: + language = "en" + text = " " for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) - if hasattr(language_module,"text_normalize"): + language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) + if hasattr(language_module, "text_normalize"): norm_text = language_module.text_normalize(text) else: - norm_text=text - if language == "zh" or language=="yue":########## + norm_text = text + if language == "zh" or language == "yue": ########## phones, word2ph = language_module.g2p(norm_text) assert len(phones) == sum(word2ph) assert len(norm_text) == len(word2ph) elif language == "en": phones = language_module.g2p(norm_text) if len(phones) < 4: - phones = [','] + phones + phones = [","] + phones word2ph = None else: phones = language_module.g2p(norm_text) word2ph = None - phones = ['UNK' if ph not in symbols else ph for ph in phones] + phones = ["UNK" if ph not in symbols else ph for ph in phones] return phones, word2ph, norm_text def clean_special(text, language, special_s, target_symbol, version=None): - if version is None:version=os.environ.get('version', 'v2') + if version is None: + version = os.environ.get("version", "v2") if version == "v1": symbols = symbols_v1.symbols language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} else: symbols = symbols_v2.symbols - language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} + language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} """ 特殊静音段sp符号处理 """ text = text.replace(special_s, ",") - language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) + language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) norm_text = language_module.text_normalize(text) phones = language_module.g2p(norm_text) new_ph = [] @@ -81,8 +83,9 @@ def clean_special(text, language, special_s, target_symbol, version=None): def text_to_sequence(text, language, version=None): - version = os.environ.get('version',version) - if version is None:version='v2' + version = os.environ.get("version", version) + if version is None: + version = "v2" phones = clean_text(text) return cleaned_text_to_sequence(phones, version) diff --git a/GPT_SoVITS/text/en_normalization/expend.py b/GPT_SoVITS/text/en_normalization/expend.py index c0bad75..bbd607c 100644 --- a/GPT_SoVITS/text/en_normalization/expend.py +++ b/GPT_SoVITS/text/en_normalization/expend.py @@ -9,17 +9,17 @@ import unicodedata # 后缀计量单位替换表 measurement_map = { "m": ["meter", "meters"], - 'km': ["kilometer", "kilometers"], + "km": ["kilometer", "kilometers"], "km/h": ["kilometer per hour", "kilometers per hour"], "ft": ["feet", "feet"], "L": ["liter", "liters"], "tbsp": ["tablespoon", "tablespoons"], - 'tsp': ["teaspoon", "teaspoons"], + "tsp": ["teaspoon", "teaspoons"], "h": ["hour", "hours"], "min": ["minute", "minutes"], "s": ["second", "seconds"], "°C": ["degree celsius", "degrees celsius"], - "°F": ["degree fahrenheit", "degrees fahrenheit"] + "°F": ["degree fahrenheit", "degrees fahrenheit"], } @@ -27,41 +27,42 @@ measurement_map = { _inflect = inflect.engine() # 转化数字序数词 -_ordinal_number_re = re.compile(r'\b([0-9]+)\. ') +_ordinal_number_re = re.compile(r"\b([0-9]+)\. ") # 我听说好像对于数字正则识别其实用 \d 会好一点 -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") # 时间识别 -_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b') +_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b") # 后缀计量单位识别 -_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b') +_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b") # 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ ) -_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)') -_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£') +_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)") +_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£") # 前后 $ 识别 -_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$') +_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)") +_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$") # 小数的识别 -_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)') +_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)") # 分数识别 (形式 "3/4" ) -_fraction_re = re.compile(r'([0-9]+/[0-9]+)') +_fraction_re = re.compile(r"([0-9]+/[0-9]+)") # 序数词识别 -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") # 数字处理 -_number_re = re.compile(r'[0-9]+') +_number_re = re.compile(r"[0-9]+") + def _convert_ordinal(m): """ - 标准化序数词, 例如: 1. 2. 3. 4. 5. 6. + 标准化序数词, 例如: 1. 2. 3. 4. 5. 6. Examples: input: "1. " output: "1st" @@ -70,24 +71,26 @@ def _convert_ordinal(m): ordinal = _inflect.ordinal(m.group(1)) return ordinal + ", " + def _remove_commas(m): - return m.group(1).replace(',', '') + return m.group(1).replace(",", "") + def _expand_time(m): """ 将 24 小时制的时间转换为 12 小时制的时间表示方式。 - + Examples: input: "13:00 / 4:00 / 13:30" output: "one o'clock p.m. / four o'clock am. / one thirty p.m." """ hours, minutes = map(int, m.group(1, 2)) - period = 'a.m.' if hours < 12 else 'p.m.' + period = "a.m." if hours < 12 else "p.m." if hours > 12: hours -= 12 hour_word = _inflect.number_to_words(hours) - minute_word = _inflect.number_to_words(minutes) if minutes != 0 else '' + minute_word = _inflect.number_to_words(minutes) if minutes != 0 else "" if minutes == 0: return f"{hour_word} o'clock {period}" @@ -103,7 +106,7 @@ def _expand_measurement(m): sign = m.group(3) ptr = 1 # 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "." - num = int(m.group(1).replace(sign, '').replace(".",'')) + num = int(m.group(1).replace(sign, "").replace(".", "")) decimal_part = m.group(2) # 上面判断的漏洞,比如 0.1 的情况,在这里排除了 if decimal_part == None and num == 1: @@ -116,23 +119,24 @@ def _expand_pounds(m): 没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起 """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") if len(parts) > 2: - return match + ' pounds' # Unexpected format + return match + " pounds" # Unexpected format pounds = int(parts[0]) if parts[0] else 0 - pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if pounds and pence: - pound_unit = 'pound' if pounds == 1 else 'pounds' - penny_unit = 'penny' if pence == 1 else 'pence' - return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit) + pound_unit = "pound" if pounds == 1 else "pounds" + penny_unit = "penny" if pence == 1 else "pence" + return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit) elif pounds: - pound_unit = 'pound' if pounds == 1 else 'pounds' - return '%s %s' % (pounds, pound_unit) + pound_unit = "pound" if pounds == 1 else "pounds" + return "%s %s" % (pounds, pound_unit) elif pence: - penny_unit = 'penny' if pence == 1 else 'pence' - return '%s %s' % (pence, penny_unit) + penny_unit = "penny" if pence == 1 else "pence" + return "%s %s" % (pence, penny_unit) else: - return 'zero pounds' + return "zero pounds" + def _expand_dollars(m): """ @@ -142,23 +146,24 @@ def _expand_dollars(m): output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents" """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") if len(parts) > 2: - return match + ' dollars' # Unexpected format + return match + " dollars" # Unexpected format dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit) + dollar_unit = "dollar" if dollars == 1 else "dollars" + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit) elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) + dollar_unit = "dollar" if dollars == 1 else "dollars" + return "%s %s" % (dollars, dollar_unit) elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s" % (cents, cent_unit) else: - return 'zero dollars' + return "zero dollars" + # 小数的处理 def _expand_decimal_number(m): @@ -168,11 +173,11 @@ def _expand_decimal_number(m): output: "thirteen point two three four" """ match = m.group(1) - parts = match.split('.') + parts = match.split(".") words = [] # 遍历字符串中的每个字符 for char in parts[1]: - if char == '.': + if char == ".": words.append("point") else: words.append(char) @@ -186,7 +191,7 @@ def _expend_fraction(m): 规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法. 规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves. Examples: - + | Written | Said | |:---:|:---:| | 1/3 | one third | @@ -196,39 +201,41 @@ def _expend_fraction(m): | 3/2 | three halves | """ match = m.group(0) - numerator, denominator = map(int, match.split('/')) + numerator, denominator = map(int, match.split("/")) numerator_part = _inflect.number_to_words(numerator) if denominator == 2: if numerator == 1: - denominator_part = 'half' + denominator_part = "half" else: - denominator_part = 'halves' + denominator_part = "halves" elif denominator == 1: - return f'{numerator_part}' + return f"{numerator_part}" else: denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator)) if numerator > 1: - denominator_part += 's' + denominator_part += "s" + + return f"{numerator_part} {denominator_part}" - return f'{numerator_part} {denominator_part}' def _expand_ordinal(m): return _inflect.number_to_words(m.group(0)) + def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: - return 'two thousand' + return "two thousand" elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) + return "two thousand " + _inflect.number_to_words(num % 100) elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' + return _inflect.number_to_words(num // 100) + " hundred" else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") else: - return _inflect.number_to_words(num, andword='') + return _inflect.number_to_words(num, andword="") def normalize(text): @@ -238,7 +245,7 @@ def normalize(text): """ text = re.sub(_ordinal_number_re, _convert_ordinal, text) - text = re.sub(r'(? Dict[str, np.array]: +def prepare_onnx_input( + tokenizer, + labels: List[str], + char2phonemes: Dict[str, List[int]], + chars: List[str], + texts: List[str], + query_ids: List[int], + use_mask: bool = False, + window_size: int = None, + max_len: int = 512, +) -> Dict[str, np.array]: if window_size is not None: truncated_texts, truncated_query_ids = _truncate_texts( - window_size=window_size, texts=texts, query_ids=query_ids) + window_size=window_size, texts=texts, query_ids=query_ids + ) input_ids = [] token_type_ids = [] attention_masks = [] @@ -50,33 +54,27 @@ def prepare_onnx_input(tokenizer, query_id = (truncated_query_ids if window_size else query_ids)[idx] try: - tokens, text2token, token2text = tokenize_and_map( - tokenizer=tokenizer, text=text) + tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text) except Exception: print(f'warning: text "{text}" is invalid') return {} text, query_id, tokens, text2token, token2text = _truncate( - max_len=max_len, - text=text, - query_id=query_id, - tokens=tokens, - text2token=text2token, - token2text=token2text) + max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text + ) - processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] + processed_tokens = ["[CLS]"] + tokens + ["[SEP]"] - input_id = list( - np.array(tokenizer.convert_tokens_to_ids(processed_tokens))) - token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int)) - attention_mask = list(np.ones((len(processed_tokens), ), dtype=int)) + input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens))) + token_type_id = list(np.zeros((len(processed_tokens),), dtype=int)) + attention_mask = list(np.ones((len(processed_tokens),), dtype=int)) query_char = text[query_id] - phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \ - if use_mask else [1] * len(labels) + phoneme_mask = ( + [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels) + ) char_id = chars.index(query_char) - position_id = text2token[ - query_id] + 1 # [CLS] token locate at first place + position_id = text2token[query_id] + 1 # [CLS] token locate at first place input_ids.append(input_id) token_type_ids.append(token_type_id) @@ -86,18 +84,17 @@ def prepare_onnx_input(tokenizer, position_ids.append(position_id) outputs = { - 'input_ids': np.array(input_ids).astype(np.int64), - 'token_type_ids': np.array(token_type_ids).astype(np.int64), - 'attention_masks': np.array(attention_masks).astype(np.int64), - 'phoneme_masks': np.array(phoneme_masks).astype(np.float32), - 'char_ids': np.array(char_ids).astype(np.int64), - 'position_ids': np.array(position_ids).astype(np.int64), + "input_ids": np.array(input_ids).astype(np.int64), + "token_type_ids": np.array(token_type_ids).astype(np.int64), + "attention_masks": np.array(attention_masks).astype(np.int64), + "phoneme_masks": np.array(phoneme_masks).astype(np.float32), + "char_ids": np.array(char_ids).astype(np.int64), + "position_ids": np.array(position_ids).astype(np.int64), } return outputs -def _truncate_texts(window_size: int, texts: List[str], - query_ids: List[int]) -> Tuple[List[str], List[int]]: +def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]: truncated_texts = [] truncated_query_ids = [] for text, query_id in zip(texts, query_ids): @@ -111,12 +108,9 @@ def _truncate_texts(window_size: int, texts: List[str], return truncated_texts, truncated_query_ids -def _truncate(max_len: int, - text: str, - query_id: int, - tokens: List[str], - text2token: List[int], - token2text: List[Tuple[int]]): +def _truncate( + max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]] +): truncate_len = max_len - 2 if len(tokens) <= truncate_len: return (text, query_id, tokens, text2token, token2text) @@ -137,14 +131,16 @@ def _truncate(max_len: int, start = token2text[token_start][0] end = token2text[token_end - 1][1] - return (text[start:end], query_id - start, tokens[token_start:token_end], [ - i - token_start if i is not None else None - for i in text2token[start:end] - ], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) + return ( + text[start:end], + query_id - start, + tokens[token_start:token_end], + [i - token_start if i is not None else None for i in text2token[start:end]], + [(s - start, e - start) for s, e in token2text[token_start:token_end]], + ) -def get_phoneme_labels(polyphonic_chars: List[List[str]] - ) -> Tuple[List[str], Dict[str, List[int]]]: +def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: @@ -154,13 +150,11 @@ def get_phoneme_labels(polyphonic_chars: List[List[str]] return labels, char2phonemes -def get_char_phoneme_labels(polyphonic_chars: List[List[str]] - ) -> Tuple[List[str], Dict[str, List[int]]]: - labels = sorted( - list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) +def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]: + labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: if char not in char2phonemes: char2phonemes[char] = [] - char2phonemes[char].append(labels.index(f'{char} {phoneme}')) + char2phonemes[char].append(labels.index(f"{char} {phoneme}")) return labels, char2phonemes diff --git a/GPT_SoVITS/text/g2pw/g2pw.py b/GPT_SoVITS/text/g2pw/g2pw.py index e81b24d..08525e9 100644 --- a/GPT_SoVITS/text/g2pw/g2pw.py +++ b/GPT_SoVITS/text/g2pw/g2pw.py @@ -17,17 +17,25 @@ PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep") class G2PWPinyin(Pinyin): - def __init__(self, model_dir='G2PWModel/', model_source=None, - enable_non_tradional_chinese=True, - v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs): + def __init__( + self, + model_dir="G2PWModel/", + model_source=None, + enable_non_tradional_chinese=True, + v_to_u=False, + neutral_tone_with_five=False, + tone_sandhi=False, + **kwargs, + ): self._g2pw = G2PWOnnxConverter( model_dir=model_dir, - style='pinyin', + style="pinyin", model_source=model_source, enable_non_tradional_chinese=enable_non_tradional_chinese, ) self._converter = Converter( - self._g2pw, v_to_u=v_to_u, + self._g2pw, + v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, ) @@ -37,31 +45,25 @@ class G2PWPinyin(Pinyin): class Converter(UltimateConverter): - def __init__(self, g2pw_instance, v_to_u=False, - neutral_tone_with_five=False, - tone_sandhi=False, **kwargs): + def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs): super(Converter, self).__init__( - v_to_u=v_to_u, - neutral_tone_with_five=neutral_tone_with_five, - tone_sandhi=tone_sandhi, **kwargs) + v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs + ) self._g2pw = g2pw_instance def convert(self, words, style, heteronym, errors, strict, **kwargs): pys = [] if RE_HANS.match(words): - pys = self._to_pinyin(words, style=style, heteronym=heteronym, - errors=errors, strict=strict) + pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict) post_data = self.post_pinyin(words, heteronym, pys) if post_data is not None: pys = post_data - pys = self.convert_styles( - pys, words, style, heteronym, errors, strict) + pys = self.convert_styles(pys, words, style, heteronym, errors, strict) else: - py = self.handle_nopinyin(words, style=style, errors=errors, - heteronym=heteronym, strict=strict) + py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict) if py: pys.extend(py) @@ -73,13 +75,11 @@ class Converter(UltimateConverter): g2pw_pinyin = self._g2pw(han) if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 - return super(Converter, self).convert( - han, Style.TONE, heteronym, errors, strict, **kwargs) + return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs) for i, item in enumerate(g2pw_pinyin[0]): if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 - py = super(Converter, self).convert( - han[i], Style.TONE, heteronym, errors, strict, **kwargs) + py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs) pinyins.extend(py) else: pinyins.append([to_tone(item)]) @@ -104,7 +104,7 @@ def _remove_dup_and_empty(lst_list): if lst: new_lst_list.append(lst) else: - new_lst_list.append(['']) + new_lst_list.append([""]) return new_lst_list @@ -127,17 +127,17 @@ def get_dict(): def read_dict(): polyphonic_dict = {} - with open(PP_DICT_PATH,encoding="utf-8") as f: + with open(PP_DICT_PATH, encoding="utf-8") as f: line = f.readline() while line: - key, value_str = line.split(':') + key, value_str = line.split(":") value = eval(value_str.strip()) polyphonic_dict[key.strip()] = value line = f.readline() - with open(PP_FIX_DICT_PATH,encoding="utf-8") as f: + with open(PP_FIX_DICT_PATH, encoding="utf-8") as f: line = f.readline() while line: - key, value_str = line.split(':') + key, value_str = line.split(":") value = eval(value_str.strip()) polyphonic_dict[key.strip()] = value line = f.readline() diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index 78a4c93..bf3109e 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -2,44 +2,43 @@ # This code is modified from https://github.com/GitYCC/g2pW import warnings + warnings.filterwarnings("ignore") import json import os -import zipfile,requests -from typing import Any -from typing import Dict -from typing import List -from typing import Tuple +import zipfile +from typing import Any, Dict, List, Tuple import numpy as np import onnxruntime +import requests + onnxruntime.set_default_logger_severity(3) from opencc import OpenCC +from pypinyin import Style, pinyin from transformers import AutoTokenizer -from pypinyin import pinyin -from pypinyin import Style -from .dataset import get_char_phoneme_labels -from .dataset import get_phoneme_labels -from .dataset import prepare_onnx_input -from .utils import load_config from ..zh_normalization.char_convert import tranditional_to_simplified +from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input +from .utils import load_config -model_version = '1.1' +model_version = "1.1" -def predict(session, onnx_input: Dict[str, Any], - labels: List[str]) -> Tuple[List[str], List[float]]: +def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[List[str], List[float]]: all_preds = [] all_confidences = [] - probs = session.run([], { - "input_ids": onnx_input['input_ids'], - "token_type_ids": onnx_input['token_type_ids'], - "attention_mask": onnx_input['attention_masks'], - "phoneme_mask": onnx_input['phoneme_masks'], - "char_ids": onnx_input['char_ids'], - "position_ids": onnx_input['position_ids'] - })[0] + probs = session.run( + [], + { + "input_ids": onnx_input["input_ids"], + "token_type_ids": onnx_input["token_type_ids"], + "attention_mask": onnx_input["attention_masks"], + "phoneme_mask": onnx_input["phoneme_masks"], + "char_ids": onnx_input["char_ids"], + "position_ids": onnx_input["position_ids"], + }, + )[0] preds = np.argmax(probs, axis=1).tolist() max_probs = [] @@ -51,17 +50,17 @@ def predict(session, onnx_input: Dict[str, Any], return all_preds, all_confidences -def download_and_decompress(model_dir: str='G2PWModel/'): +def download_and_decompress(model_dir: str = "G2PWModel/"): if not os.path.exists(model_dir): parent_directory = os.path.dirname(model_dir) - zip_dir = os.path.join(parent_directory,"G2PWModel_1.1.zip") - extract_dir = os.path.join(parent_directory,"G2PWModel_1.1") - extract_dir_new = os.path.join(parent_directory,"G2PWModel") + zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip") + extract_dir = os.path.join(parent_directory, "G2PWModel_1.1") + extract_dir_new = os.path.join(parent_directory, "G2PWModel") print("Downloading g2pw model...") - modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" + modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" with requests.get(modelscope_url, stream=True) as r: r.raise_for_status() - with open(zip_dir, 'wb') as f: + with open(zip_dir, "wb") as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) @@ -69,17 +68,20 @@ def download_and_decompress(model_dir: str='G2PWModel/'): print("Extracting g2pw model...") with zipfile.ZipFile(zip_dir, "r") as zip_ref: zip_ref.extractall(parent_directory) - + os.rename(extract_dir, extract_dir_new) return model_dir + class G2PWOnnxConverter: - def __init__(self, - model_dir: str='G2PWModel/', - style: str='bopomofo', - model_source: str=None, - enable_non_tradional_chinese: bool=False): + def __init__( + self, + model_dir: str = "G2PWModel/", + style: str = "bopomofo", + model_source: str = None, + enable_non_tradional_chinese: bool = False, + ): uncompress_path = download_and_decompress(model_dir) sess_options = onnxruntime.SessionOptions() @@ -87,41 +89,59 @@ class G2PWOnnxConverter: sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL sess_options.intra_op_num_threads = 2 try: - self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + self.session_g2pW = onnxruntime.InferenceSession( + os.path.join(uncompress_path, "g2pW.onnx"), + sess_options=sess_options, + providers=["CUDAExecutionProvider", "CPUExecutionProvider"], + ) except: - self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CPUExecutionProvider']) - self.config = load_config( - config_path=os.path.join(uncompress_path, 'config.py'), - use_default=True) + self.session_g2pW = onnxruntime.InferenceSession( + os.path.join(uncompress_path, "g2pW.onnx"), + sess_options=sess_options, + providers=["CPUExecutionProvider"], + ) + self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True) self.model_source = model_source if model_source else self.config.model_source self.enable_opencc = enable_non_tradional_chinese self.tokenizer = AutoTokenizer.from_pretrained(self.model_source) - polyphonic_chars_path = os.path.join(uncompress_path, - 'POLYPHONIC_CHARS.txt') - monophonic_chars_path = os.path.join(uncompress_path, - 'MONOPHONIC_CHARS.txt') + polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt") + monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt") self.polyphonic_chars = [ - line.split('\t') - for line in open(polyphonic_chars_path, encoding='utf-8').read() - .strip().split('\n') + line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n") ] self.non_polyphonic = { - '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊', '听', '噢' + "一", + "不", + "和", + "咋", + "嗲", + "剖", + "差", + "攢", + "倒", + "難", + "奔", + "勁", + "拗", + "肖", + "瘙", + "誒", + "泊", + "听", + "噢", } - self.non_monophonic = {'似', '攢'} + self.non_monophonic = {"似", "攢"} self.monophonic_chars = [ - line.split('\t') - for line in open(monophonic_chars_path, encoding='utf-8').read() - .strip().split('\n') + line.split("\t") for line in open(monophonic_chars_path, encoding="utf-8").read().strip().split("\n") ] - self.labels, self.char2phonemes = get_char_phoneme_labels( - polyphonic_chars=self.polyphonic_chars - ) if self.config.use_char_phoneme else get_phoneme_labels( - polyphonic_chars=self.polyphonic_chars) + self.labels, self.char2phonemes = ( + get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars) + if self.config.use_char_phoneme + else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars) + ) self.chars = sorted(list(self.char2phonemes.keys())) @@ -130,41 +150,29 @@ class G2PWOnnxConverter: if char in self.polyphonic_chars_new: self.polyphonic_chars_new.remove(char) - self.monophonic_chars_dict = { - char: phoneme - for char, phoneme in self.monophonic_chars - } + self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars} for char in self.non_monophonic: if char in self.monophonic_chars_dict: self.monophonic_chars_dict.pop(char) - self.pos_tags = [ - 'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI' - ] + self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"] - with open( - os.path.join(uncompress_path, - 'bopomofo_to_pinyin_wo_tune_dict.json'), - 'r', - encoding='utf-8') as fr: + with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr: self.bopomofo_convert_dict = json.load(fr) self.style_convert_func = { - 'bopomofo': lambda x: x, - 'pinyin': self._convert_bopomofo_to_pinyin, + "bopomofo": lambda x: x, + "pinyin": self._convert_bopomofo_to_pinyin, }[style] - with open( - os.path.join(uncompress_path, 'char_bopomofo_dict.json'), - 'r', - encoding='utf-8') as fr: + with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr: self.char_bopomofo_dict = json.load(fr) if self.enable_opencc: - self.cc = OpenCC('s2tw') + self.cc = OpenCC("s2tw") def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str: tone = bopomofo[-1] - assert tone in '12345' + assert tone in "12345" component = self.bopomofo_convert_dict.get(bopomofo[:-1]) if component: return component + tone @@ -184,8 +192,7 @@ class G2PWOnnxConverter: translated_sentences.append(translated_sent) sentences = translated_sentences - texts, query_ids, sent_ids, partial_results = self._prepare_data( - sentences=sentences) + texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences) if len(texts) == 0: # sentences no polyphonic words return partial_results @@ -198,14 +205,12 @@ class G2PWOnnxConverter: texts=texts, query_ids=query_ids, use_mask=self.config.use_mask, - window_size=None) + window_size=None, + ) - preds, confidences = predict( - session=self.session_g2pW, - onnx_input=onnx_input, - labels=self.labels) + preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels) if self.config.use_char_phoneme: - preds = [pred.split(' ')[1] for pred in preds] + preds = [pred.split(" ")[1] for pred in preds] results = partial_results for sent_id, query_id, pred in zip(sent_ids, query_ids, preds): @@ -213,15 +218,12 @@ class G2PWOnnxConverter: return results - def _prepare_data( - self, sentences: List[str] - ) -> Tuple[List[str], List[int], List[int], List[List[str]]]: + def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]: texts, query_ids, sent_ids, partial_results = [], [], [], [] for sent_id, sent in enumerate(sentences): # pypinyin works well for Simplified Chinese than Traditional Chinese sent_s = tranditional_to_simplified(sent) - pypinyin_result = pinyin( - sent_s, neutral_tone_with_five=True, style=Style.TONE3) + pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3) partial_result = [None] * len(sent) for i, char in enumerate(sent): if char in self.polyphonic_chars_new: @@ -229,8 +231,7 @@ class G2PWOnnxConverter: query_ids.append(i) sent_ids.append(sent_id) elif char in self.monophonic_chars_dict: - partial_result[i] = self.style_convert_func( - self.monophonic_chars_dict[char]) + partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char]) elif char in self.char_bopomofo_dict: partial_result[i] = pypinyin_result[i][0] # partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0]) diff --git a/GPT_SoVITS/text/g2pw/utils.py b/GPT_SoVITS/text/g2pw/utils.py index ba9ce51..a86b2bc 100644 --- a/GPT_SoVITS/text/g2pw/utils.py +++ b/GPT_SoVITS/text/g2pw/utils.py @@ -15,6 +15,7 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ + import os import re @@ -24,14 +25,14 @@ def wordize_and_map(text: str): index_map_from_text_to_word = [] index_map_from_word_to_text = [] while len(text) > 0: - match_space = re.match(r'^ +', text) + match_space = re.match(r"^ +", text) if match_space: space_str = match_space.group(0) index_map_from_text_to_word += [None] * len(space_str) - text = text[len(space_str):] + text = text[len(space_str) :] continue - match_en = re.match(r'^[a-zA-Z0-9]+', text) + match_en = re.match(r"^[a-zA-Z0-9]+", text) if match_en: en_word = match_en.group(0) @@ -42,7 +43,7 @@ def wordize_and_map(text: str): index_map_from_text_to_word += [len(words)] * len(en_word) words.append(en_word) - text = text[len(en_word):] + text = text[len(en_word) :] else: word_start_pos = len(index_map_from_text_to_word) word_end_pos = word_start_pos + 1 @@ -63,15 +64,14 @@ def tokenize_and_map(tokenizer, text: str): for word, (word_start, word_end) in zip(words, word2text): word_tokens = tokenizer.tokenize(word) - if len(word_tokens) == 0 or word_tokens == ['[UNK]']: + if len(word_tokens) == 0 or word_tokens == ["[UNK]"]: index_map_from_token_to_text.append((word_start, word_end)) - tokens.append('[UNK]') + tokens.append("[UNK]") else: current_word_start = word_start for word_token in word_tokens: - word_token_len = len(re.sub(r'^##', '', word_token)) - index_map_from_token_to_text.append( - (current_word_start, current_word_start + word_token_len)) + word_token_len = len(re.sub(r"^##", "", word_token)) + index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len)) current_word_start = current_word_start + word_token_len tokens.append(word_token) @@ -85,53 +85,51 @@ def tokenize_and_map(tokenizer, text: str): def _load_config(config_path: os.PathLike): import importlib.util - spec = importlib.util.spec_from_file_location('__init__', config_path) + + spec = importlib.util.spec_from_file_location("__init__", config_path) config = importlib.util.module_from_spec(spec) spec.loader.exec_module(config) return config default_config_dict = { - 'manual_seed': 1313, - 'model_source': 'bert-base-chinese', - 'window_size': 32, - 'num_workers': 2, - 'use_mask': True, - 'use_char_phoneme': False, - 'use_conditional': True, - 'param_conditional': { - 'affect_location': 'softmax', - 'bias': True, - 'char-linear': True, - 'pos-linear': False, - 'char+pos-second': True, - 'char+pos-second_lowrank': False, - 'lowrank_size': 0, - 'char+pos-second_fm': False, - 'fm_size': 0, - 'fix_mode': None, - 'count_json': 'train.count.json' + "manual_seed": 1313, + "model_source": "bert-base-chinese", + "window_size": 32, + "num_workers": 2, + "use_mask": True, + "use_char_phoneme": False, + "use_conditional": True, + "param_conditional": { + "affect_location": "softmax", + "bias": True, + "char-linear": True, + "pos-linear": False, + "char+pos-second": True, + "char+pos-second_lowrank": False, + "lowrank_size": 0, + "char+pos-second_fm": False, + "fm_size": 0, + "fix_mode": None, + "count_json": "train.count.json", }, - 'lr': 5e-5, - 'val_interval': 200, - 'num_iter': 10000, - 'use_focal': False, - 'param_focal': { - 'alpha': 0.0, - 'gamma': 0.7 + "lr": 5e-5, + "val_interval": 200, + "num_iter": 10000, + "use_focal": False, + "param_focal": {"alpha": 0.0, "gamma": 0.7}, + "use_pos": True, + "param_pos ": { + "weight": 0.1, + "pos_joint_training": True, + "train_pos_path": "train.pos", + "valid_pos_path": "dev.pos", + "test_pos_path": "test.pos", }, - 'use_pos': True, - 'param_pos ': { - 'weight': 0.1, - 'pos_joint_training': True, - 'train_pos_path': 'train.pos', - 'valid_pos_path': 'dev.pos', - 'test_pos_path': 'test.pos' - } } -def load_config(config_path: os.PathLike, use_default: bool=False): +def load_config(config_path: os.PathLike, use_default: bool = False): config = _load_config(config_path) if use_default: for attr, val in default_config_dict.items(): diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index e023ce7..a54d0cf 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -2,43 +2,51 @@ import re import os import hashlib + try: import pyopenjtalk + current_file_path = os.path.dirname(__file__) # 防止win下无法读取模型 - if os.name == 'nt': + if os.name == "nt": python_dir = os.getcwd() OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8") - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', OPEN_JTALK_DICT_DIR)): - if (OPEN_JTALK_DICT_DIR[:len(python_dir)].upper() == python_dir.upper()): - OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR,python_dir)) + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)): + if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper(): + OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir)) else: import shutil - if not os.path.exists('TEMP'): - os.mkdir('TEMP') + + if not os.path.exists("TEMP"): + os.mkdir("TEMP") if not os.path.exists(os.path.join("TEMP", "ja")): os.mkdir(os.path.join("TEMP", "ja")) if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")): shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic")) - shutil.copytree(pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), os.path.join("TEMP", "ja", "open_jtalk_dic"), ) + shutil.copytree( + pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), + os.path.join("TEMP", "ja", "open_jtalk_dic"), + ) OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic") pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8") - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', current_file_path)): - if (current_file_path[:len(python_dir)].upper() == python_dir.upper()): - current_file_path = os.path.join(os.path.relpath(current_file_path,python_dir)) + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)): + if current_file_path[: len(python_dir)].upper() == python_dir.upper(): + current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir)) else: - if not os.path.exists('TEMP'): - os.mkdir('TEMP') + if not os.path.exists("TEMP"): + os.mkdir("TEMP") if not os.path.exists(os.path.join("TEMP", "ja")): os.mkdir(os.path.join("TEMP", "ja")) if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")): os.mkdir(os.path.join("TEMP", "ja", "ja_userdic")) - shutil.copyfile(os.path.join(current_file_path, "ja_userdic", "userdict.csv"),os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv")) + shutil.copyfile( + os.path.join(current_file_path, "ja_userdic", "userdict.csv"), + os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"), + ) current_file_path = os.path.join("TEMP", "ja") - def get_hash(fp: str) -> str: hash_md5 = hashlib.md5() with open(fp, "rb") as f: @@ -51,21 +59,26 @@ try: USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5") # 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成 if os.path.exists(USERDIC_CSV_PATH): - if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read(): + if ( + not os.path.exists(USERDIC_BIN_PATH) + or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read() + ): pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH) - with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f: + with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f: f.write(get_hash(USERDIC_CSV_PATH)) if os.path.exists(USERDIC_BIN_PATH): - pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH) -except Exception as e: + pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH) +except Exception: # print(e) import pyopenjtalk + # failed to load user dictionary, ignore. pass from text.symbols import punctuation + # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" @@ -123,9 +136,9 @@ def post_replace_ph(ph): def replace_consecutive_punctuation(text): - punctuations = ''.join(re.escape(p) for p in punctuation) - pattern = f'([{punctuations}])([{punctuations}])+' - result = re.sub(pattern, r'\1', text) + punctuations = "".join(re.escape(p) for p in punctuation) + pattern = f"([{punctuations}])([{punctuations}])+" + result = re.sub(pattern, r"\1", text) return result @@ -152,7 +165,7 @@ def preprocess_jap(text, with_prosody=False): text += p.split(" ") if i < len(marks): - if marks[i] == " ":# 防止意外的UNK + if marks[i] == " ": # 防止意外的UNK continue text += [marks[i].replace(" ", "")] return text @@ -165,6 +178,7 @@ def text_normalize(text): text = replace_consecutive_punctuation(text) return text + # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): """Extract phoneme + prosoody symbol sequence from input full-context labels. @@ -241,6 +255,7 @@ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): return phones + # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def _numeric_feature_by_regex(regex, s): match = re.search(regex, s) @@ -248,6 +263,7 @@ def _numeric_feature_by_regex(regex, s): return -50 return int(match.group(1)) + def g2p(norm_text, with_prosody=True): phones = preprocess_jap(norm_text, with_prosody) phones = [post_replace_ph(i) for i in phones] diff --git a/GPT_SoVITS/text/korean.py b/GPT_SoVITS/text/korean.py index daae41f..254b05c 100644 --- a/GPT_SoVITS/text/korean.py +++ b/GPT_SoVITS/text/korean.py @@ -9,39 +9,43 @@ import importlib import os # 防止win下无法读取模型 -if os.name == 'nt': +if os.name == "nt": + class win_G2p(G2p): def check_mecab(self): super().check_mecab() spam_spec = importlib.util.find_spec("eunjeon") non_found = spam_spec is None if non_found: - print(f'you have to install eunjeon. install it...') + print("you have to install eunjeon. install it...") else: installpath = spam_spec.submodule_search_locations[0] - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)): - + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)): import sys from eunjeon import Mecab as _Mecab + class Mecab(_Mecab): def get_dicpath(installpath): - if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)): + if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)): import shutil - python_dir = os.getcwd() - if (installpath[:len(python_dir)].upper() == python_dir.upper()): - dicpath = os.path.join(os.path.relpath(installpath,python_dir),'data','mecabrc') - else: - if not os.path.exists('TEMP'): - os.mkdir('TEMP') - if not os.path.exists(os.path.join('TEMP', 'ko')): - os.mkdir(os.path.join('TEMP', 'ko')) - if os.path.exists(os.path.join('TEMP', 'ko', 'ko_dict')): - shutil.rmtree(os.path.join('TEMP', 'ko', 'ko_dict')) - shutil.copytree(os.path.join(installpath, 'data'), os.path.join('TEMP', 'ko', 'ko_dict')) - dicpath = os.path.join('TEMP', 'ko', 'ko_dict', 'mecabrc') + python_dir = os.getcwd() + if installpath[: len(python_dir)].upper() == python_dir.upper(): + dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc") + else: + if not os.path.exists("TEMP"): + os.mkdir("TEMP") + if not os.path.exists(os.path.join("TEMP", "ko")): + os.mkdir(os.path.join("TEMP", "ko")) + if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")): + shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict")) + + shutil.copytree( + os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict") + ) + dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc") else: - dicpath=os.path.abspath(os.path.join(installpath, 'data/mecabrc')) + dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc")) return dicpath def __init__(self, dicpath=get_dicpath(installpath)): @@ -52,97 +56,108 @@ if os.name == 'nt': G2p = win_G2p -from text.symbols2 import symbols +from text.symbols2 import symbols # This is a list of Korean classifiers preceded by pure Korean numerals. -_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' +_korean_classifiers = ( + "군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통" +) # List of (hangul, hangul divided) pairs: -_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ - # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule - # ('ㄵ', 'ㄴㅈ'), - # ('ㄶ', 'ㄴㅎ'), - # ('ㄺ', 'ㄹㄱ'), - # ('ㄻ', 'ㄹㅁ'), - # ('ㄼ', 'ㄹㅂ'), - # ('ㄽ', 'ㄹㅅ'), - # ('ㄾ', 'ㄹㅌ'), - # ('ㄿ', 'ㄹㅍ'), - # ('ㅀ', 'ㄹㅎ'), - # ('ㅄ', 'ㅂㅅ'), - ('ㅘ', 'ㅗㅏ'), - ('ㅙ', 'ㅗㅐ'), - ('ㅚ', 'ㅗㅣ'), - ('ㅝ', 'ㅜㅓ'), - ('ㅞ', 'ㅜㅔ'), - ('ㅟ', 'ㅜㅣ'), - ('ㅢ', 'ㅡㅣ'), - ('ㅑ', 'ㅣㅏ'), - ('ㅒ', 'ㅣㅐ'), - ('ㅕ', 'ㅣㅓ'), - ('ㅖ', 'ㅣㅔ'), - ('ㅛ', 'ㅣㅗ'), - ('ㅠ', 'ㅣㅜ') -]] +_hangul_divided = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule + # ('ㄵ', 'ㄴㅈ'), + # ('ㄶ', 'ㄴㅎ'), + # ('ㄺ', 'ㄹㄱ'), + # ('ㄻ', 'ㄹㅁ'), + # ('ㄼ', 'ㄹㅂ'), + # ('ㄽ', 'ㄹㅅ'), + # ('ㄾ', 'ㄹㅌ'), + # ('ㄿ', 'ㄹㅍ'), + # ('ㅀ', 'ㄹㅎ'), + # ('ㅄ', 'ㅂㅅ'), + ("ㅘ", "ㅗㅏ"), + ("ㅙ", "ㅗㅐ"), + ("ㅚ", "ㅗㅣ"), + ("ㅝ", "ㅜㅓ"), + ("ㅞ", "ㅜㅔ"), + ("ㅟ", "ㅜㅣ"), + ("ㅢ", "ㅡㅣ"), + ("ㅑ", "ㅣㅏ"), + ("ㅒ", "ㅣㅐ"), + ("ㅕ", "ㅣㅓ"), + ("ㅖ", "ㅣㅔ"), + ("ㅛ", "ㅣㅗ"), + ("ㅠ", "ㅣㅜ"), + ] +] # List of (Latin alphabet, hangul) pairs: -_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ - ('a', '에이'), - ('b', '비'), - ('c', '시'), - ('d', '디'), - ('e', '이'), - ('f', '에프'), - ('g', '지'), - ('h', '에이치'), - ('i', '아이'), - ('j', '제이'), - ('k', '케이'), - ('l', '엘'), - ('m', '엠'), - ('n', '엔'), - ('o', '오'), - ('p', '피'), - ('q', '큐'), - ('r', '아르'), - ('s', '에스'), - ('t', '티'), - ('u', '유'), - ('v', '브이'), - ('w', '더블유'), - ('x', '엑스'), - ('y', '와이'), - ('z', '제트') -]] +_latin_to_hangul = [ + (re.compile("%s" % x[0], re.IGNORECASE), x[1]) + for x in [ + ("a", "에이"), + ("b", "비"), + ("c", "시"), + ("d", "디"), + ("e", "이"), + ("f", "에프"), + ("g", "지"), + ("h", "에이치"), + ("i", "아이"), + ("j", "제이"), + ("k", "케이"), + ("l", "엘"), + ("m", "엠"), + ("n", "엔"), + ("o", "오"), + ("p", "피"), + ("q", "큐"), + ("r", "아르"), + ("s", "에스"), + ("t", "티"), + ("u", "유"), + ("v", "브이"), + ("w", "더블유"), + ("x", "엑스"), + ("y", "와이"), + ("z", "제트"), + ] +] # List of (ipa, lazy ipa) pairs: -_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ - ('t͡ɕ','ʧ'), - ('d͡ʑ','ʥ'), - ('ɲ','n^'), - ('ɕ','ʃ'), - ('ʷ','w'), - ('ɭ','l`'), - ('ʎ','ɾ'), - ('ɣ','ŋ'), - ('ɰ','ɯ'), - ('ʝ','j'), - ('ʌ','ə'), - ('ɡ','g'), - ('\u031a','#'), - ('\u0348','='), - ('\u031e',''), - ('\u0320',''), - ('\u0339','') -]] +_ipa_to_lazy_ipa = [ + (re.compile("%s" % x[0], re.IGNORECASE), x[1]) + for x in [ + ("t͡ɕ", "ʧ"), + ("d͡ʑ", "ʥ"), + ("ɲ", "n^"), + ("ɕ", "ʃ"), + ("ʷ", "w"), + ("ɭ", "l`"), + ("ʎ", "ɾ"), + ("ɣ", "ŋ"), + ("ɰ", "ɯ"), + ("ʝ", "j"), + ("ʌ", "ə"), + ("ɡ", "g"), + ("\u031a", "#"), + ("\u0348", "="), + ("\u031e", ""), + ("\u0320", ""), + ("\u0339", ""), + ] +] def fix_g2pk2_error(text): new_text = "" i = 0 while i < len(text) - 4: - if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ': - new_text += text[i:i+3] + ' ' + 'ㄴ' + if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "ㄹ": + new_text += text[i : i + 3] + " " + "ㄴ" i += 5 else: new_text += text[i] @@ -166,20 +181,20 @@ def divide_hangul(text): def hangul_number(num, sino=True): - '''Reference https://github.com/Kyubyong/g2pK''' - num = re.sub(',', '', num) + """Reference https://github.com/Kyubyong/g2pK""" + num = re.sub(",", "", num) - if num == '0': - return '영' - if not sino and num == '20': - return '스무' + if num == "0": + return "영" + if not sino and num == "20": + return "스무" - digits = '123456789' - names = '일이삼사오육칠팔구' + digits = "123456789" + names = "일이삼사오육칠팔구" digit2name = {d: n for d, n in zip(digits, names)} - modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' - decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' + modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉" + decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔" digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} @@ -188,75 +203,75 @@ def hangul_number(num, sino=True): i = len(num) - i - 1 if sino: if i == 0: - name = digit2name.get(digit, '') + name = digit2name.get(digit, "") elif i == 1: - name = digit2name.get(digit, '') + '십' - name = name.replace('일십', '십') + name = digit2name.get(digit, "") + "십" + name = name.replace("일십", "십") else: if i == 0: - name = digit2mod.get(digit, '') + name = digit2mod.get(digit, "") elif i == 1: - name = digit2dec.get(digit, '') - if digit == '0': + name = digit2dec.get(digit, "") + if digit == "0": if i % 4 == 0: - last_three = spelledout[-min(3, len(spelledout)):] - if ''.join(last_three) == '': - spelledout.append('') + last_three = spelledout[-min(3, len(spelledout)) :] + if "".join(last_three) == "": + spelledout.append("") continue else: - spelledout.append('') + spelledout.append("") continue if i == 2: - name = digit2name.get(digit, '') + '백' - name = name.replace('일백', '백') + name = digit2name.get(digit, "") + "백" + name = name.replace("일백", "백") elif i == 3: - name = digit2name.get(digit, '') + '천' - name = name.replace('일천', '천') + name = digit2name.get(digit, "") + "천" + name = name.replace("일천", "천") elif i == 4: - name = digit2name.get(digit, '') + '만' - name = name.replace('일만', '만') + name = digit2name.get(digit, "") + "만" + name = name.replace("일만", "만") elif i == 5: - name = digit2name.get(digit, '') + '십' - name = name.replace('일십', '십') + name = digit2name.get(digit, "") + "십" + name = name.replace("일십", "십") elif i == 6: - name = digit2name.get(digit, '') + '백' - name = name.replace('일백', '백') + name = digit2name.get(digit, "") + "백" + name = name.replace("일백", "백") elif i == 7: - name = digit2name.get(digit, '') + '천' - name = name.replace('일천', '천') + name = digit2name.get(digit, "") + "천" + name = name.replace("일천", "천") elif i == 8: - name = digit2name.get(digit, '') + '억' + name = digit2name.get(digit, "") + "억" elif i == 9: - name = digit2name.get(digit, '') + '십' + name = digit2name.get(digit, "") + "십" elif i == 10: - name = digit2name.get(digit, '') + '백' + name = digit2name.get(digit, "") + "백" elif i == 11: - name = digit2name.get(digit, '') + '천' + name = digit2name.get(digit, "") + "천" elif i == 12: - name = digit2name.get(digit, '') + '조' + name = digit2name.get(digit, "") + "조" elif i == 13: - name = digit2name.get(digit, '') + '십' + name = digit2name.get(digit, "") + "십" elif i == 14: - name = digit2name.get(digit, '') + '백' + name = digit2name.get(digit, "") + "백" elif i == 15: - name = digit2name.get(digit, '') + '천' + name = digit2name.get(digit, "") + "천" spelledout.append(name) - return ''.join(elem for elem in spelledout) + return "".join(elem for elem in spelledout) def number_to_hangul(text): - '''Reference https://github.com/Kyubyong/g2pK''' - tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) + """Reference https://github.com/Kyubyong/g2pK""" + tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text)) for token in tokens: num, classifier = token if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: spelledout = hangul_number(num, sino=False) else: spelledout = hangul_number(num, sino=True) - text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') + text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}") # digit by digit for remaining digits - digits = '0123456789' - names = '영일이삼사오육칠팔구' + digits = "0123456789" + names = "영일이삼사오육칠팔구" for d, n in zip(digits, names): text = text.replace(d, n) return text @@ -265,19 +280,23 @@ def number_to_hangul(text): def korean_to_lazy_ipa(text): text = latin_to_hangul(text) text = number_to_hangul(text) - text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) + text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text) for regex, replacement in _ipa_to_lazy_ipa: text = re.sub(regex, replacement, text) return text -_g2p=G2p() + +_g2p = G2p() + + def korean_to_ipa(text): text = latin_to_hangul(text) text = number_to_hangul(text) text = _g2p(text) text = fix_g2pk2_error(text) text = korean_to_lazy_ipa(text) - return text.replace('ʧ','tʃ').replace('ʥ','dʑ') + return text.replace("ʧ", "tʃ").replace("ʥ", "dʑ") + def post_replace_ph(ph): rep_map = { @@ -301,12 +320,13 @@ def post_replace_ph(ph): ph = "停" return ph + def g2p(text): text = latin_to_hangul(text) text = _g2p(text) text = divide_hangul(text) text = fix_g2pk2_error(text) - text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) + text = re.sub(r"([\u3131-\u3163])$", r"\1.", text) # text = "".join([post_replace_ph(i) for i in text]) text = [post_replace_ph(i) for i in text] return text @@ -314,4 +334,4 @@ def g2p(text): if __name__ == "__main__": text = "안녕하세요" - print(g2p(text)) \ No newline at end of file + print(g2p(text)) diff --git a/GPT_SoVITS/text/symbols.py b/GPT_SoVITS/text/symbols.py index 7049949..b012882 100644 --- a/GPT_SoVITS/text/symbols.py +++ b/GPT_SoVITS/text/symbols.py @@ -1,5 +1,3 @@ -import os - # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 punctuation.append("-") diff --git a/GPT_SoVITS/text/symbols2.py b/GPT_SoVITS/text/symbols2.py index a442350..2f159d2 100644 --- a/GPT_SoVITS/text/symbols2.py +++ b/GPT_SoVITS/text/symbols2.py @@ -1,5 +1,3 @@ -import os - # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 punctuation.append("-") @@ -396,24 +394,404 @@ arpa = { "SH", } -ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停' +ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停" # ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ ' -yue_symbols={'Yeot3', 'Yip1', 'Yyu3', 'Yeng4', 'Yut5', 'Yaan5', 'Ym5', 'Yaan6', 'Yang1', 'Yun4', 'Yon2', 'Yui5', 'Yun2', 'Yat3', 'Ye', 'Yeot1', 'Yoeng5', 'Yoek2', 'Yam2', 'Yeon6', 'Yu6', 'Yiu3', 'Yaang6', 'Yp5', 'Yai4', 'Yoek4', 'Yit6', 'Yam5', 'Yoeng6', 'Yg1', 'Yk3', 'Yoe4', 'Yam3', 'Yc', 'Yyu4', 'Yyut1', 'Yiu4', 'Ying3', 'Yip3', 'Yaap3', 'Yau3', 'Yan4', 'Yau1', 'Yap4', 'Yk6', 'Yok3', 'Yai1', 'Yeot6', 'Yan2', 'Yoek6', 'Yt1', 'Yoi1', 'Yit5', 'Yn4', 'Yaau3', 'Yau4', 'Yuk6', 'Ys', 'Yuk', 'Yin6', 'Yung6', 'Ya', 'You', 'Yaai5', 'Yau5', 'Yoi3', 'Yaak3', 'Yaat3', 'Ying2', 'Yok5', 'Yeng2', 'Yyut3', 'Yam1', 'Yip5', 'You1', 'Yam6', 'Yaa5', 'Yi6', 'Yek4', 'Yyu2', 'Yuk5', 'Yaam1', 'Yang2', 'Yai', 'Yiu6', 'Yin4', 'Yok4', 'Yot3', 'Yui2', 'Yeoi5', 'Yyun6', 'Yyu5', 'Yoi5', 'Yeot2', 'Yim4', 'Yeoi2', 'Yaan1', 'Yang6', 'Yong1', 'Yaang4', 'Yung5', 'Yeon1', 'Yin2', 'Ya3', 'Yaang3', 'Yg', 'Yk2', 'Yaau5', 'Yut1', 'Yt5', 'Yip4', 'Yung4', 'Yj', 'Yong3', 'Ya1', 'Yg6', 'Yaau6', 'Yit3', 'Yun3', 'Ying1', 'Yn2', 'Yg4', 'Yl', 'Yp3', 'Yn3', 'Yak1', 'Yang5', 'Yoe6', 'You2', 'Yap2', 'Yak2', 'Yt3', 'Yot5', 'Yim2', 'Yi1', 'Yn6', 'Yaat5', 'Yaam3', 'Yoek5', 'Ye3', 'Yeon4', 'Yaa2', 'Yu3', 'Yim6', 'Ym', 'Yoe3', 'Yaai2', 'Ym2', 'Ya6', 'Yeng6', 'Yik4', 'Yot4', 'Yaai4', 'Yyun3', 'Yu1', 'Yoeng1', 'Yaap2', 'Yuk3', 'Yoek3', 'Yeng5', 'Yeoi1', 'Yiu2', 'Yok1', 'Yo1', 'Yoek1', 'Yoeng2', 'Yeon5', 'Yiu1', 'Yoeng4', 'Yuk2', 'Yat4', 'Yg5', 'Yut4', 'Yan6', 'Yin3', 'Yaa6', 'Yap1', 'Yg2', 'Yoe5', 'Yt4', 'Ya5', 'Yo4', 'Yyu1', 'Yak3', 'Yeon2', 'Yong4', 'Ym1', 'Ye2', 'Yaang5', 'Yoi2', 'Yeng3', 'Yn', 'Yyut4', 'Yau', 'Yaak2', 'Yaan4', 'Yek2', 'Yin1', 'Yi5', 'Yoe2', 'Yei5', 'Yaat6', 'Yak5', 'Yp6', 'Yok6', 'Yei2', 'Yaap1', 'Yyut5', 'Yi4', 'Yim1', 'Yk5', 'Ye4', 'Yok2', 'Yaam6', 'Yat2', 'Yon6', 'Yei3', 'Yyu6', 'Yeot5', 'Yk4', 'Yai6', 'Yd', 'Yg3', 'Yei6', 'Yau2', 'Yok', 'Yau6', 'Yung3', 'Yim5', 'Yut6', 'Yit1', 'Yon3', 'Yat1', 'Yaam2', 'Yyut2', 'Yui6', 'Yt2', 'Yek6', 'Yt', 'Ye6', 'Yang3', 'Ying6', 'Yaau1', 'Yeon3', 'Yng', 'Yh', 'Yang4', 'Ying5', 'Yaap6', 'Yoeng3', 'Yyun4', 'You3', 'Yan5', 'Yat5', 'Yot1', 'Yun1', 'Yi3', 'Yaa1', 'Yaap4', 'You6', 'Yaang2', 'Yaap5', 'Yaa3', 'Yaak6', 'Yeng1', 'Yaak1', 'Yo5', 'Yoi4', 'Yam4', 'Yik1', 'Ye1', 'Yai5', 'Yung1', 'Yp2', 'Yui4', 'Yaak4', 'Yung2', 'Yak4', 'Yaat4', 'Yeoi4', 'Yut2', 'Yin5', 'Yaau4', 'Yap6', 'Yb', 'Yaam4', 'Yw', 'Yut3', 'Yong2', 'Yt6', 'Yaai6', 'Yap5', 'Yik5', 'Yun6', 'Yaam5', 'Yun5', 'Yik3', 'Ya2', 'Yyut6', 'Yon4', 'Yk1', 'Yit4', 'Yak6', 'Yaan2', 'Yuk1', 'Yai2', 'Yik2', 'Yaat2', 'Yo3', 'Ykw', 'Yn5', 'Yaa', 'Ye5', 'Yu4', 'Yei1', 'Yai3', 'Yyun5', 'Yip2', 'Yaau2', 'Yiu5', 'Ym4', 'Yeoi6', 'Yk', 'Ym6', 'Yoe1', 'Yeoi3', 'Yon', 'Yuk4', 'Yaai3', 'Yaa4', 'Yot6', 'Yaang1', 'Yei4', 'Yek1', 'Yo', 'Yp', 'Yo6', 'Yp4', 'Yan3', 'Yoi', 'Yap3', 'Yek3', 'Yim3', 'Yz', 'Yot2', 'Yoi6', 'Yit2', 'Yu5', 'Yaan3', 'Yan1', 'Yon5', 'Yp1', 'Yong5', 'Ygw', 'Yak', 'Yat6', 'Ying4', 'Yu2', 'Yf', 'Ya4', 'Yon1', 'You4', 'Yik6', 'Yui1', 'Yaat1', 'Yeot4', 'Yi2', 'Yaai1', 'Yek5', 'Ym3', 'Yong6', 'You5', 'Yyun1', 'Yn1', 'Yo2', 'Yip6', 'Yui3', 'Yaak5', 'Yyun2'} +yue_symbols = { + "Yeot3", + "Yip1", + "Yyu3", + "Yeng4", + "Yut5", + "Yaan5", + "Ym5", + "Yaan6", + "Yang1", + "Yun4", + "Yon2", + "Yui5", + "Yun2", + "Yat3", + "Ye", + "Yeot1", + "Yoeng5", + "Yoek2", + "Yam2", + "Yeon6", + "Yu6", + "Yiu3", + "Yaang6", + "Yp5", + "Yai4", + "Yoek4", + "Yit6", + "Yam5", + "Yoeng6", + "Yg1", + "Yk3", + "Yoe4", + "Yam3", + "Yc", + "Yyu4", + "Yyut1", + "Yiu4", + "Ying3", + "Yip3", + "Yaap3", + "Yau3", + "Yan4", + "Yau1", + "Yap4", + "Yk6", + "Yok3", + "Yai1", + "Yeot6", + "Yan2", + "Yoek6", + "Yt1", + "Yoi1", + "Yit5", + "Yn4", + "Yaau3", + "Yau4", + "Yuk6", + "Ys", + "Yuk", + "Yin6", + "Yung6", + "Ya", + "You", + "Yaai5", + "Yau5", + "Yoi3", + "Yaak3", + "Yaat3", + "Ying2", + "Yok5", + "Yeng2", + "Yyut3", + "Yam1", + "Yip5", + "You1", + "Yam6", + "Yaa5", + "Yi6", + "Yek4", + "Yyu2", + "Yuk5", + "Yaam1", + "Yang2", + "Yai", + "Yiu6", + "Yin4", + "Yok4", + "Yot3", + "Yui2", + "Yeoi5", + "Yyun6", + "Yyu5", + "Yoi5", + "Yeot2", + "Yim4", + "Yeoi2", + "Yaan1", + "Yang6", + "Yong1", + "Yaang4", + "Yung5", + "Yeon1", + "Yin2", + "Ya3", + "Yaang3", + "Yg", + "Yk2", + "Yaau5", + "Yut1", + "Yt5", + "Yip4", + "Yung4", + "Yj", + "Yong3", + "Ya1", + "Yg6", + "Yaau6", + "Yit3", + "Yun3", + "Ying1", + "Yn2", + "Yg4", + "Yl", + "Yp3", + "Yn3", + "Yak1", + "Yang5", + "Yoe6", + "You2", + "Yap2", + "Yak2", + "Yt3", + "Yot5", + "Yim2", + "Yi1", + "Yn6", + "Yaat5", + "Yaam3", + "Yoek5", + "Ye3", + "Yeon4", + "Yaa2", + "Yu3", + "Yim6", + "Ym", + "Yoe3", + "Yaai2", + "Ym2", + "Ya6", + "Yeng6", + "Yik4", + "Yot4", + "Yaai4", + "Yyun3", + "Yu1", + "Yoeng1", + "Yaap2", + "Yuk3", + "Yoek3", + "Yeng5", + "Yeoi1", + "Yiu2", + "Yok1", + "Yo1", + "Yoek1", + "Yoeng2", + "Yeon5", + "Yiu1", + "Yoeng4", + "Yuk2", + "Yat4", + "Yg5", + "Yut4", + "Yan6", + "Yin3", + "Yaa6", + "Yap1", + "Yg2", + "Yoe5", + "Yt4", + "Ya5", + "Yo4", + "Yyu1", + "Yak3", + "Yeon2", + "Yong4", + "Ym1", + "Ye2", + "Yaang5", + "Yoi2", + "Yeng3", + "Yn", + "Yyut4", + "Yau", + "Yaak2", + "Yaan4", + "Yek2", + "Yin1", + "Yi5", + "Yoe2", + "Yei5", + "Yaat6", + "Yak5", + "Yp6", + "Yok6", + "Yei2", + "Yaap1", + "Yyut5", + "Yi4", + "Yim1", + "Yk5", + "Ye4", + "Yok2", + "Yaam6", + "Yat2", + "Yon6", + "Yei3", + "Yyu6", + "Yeot5", + "Yk4", + "Yai6", + "Yd", + "Yg3", + "Yei6", + "Yau2", + "Yok", + "Yau6", + "Yung3", + "Yim5", + "Yut6", + "Yit1", + "Yon3", + "Yat1", + "Yaam2", + "Yyut2", + "Yui6", + "Yt2", + "Yek6", + "Yt", + "Ye6", + "Yang3", + "Ying6", + "Yaau1", + "Yeon3", + "Yng", + "Yh", + "Yang4", + "Ying5", + "Yaap6", + "Yoeng3", + "Yyun4", + "You3", + "Yan5", + "Yat5", + "Yot1", + "Yun1", + "Yi3", + "Yaa1", + "Yaap4", + "You6", + "Yaang2", + "Yaap5", + "Yaa3", + "Yaak6", + "Yeng1", + "Yaak1", + "Yo5", + "Yoi4", + "Yam4", + "Yik1", + "Ye1", + "Yai5", + "Yung1", + "Yp2", + "Yui4", + "Yaak4", + "Yung2", + "Yak4", + "Yaat4", + "Yeoi4", + "Yut2", + "Yin5", + "Yaau4", + "Yap6", + "Yb", + "Yaam4", + "Yw", + "Yut3", + "Yong2", + "Yt6", + "Yaai6", + "Yap5", + "Yik5", + "Yun6", + "Yaam5", + "Yun5", + "Yik3", + "Ya2", + "Yyut6", + "Yon4", + "Yk1", + "Yit4", + "Yak6", + "Yaan2", + "Yuk1", + "Yai2", + "Yik2", + "Yaat2", + "Yo3", + "Ykw", + "Yn5", + "Yaa", + "Ye5", + "Yu4", + "Yei1", + "Yai3", + "Yyun5", + "Yip2", + "Yaau2", + "Yiu5", + "Ym4", + "Yeoi6", + "Yk", + "Ym6", + "Yoe1", + "Yeoi3", + "Yon", + "Yuk4", + "Yaai3", + "Yaa4", + "Yot6", + "Yaang1", + "Yei4", + "Yek1", + "Yo", + "Yp", + "Yo6", + "Yp4", + "Yan3", + "Yoi", + "Yap3", + "Yek3", + "Yim3", + "Yz", + "Yot2", + "Yoi6", + "Yit2", + "Yu5", + "Yaan3", + "Yan1", + "Yon5", + "Yp1", + "Yong5", + "Ygw", + "Yak", + "Yat6", + "Ying4", + "Yu2", + "Yf", + "Ya4", + "Yon1", + "You4", + "Yik6", + "Yui1", + "Yaat1", + "Yeot4", + "Yi2", + "Yaai1", + "Yek5", + "Ym3", + "Yong6", + "You5", + "Yyun1", + "Yn1", + "Yo2", + "Yip6", + "Yui3", + "Yaak5", + "Yyun2", +} # symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了 symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) symbols = sorted(set(symbols)) # print(len(symbols)) -symbols+=["[","]"]##日文新增上升下降调型 -symbols+=sorted(list(ko_symbols)) -symbols+=sorted(list(yue_symbols))##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复 +symbols += ["[", "]"] ##日文新增上升下降调型 +symbols += sorted(list(ko_symbols)) +symbols += sorted(list(yue_symbols)) ##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复 # print(len(symbols)) if __name__ == "__main__": print(len(symbols)) -''' +""" 粤语: 732-353=379 韩文+粤语: 732-322=410 -''' +""" diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py index f6b0a94..964ea38 100644 --- a/GPT_SoVITS/text/tone_sandhi.py +++ b/GPT_SoVITS/text/tone_sandhi.py @@ -510,12 +510,7 @@ class ToneSandhi: # e.g. 走了, 看着, 去过 elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: finals[-1] = finals[-1][:-1] + "5" - elif ( - len(word) > 1 - and word[-1] in "们子" - and pos in {"r", "n"} - and word not in self.must_not_neural_tone_words - ): + elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words: finals[-1] = finals[-1][:-1] + "5" # e.g. 桌上, 地下, 家里 elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: @@ -525,25 +520,18 @@ class ToneSandhi: finals[-1] = finals[-1][:-1] + "5" # 个做量词 elif ( - ge_idx >= 1 - and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") ) or word == "个": finals[ge_idx] = finals[ge_idx][:-1] + "5" else: - if ( - word in self.must_neural_tone_words - or word[-2:] in self.must_neural_tone_words - ): + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: finals[-1] = finals[-1][:-1] + "5" word_list = self._split_word(word) finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] for i, word in enumerate(word_list): # conventional neural in Chinese - if ( - word in self.must_neural_tone_words - or word[-2:] in self.must_neural_tone_words - ): + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: finals_list[i][-1] = finals_list[i][-1][:-1] + "5" finals = sum(finals_list, []) return finals @@ -561,9 +549,7 @@ class ToneSandhi: def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: # "一" in number sequences, e.g. 一零零, 二一零 - if word.find("一") != -1 and all( - [item.isnumeric() for item in word if item != "一"] - ): + if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]): return finals # "一" between reduplication words shold be yi5, e.g. 看一看 elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: @@ -697,13 +683,10 @@ class ToneSandhi: return new_seg # the first and the second words are all_tone_three - def _merge_continuous_three_tones( - self, seg: List[Tuple[str, str]] - ) -> List[Tuple[str, str]]: + def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) @@ -715,10 +698,7 @@ class ToneSandhi: and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if ( - not self._is_reduplication(seg[i - 1][0]) - and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 - ): + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -732,13 +712,10 @@ class ToneSandhi: return len(word) == 2 and word[0] == word[1] # the last char of first word and the first char of second word is tone_three - def _merge_continuous_three_tones_2( - self, seg: List[Tuple[str, str]] - ) -> List[Tuple[str, str]]: + def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) - for (word, pos) in seg + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) @@ -750,10 +727,7 @@ class ToneSandhi: and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if ( - not self._is_reduplication(seg[i - 1][0]) - and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 - ): + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: diff --git a/GPT_SoVITS/text/zh_normalization/char_convert.py b/GPT_SoVITS/text/zh_normalization/char_convert.py index dcf95d7..5b57ed9 100644 --- a/GPT_SoVITS/text/zh_normalization/char_convert.py +++ b/GPT_SoVITS/text/zh_normalization/char_convert.py @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. -""" -simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' +"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.""" -traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' +simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤" + +traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤" assert len(simplified_charcters) == len(simplified_charcters) @@ -28,13 +28,11 @@ for i, item in enumerate(simplified_charcters): def tranditional_to_simplified(text: str) -> str: - return "".join( - [t2s_dict[item] if item in t2s_dict else item for item in text]) + return "".join([t2s_dict[item] if item in t2s_dict else item for item in text]) def simplified_to_traditional(text: str) -> str: - return "".join( - [s2t_dict[item] if item in s2t_dict else item for item in text]) + return "".join([s2t_dict[item] if item in s2t_dict else item for item in text]) if __name__ == "__main__": diff --git a/GPT_SoVITS/text/zh_normalization/chronology.py b/GPT_SoVITS/text/zh_normalization/chronology.py index ea4558e..2a6f66c 100644 --- a/GPT_SoVITS/text/zh_normalization/chronology.py +++ b/GPT_SoVITS/text/zh_normalization/chronology.py @@ -21,25 +21,29 @@ from .num import verbalize_digit def _time_num2str(num_string: str) -> str: """A special case for verbalizing number in time.""" - result = num2str(num_string.lstrip('0')) - if num_string.startswith('0'): - result = DIGITS['0'] + result + result = num2str(num_string.lstrip("0")) + if num_string.startswith("0"): + result = DIGITS["0"] + result return result # 时刻表达式 -RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?') +RE_TIME = re.compile( + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" +) # 时间范围,如8:30-12:30 -RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?' - r'(~|-)' - r'([0-1]?[0-9]|2[0-3])' - r':([0-5][0-9])' - r'(:([0-5][0-9]))?') +RE_TIME_RANGE = re.compile( + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" + r"(~|-)" + r"([0-1]?[0-9]|2[0-3])" + r":([0-5][0-9])" + r"(:([0-5][0-9]))?" +) def replace_time(match) -> str: @@ -62,31 +66,33 @@ def replace_time(match) -> str: second_2 = match.group(9) result = f"{num2str(hour)}点" - if minute.lstrip('0'): + if minute.lstrip("0"): if int(minute) == 30: result += "半" else: result += f"{_time_num2str(minute)}分" - if second and second.lstrip('0'): + if second and second.lstrip("0"): result += f"{_time_num2str(second)}秒" if is_range: result += "至" result += f"{num2str(hour_2)}点" - if minute_2.lstrip('0'): + if minute_2.lstrip("0"): if int(minute) == 30: result += "半" else: result += f"{_time_num2str(minute_2)}分" - if second_2 and second_2.lstrip('0'): + if second_2 and second_2.lstrip("0"): result += f"{_time_num2str(second_2)}秒" return result -RE_DATE = re.compile(r'(\d{4}|\d{2})年' - r'((0?[1-9]|1[0-2])月)?' - r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') +RE_DATE = re.compile( + r"(\d{4}|\d{2})年" + r"((0?[1-9]|1[0-2])月)?" + r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?" +) def replace_date(match) -> str: @@ -110,8 +116,7 @@ def replace_date(match) -> str: # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 -RE_DATE2 = re.compile( - r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') +RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])") def replace_date2(match) -> str: diff --git a/GPT_SoVITS/text/zh_normalization/constants.py b/GPT_SoVITS/text/zh_normalization/constants.py index 6423ad7..4218a55 100644 --- a/GPT_SoVITS/text/zh_normalization/constants.py +++ b/GPT_SoVITS/text/zh_normalization/constants.py @@ -18,10 +18,7 @@ from pypinyin.constants import SUPPORT_UCS4 # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) -F2H_ASCII_LETTERS = { - ord(char) + 65248: ord(char) - for char in string.ascii_letters -} +F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters} # 英文字符半角 -> 全角映射表 H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} @@ -37,26 +34,29 @@ F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} # 空格 (num: 1) -F2H_SPACE = {'\u3000': ' '} -H2F_SPACE = {' ': '\u3000'} +F2H_SPACE = {"\u3000": " "} +H2F_SPACE = {" ": "\u3000"} # 非"有拼音的汉字"的字符串,可用于NSW提取 if SUPPORT_UCS4: - RE_NSW = re.compile(r'(?:[^' - r'\u3007' # 〇 - r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] - r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] - r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] - r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] - r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] - r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] - r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] - r'])+') + RE_NSW = re.compile( + r"(?:[^" + r"\u3007" # 〇 + r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] + r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] + r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] + r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF] + r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F] + r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D] + r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F] + r"])+" + ) else: RE_NSW = re.compile( # pragma: no cover - r'(?:[^' - r'\u3007' # 〇 - r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] - r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] - r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] - r'])+') + r"(?:[^" + r"\u3007" # 〇 + r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] + r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] + r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] + r"])+" + ) diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py index c0460a0..c3af4d6 100644 --- a/GPT_SoVITS/text/zh_normalization/num.py +++ b/GPT_SoVITS/text/zh_normalization/num.py @@ -15,23 +15,26 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ + import re from collections import OrderedDict from typing import List -DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} -UNITS = OrderedDict({ - 1: '十', - 2: '百', - 3: '千', - 4: '万', - 8: '亿', -}) +DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")} +UNITS = OrderedDict( + { + 1: "十", + 2: "百", + 3: "千", + 4: "万", + 8: "亿", + } +) -COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)" # 分数表达式 -RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') +RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)") def replace_frac(match) -> str: @@ -52,7 +55,7 @@ def replace_frac(match) -> str: # 百分数表达式 -RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') +RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%") def replace_percentage(match) -> str: @@ -72,7 +75,7 @@ def replace_percentage(match) -> str: # 整数表达式 # 带负号的整数 -10 -RE_INTEGER = re.compile(r'(-)' r'(\d+)') +RE_INTEGER = re.compile(r"(-)" r"(\d+)") def replace_negative_num(match) -> str: @@ -92,7 +95,7 @@ def replace_negative_num(match) -> str: # 编号-无符号整形 # 00078 -RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') +RE_DEFAULT_NUM = re.compile(r"\d{3}\d*") def replace_default_num(match): @@ -110,15 +113,11 @@ def replace_default_num(match): # RE_ASMD = re.compile( # r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') RE_ASMD = re.compile( - r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))') + r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))" +) + +asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"} -asmd_map = { - '+': '加', - '-': '减', - '×': '乘', - '÷': '除', - '=': '等于' -} def replace_asmd(match) -> str: """ @@ -132,24 +131,25 @@ def replace_asmd(match) -> str: # 次方专项 -RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+') +RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+") power_map = { - '⁰': '0', - '¹': '1', - '²': '2', - '³': '3', - '⁴': '4', - '⁵': '5', - '⁶': '6', - '⁷': '7', - '⁸': '8', - '⁹': '9', - 'ˣ': 'x', - 'ʸ': 'y', - 'ⁿ': 'n' + "⁰": "0", + "¹": "1", + "²": "2", + "³": "3", + "⁴": "4", + "⁵": "5", + "⁶": "6", + "⁷": "7", + "⁸": "8", + "⁹": "9", + "ˣ": "x", + "ʸ": "y", + "ⁿ": "n", } + def replace_power(match) -> str: """ Args: @@ -166,10 +166,10 @@ def replace_power(match) -> str: # 数字表达式 # 纯小数 -RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') +RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))") # 正整数 + 量词 RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) -RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') +RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))") def replace_positive_quantifier(match) -> str: @@ -220,7 +220,9 @@ RE_RANGE = re.compile( [-~] # 匹配范围分隔符 ((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数) (?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符 - """, re.VERBOSE) + """, + re.VERBOSE, +) def replace_range(match) -> str: @@ -239,7 +241,9 @@ def replace_range(match) -> str: # ~至表达式 RE_TO_RANGE = re.compile( - r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)') + r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)" +) + def replace_to_range(match) -> str: """ @@ -248,71 +252,66 @@ def replace_to_range(match) -> str: Returns: str """ - result = match.group(0).replace('~', '至') + result = match.group(0).replace("~", "至") return result -def _get_value(value_string: str, use_zero: bool=True) -> List[str]: - stripped = value_string.lstrip('0') +def _get_value(value_string: str, use_zero: bool = True) -> List[str]: + stripped = value_string.lstrip("0") if len(stripped) == 0: return [] elif len(stripped) == 1: if use_zero and len(stripped) < len(value_string): - return [DIGITS['0'], DIGITS[stripped]] + return [DIGITS["0"], DIGITS[stripped]] else: return [DIGITS[stripped]] else: - largest_unit = next( - power for power in reversed(UNITS.keys()) if power < len(stripped)) + largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped)) first_part = value_string[:-largest_unit] second_part = value_string[-largest_unit:] - return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( - second_part) + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) def verbalize_cardinal(value_string: str) -> str: if not value_string: - return '' + return "" # 000 -> '零' , 0 -> '零' - value_string = value_string.lstrip('0') + value_string = value_string.lstrip("0") if len(value_string) == 0: - return DIGITS['0'] + return DIGITS["0"] result_symbols = _get_value(value_string) # verbalized number starting with '一十*' is abbreviated as `十*` - if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ - '1'] and result_symbols[1] == UNITS[1]: + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]: result_symbols = result_symbols[1:] - return ''.join(result_symbols) + return "".join(result_symbols) def verbalize_digit(value_string: str, alt_one=False) -> str: result_symbols = [DIGITS[digit] for digit in value_string] - result = ''.join(result_symbols) + result = "".join(result_symbols) if alt_one: result = result.replace("一", "幺") return result def num2str(value_string: str) -> str: - integer_decimal = value_string.split('.') + integer_decimal = value_string.split(".") if len(integer_decimal) == 1: integer = integer_decimal[0] - decimal = '' + decimal = "" elif len(integer_decimal) == 2: integer, decimal = integer_decimal else: - raise ValueError( - f"The value string: '${value_string}' has more than one point in it." - ) + raise ValueError(f"The value string: '${value_string}' has more than one point in it.") result = verbalize_cardinal(integer) - decimal = decimal.rstrip('0') + decimal = decimal.rstrip("0") if decimal: # '.22' is verbalized as '零点二二' # '3.20' is verbalized as '三点二 result = result if result else "零" - result += '点' + verbalize_digit(decimal) + result += "点" + verbalize_digit(decimal) return result diff --git a/GPT_SoVITS/text/zh_normalization/phonecode.py b/GPT_SoVITS/text/zh_normalization/phonecode.py index 5183511..3560ac2 100644 --- a/GPT_SoVITS/text/zh_normalization/phonecode.py +++ b/GPT_SoVITS/text/zh_normalization/phonecode.py @@ -21,10 +21,8 @@ from .num import verbalize_digit # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 # 联通:130、131、132、156、155、186、185、176 # 电信:133、153、189、180、181、177 -RE_MOBILE_PHONE = re.compile( - r"(? str: if mobile: - sp_parts = phone_string.strip('+').split() - result = ','.join( - [verbalize_digit(part, alt_one=True) for part in sp_parts]) + sp_parts = phone_string.strip("+").split() + result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: - sil_parts = phone_string.split('-') - result = ','.join( - [verbalize_digit(part, alt_one=True) for part in sil_parts]) + sil_parts = phone_string.split("-") + result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/GPT_SoVITS/text/zh_normalization/quantifier.py b/GPT_SoVITS/text/zh_normalization/quantifier.py index 598030e..1e7f2aa 100644 --- a/GPT_SoVITS/text/zh_normalization/quantifier.py +++ b/GPT_SoVITS/text/zh_normalization/quantifier.py @@ -17,7 +17,7 @@ from .num import num2str # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 -RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') +RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)") measure_dict = { "cm2": "平方厘米", "cm²": "平方厘米", @@ -35,7 +35,7 @@ measure_dict = { "ml": "毫升", "m": "米", "mm": "毫米", - "s": "秒" + "s": "秒", } diff --git a/GPT_SoVITS/text/zh_normalization/text_normlization.py b/GPT_SoVITS/text/zh_normalization/text_normlization.py index 400b30f..099b01b 100644 --- a/GPT_SoVITS/text/zh_normalization/text_normlization.py +++ b/GPT_SoVITS/text/zh_normalization/text_normlization.py @@ -56,9 +56,9 @@ from .quantifier import replace_measure from .quantifier import replace_temperature -class TextNormalizer(): +class TextNormalizer: def __init__(self): - self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') + self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)") def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. @@ -71,66 +71,64 @@ class TextNormalizer(): if lang == "zh": text = text.replace(" ", "") # 过滤掉特殊字符 - text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text) - text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) + text = re.sub(r"[——《》【】<>{}()()#&@“”^_|\\]", "", text) + text = self.SENTENCE_SPLITOR.sub(r"\1\n", text) text = text.strip() - sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + sentences = [sentence.strip() for sentence in re.split(r"\n+", text)] return sentences def _post_replace(self, sentence: str) -> str: - sentence = sentence.replace('/', '每') + sentence = sentence.replace("/", "每") # sentence = sentence.replace('~', '至') # sentence = sentence.replace('~', '至') - sentence = sentence.replace('①', '一') - sentence = sentence.replace('②', '二') - sentence = sentence.replace('③', '三') - sentence = sentence.replace('④', '四') - sentence = sentence.replace('⑤', '五') - sentence = sentence.replace('⑥', '六') - sentence = sentence.replace('⑦', '七') - sentence = sentence.replace('⑧', '八') - sentence = sentence.replace('⑨', '九') - sentence = sentence.replace('⑩', '十') - sentence = sentence.replace('α', '阿尔法') - sentence = sentence.replace('β', '贝塔') - sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛') - sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔') - sentence = sentence.replace('ε', '艾普西龙') - sentence = sentence.replace('ζ', '捷塔') - sentence = sentence.replace('η', '依塔') - sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔') - sentence = sentence.replace('ι', '艾欧塔') - sentence = sentence.replace('κ', '喀帕') - sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达') - sentence = sentence.replace('μ', '缪') - sentence = sentence.replace('ν', '拗') - sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西') - sentence = sentence.replace('ο', '欧米克伦') - sentence = sentence.replace('π', '派').replace('Π', '派') - sentence = sentence.replace('ρ', '肉') - sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace( - 'σ', '西格玛') - sentence = sentence.replace('τ', '套') - sentence = sentence.replace('υ', '宇普西龙') - sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾') - sentence = sentence.replace('χ', '器') - sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛') - sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽') + sentence = sentence.replace("①", "一") + sentence = sentence.replace("②", "二") + sentence = sentence.replace("③", "三") + sentence = sentence.replace("④", "四") + sentence = sentence.replace("⑤", "五") + sentence = sentence.replace("⑥", "六") + sentence = sentence.replace("⑦", "七") + sentence = sentence.replace("⑧", "八") + sentence = sentence.replace("⑨", "九") + sentence = sentence.replace("⑩", "十") + sentence = sentence.replace("α", "阿尔法") + sentence = sentence.replace("β", "贝塔") + sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛") + sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔") + sentence = sentence.replace("ε", "艾普西龙") + sentence = sentence.replace("ζ", "捷塔") + sentence = sentence.replace("η", "依塔") + sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔") + sentence = sentence.replace("ι", "艾欧塔") + sentence = sentence.replace("κ", "喀帕") + sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达") + sentence = sentence.replace("μ", "缪") + sentence = sentence.replace("ν", "拗") + sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西") + sentence = sentence.replace("ο", "欧米克伦") + sentence = sentence.replace("π", "派").replace("Π", "派") + sentence = sentence.replace("ρ", "肉") + sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛") + sentence = sentence.replace("τ", "套") + sentence = sentence.replace("υ", "宇普西龙") + sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾") + sentence = sentence.replace("χ", "器") + sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛") + sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽") # 兜底数学运算,顺便兼容懒人用语 - sentence = sentence.replace('+', '加') - sentence = sentence.replace('-', '减') - sentence = sentence.replace('×', '乘') - sentence = sentence.replace('÷', '除') - sentence = sentence.replace('=', '等') + sentence = sentence.replace("+", "加") + sentence = sentence.replace("-", "减") + sentence = sentence.replace("×", "乘") + sentence = sentence.replace("÷", "除") + sentence = sentence.replace("=", "等") # re filter special characters, have one more character "-" than line 68 - sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence) + sentence = re.sub(r"[-——《》【】<=>{}()()#&@“”^_|\\]", "", sentence) return sentence def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) - sentence = sentence.translate(F2H_ASCII_LETTERS).translate( - F2H_DIGITS).translate(F2H_SPACE) + sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE) # number related NSW verbalization sentence = RE_DATE.sub(replace_date, sentence) @@ -161,8 +159,7 @@ class TextNormalizer(): sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) - sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, - sentence) + sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) sentence = self._post_replace(sentence) diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py index 177eda1..1cc2d97 100644 --- a/GPT_SoVITS/utils.py +++ b/GPT_SoVITS/utils.py @@ -1,17 +1,15 @@ -import os -import glob -import sys import argparse -import logging +import glob import json +import logging +import os import subprocess +import sys import traceback import librosa import numpy as np -from scipy.io.wavfile import read import torch -import logging logging.getLogger("numba").setLevel(logging.ERROR) logging.getLogger("matplotlib").setLevel(logging.ERROR) @@ -27,11 +25,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") iteration = checkpoint_dict["iteration"] learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None - and not skip_optimizer - and checkpoint_dict["optimizer"] is not None - ): + if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None: optimizer.load_state_dict(checkpoint_dict["optimizer"]) saved_state_dict = checkpoint_dict["model"] if hasattr(model, "module"): @@ -50,9 +44,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False ) except: traceback.print_exc() - print( - "error, %s is not in the checkpoint" % k - ) # shape不对也会,比如text_embedding当cleaner修改时 + print("error, %s is not in the checkpoint" % k) # shape不对也会,比如text_embedding当cleaner修改时 new_state_dict[k] = v if hasattr(model, "module"): model.module.load_state_dict(new_state_dict) @@ -60,25 +52,28 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False model.load_state_dict(new_state_dict) print("load ") logger.info( - "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + "Loaded checkpoint '{}' (iteration {})".format( + checkpoint_path, + iteration, + ) ) return model, optimizer, learning_rate, iteration -from time import time as ttime + import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s.pth"%(ttime()) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) +from time import time as ttime + + +def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path + dir = os.path.dirname(path) + name = os.path.basename(path) + tmp_path = "%s.pth" % (ttime()) + torch.save(fea, tmp_path) + shutil.move(tmp_path, "%s/%s" % (dir, name)) + def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at iteration {} to {}".format( - iteration, checkpoint_path - ) - ) + logger.info("Saving model and optimizer state at iteration {} to {}".format(iteration, checkpoint_path)) if hasattr(model, "module"): state_dict = model.module.state_dict() else: @@ -132,7 +127,6 @@ def plot_spectrogram_to_numpy(spectrogram): mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt - import numpy as np fig, ax = plt.subplots(figsize=(10, 2)) im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") @@ -158,11 +152,13 @@ def plot_alignment_to_numpy(alignment, info=None): mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt - import numpy as np fig, ax = plt.subplots(figsize=(6, 4)) im = ax.imshow( - alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + alignment.transpose(), + aspect="auto", + origin="lower", + interpolation="none", ) fig.colorbar(im, ax=ax) xlabel = "Decoder timestep" @@ -199,9 +195,7 @@ def get_hparams(init=True, stage=1): default="./configs/s2.json", help="JSON file for configuration", ) - parser.add_argument( - "-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir" - ) + parser.add_argument("-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir") parser.add_argument( "-rs", "--resume_step", @@ -250,11 +244,7 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim """ import re - ckpts_files = [ - f - for f in os.listdir(path_to_models) - if os.path.isfile(os.path.join(path_to_models, f)) - ] + ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) sort_key = time_key if sort_by_time else name_key @@ -263,8 +253,7 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim key=sort_key, ) to_del = [ - os.path.join(path_to_models, fn) - for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + os.path.join(path_to_models, fn) for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) ] del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") del_routine = lambda x: [os.remove(x), del_info(x)] @@ -296,7 +285,7 @@ def check_git_hash(model_dir): if not os.path.exists(os.path.join(source_dir, ".git")): logger.warn( "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir + source_dir, ) ) return @@ -309,7 +298,8 @@ def check_git_hash(model_dir): if saved_hash != cur_hash: logger.warn( "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] + saved_hash[:8], + cur_hash[:8], ) ) else: @@ -366,6 +356,6 @@ class HParams: if __name__ == "__main__": print( load_wav_to_torch( - "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac" + "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac", ) ) diff --git a/api.py b/api.py index d92d9c8..c1c917a 100644 --- a/api.py +++ b/api.py @@ -140,9 +140,9 @@ RESP: 无 """ - import argparse -import os,re +import os +import re import sys now_dir = os.getcwd() @@ -152,10 +152,11 @@ sys.path.append("%s/GPT_SoVITS" % (now_dir)) import signal from text.LangSegmenter import LangSegmenter from time import time as ttime -import torch, torchaudio +import torch +import torchaudio import librosa import soundfile as sf -from fastapi import FastAPI, Request, Query, HTTPException +from fastapi import FastAPI, Request, Query from fastapi.responses import StreamingResponse, JSONResponse import uvicorn from transformers import AutoModelForMaskedLM, AutoTokenizer @@ -163,12 +164,11 @@ import numpy as np from feature_extractor import cnhubert from io import BytesIO from module.models import SynthesizerTrn, SynthesizerTrnV3 -from peft import LoraConfig, PeftModel, get_peft_model +from peft import LoraConfig, get_peft_model from AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch -from tools.my_utils import load_audio import config as global_config import logging import subprocess @@ -201,7 +201,11 @@ def is_full(*items): # 任意一项为空返回False def init_bigvgan(): global bigvgan_model from BigVGAN import bigvgan - bigvgan_model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions + + bigvgan_model = bigvgan.BigVGAN.from_pretrained( + "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), + use_cuda_kernel=False, + ) # if True, RuntimeError: Ninja is required to load C++ extensions # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() @@ -211,57 +215,71 @@ def init_bigvgan(): bigvgan_model = bigvgan_model.to(device) -resample_transform_dict={} +resample_transform_dict = {} + + def resample(audio_tensor, sr0): global resample_transform_dict if sr0 not in resample_transform_dict: - resample_transform_dict[sr0] = torchaudio.transforms.Resample( - sr0, 24000 - ).to(device) + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) return resample_transform_dict[sr0](audio_tensor) -from module.mel_processing import spectrogram_torch,mel_spectrogram_torch +from module.mel_processing import mel_spectrogram_torch + spec_min = -12 spec_max = 2 + + def norm_spec(x): return (x - spec_min) / (spec_max - spec_min) * 2 - 1 + + def denorm_spec(x): return (x + 1) / 2 * (spec_max - spec_min) + spec_min -mel_fn=lambda x: mel_spectrogram_torch(x, **{ - "n_fft": 1024, - "win_size": 1024, - "hop_size": 256, - "num_mels": 100, - "sampling_rate": 24000, - "fmin": 0, - "fmax": None, - "center": False -}) -sr_model=None -def audio_sr(audio,sr): +mel_fn = lambda x: mel_spectrogram_torch( + x, + **{ + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "num_mels": 100, + "sampling_rate": 24000, + "fmin": 0, + "fmax": None, + "center": False, + }, +) + + +sr_model = None + + +def audio_sr(audio, sr): global sr_model - if sr_model==None: + if sr_model == None: from tools.audio_sr import AP_BWE + try: - sr_model=AP_BWE(device,DictToAttrRecursive) + sr_model = AP_BWE(device, DictToAttrRecursive) except FileNotFoundError: logger.info("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载") - return audio.cpu().detach().numpy(),sr - return sr_model(audio,sr) + return audio.cpu().detach().numpy(), sr + return sr_model(audio, sr) class Speaker: - def __init__(self, name, gpt, sovits, phones = None, bert = None, prompt = None): + def __init__(self, name, gpt, sovits, phones=None, bert=None, prompt=None): self.name = name self.sovits = sovits self.gpt = gpt self.phones = phones self.bert = bert self.prompt = prompt - + + speaker_list = {} @@ -270,22 +288,25 @@ class Sovits: self.vq_model = vq_model self.hps = hps -from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new -def get_sovits_weights(sovits_path): - path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth" - is_exist_s2gv3=os.path.exists(path_sovits_v3) - version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) - if if_lora_v3==True and is_exist_s2gv3==False: +from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new + + +def get_sovits_weights(sovits_path): + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + is_exist_s2gv3 = os.path.exists(path_sovits_v3) + + version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) + if if_lora_v3 == True and is_exist_s2gv3 == False: logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") dict_s2 = load_sovits_new(sovits_path) hps = dict_s2["config"] hps = DictToAttrRecursive(hps) hps.model.semantic_frame_rate = "25hz" - if 'enc_p.text_embedding.weight' not in dict_s2['weight']: - hps.model.version = "v2"#v3model,v2sybomls - elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322: + if "enc_p.text_embedding.weight" not in dict_s2["weight"]: + hps.model.version = "v2" # v3model,v2sybomls + elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322: hps.model.version = "v1" else: hps.model.version = "v2" @@ -294,27 +315,28 @@ def get_sovits_weights(sovits_path): hps.model.version = "v3" model_params_dict = vars(hps.model) - if model_version!="v3": + if model_version != "v3": vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **model_params_dict + **model_params_dict, ) else: vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, - **model_params_dict + **model_params_dict, ) init_bigvgan() - model_version=hps.model.version + model_version = hps.model.version logger.info(f"模型版本: {model_version}") - if ("pretrained" not in sovits_path): + if "pretrained" not in sovits_path: try: del vq_model.enc_q - except:pass + except: + pass if is_half == True: vq_model = vq_model.half().to(device) else: @@ -324,7 +346,7 @@ def get_sovits_weights(sovits_path): vq_model.load_state_dict(dict_s2["weight"], strict=False) else: vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False) - lora_rank=dict_s2["lora_rank"] + lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], r=lora_rank, @@ -340,13 +362,17 @@ def get_sovits_weights(sovits_path): sovits = Sovits(vq_model, hps) return sovits + class Gpt: def __init__(self, max_sec, t2s_model): self.max_sec = max_sec self.t2s_model = t2s_model + global hz hz = 50 + + def get_gpt_weights(gpt_path): dict_s1 = torch.load(gpt_path, map_location="cpu") config = dict_s1["config"] @@ -363,7 +389,8 @@ def get_gpt_weights(gpt_path): gpt = Gpt(max_sec, t2s_model) return gpt -def change_gpt_sovits_weights(gpt_path,sovits_path): + +def change_gpt_sovits_weights(gpt_path, sovits_path): try: gpt = get_gpt_weights(gpt_path) sovits = get_sovits_weights(sovits_path) @@ -392,16 +419,16 @@ def get_bert_feature(text, word2ph): def clean_text_inf(text, language, version): - language = language.replace("all_","") + language = language.replace("all_", "") phones, word2ph, norm_text = clean_text(text, language, version) phones = cleaned_text_to_sequence(phones, version) return phones, word2ph, norm_text def get_bert_inf(phones, word2ph, norm_text, language): - language=language.replace("all_","") + language = language.replace("all_", "") if language == "zh": - bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + bert = get_bert_feature(norm_text, word2ph).to(device) # .to(dtype) else: bert = torch.zeros( (1024, len(phones)), @@ -410,24 +437,27 @@ def get_bert_inf(phones, word2ph, norm_text, language): return bert + from text import chinese -def get_phones_and_bert(text,language,version,final=False): + + +def get_phones_and_bert(text, language, version, final=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") if language == "all_zh": - if re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"zh",version) + return get_phones_and_bert(formattext, "zh", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = get_bert_feature(norm_text, word2ph).to(device) - elif language == "all_yue" and re.search(r'[A-Za-z]', formattext): - formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.mix_text_normalize(formattext) - return get_phones_and_bert(formattext,"yue",version) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = torch.zeros( @@ -435,8 +465,8 @@ def get_phones_and_bert(text,language,version,final=False): dtype=torch.float16 if is_half == True else torch.float32, ).to(device) elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: - textlist=[] - langlist=[] + textlist = [] + langlist = [] if language == "auto": for tmp in LangSegmenter.getTexts(text): langlist.append(tmp["lang"]) @@ -467,12 +497,12 @@ def get_phones_and_bert(text,language,version,final=False): bert_list.append(bert) bert = torch.cat(bert_list, dim=1) phones = sum(phones_list, []) - norm_text = ''.join(norm_text_list) + norm_text = "".join(norm_text_list) if not final and len(phones) < 6: - return get_phones_and_bert("." + text,language,version,final=True) + return get_phones_and_bert("." + text, language, version, final=True) - return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text + return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text class DictToAttrRecursive(dict): @@ -504,15 +534,21 @@ class DictToAttrRecursive(dict): def get_spepc(hps, filename): - audio,_ = librosa.load(filename, int(hps.data.sampling_rate)) + audio, _ = librosa.load(filename, int(hps.data.sampling_rate)) audio = torch.FloatTensor(audio) - maxx=audio.abs().max() - if(maxx>1): - audio/=min(2,maxx) + maxx = audio.abs().max() + if maxx > 1: + audio /= min(2, maxx) audio_norm = audio audio_norm = audio_norm.unsqueeze(0) - spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, - hps.data.win_length, center=False) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) return spec @@ -546,10 +582,11 @@ def pack_ogg(audio_bytes, data, rate): # Or split the whole audio data into smaller audio segment to avoid stack overflow? def handle_pack_ogg(): - with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: + with sf.SoundFile(audio_bytes, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: audio_file.write(data) import threading + # See: https://docs.python.org/3/library/threading.html # The stack size of this thread is at least 32768 # If stack overflow error still occurs, just modify the `stack_size`. @@ -581,35 +618,47 @@ def pack_raw(audio_bytes, data, rate): def pack_wav(audio_bytes, rate): if is_int32: - data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int32) + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int32) wav_bytes = BytesIO() - sf.write(wav_bytes, data, rate, format='WAV', subtype='PCM_32') + sf.write(wav_bytes, data, rate, format="WAV", subtype="PCM_32") else: - data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16) + data = np.frombuffer(audio_bytes.getvalue(), dtype=np.int16) wav_bytes = BytesIO() - sf.write(wav_bytes, data, rate, format='WAV') + sf.write(wav_bytes, data, rate, format="WAV") return wav_bytes def pack_aac(audio_bytes, data, rate): if is_int32: - pcm = 's32le' - bit_rate = '256k' + pcm = "s32le" + bit_rate = "256k" else: - pcm = 's16le' - bit_rate = '128k' - process = subprocess.Popen([ - 'ffmpeg', - '-f', pcm, # 输入16位有符号小端整数PCM - '-ar', str(rate), # 设置采样率 - '-ac', '1', # 单声道 - '-i', 'pipe:0', # 从管道读取输入 - '-c:a', 'aac', # 音频编码器为AAC - '-b:a', bit_rate, # 比特率 - '-vn', # 不包含视频 - '-f', 'adts', # 输出AAC数据流格式 - 'pipe:1' # 将输出写入管道 - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + pcm = "s16le" + bit_rate = "128k" + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + pcm, # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + bit_rate, # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) out, _ = process.communicate(input=data.tobytes()) audio_bytes.write(out) @@ -632,7 +681,7 @@ def cut_text(text, punc): items = re.split(f"({punds})", text) mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] # 在句子不存在符号或句尾无符号的时候保证文本完整 - if len(items)%2 == 1: + if len(items) % 2 == 1: mergeitems.append(items[-1]) text = "\n".join(mergeitems) @@ -646,8 +695,38 @@ def only_punc(text): return not any(t.isalnum() or t.isalpha() for t in text) -splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k= 15, top_p = 0.6, temperature = 0.6, speed = 1, inp_refs = None, sample_steps = 32, if_sr = False, spk = "default"): +splits = { + ",", + "。", + "?", + "!", + ",", + ".", + "?", + "!", + "~", + ":", + ":", + "—", + "…", +} + + +def get_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k=15, + top_p=0.6, + temperature=0.6, + speed=1, + inp_refs=None, + sample_steps=32, + if_sr=False, + spk="default", +): infer_sovits = speaker_list[spk].sovits vq_model = infer_sovits.vq_model hps = infer_sovits.hps @@ -659,7 +738,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, t0 = ttime() prompt_text = prompt_text.strip("\n") - if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + if prompt_text[-1] not in splits: + prompt_text += "。" if prompt_language != "en" else "." prompt_language, text = prompt_language, text.strip("\n") dtype = torch.float16 if is_half == True else torch.float32 zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32) @@ -667,7 +747,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, wav16k, sr = librosa.load(ref_wav_path, sr=16000) wav16k = torch.from_numpy(wav16k) zero_wav_torch = torch.from_numpy(zero_wav) - if (is_half == True): + if is_half == True: wav16k = wav16k.half().to(device) zero_wav_torch = zero_wav_torch.half().to(device) else: @@ -680,15 +760,15 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, prompt = prompt_semantic.unsqueeze(0).to(device) if version != "v3": - refers=[] - if(inp_refs): + refers = [] + if inp_refs: for path in inp_refs: try: refer = get_spepc(hps, path).to(dtype).to(device) refers.append(refer) except Exception as e: logger.error(e) - if(len(refers)==0): + if len(refers) == 0: refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] else: refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) @@ -707,7 +787,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, continue audio_opt = [] - if (text[-1] not in splits): text += "。" if text_language != "en" else "." + if text[-1] not in splits: + text += "。" if text_language != "en" else "." phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version) bert = torch.cat([bert1, bert2], 1) @@ -722,56 +803,62 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, prompt, bert, # prompt_phone_len=ph_offset, - top_k = top_k, - top_p = top_p, - temperature = temperature, - early_stop_num=hz * max_sec) + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) t3 = ttime() if version != "v3": - audio = \ - vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), - refers,speed=speed).detach().cpu().numpy()[ - 0, 0] ###试试重建不带上prompt部分 + audio = ( + vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 else: - phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0) - phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0) + phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) + phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) # print(11111111, phoneme_ids0, phoneme_ids1) - fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) + fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) ref_audio, sr = torchaudio.load(ref_wav_path) - ref_audio=ref_audio.to(device).float() - if (ref_audio.shape[0] == 2): + ref_audio = ref_audio.to(device).float() + if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - if sr!=24000: - ref_audio=resample(ref_audio,sr) + if sr != 24000: + ref_audio = resample(ref_audio, sr) # print("ref_audio",ref_audio.abs().mean()) mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - if (T_min > 468): + if T_min > 468: mel2 = mel2[:, :, -468:] fea_ref = fea_ref[:, :, -468:] T_min = 468 chunk_len = 934 - T_min # print("fea_ref",fea_ref,fea_ref.shape) # print("mel2",mel2) - mel2=mel2.to(dtype) - fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge,speed) + mel2 = mel2.to(dtype) + fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) # print("fea_todo",fea_todo) # print("ge",ge.abs().mean()) cfm_resss = [] idx = 0 - while (1): - fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len] - if (fea_todo_chunk.shape[-1] == 0): break + while 1: + fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len] + if fea_todo_chunk.shape[-1] == 0: + break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) # set_seed(123) - cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) - cfm_res = cfm_res[:, :, mel2.shape[2]:] + cfm_res = vq_model.cfm.inference( + fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 + ) + cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] # print("fea", fea) # print("mel2in", mel2) @@ -779,14 +866,15 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, cfm_resss.append(cfm_res) cmf_res = torch.cat(cfm_resss, 2) cmf_res = denorm_spec(cmf_res) - if bigvgan_model==None:init_bigvgan() + if bigvgan_model == None: + init_bigvgan() with torch.inference_mode(): wav_gen = bigvgan_model(cmf_res) - audio=wav_gen[0][0].cpu().detach().numpy() + audio = wav_gen[0][0].cpu().detach().numpy() - max_audio=np.abs(audio).max() - if max_audio>1: - audio/=max_audio + max_audio = np.abs(audio).max() + if max_audio > 1: + audio /= max_audio audio_opt.append(audio) audio_opt.append(zero_wav) audio_opt = np.concatenate(audio_opt, 0) @@ -795,29 +883,29 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, sr = hps.data.sampling_rate if version != "v3" else 24000 if if_sr and sr == 24000: audio_opt = torch.from_numpy(audio_opt).float().to(device) - audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr) - max_audio=np.abs(audio_opt).max() - if max_audio > 1: audio_opt /= max_audio + audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) + max_audio = np.abs(audio_opt).max() + if max_audio > 1: + audio_opt /= max_audio sr = 48000 if is_int32: - audio_bytes = pack_audio(audio_bytes,(audio_opt * 2147483647).astype(np.int32),sr) + audio_bytes = pack_audio(audio_bytes, (audio_opt * 2147483647).astype(np.int32), sr) else: - audio_bytes = pack_audio(audio_bytes,(audio_opt * 32768).astype(np.int16),sr) - # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + audio_bytes = pack_audio(audio_bytes, (audio_opt * 32768).astype(np.int16), sr) + # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) if stream_mode == "normal": audio_bytes, audio_chunk = read_clean_buffer(audio_bytes) yield audio_chunk - - if not stream_mode == "normal": + + if not stream_mode == "normal": if media_type == "wav": sr = 48000 if if_sr else 24000 sr = hps.data.sampling_rate if version != "v3" else sr - audio_bytes = pack_wav(audio_bytes,sr) + audio_bytes = pack_wav(audio_bytes, sr) yield audio_bytes.getvalue() - def handle_control(command): if command == "restart": os.execl(g_config.python_exec, g_config.python_exec, *sys.argv) @@ -828,7 +916,9 @@ def handle_control(command): def handle_change(path, text, language): if is_empty(path, text, language): - return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400) + return JSONResponse( + {"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400 + ) if path != "" or path is not None: default_refer.path = path @@ -842,15 +932,31 @@ def handle_change(path, text, language): logger.info(f"当前默认参考音频语种: {default_refer.language}") logger.info(f"is_ready: {default_refer.is_ready()}") - return JSONResponse({"code": 0, "message": "Success"}, status_code=200) -def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr): +def handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, +): if ( - refer_wav_path == "" or refer_wav_path is None - or prompt_text == "" or prompt_text is None - or prompt_language == "" or prompt_language is None + refer_wav_path == "" + or refer_wav_path is None + or prompt_text == "" + or prompt_text is None + or prompt_language == "" + or prompt_language is None ): refer_wav_path, prompt_text, prompt_language = ( default_refer.path, @@ -860,17 +966,31 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cu if not default_refer.is_ready(): return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) - if not sample_steps in [4,8,16,32]: + if sample_steps not in [4, 8, 16, 32]: sample_steps = 32 if cut_punc == None: - text = cut_text(text,default_cut_punc) + text = cut_text(text, default_cut_punc) else: - text = cut_text(text,cut_punc) - - return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr), media_type="audio/"+media_type) - + text = cut_text(text, cut_punc) + return StreamingResponse( + get_tts_wav( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ), + media_type="audio/" + media_type, + ) # -------------------------------- @@ -886,7 +1006,7 @@ dict_language = { "粤英混合": "yue", "日英混合": "ja", "韩英混合": "ko", - "多语种混合": "auto", #多语种启动切分识别语种 + "多语种混合": "auto", # 多语种启动切分识别语种 "多语种混合(粤语)": "auto_yue", "all_zh": "all_zh", "all_yue": "all_yue", @@ -903,7 +1023,7 @@ dict_language = { # logger logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG) -logger = logging.getLogger('uvicorn') +logger = logging.getLogger("uvicorn") # 获取配置 g_config = global_config.Config() @@ -919,8 +1039,12 @@ parser.add_argument("-dl", "--default_refer_language", type=str, default="", hel parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu") parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0") parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") -parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度") -parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度") +parser.add_argument( + "-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度" +) +parser.add_argument( + "-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度" +) # bool值的用法为 `python ./api.py -fp ...` # 此时 full_precision==True, half_precision==False parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive") @@ -972,14 +1096,14 @@ if args.full_precision and args.half_precision: logger.info(f"半精: {is_half}") # 流式返回模式 -if args.stream_mode.lower() in ["normal","n"]: +if args.stream_mode.lower() in ["normal", "n"]: stream_mode = "normal" logger.info("流式返回已开启") else: stream_mode = "close" # 音频编码格式 -if args.media_type.lower() in ["aac","ogg"]: +if args.media_type.lower() in ["aac", "ogg"]: media_type = args.media_type.lower() elif stream_mode == "close": media_type = "wav" @@ -988,12 +1112,12 @@ else: logger.info(f"编码格式: {media_type}") # 音频数据类型 -if args.sub_type.lower() == 'int32': +if args.sub_type.lower() == "int32": is_int32 = True - logger.info(f"数据类型: int32") + logger.info("数据类型: int32") else: is_int32 = False - logger.info(f"数据类型: int16") + logger.info("数据类型: int16") # 初始化模型 cnhubert.cnhubert_base_path = cnhubert_base_path @@ -1006,8 +1130,7 @@ if is_half: else: bert_model = bert_model.to(device) ssl_model = ssl_model.to(device) -change_gpt_sovits_weights(gpt_path = gpt_path, sovits_path = sovits_path) - +change_gpt_sovits_weights(gpt_path=gpt_path, sovits_path=sovits_path) # -------------------------------- @@ -1015,21 +1138,21 @@ change_gpt_sovits_weights(gpt_path = gpt_path, sovits_path = sovits_path) # -------------------------------- app = FastAPI() + @app.post("/set_model") async def set_model(request: Request): json_post_raw = await request.json() return change_gpt_sovits_weights( - gpt_path = json_post_raw.get("gpt_model_path"), - sovits_path = json_post_raw.get("sovits_model_path") + gpt_path=json_post_raw.get("gpt_model_path"), sovits_path=json_post_raw.get("sovits_model_path") ) @app.get("/set_model") async def set_model( - gpt_model_path: str = None, - sovits_model_path: str = None, + gpt_model_path: str = None, + sovits_model_path: str = None, ): - return change_gpt_sovits_weights(gpt_path = gpt_model_path, sovits_path = sovits_model_path) + return change_gpt_sovits_weights(gpt_path=gpt_model_path, sovits_path=sovits_model_path) @app.post("/control") @@ -1047,18 +1170,12 @@ async def control(command: str = None): async def change_refer(request: Request): json_post_raw = await request.json() return handle_change( - json_post_raw.get("refer_wav_path"), - json_post_raw.get("prompt_text"), - json_post_raw.get("prompt_language") + json_post_raw.get("refer_wav_path"), json_post_raw.get("prompt_text"), json_post_raw.get("prompt_language") ) @app.get("/change_refer") -async def change_refer( - refer_wav_path: str = None, - prompt_text: str = None, - prompt_language: str = None -): +async def change_refer(refer_wav_path: str = None, prompt_text: str = None, prompt_language: str = None): return handle_change(refer_wav_path, prompt_text, prompt_language) @@ -1078,27 +1195,41 @@ async def tts_endpoint(request: Request): json_post_raw.get("speed", 1.0), json_post_raw.get("inp_refs", []), json_post_raw.get("sample_steps", 32), - json_post_raw.get("if_sr", False) + json_post_raw.get("if_sr", False), ) @app.get("/") async def tts_endpoint( - refer_wav_path: str = None, - prompt_text: str = None, - prompt_language: str = None, - text: str = None, - text_language: str = None, - cut_punc: str = None, - top_k: int = 15, - top_p: float = 1.0, - temperature: float = 1.0, - speed: float = 1.0, - inp_refs: list = Query(default=[]), - sample_steps: int = 32, - if_sr: bool = False + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None, + text: str = None, + text_language: str = None, + cut_punc: str = None, + top_k: int = 15, + top_p: float = 1.0, + temperature: float = 1.0, + speed: float = 1.0, + inp_refs: list = Query(default=[]), + sample_steps: int = 32, + if_sr: bool = False, ): - return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr) + return handle( + refer_wav_path, + prompt_text, + prompt_language, + text, + text_language, + cut_punc, + top_k, + top_p, + temperature, + speed, + inp_refs, + sample_steps, + if_sr, + ) if __name__ == "__main__": diff --git a/api_v2.py b/api_v2.py index 3a8566a..8708207 100644 --- a/api_v2.py +++ b/api_v2.py @@ -78,7 +78,7 @@ GET: ``` http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt ``` -RESP: +RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 @@ -92,11 +92,12 @@ GET: http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth ``` -RESP: +RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 - + """ + import os import sys import traceback @@ -112,16 +113,15 @@ import wave import signal import numpy as np import soundfile as sf -from fastapi import FastAPI, Request, HTTPException, Response +from fastapi import FastAPI, Response from fastapi.responses import StreamingResponse, JSONResponse -from fastapi import FastAPI, UploadFile, File import uvicorn from io import BytesIO from tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names -from fastapi.responses import StreamingResponse from pydantic import BaseModel + # print(sys.path) i18n = I18nAuto() cut_method_names = get_cut_method_names() @@ -145,6 +145,8 @@ print(tts_config) tts_pipeline = TTS(tts_config) APP = FastAPI() + + class TTS_Request(BaseModel): text: str = None text_lang: str = None @@ -152,58 +154,73 @@ class TTS_Request(BaseModel): aux_ref_audio_paths: list = None prompt_lang: str = None prompt_text: str = "" - top_k:int = 5 - top_p:float = 1 - temperature:float = 1 - text_split_method:str = "cut5" - batch_size:int = 1 - batch_threshold:float = 0.75 - split_bucket:bool = True - speed_factor:float = 1.0 - fragment_interval:float = 0.3 - seed:int = -1 - media_type:str = "wav" - streaming_mode:bool = False - parallel_infer:bool = True - repetition_penalty:float = 1.35 - sample_steps:int = 32 - super_sampling:bool = False + top_k: int = 5 + top_p: float = 1 + temperature: float = 1 + text_split_method: str = "cut5" + batch_size: int = 1 + batch_threshold: float = 0.75 + split_bucket: bool = True + speed_factor: float = 1.0 + fragment_interval: float = 0.3 + seed: int = -1 + media_type: str = "wav" + streaming_mode: bool = False + parallel_infer: bool = True + repetition_penalty: float = 1.35 + sample_steps: int = 32 + super_sampling: bool = False + ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files -def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int): - with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file: +def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int): + with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: audio_file.write(data) return io_buffer -def pack_raw(io_buffer:BytesIO, data:np.ndarray, rate:int): +def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int): io_buffer.write(data.tobytes()) return io_buffer -def pack_wav(io_buffer:BytesIO, data:np.ndarray, rate:int): +def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int): io_buffer = BytesIO() - sf.write(io_buffer, data, rate, format='wav') + sf.write(io_buffer, data, rate, format="wav") return io_buffer -def pack_aac(io_buffer:BytesIO, data:np.ndarray, rate:int): - process = subprocess.Popen([ - 'ffmpeg', - '-f', 's16le', # 输入16位有符号小端整数PCM - '-ar', str(rate), # 设置采样率 - '-ac', '1', # 单声道 - '-i', 'pipe:0', # 从管道读取输入 - '-c:a', 'aac', # 音频编码器为AAC - '-b:a', '192k', # 比特率 - '-vn', # 不包含视频 - '-f', 'adts', # 输出AAC数据流格式 - 'pipe:1' # 将输出写入管道 - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + +def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int): + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + "s16le", # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + "192k", # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) out, _ = process.communicate(input=data.tobytes()) io_buffer.write(out) return io_buffer -def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str): + +def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str): if media_type == "ogg": io_buffer = pack_ogg(io_buffer, data, rate) elif media_type == "aac": @@ -216,7 +233,6 @@ def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str): return io_buffer - # from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): # This will create a wave header then append the frame input @@ -233,7 +249,7 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=3 return wav_buf.read() -def handle_control(command:str): +def handle_control(command: str): if command == "restart": os.execl(sys.executable, sys.executable, *argv) elif command == "exit": @@ -241,43 +257,52 @@ def handle_control(command:str): exit(0) -def check_params(req:dict): - text:str = req.get("text", "") - text_lang:str = req.get("text_lang", "") - ref_audio_path:str = req.get("ref_audio_path", "") - streaming_mode:bool = req.get("streaming_mode", False) - media_type:str = req.get("media_type", "wav") - prompt_lang:str = req.get("prompt_lang", "") - text_split_method:str = req.get("text_split_method", "cut5") +def check_params(req: dict): + text: str = req.get("text", "") + text_lang: str = req.get("text_lang", "") + ref_audio_path: str = req.get("ref_audio_path", "") + streaming_mode: bool = req.get("streaming_mode", False) + media_type: str = req.get("media_type", "wav") + prompt_lang: str = req.get("prompt_lang", "") + text_split_method: str = req.get("text_split_method", "cut5") if ref_audio_path in [None, ""]: return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"}) if text in [None, ""]: return JSONResponse(status_code=400, content={"message": "text is required"}) - if (text_lang in [None, ""]) : + if text_lang in [None, ""]: return JSONResponse(status_code=400, content={"message": "text_lang is required"}) elif text_lang.lower() not in tts_config.languages: - return JSONResponse(status_code=400, content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"}) - if (prompt_lang in [None, ""]) : + return JSONResponse( + status_code=400, + content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"}, + ) + if prompt_lang in [None, ""]: return JSONResponse(status_code=400, content={"message": "prompt_lang is required"}) elif prompt_lang.lower() not in tts_config.languages: - return JSONResponse(status_code=400, content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"}) + return JSONResponse( + status_code=400, + content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"}, + ) if media_type not in ["wav", "raw", "ogg", "aac"]: return JSONResponse(status_code=400, content={"message": f"media_type: {media_type} is not supported"}) - elif media_type == "ogg" and not streaming_mode: + elif media_type == "ogg" and not streaming_mode: return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"}) - + if text_split_method not in cut_method_names: - return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"}) + return JSONResponse( + status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"} + ) return None -async def tts_handle(req:dict): + +async def tts_handle(req: dict): """ Text to speech handler. - + Args: - req (dict): + req (dict): { "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized @@ -298,14 +323,14 @@ async def tts_handle(req:dict): "media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac". "streaming_mode": False, # bool. whether to return a streaming response. "parallel_infer": True, # bool.(optional) whether to use parallel inference. - "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. + "repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model. "sample_steps": 32, # int. number of sampling steps for VITS model V3. - "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. + "super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3. } returns: StreamingResponse: audio stream response. """ - + streaming_mode = req.get("streaming_mode", False) return_fragment = req.get("return_fragment", False) media_type = req.get("media_type", "wav") @@ -316,12 +341,13 @@ async def tts_handle(req:dict): if streaming_mode or return_fragment: req["return_fragment"] = True - + try: - tts_generator=tts_pipeline.run(req) - + tts_generator = tts_pipeline.run(req) + if streaming_mode: - def streaming_generator(tts_generator:Generator, media_type:str): + + def streaming_generator(tts_generator: Generator, media_type: str): if_frist_chunk = True for sr, chunk in tts_generator: if if_frist_chunk and media_type == "wav": @@ -329,19 +355,22 @@ async def tts_handle(req:dict): media_type = "raw" if_frist_chunk = False yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue() + # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" - return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}") - + return StreamingResponse( + streaming_generator( + tts_generator, + media_type, + ), + media_type=f"audio/{media_type}", + ) + else: sr, audio_data = next(tts_generator) audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() return Response(audio_data, media_type=f"audio/{media_type}") except Exception as e: - return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)}) - - - - + return JSONResponse(status_code=400, content={"message": "tts failed", "Exception": str(e)}) @APP.get("/control") @@ -351,32 +380,31 @@ async def control(command: str = None): handle_control(command) - @APP.get("/tts") async def tts_get_endpoint( - text: str = None, - text_lang: str = None, - ref_audio_path: str = None, - aux_ref_audio_paths:list = None, - prompt_lang: str = None, - prompt_text: str = "", - top_k:int = 5, - top_p:float = 1, - temperature:float = 1, - text_split_method:str = "cut0", - batch_size:int = 1, - batch_threshold:float = 0.75, - split_bucket:bool = True, - speed_factor:float = 1.0, - fragment_interval:float = 0.3, - seed:int = -1, - media_type:str = "wav", - streaming_mode:bool = False, - parallel_infer:bool = True, - repetition_penalty:float = 1.35, - sample_steps:int =32, - super_sampling:bool = False - ): + text: str = None, + text_lang: str = None, + ref_audio_path: str = None, + aux_ref_audio_paths: list = None, + prompt_lang: str = None, + prompt_text: str = "", + top_k: int = 5, + top_p: float = 1, + temperature: float = 1, + text_split_method: str = "cut0", + batch_size: int = 1, + batch_threshold: float = 0.75, + split_bucket: bool = True, + speed_factor: float = 1.0, + fragment_interval: float = 0.3, + seed: int = -1, + media_type: str = "wav", + streaming_mode: bool = False, + parallel_infer: bool = True, + repetition_penalty: float = 1.35, + sample_steps: int = 32, + super_sampling: bool = False, +): req = { "text": text, "text_lang": text_lang.lower(), @@ -388,21 +416,21 @@ async def tts_get_endpoint( "top_p": top_p, "temperature": temperature, "text_split_method": text_split_method, - "batch_size":int(batch_size), - "batch_threshold":float(batch_threshold), - "speed_factor":float(speed_factor), - "split_bucket":split_bucket, - "fragment_interval":fragment_interval, - "seed":seed, - "media_type":media_type, - "streaming_mode":streaming_mode, - "parallel_infer":parallel_infer, - "repetition_penalty":float(repetition_penalty), - "sample_steps":int(sample_steps), - "super_sampling":super_sampling + "batch_size": int(batch_size), + "batch_threshold": float(batch_threshold), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "fragment_interval": fragment_interval, + "seed": seed, + "media_type": media_type, + "streaming_mode": streaming_mode, + "parallel_infer": parallel_infer, + "repetition_penalty": float(repetition_penalty), + "sample_steps": int(sample_steps), + "super_sampling": super_sampling, } return await tts_handle(req) - + @APP.post("/tts") async def tts_post_endpoint(request: TTS_Request): @@ -415,7 +443,7 @@ async def set_refer_aduio(refer_audio_path: str = None): try: tts_pipeline.set_ref_audio(refer_audio_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "set refer audio failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) @@ -425,18 +453,19 @@ async def set_refer_aduio(refer_audio_path: str = None): # # 检查文件类型,确保是音频文件 # if not audio_file.content_type.startswith("audio/"): # return JSONResponse(status_code=400, content={"message": "file type is not supported"}) - + # os.makedirs("uploaded_audio", exist_ok=True) # save_path = os.path.join("uploaded_audio", audio_file.filename) # # 保存音频文件到服务器上的一个目录 # with open(save_path , "wb") as buffer: # buffer.write(await audio_file.read()) - + # tts_pipeline.set_ref_audio(save_path) # except Exception as e: # return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) # return JSONResponse(status_code=200, content={"message": "success"}) + @APP.get("/set_gpt_weights") async def set_gpt_weights(weights_path: str = None): try: @@ -444,7 +473,7 @@ async def set_gpt_weights(weights_path: str = None): return JSONResponse(status_code=400, content={"message": "gpt weight path is required"}) tts_pipeline.init_t2s_weights(weights_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"change gpt weight failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "change gpt weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) @@ -456,17 +485,16 @@ async def set_sovits_weights(weights_path: str = None): return JSONResponse(status_code=400, content={"message": "sovits weight path is required"}) tts_pipeline.init_vits_weights(weights_path) except Exception as e: - return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)}) + return JSONResponse(status_code=400, content={"message": "change sovits weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content={"message": "success"}) - if __name__ == "__main__": try: - if host == 'None': # 在调用时使用 -a None 参数,可以让api监听双栈 + if host == "None": # 在调用时使用 -a None 参数,可以让api监听双栈 host = None uvicorn.run(app=APP, host=host, port=port, workers=1) - except Exception as e: + except Exception: traceback.print_exc() os.kill(os.getpid(), signal.SIGTERM) exit(0) diff --git a/colab_webui.ipynb b/colab_webui.ipynb index 226cc21..5faee51 100644 --- a/colab_webui.ipynb +++ b/colab_webui.ipynb @@ -33,11 +33,8 @@ "condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n", "%cd -q /content\n", "!git clone https://github.com/RVC-Boss/GPT-SoVITS\n", - "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", "%cd -q /content/GPT-SoVITS\n", - "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", - "!/usr/local/bin/pip install -r extra-req.txt --no-deps\n", - "!/usr/local/bin/pip install -r requirements.txt" + "!bash install.sh" ] }, { diff --git a/config.py b/config.py index 1f74128..5f90c5c 100644 --- a/config.py +++ b/config.py @@ -1,4 +1,5 @@ -import sys,os +import sys +import os import torch @@ -6,9 +7,9 @@ import torch sovits_path = "" gpt_path = "" is_half_str = os.environ.get("is_half", "True") -is_half = True if is_half_str.lower() == 'true' else False -is_share_str = os.environ.get("is_share","False") -is_share= True if is_share_str.lower() == 'true' else False +is_half = True if is_half_str.lower() == "true" else False +is_share_str = os.environ.get("is_share", "False") +is_share = True if is_share_str.lower() == "true" else False cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" @@ -32,16 +33,18 @@ api_port = 9880 if infer_device == "cuda": gpu_name = torch.cuda.get_device_name(0) if ( - ("16" in gpu_name and "V100" not in gpu_name.upper()) - or "P40" in gpu_name.upper() - or "P10" in gpu_name.upper() - or "1060" in gpu_name - or "1070" in gpu_name - or "1080" in gpu_name + ("16" in gpu_name and "V100" not in gpu_name.upper()) + or "P40" in gpu_name.upper() + or "P10" in gpu_name.upper() + or "1060" in gpu_name + or "1070" in gpu_name + or "1080" in gpu_name ): - is_half=False + is_half = False + +if infer_device == "cpu": + is_half = False -if(infer_device=="cpu"):is_half=False class Config: def __init__(self): diff --git a/gpt-sovits_kaggle.ipynb b/gpt-sovits_kaggle.ipynb index 67ad473..9f28f6f 100644 --- a/gpt-sovits_kaggle.ipynb +++ b/gpt-sovits_kaggle.ipynb @@ -101,21 +101,31 @@ "import time\n", "import socket\n", "import urllib.request\n", + "\n", + "\n", "def iframe_thread(port):\n", " while True:\n", " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", " if result == 0:\n", " break\n", " sock.close()\n", "\n", " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9874,)).start()\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9874,)).start()\n", "\n", "!python webui.py" ] @@ -143,26 +153,32 @@ "# 开启推理页面\n", "%cd /kaggle/working/GPT-SoVITS/\n", "!npm install -g localtunnel\n", - "import subprocess\n", "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", + "\n", + "\n", "def iframe_thread(port):\n", " while True:\n", " time.sleep(0.5)\n", - " sock= socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", + " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + " result = sock.connect_ex((\"127.0.0.1\", port))\n", " if result == 0:\n", " break\n", " sock.close()\n", "\n", " from colorama import Fore, Style\n", - " print (Fore.GREEN + \"\\nIP: \", Fore. RED, urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"), \"\\n\", Style. RESET_ALL)\n", + " print(\n", + " Fore.GREEN + \"\\nIP: \",\n", + " Fore.RED,\n", + " urllib.request.urlopen(\"https://ipv4.icanhazip.com\").read().decode(\"utf8\").strip(\"\\n\"),\n", + " \"\\n\",\n", + " Style.RESET_ALL,\n", + " )\n", " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "threading.Thread (target=iframe_thread, daemon=True, args=(9872,)).start()\n", + " print(line.decode(), end=\"\")\n", + "\n", + "\n", + "threading.Thread(target=iframe_thread, daemon=True, args=(9872,)).start()\n", "\n", "!python ./GPT_SoVITS/inference_webui.py" ] diff --git a/install.sh b/install.sh index 66a3a49..2f93e9f 100644 --- a/install.sh +++ b/install.sh @@ -48,13 +48,13 @@ fi if [ "$USE_CUDA" = true ]; then echo "Installing PyTorch with CUDA support..." - conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 elif [ "$USE_ROCM" = true ]; then echo "Installing PyTorch with ROCm support..." - pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 else echo "Installing PyTorch for CPU..." - conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch + pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu fi echo "Installing Python dependencies from requirements.txt..." diff --git a/tools/AP_BWE_main/datasets1/dataset.py b/tools/AP_BWE_main/datasets1/dataset.py index b5ccd43..40f993b 100644 --- a/tools/AP_BWE_main/datasets1/dataset.py +++ b/tools/AP_BWE_main/datasets1/dataset.py @@ -5,24 +5,31 @@ import torchaudio import torch.utils.data import torchaudio.functional as aF -def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True): +def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True): hann_window = torch.hann_window(win_size).to(audio.device) - stft_spec = torch.stft(audio, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, - center=center, pad_mode='reflect', normalized=False, return_complex=True) - log_amp = torch.log(torch.abs(stft_spec)+1e-4) + stft_spec = torch.stft( + audio, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode="reflect", + normalized=False, + return_complex=True, + ) + log_amp = torch.log(torch.abs(stft_spec) + 1e-4) pha = torch.angle(stft_spec) - com = torch.stack((torch.exp(log_amp)*torch.cos(pha), - torch.exp(log_amp)*torch.sin(pha)), dim=-1) + com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1) return log_amp, pha, com def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True): - amp = torch.exp(log_amp) - com = torch.complex(amp*torch.cos(pha), amp*torch.sin(pha)) + com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha)) hann_window = torch.hann_window(win_size).to(com.device) audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center) @@ -30,18 +37,28 @@ def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True): def get_dataset_filelist(a): - with open(a.input_training_file, 'r', encoding='utf-8') as fi: - training_indexes = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] + with open(a.input_training_file, "r", encoding="utf-8") as fi: + training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] - with open(a.input_validation_file, 'r', encoding='utf-8') as fi: - validation_indexes = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] + with open(a.input_validation_file, "r", encoding="utf-8") as fi: + validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0] return training_indexes, validation_indexes class Dataset(torch.utils.data.Dataset): - def __init__(self, training_indexes, wavs_dir, segment_size, hr_sampling_rate, lr_sampling_rate, - split=True, shuffle=True, n_cache_reuse=1, device=None): + def __init__( + self, + training_indexes, + wavs_dir, + segment_size, + hr_sampling_rate, + lr_sampling_rate, + split=True, + shuffle=True, + n_cache_reuse=1, + device=None, + ): self.audio_indexes = training_indexes random.seed(1234) if shuffle: @@ -59,7 +76,7 @@ class Dataset(torch.utils.data.Dataset): def __getitem__(self, index): filename = self.audio_indexes[index] if self._cache_ref_count == 0: - audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + '.wav')) + audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav")) self.cached_wav = audio self._cache_ref_count = self.n_cache_reuse else: @@ -79,14 +96,13 @@ class Dataset(torch.utils.data.Dataset): if audio_hr.size(1) >= self.segment_size: max_audio_start = audio_hr.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) - audio_hr = audio_hr[:, audio_start: audio_start+self.segment_size] - audio_lr = audio_lr[:, audio_start: audio_start+self.segment_size] + audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size] + audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size] else: - audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), 'constant') - audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), 'constant') + audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant") + audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant") return (audio_hr.squeeze(), audio_lr.squeeze()) def __len__(self): - return len(self.audio_indexes) diff --git a/tools/AP_BWE_main/models/model.py b/tools/AP_BWE_main/models/model.py index 0c235d6..e538600 100644 --- a/tools/AP_BWE_main/models/model.py +++ b/tools/AP_BWE_main/models/model.py @@ -1,20 +1,26 @@ import torch import torch.nn.functional as F import torch.nn as nn -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn.utils import weight_norm, spectral_norm + + # from utils import init_weights, get_padding def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) + return int((kernel_size * dilation - dilation) / 2) + + def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) + import numpy as np from typing import Tuple, List LRELU_SLOPE = 0.1 + class ConvNeXtBlock(nn.Module): """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. @@ -30,24 +36,24 @@ class ConvNeXtBlock(nn.Module): def __init__( self, dim: int, - layer_scale_init_value= None, - adanorm_num_embeddings = None, + layer_scale_init_value=None, + adanorm_num_embeddings=None, ): super().__init__() self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv self.adanorm = adanorm_num_embeddings is not None - + self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, dim*3) # pointwise/1x1 convs, implemented with linear layers + self.pwconv1 = nn.Linear(dim, dim * 3) # pointwise/1x1 convs, implemented with linear layers self.act = nn.GELU() - self.pwconv2 = nn.Linear(dim*3, dim) + self.pwconv2 = nn.Linear(dim * 3, dim) self.gamma = ( nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) if layer_scale_init_value > 0 else None ) - def forward(self, x, cond_embedding_id = None) : + def forward(self, x, cond_embedding_id=None): residual = x x = self.dwconv(x) x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) @@ -72,11 +78,11 @@ class APNet_BWE_Model(torch.nn.Module): super(APNet_BWE_Model, self).__init__() self.h = h self.adanorm_num_embeddings = None - layer_scale_init_value = 1 / h.ConvNeXt_layers + layer_scale_init_value = 1 / h.ConvNeXt_layers - self.conv_pre_mag = nn.Conv1d(h.n_fft//2+1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) + self.conv_pre_mag = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) self.norm_pre_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) - self.conv_pre_pha = nn.Conv1d(h.n_fft//2+1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) + self.conv_pre_pha = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1)) self.norm_pre_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.convnext_mag = nn.ModuleList( @@ -104,9 +110,9 @@ class APNet_BWE_Model(torch.nn.Module): self.norm_post_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.norm_post_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6) self.apply(self._init_weights) - self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) - self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) - self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft//2+1) + self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) + self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) + self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1) def _init_weights(self, m): if isinstance(m, (nn.Conv1d, nn.Linear)): @@ -114,7 +120,6 @@ class APNet_BWE_Model(torch.nn.Module): nn.init.constant_(m.bias, 0) def forward(self, mag_nb, pha_nb): - x_mag = self.conv_pre_mag(mag_nb) x_pha = self.conv_pre_pha(pha_nb) x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2) @@ -134,11 +139,9 @@ class APNet_BWE_Model(torch.nn.Module): x_pha_i = self.linear_post_pha_i(x_pha) pha_wb = torch.atan2(x_pha_i, x_pha_r).transpose(1, 2) - com_wb = torch.stack((torch.exp(mag_wb)*torch.cos(pha_wb), - torch.exp(mag_wb)*torch.sin(pha_wb)), dim=-1) - - return mag_wb, pha_wb, com_wb + com_wb = torch.stack((torch.exp(mag_wb) * torch.cos(pha_wb), torch.exp(mag_wb) * torch.sin(pha_wb)), dim=-1) + return mag_wb, pha_wb, com_wb class DiscriminatorP(torch.nn.Module): @@ -146,13 +149,15 @@ class DiscriminatorP(torch.nn.Module): super(DiscriminatorP, self).__init__() self.period = period norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), - norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), - ]) + self.convs = nn.ModuleList( + [ + norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): @@ -160,13 +165,13 @@ class DiscriminatorP(torch.nn.Module): # 1d to 2d b, c, t = x.shape - if t % self.period != 0: # pad first + if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) - for i,l in enumerate(self.convs): + for i, l in enumerate(self.convs): x = l(x) x = F.leaky_relu(x, LRELU_SLOPE) if i > 0: @@ -181,13 +186,15 @@ class DiscriminatorP(torch.nn.Module): class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self): super(MultiPeriodDiscriminator, self).__init__() - self.discriminators = nn.ModuleList([ - DiscriminatorP(2), - DiscriminatorP(3), - DiscriminatorP(5), - DiscriminatorP(7), - DiscriminatorP(11), - ]) + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) def forward(self, y, y_hat): y_d_rs = [] @@ -264,8 +271,8 @@ class DiscriminatorAR(nn.Module): self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None ) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] - x=x.squeeze(1) - + x = x.squeeze(1) + x = self.spectrogram(x) x = x.unsqueeze(1) for l in self.convs: @@ -358,8 +365,8 @@ class DiscriminatorPR(nn.Module): self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None ) -> Tuple[torch.Tensor, List[torch.Tensor]]: fmap = [] - x=x.squeeze(1) - + x = x.squeeze(1) + x = self.spectrogram(x) x = x.unsqueeze(1) for l in self.convs: @@ -407,11 +414,11 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs): r_losses = [] g_losses = [] for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - r_loss = torch.mean(torch.clamp(1 - dr, min=0)) - g_loss = torch.mean(torch.clamp(1 + dg, min=0)) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) + r_loss = torch.mean(torch.clamp(1 - dr, min=0)) + g_loss = torch.mean(torch.clamp(1 + dg, min=0)) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) return loss, r_losses, g_losses @@ -420,35 +427,37 @@ def generator_loss(disc_outputs): loss = 0 gen_losses = [] for dg in disc_outputs: - l = torch.mean(torch.clamp(1 - dg, min=0)) - gen_losses.append(l) - loss += l + l = torch.mean(torch.clamp(1 - dg, min=0)) + gen_losses.append(l) + loss += l return loss, gen_losses def phase_losses(phase_r, phase_g): - ip_loss = torch.mean(anti_wrapping_function(phase_r - phase_g)) gd_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=1) - torch.diff(phase_g, dim=1))) iaf_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=2) - torch.diff(phase_g, dim=2))) return ip_loss, gd_loss, iaf_loss -def anti_wrapping_function(x): +def anti_wrapping_function(x): return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi) + def stft_mag(audio, n_fft=2048, hop_length=512): hann_window = torch.hann_window(n_fft).to(audio.device) stft_spec = torch.stft(audio, n_fft, hop_length, window=hann_window, return_complex=True) stft_mag = torch.abs(stft_spec) - return(stft_mag) + return stft_mag + def cal_snr(pred, target): snr = (20 * torch.log10(torch.norm(target, dim=-1) / torch.norm(pred - target, dim=-1).clamp(min=1e-8))).mean() return snr + def cal_lsd(pred, target): sp = torch.log10(stft_mag(pred).square().clamp(1e-8)) st = torch.log10(stft_mag(target).square().clamp(1e-8)) diff --git a/tools/asr/config.py b/tools/asr/config.py index 4b0d37a..c04069b 100644 --- a/tools/asr/config.py +++ b/tools/asr/config.py @@ -1,33 +1,36 @@ import os + def check_fw_local_models(): - ''' + """ 启动时检查本地是否有 Faster Whisper 模型. - ''' + """ model_size_list = [ - "tiny", "tiny.en", - "base", "base.en", - "small", "small.en", - "medium", "medium.en", - "large", "large-v1", - "large-v2", "large-v3"] + "tiny", + "tiny.en", + "base", + "base.en", + "small", + "small.en", + "medium", + "medium.en", + "large", + "large-v1", + "large-v2", + "large-v3", + ] for i, size in enumerate(model_size_list): - if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): - model_size_list[i] = size + '-local' + if os.path.exists(f"tools/asr/models/faster-whisper-{size}"): + model_size_list[i] = size + "-local" return model_size_list + asr_dict = { - "达摩 ASR (中文)": { - 'lang': ['zh','yue'], - 'size': ['large'], - 'path': 'funasr_asr.py', - 'precision': ['float32'] - }, + "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, "Faster Whisper (多语种)": { - 'lang': ['auto', 'zh', 'en', 'ja', 'ko', 'yue'], - 'size': check_fw_local_models(), - 'path': 'fasterwhisper_asr.py', - 'precision': ['float32', 'float16', 'int8'] + "lang": ["auto", "zh", "en", "ja", "ko", "yue"], + "size": check_fw_local_models(), + "path": "fasterwhisper_asr.py", + "precision": ["float32", "float16", "int8"], }, } - diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py index d46cbbd..e570f17 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/tools/asr/fasterwhisper_asr.py @@ -2,7 +2,7 @@ import argparse import os import traceback -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import torch @@ -11,6 +11,7 @@ from tqdm import tqdm from tools.asr.config import check_fw_local_models +# fmt: off language_code_list = [ "af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", @@ -32,82 +33,97 @@ language_code_list = [ "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz", "vi", "yi", "yo", "zh", "yue", - "auto"] + "auto"] +# fmt: on + def execute_asr(input_folder, output_folder, model_size, language, precision): - if '-local' in model_size: + if "-local" in model_size: model_size = model_size[:-6] - model_path = f'tools/asr/models/faster-whisper-{model_size}' + model_path = f"tools/asr/models/faster-whisper-{model_size}" else: model_path = model_size - if language == 'auto': - language = None #不设置语种由模型自动输出概率最高的语种 - print("loading faster whisper model:",model_size,model_path) - device = 'cuda' if torch.cuda.is_available() else 'cpu' + if language == "auto": + language = None # 不设置语种由模型自动输出概率最高的语种 + print("loading faster whisper model:", model_size, model_path) + device = "cuda" if torch.cuda.is_available() else "cpu" try: model = WhisperModel(model_path, device=device, compute_type=precision) except: return print(traceback.format_exc()) - + input_file_names = os.listdir(input_folder) input_file_names.sort() output = [] output_file_name = os.path.basename(input_folder) - + for file_name in tqdm(input_file_names): try: file_path = os.path.join(input_folder, file_name) segments, info = model.transcribe( - audio = file_path, - beam_size = 5, - vad_filter = True, - vad_parameters = dict(min_silence_duration_ms=700), - language = language) - text = '' + audio=file_path, + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=700), + language=language, + ) + text = "" if info.language == "zh": print("检测为中文文本, 转 FunASR 处理") - if("only_asr" not in globals()): - from tools.asr.funasr_asr import only_asr #如果用英文就不需要导入下载模型 + if "only_asr" not in globals(): + from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型 text = only_asr(file_path, language=info.language.lower()) - if text == '': + if text == "": for segment in segments: text += segment.text output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") except: print(traceback.format_exc()) - + output_folder = output_folder or "output/asr_opt" os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") with open(output_file_path, "w", encoding="utf-8") as f: f.write("\n".join(output)) print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") return output_file_path -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large-v3', - choices=check_fw_local_models(), - help="Model Size of Faster Whisper") - parser.add_argument("-l", "--language", type=str, default='ja', - choices=language_code_list, - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32','int8'], - help="fp16, int8 or fp32") + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument( + "-s", + "--model_size", + type=str, + default="large-v3", + choices=check_fw_local_models(), + help="Model Size of Faster Whisper", + ) + parser.add_argument( + "-l", "--language", type=str, default="ja", choices=language_code_list, help="Language of the audio files." + ) + parser.add_argument( + "-p", + "--precision", + type=str, + default="float16", + choices=["float16", "float32", "int8"], + help="fp16, int8 or fp32", + ) cmd = parser.parse_args() output_file_path = execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - precision = cmd.precision, + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, + precision=cmd.precision, ) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py index fe520e2..b0ffceb 100644 --- a/tools/asr/funasr_asr.py +++ b/tools/asr/funasr_asr.py @@ -9,31 +9,41 @@ import traceback from funasr import AutoModel from tqdm import tqdm -funasr_models = {} # 存储模型避免重复加载 +funasr_models = {} # 存储模型避免重复加载 + def only_asr(input_file, language): try: model = create_model(language) text = model.generate(input=input_file)[0]["text"] except: - text = '' + text = "" print(traceback.format_exc()) return text + def create_model(language="zh"): - path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' - path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' - path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" + path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch" + path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" + path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" vad_model_revision = punc_model_revision = "v2.0.4" if language == "zh": - path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + path_asr = ( + path_asr + if os.path.exists(path_asr) + else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + ) model_revision = "v2.0.4" elif language == "yue": - path_asr = 'tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + path_asr = ( + path_asr + if os.path.exists(path_asr) + else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + ) model_revision = "master" path_vad = path_punc = None vad_model_revision = punc_model_revision = None @@ -45,25 +55,26 @@ def create_model(language="zh"): return funasr_models[language] else: model = AutoModel( - model = path_asr, - model_revision = model_revision, - vad_model = path_vad, - vad_model_revision = vad_model_revision, - punc_model = path_punc, - punc_model_revision = punc_model_revision, + model=path_asr, + model_revision=model_revision, + vad_model=path_vad, + vad_model_revision=vad_model_revision, + punc_model=path_punc, + punc_model_revision=punc_model_revision, ) print(f"FunASR 模型加载完成: {language.upper()}") funasr_models[language] = model return model + def execute_asr(input_folder, output_folder, model_size, language): input_file_names = os.listdir(input_folder) input_file_names.sort() - + output = [] output_file_name = os.path.basename(input_folder) - + model = create_model(language) for file_name in tqdm(input_file_names): @@ -77,29 +88,31 @@ def execute_asr(input_folder, output_folder, model_size, language): output_folder = output_folder or "output/asr_opt" os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") with open(output_file_path, "w", encoding="utf-8") as f: f.write("\n".join(output)) print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") return output_file_path -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large', - help="Model Size of FunASR is Large") - parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh','yue','auto'], - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default="large", help="Model Size of FunASR is Large") + parser.add_argument( + "-l", "--language", type=str, default="zh", choices=["zh", "yue", "auto"], help="Language of the audio files." + ) + parser.add_argument( + "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" + ) # 还没接入 cmd = parser.parse_args() execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, ) diff --git a/tools/audio_sr.py b/tools/audio_sr.py index 009ad26..58df6d2 100644 --- a/tools/audio_sr.py +++ b/tools/audio_sr.py @@ -1,50 +1,44 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import sys,os -import traceback -AP_BWE_main_dir_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'AP_BWE_main') +import sys +import os + +AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main") sys.path.append(AP_BWE_main_dir_path) -import glob -import argparse import json -from re import S import torch -import numpy as np -import torchaudio -import time import torchaudio.functional as aF # from attrdict import AttrDict####will be bug in py3.10 from datasets1.dataset import amp_pha_stft, amp_pha_istft from models.model import APNet_BWE_Model -import soundfile as sf -import matplotlib.pyplot as plt -from rich.progress import track -class AP_BWE(): - def __init__(self,device,DictToAttrRecursive,checkpoint_file=None): - if checkpoint_file==None: - checkpoint_file="%s/24kto48k/g_24kto48k.zip"%(AP_BWE_main_dir_path) - if os.path.exists(checkpoint_file)==False: + +class AP_BWE: + def __init__(self, device, DictToAttrRecursive, checkpoint_file=None): + if checkpoint_file == None: + checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path) + if os.path.exists(checkpoint_file) == False: raise FileNotFoundError - config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json') - with open(config_file) as f:data = f.read() + config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() json_config = json.loads(data) # h = AttrDict(json_config) h = DictToAttrRecursive(json_config) model = APNet_BWE_Model(h).to(device) - state_dict = torch.load(checkpoint_file,map_location="cpu",weights_only=False) - model.load_state_dict(state_dict['generator']) + state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False) + model.load_state_dict(state_dict["generator"]) model.eval() - self.device=device - self.model=model - self.h=h + self.device = device + self.model = model + self.h = h def to(self, *arg, **kwargs): self.model.to(*arg, **kwargs) self.device = self.model.conv_pre_mag.weight.device return self - def __call__(self, audio,orig_sampling_rate): + def __call__(self, audio, orig_sampling_rate): with torch.no_grad(): # audio, orig_sampling_rate = torchaudio.load(inp_path) # audio = audio.to(self.device) @@ -53,4 +47,4 @@ class AP_BWE(): amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb) audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size) # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16') - return audio_hr_g.squeeze().cpu().numpy(),self.h.hr_sampling_rate + return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate diff --git a/tools/cmd-denoise.py b/tools/cmd-denoise.py index 1fdcab6..bbf6847 100644 --- a/tools/cmd-denoise.py +++ b/tools/cmd-denoise.py @@ -1,33 +1,38 @@ -import os,argparse +import os +import argparse import traceback from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from tqdm import tqdm -path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' -path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" -ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) -def execute_denoise(input_folder,output_folder): - os.makedirs(output_folder,exist_ok=True) +path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k" +path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" +ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise) + + +def execute_denoise(input_folder, output_folder): + os.makedirs(output_folder, exist_ok=True) # print(input_folder) # print(list(os.listdir(input_folder).sort())) for name in tqdm(os.listdir(input_folder)): try: - ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) + ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name)) except: traceback.print_exc() -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 + parser.add_argument( + "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." + ) + parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") + parser.add_argument( + "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" + ) # 还没接入 cmd = parser.parse_args() execute_denoise( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - ) \ No newline at end of file + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + ) diff --git a/tools/i18n/i18n.py b/tools/i18n/i18n.py index e256941..4cd123f 100644 --- a/tools/i18n/i18n.py +++ b/tools/i18n/i18n.py @@ -2,23 +2,27 @@ import json import locale import os -I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale') +I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") + def load_language_list(language): with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f: language_list = json.load(f) return language_list + def scan_language_list(): language_list = [] for name in os.listdir(I18N_JSON_DIR): - if name.endswith(".json"):language_list.append(name.split('.')[0]) + if name.endswith(".json"): + language_list.append(name.split(".")[0]) return language_list + class I18nAuto: def __init__(self, language=None): if language in ["Auto", None]: - language = locale.getdefaultlocale()[0] + language = locale.getdefaultlocale()[0] # getlocale can't identify the system's language ((None, None)) if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")): language = "en_US" @@ -31,6 +35,7 @@ class I18nAuto: def __repr__(self): return "Use Language: " + self.language + if __name__ == "__main__": - i18n = I18nAuto(language='en_US') - print(i18n) \ No newline at end of file + i18n = I18nAuto(language="en_US") + print(i18n) diff --git a/tools/i18n/scan_i18n.py b/tools/i18n/scan_i18n.py index d2bd12b..0f12091 100644 --- a/tools/i18n/scan_i18n.py +++ b/tools/i18n/scan_i18n.py @@ -4,21 +4,18 @@ import json import os from collections import OrderedDict -I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale') -DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言 -TITLE_LEN : int = 60 # 标题显示长度 -KEY_LEN : int = 30 # 键名显示长度 -SHOW_KEYS : bool = False # 是否显示键信息 -SORT_KEYS : bool = False # 是否按全局键名写入文件 +I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") +DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言 +TITLE_LEN: int = 60 # 标题显示长度 +KEY_LEN: int = 30 # 键名显示长度 +SHOW_KEYS: bool = False # 是否显示键信息 +SORT_KEYS: bool = False # 是否按全局键名写入文件 + def extract_i18n_strings(node): i18n_strings = [] - if ( - isinstance(node, ast.Call) - and isinstance(node.func, ast.Name) - and node.func.id == "i18n" - ): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "i18n": for arg in node.args: if isinstance(arg, ast.Str): i18n_strings.append(arg.s) @@ -28,6 +25,7 @@ def extract_i18n_strings(node): return i18n_strings + def scan_i18n_strings(): """ scan the directory for all .py files (recursively) @@ -43,7 +41,7 @@ def scan_i18n_strings(): if "I18nAuto" in code: tree = ast.parse(code) i18n_strings = extract_i18n_strings(tree) - print(f"{filename.ljust(KEY_LEN*3//2)}: {len(i18n_strings)}") + print(f"{filename.ljust(KEY_LEN * 3 // 2)}: {len(i18n_strings)}") if SHOW_KEYS: print("\n".join([s for s in i18n_strings])) strings.extend(i18n_strings) @@ -51,9 +49,10 @@ def scan_i18n_strings(): print(f"\033[31m[Failed] Error occur at {filename}: {e}\033[0m") code_keys = set(strings) - print(f"{'Total Unique'.ljust(KEY_LEN*3//2)}: {len(code_keys)}") + print(f"{'Total Unique'.ljust(KEY_LEN * 3 // 2)}: {len(code_keys)}") return code_keys + def update_i18n_json(json_file, standard_keys): standard_keys = sorted(standard_keys) print(f" Process {json_file} ".center(TITLE_LEN, "=")) @@ -89,8 +88,10 @@ def update_i18n_json(json_file, standard_keys): sorted( json_data.items(), key=lambda x: ( - list(standard_keys).index(x[0]) if x[0] in standard_keys and not x[1].startswith('#!') else len(json_data), - ) + list(standard_keys).index(x[0]) + if x[0] in standard_keys and not x[1].startswith("#!") + else len(json_data), + ), ) ) # 打印处理后的 JSON 条目数 @@ -111,21 +112,26 @@ def update_i18n_json(json_file, standard_keys): # 打印是否有重复的值 for value, keys in duplicate_items.items(): if len(keys) > 1: - print("\n".join([f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys])) + print( + "\n".join( + [f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys] + ) + ) if num_miss_translation > 0: print(f"\033[31m{'[Failed] Missing Translation'.ljust(KEY_LEN)}: {num_miss_translation}\033[0m") else: - print(f"\033[32m[Passed] All Keys Translated\033[0m") + print("\033[32m[Passed] All Keys Translated\033[0m") # 将处理后的结果写入 JSON 文件 with open(json_file, "w", encoding="utf-8") as f: json.dump(json_data, f, ensure_ascii=False, indent=4, sort_keys=SORT_KEYS) f.write("\n") - print(f" Updated {json_file} ".center(TITLE_LEN, "=") + '\n') + print(f" Updated {json_file} ".center(TITLE_LEN, "=") + "\n") + if __name__ == "__main__": code_keys = scan_i18n_strings() for json_file in os.listdir(I18N_JSON_DIR): if json_file.endswith(r".json"): json_file = os.path.join(I18N_JSON_DIR, json_file) - update_i18n_json(json_file, code_keys) \ No newline at end of file + update_i18n_json(json_file, code_keys) diff --git a/tools/my_utils.py b/tools/my_utils.py index 3369248..44d326e 100644 --- a/tools/my_utils.py +++ b/tools/my_utils.py @@ -1,10 +1,13 @@ -import platform,os,traceback +import os +import traceback import ffmpeg import numpy as np import gradio as gr from tools.i18n.i18n import I18nAuto import pandas as pd -i18n = I18nAuto(language=os.environ.get('language','Auto')) + +i18n = I18nAuto(language=os.environ.get("language", "Auto")) + def load_audio(file, sr): try: @@ -13,45 +16,49 @@ def load_audio(file, sr): # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 if os.path.exists(file) == False: - raise RuntimeError( - "You input a wrong audio path that does not exists, please fix it!" - ) + raise RuntimeError("You input a wrong audio path that does not exists, please fix it!") out, _ = ( ffmpeg.input(file, threads=0) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) - except Exception as e: + except Exception: traceback.print_exc() raise RuntimeError(i18n("音频加载失败")) return np.frombuffer(out, np.float32).flatten() -def clean_path(path_str:str): - if path_str.endswith(('\\','/')): +def clean_path(path_str: str): + if path_str.endswith(("\\", "/")): return clean_path(path_str[0:-1]) - path_str = path_str.replace('/', os.sep).replace('\\', os.sep) - return path_str.strip(" \'\n\"\u202a")#path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") + path_str = path_str.replace("/", os.sep).replace("\\", os.sep) + return path_str.strip( + " '\n\"\u202a" + ) # path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a") -def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False): - files_status=[] +def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False): + files_status = [] if is_train == True and file_list: - file_list.append(os.path.join(file_list[0],'2-name2text.txt')) - file_list.append(os.path.join(file_list[0],'3-bert')) - file_list.append(os.path.join(file_list[0],'4-cnhubert')) - file_list.append(os.path.join(file_list[0],'5-wav32k')) - file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv')) + file_list.append(os.path.join(file_list[0], "2-name2text.txt")) + file_list.append(os.path.join(file_list[0], "3-bert")) + file_list.append(os.path.join(file_list[0], "4-cnhubert")) + file_list.append(os.path.join(file_list[0], "5-wav32k")) + file_list.append(os.path.join(file_list[0], "6-name2semantic.tsv")) for file in file_list: - if os.path.exists(file):files_status.append(True) - else:files_status.append(False) - if sum(files_status)!=len(files_status): + if os.path.exists(file): + files_status.append(True) + else: + files_status.append(False) + if sum(files_status) != len(files_status): if is_train: - for file,status in zip(file_list,files_status): - if status:pass - else:gr.Warning(file) - gr.Warning(i18n('以下文件或文件夹不存在')) + for file, status in zip(file_list, files_status): + if status: + pass + else: + gr.Warning(file) + gr.Warning(i18n("以下文件或文件夹不存在")) return False elif is_dataset_processing: if files_status[0]: @@ -60,56 +67,63 @@ def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing gr.Warning(file_list[0]) elif not files_status[1] and file_list[1]: gr.Warning(file_list[1]) - gr.Warning(i18n('以下文件或文件夹不存在')) + gr.Warning(i18n("以下文件或文件夹不存在")) return False else: if file_list[0]: gr.Warning(file_list[0]) - gr.Warning(i18n('以下文件或文件夹不存在')) + gr.Warning(i18n("以下文件或文件夹不存在")) else: - gr.Warning(i18n('路径不能为空')) + gr.Warning(i18n("路径不能为空")) return False return True -def check_details(path_list=None,is_train=False,is_dataset_processing=False): + +def check_details(path_list=None, is_train=False, is_dataset_processing=False): if is_dataset_processing: list_path, audio_path = path_list - if (not list_path.endswith('.list')): - gr.Warning(i18n('请填入正确的List路径')) + if not list_path.endswith(".list"): + gr.Warning(i18n("请填入正确的List路径")) return if audio_path: if not os.path.isdir(audio_path): - gr.Warning(i18n('请填入正确的音频文件夹路径')) + gr.Warning(i18n("请填入正确的音频文件夹路径")) return - with open(list_path,"r",encoding="utf8")as f: - line=f.readline().strip("\n").split("\n") + with open(list_path, "r", encoding="utf8") as f: + line = f.readline().strip("\n").split("\n") wav_name, _, __, ___ = line[0].split("|") - wav_name=clean_path(wav_name) - if (audio_path != "" and audio_path != None): + wav_name = clean_path(wav_name) + if audio_path != "" and audio_path != None: wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(audio_path, wav_name) + wav_path = "%s/%s" % (audio_path, wav_name) else: - wav_path=wav_name + wav_path = wav_name if os.path.exists(wav_path): ... else: - gr.Warning(i18n('路径错误')) + gr.Warning(i18n("路径错误")) return if is_train: - path_list.append(os.path.join(path_list[0],'2-name2text.txt')) - path_list.append(os.path.join(path_list[0],'4-cnhubert')) - path_list.append(os.path.join(path_list[0],'5-wav32k')) - path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv')) + path_list.append(os.path.join(path_list[0], "2-name2text.txt")) + path_list.append(os.path.join(path_list[0], "4-cnhubert")) + path_list.append(os.path.join(path_list[0], "5-wav32k")) + path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv")) phone_path, hubert_path, wav_path, semantic_path = path_list[1:] - with open(phone_path,'r',encoding='utf-8') as f: - if f.read(1):... - else:gr.Warning(i18n('缺少音素数据集')) - if os.listdir(hubert_path):... - else:gr.Warning(i18n('缺少Hubert数据集')) - if os.listdir(wav_path):... - else:gr.Warning(i18n('缺少音频数据集')) - df = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" - ) - if len(df) >= 1:... - else:gr.Warning(i18n('缺少语义数据集')) + with open(phone_path, "r", encoding="utf-8") as f: + if f.read(1): + ... + else: + gr.Warning(i18n("缺少音素数据集")) + if os.listdir(hubert_path): + ... + else: + gr.Warning(i18n("缺少Hubert数据集")) + if os.listdir(wav_path): + ... + else: + gr.Warning(i18n("缺少音频数据集")) + df = pd.read_csv(semantic_path, delimiter="\t", encoding="utf-8") + if len(df) >= 1: + ... + else: + gr.Warning(i18n("缺少语义数据集")) diff --git a/tools/slice_audio.py b/tools/slice_audio.py index 8a06292..66fafa9 100644 --- a/tools/slice_audio.py +++ b/tools/slice_audio.py @@ -1,30 +1,34 @@ -import os,sys,numpy as np +import os +import sys +import numpy as np import traceback from scipy.io import wavfile + # parent_directory = os.path.dirname(os.path.abspath(__file__)) # sys.path.append(parent_directory) from tools.my_utils import load_audio from slicer2 import Slicer -def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): - os.makedirs(opt_root,exist_ok=True) + +def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part): + os.makedirs(opt_root, exist_ok=True) if os.path.isfile(inp): - input=[inp] + input = [inp] elif os.path.isdir(inp): - input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] + input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] else: return "输入路径存在但既不是文件也不是文件夹" slicer = Slicer( sr=32000, # 长音频采样率 - threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 - min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 - min_interval= int(min_interval), # 最短切割间隔 - hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) - max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 + threshold=int(threshold), # 音量小于这个值视作静音的备选切割点 + min_length=int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 + min_interval=int(min_interval), # 最短切割间隔 + hop_size=int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) + max_sil_kept=int(max_sil_kept), # 切完后静音最多留多长 ) - _max=float(_max) - alpha=float(alpha) - for inp_path in input[int(i_part)::int(all_part)]: + _max = float(_max) + alpha = float(alpha) + for inp_path in input[int(i_part) :: int(all_part)]: # print(inp_path) try: name = os.path.basename(inp_path) @@ -32,7 +36,8 @@ def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_ # print(audio.shape) for chunk, start, end in slicer.slice(audio): # start和end是帧数 tmp_max = np.abs(chunk).max() - if(tmp_max>1):chunk/=tmp_max + if tmp_max > 1: + chunk /= tmp_max chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk wavfile.write( "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), @@ -41,8 +46,8 @@ def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_ (chunk * 32767).astype(np.int16), ) except: - print(inp_path,"->fail->",traceback.format_exc()) + print(inp_path, "->fail->", traceback.format_exc()) return "执行完毕,请检查输出文件" -print(slice(*sys.argv[1:])) +print(slice(*sys.argv[1:])) diff --git a/tools/slicer2.py b/tools/slicer2.py index ba6794b..8d80f1b 100644 --- a/tools/slicer2.py +++ b/tools/slicer2.py @@ -46,13 +46,9 @@ class Slicer: max_sil_kept: int = 5000, ): if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) + raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size") if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) + raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size") min_interval = sr * min_interval / 1000 self.threshold = 10 ** (threshold / 20.0) self.hop_size = round(sr * hop_size / 1000) @@ -63,13 +59,9 @@ class Slicer: def _apply_slice(self, waveform, begin, end): if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] + return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)] else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] + return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)] # @timeit def slice(self, waveform): @@ -79,9 +71,7 @@ class Slicer: samples = waveform if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) + rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) sil_tags = [] silence_start = None clip_start = 0 @@ -97,10 +87,7 @@ class Slicer: continue # Clear recorded silence start if interval is not enough or clip is too short is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length if not is_leading_silence and not need_slice_middle: silence_start = None continue @@ -113,21 +100,10 @@ class Slicer: sil_tags.append((pos, pos)) clip_start = pos elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() + pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) clip_start = pos_r @@ -135,17 +111,8 @@ class Slicer: sil_tags.append((min(pos_l, pos), max(pos_r, pos))) clip_start = max(pos_r, pos) else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) else: @@ -154,28 +121,33 @@ class Slicer: silence_start = None # Deal with trailing silence. total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): + if silence_start is not None and total_frames - silence_start >= self.min_interval: silence_end = min(total_frames, silence_start + self.max_sil_kept) pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start sil_tags.append((pos, total_frames + 1)) # Apply and return slices. ####音频+起始时间+终止时间 if len(sil_tags) == 0: - return [[waveform,0,int(total_frames*self.hop_size)]] + return [[waveform, 0, int(total_frames * self.hop_size)]] else: chunks = [] if sil_tags[0][0] > 0: - chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) + chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)]) for i in range(len(sil_tags) - 1): chunks.append( - [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] + [ + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), + int(sil_tags[i][1] * self.hop_size), + int(sil_tags[i + 1][0] * self.hop_size), + ] ) if sil_tags[-1][1] < total_frames: chunks.append( - [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] + [ + self._apply_slice(waveform, sil_tags[-1][1], total_frames), + int(sil_tags[-1][1] * self.hop_size), + int(total_frames * self.hop_size), + ] ) return chunks @@ -189,9 +161,7 @@ def main(): parser = ArgumentParser() parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) + parser.add_argument("--out", type=str, help="Output directory of the sliced audio clips") parser.add_argument( "--db_thresh", type=float, @@ -249,8 +219,7 @@ def main(): soundfile.write( os.path.join( out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + "%s_%d.wav" % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), ), chunk, sr, diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py index 9ae6c7c..85877e8 100644 --- a/tools/subfix_webui.py +++ b/tools/subfix_webui.py @@ -1,13 +1,15 @@ -import argparse,os +import argparse +import os import copy import json -import os import uuid try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... + + analytics.version_check = lambda: None +except: + ... import librosa import gradio as gr @@ -33,15 +35,10 @@ def reload_data(index, batch): g_index = index global g_batch g_batch = batch - datas = g_data_json[index:index+batch] + datas = g_data_json[index : index + batch] output = [] for d in datas: - output.append( - { - g_json_key_text: d[g_json_key_text], - g_json_key_path: d[g_json_key_path] - } - ) + output.append({g_json_key_text: d[g_json_key_text], g_json_key_path: d[g_json_key_path]}) return output @@ -50,17 +47,13 @@ def b_change_index(index, batch): g_index, g_batch = index, batch datas = reload_data(index, batch) output = [] - for i , _ in enumerate(datas): + for i, _ in enumerate(datas): output.append( # gr.Textbox( # label=f"Text {i+index}", # value=_[g_json_key_text]#text # ) - { - "__type__":"update", - "label":f"Text {i+index}", - "value":_[g_json_key_text] - } + {"__type__": "update", "label": f"Text {i + index}", "value": _[g_json_key_text]} ) for _ in range(g_batch - len(datas)): output.append( @@ -68,11 +61,7 @@ def b_change_index(index, batch): # label=f"Text", # value="" # ) - { - "__type__": "update", - "label": f"Text", - "value": "" - } + {"__type__": "update", "label": "Text", "value": ""} ) for _ in datas: output.append(_[g_json_key_path]) @@ -86,7 +75,7 @@ def b_change_index(index, batch): def b_next_index(index, batch): b_save_file() if (index + batch) <= g_max_json_index: - return index + batch , *b_change_index(index + batch, batch) + return index + batch, *b_change_index(index + batch, batch) else: return index, *b_change_index(index, batch) @@ -94,7 +83,7 @@ def b_next_index(index, batch): def b_previous_index(index, batch): b_save_file() if (index - batch) >= 0: - return index - batch , *b_change_index(index - batch, batch) + return index - batch, *b_change_index(index - batch, batch) else: return 0, *b_change_index(0, batch) @@ -104,8 +93,8 @@ def b_submit_change(*text_list): change = False for i, new_text in enumerate(text_list): if g_index + i <= g_max_json_index: - new_text = new_text.strip()+' ' - if (g_data_json[g_index + i][g_json_key_text] != new_text): + new_text = new_text.strip() + " " + if g_data_json[g_index + i][g_json_key_text] != new_text: g_data_json[g_index + i][g_json_key_text] = new_text change = True if change: @@ -119,18 +108,22 @@ def b_delete_audio(*checkbox_list): change = False for i, checkbox in reversed(list(enumerate(checkbox_list))): if g_index + i < len(g_data_json): - if (checkbox == True): + if checkbox == True: g_data_json.pop(g_index + i) change = True - - g_max_json_index = len(g_data_json)-1 + + g_max_json_index = len(g_data_json) - 1 if g_index > g_max_json_index: g_index = g_max_json_index g_index = g_index if g_index >= 0 else 0 if change: b_save_file() # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) - return {"value":g_index,"__type__":"update","maximum":(g_max_json_index if g_max_json_index>=0 else 0)},*b_change_index(g_index, g_batch) + return { + "value": g_index, + "__type__": "update", + "maximum": (g_max_json_index if g_max_json_index >= 0 else 0), + }, *b_change_index(g_index, g_batch) def b_invert_selection(*checkbox_list): @@ -143,18 +136,18 @@ def get_next_path(filename): base_name = os.path.splitext(os.path.basename(filename))[0] for i in range(100): new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") - if not os.path.exists(new_path) : + if not os.path.exists(new_path): return new_path - return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav') + return os.path.join(base_dir, f"{str(uuid.uuid4())}.wav") def b_audio_split(audio_breakpoint, *checkbox_list): - global g_data_json , g_max_json_index + global g_data_json, g_max_json_index checked_index = [] for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): + if checkbox == True and g_index + i < len(g_data_json): checked_index.append(g_index + i) - if len(checked_index) == 1 : + if len(checked_index) == 1: index = checked_index[0] audio_json = copy.deepcopy(g_data_json[index]) path = audio_json[g_json_key_path] @@ -162,7 +155,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list): audio_maxframe = len(data) break_frame = int(audio_breakpoint * sample_rate) - if (break_frame >= 1 and break_frame < audio_maxframe): + if break_frame >= 1 and break_frame < audio_maxframe: audio_first = data[0:break_frame] audio_second = data[break_frame:] nextpath = get_next_path(path) @@ -174,19 +167,20 @@ def b_audio_split(audio_breakpoint, *checkbox_list): g_max_json_index = len(g_data_json) - 1 # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) + return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) + def b_merge_audio(interval_r, *checkbox_list): - global g_data_json , g_max_json_index + global g_data_json, g_max_json_index b_save_file() checked_index = [] audios_path = [] audios_text = [] for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): + if checkbox == True and g_index + i < len(g_data_json): checked_index.append(g_index + i) - - if (len(checked_index)>1): + + if len(checked_index) > 1: for i in checked_index: audios_path.append(g_data_json[i][g_json_key_path]) audios_text.append(g_data_json[i][g_json_key_text]) @@ -202,7 +196,7 @@ def b_merge_audio(interval_r, *checkbox_list): for i, path in enumerate(audios_path): data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) l_sample_rate = sample_rate - if (i > 0): + if i > 0: silence = np.zeros(int(l_sample_rate * interval_r)) audio_list.append(silence) @@ -213,32 +207,32 @@ def b_merge_audio(interval_r, *checkbox_list): soundfile.write(base_path, audio_concat, l_sample_rate) b_save_file() - + g_max_json_index = len(g_data_json) - 1 - + # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) + return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch) def b_save_json(): - with open(g_load_file,'w', encoding="utf-8") as file: + with open(g_load_file, "w", encoding="utf-8") as file: for data in g_data_json: - file.write(f'{json.dumps(data, ensure_ascii = False)}\n') + file.write(f"{json.dumps(data, ensure_ascii=False)}\n") def b_save_list(): - with open(g_load_file,'w', encoding="utf-8") as file: + with open(g_load_file, "w", encoding="utf-8") as file: for data in g_data_json: wav_path = data["wav_path"] speaker_name = data["speaker_name"] language = data["language"] text = data["text"] - file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n') + file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip() + "\n") def b_load_json(): global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as file: + with open(g_load_file, "r", encoding="utf-8") as file: g_data_json = file.readlines() g_data_json = [json.loads(line) for line in g_data_json] g_max_json_index = len(g_data_json) - 1 @@ -246,19 +240,14 @@ def b_load_json(): def b_load_list(): global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as source: + with open(g_load_file, "r", encoding="utf-8") as source: data_list = source.readlines() for _ in data_list: - data = _.split('|') - if (len(data) == 4): + data = _.split("|") + if len(data) == 4: wav_path, speaker_name, language, text = data g_data_json.append( - { - 'wav_path':wav_path, - 'speaker_name':speaker_name, - 'language':language, - 'text':text.strip() - } + {"wav_path": wav_path, "speaker_name": speaker_name, "language": language, "text": text.strip()} ) else: print("error line:", data) @@ -283,17 +272,17 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch): global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch g_batch = int(batch) - - if (load_json != "None"): + + if load_json != "None": g_load_format = "json" g_load_file = load_json - elif (load_list != "None"): + elif load_list != "None": g_load_format = "list" g_load_file = load_list else: g_load_format = "list" g_load_file = "demo.list" - + g_json_key_text = json_key_text g_json_key_path = json_key_path @@ -301,21 +290,20 @@ def set_global(load_json, load_list, json_key_text, json_key_path, batch): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument('--load_json', default="None", help='source file, like demo.json') - parser.add_argument('--is_share', default="False", help='whether webui is_share=True') - parser.add_argument('--load_list', default="None", help='source file, like demo.list') - parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list') - parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text') - parser.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path') - parser.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10') + parser = argparse.ArgumentParser(description="Process some integers.") + parser.add_argument("--load_json", default="None", help="source file, like demo.json") + parser.add_argument("--is_share", default="False", help="whether webui is_share=True") + parser.add_argument("--load_list", default="None", help="source file, like demo.list") + parser.add_argument("--webui_port_subfix", default=9871, help="source file, like demo.list") + parser.add_argument("--json_key_text", default="text", help="the text key name in json, Default: text") + parser.add_argument("--json_key_path", default="wav_path", help="the path key name in json, Default: wav_path") + parser.add_argument("--g_batch", default=10, help="max number g_batch wav to display, Default: 10") args = parser.parse_args() set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) - - with gr.Blocks() as demo: + with gr.Blocks() as demo: with gr.Row(): btn_change_index = gr.Button("Change Index") btn_submit_change = gr.Button("Submit Text") @@ -323,79 +311,50 @@ if __name__ == "__main__": btn_delete_audio = gr.Button("Delete Audio") btn_previous_index = gr.Button("Previous Index") btn_next_index = gr.Button("Next Index") - + with gr.Row(): - index_slider = gr.Slider( - minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3 - ) + index_slider = gr.Slider(minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3) splitpoint_slider = gr.Slider( - minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 + minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 ) btn_audio_split = gr.Button("Split Audio", scale=1) btn_save_json = gr.Button("Save File", visible=True, scale=1) btn_invert_selection = gr.Button("Invert Selection", scale=1) - + with gr.Row(): with gr.Column(): - for _ in range(0,g_batch): + for _ in range(0, g_batch): with gr.Row(): - text = gr.Textbox( - label = "Text", - visible = True, - scale=5 - ) - audio_output = gr.Audio( - label="Output Audio", - visible = True, - scale=5 - ) - audio_check = gr.Checkbox( - label="Yes", - show_label = True, - info = "Choose Audio", - scale=1 - ) + text = gr.Textbox(label="Text", visible=True, scale=5) + audio_output = gr.Audio(label="Output Audio", visible=True, scale=5) + audio_check = gr.Checkbox(label="Yes", show_label=True, info="Choose Audio", scale=1) g_text_list.append(text) g_audio_list.append(audio_output) g_checkbox_list.append(audio_check) - - with gr.Row(): batchsize_slider = gr.Slider( - minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False - ) - interval_slider = gr.Slider( - minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3 + minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False ) + interval_slider = gr.Slider(minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3) btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1) btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1) - + btn_change_index.click( b_change_index, inputs=[ index_slider, batchsize_slider, ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], ) - btn_submit_change.click( b_submit_change, inputs=[ *g_text_list, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_previous_index.click( @@ -404,82 +363,39 @@ if __name__ == "__main__": index_slider, batchsize_slider, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) - + btn_next_index.click( b_next_index, inputs=[ index_slider, batchsize_slider, ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_delete_audio.click( b_delete_audio, - inputs=[ - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[*g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_merge_audio.click( b_merge_audio, - inputs=[ - interval_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[interval_slider, *g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) btn_audio_split.click( b_audio_split, - inputs=[ - splitpoint_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] + inputs=[splitpoint_slider, *g_checkbox_list], + outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list], ) - btn_invert_selection.click( - b_invert_selection, - inputs=[ - *g_checkbox_list - ], - outputs=[ - *g_checkbox_list - ] - ) + btn_invert_selection.click(b_invert_selection, inputs=[*g_checkbox_list], outputs=[*g_checkbox_list]) - btn_save_json.click( - b_save_file - ) + btn_save_json.click(b_save_file) demo.load( b_change_index, @@ -487,17 +403,13 @@ if __name__ == "__main__": index_slider, batchsize_slider, ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], + outputs=[*g_text_list, *g_audio_list, *g_checkbox_list], ) - + demo.launch( server_name="0.0.0.0", inbrowser=True, quiet=True, share=eval(args.is_share), - server_port=int(args.webui_port_subfix) - ) \ No newline at end of file + server_port=int(args.webui_port_subfix), + ) diff --git a/tools/uvr5/bs_roformer/attend.py b/tools/uvr5/bs_roformer/attend.py index 2e3555a..29dad18 100644 --- a/tools/uvr5/bs_roformer/attend.py +++ b/tools/uvr5/bs_roformer/attend.py @@ -7,23 +7,22 @@ import torch.nn.functional as F def exists(val): return val is not None + def default(v, d): return v if exists(v) else d + class Attend(nn.Module): - def __init__( - self, - dropout = 0., - flash = False, - scale = None - ): + def __init__(self, dropout=0.0, flash=False, scale=None): super().__init__() self.scale = scale self.dropout = dropout self.attn_dropout = nn.Dropout(dropout) self.flash = flash - assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above' + assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), ( + "in order to use flash attention, you must be using pytorch 2.0 or above" + ) def flash_attn(self, q, k, v): # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device @@ -34,7 +33,7 @@ class Attend(nn.Module): # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - return F.scaled_dot_product_attention(q, k, v,dropout_p = self.dropout if self.training else 0.) + return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0) def forward(self, q, k, v): """ @@ -54,7 +53,7 @@ class Attend(nn.Module): # similarity - sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale + sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale # attention @@ -63,6 +62,6 @@ class Attend(nn.Module): # aggregate values - out = einsum(f"b h i j, b h j d -> b h i d", attn, v) + out = einsum("b h i j, b h j d -> b h i d", attn, v) return out diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/tools/uvr5/bs_roformer/bs_roformer.py index 45c46d5..8c9d1f3 100644 --- a/tools/uvr5/bs_roformer/bs_roformer.py +++ b/tools/uvr5/bs_roformer/bs_roformer.py @@ -1,14 +1,14 @@ from functools import partial import torch -from torch import nn, einsum, Tensor +from torch import nn from torch.nn import Module, ModuleList import torch.nn.functional as F from bs_roformer.attend import Attend from torch.utils.checkpoint import checkpoint -from typing import Tuple, Optional, List, Callable +from typing import Tuple, Optional, Callable # from beartype.typing import Tuple, Optional, List, Callable # from beartype import beartype @@ -19,6 +19,7 @@ from einops.layers.torch import Rearrange # helper functions + def exists(val): return val is not None @@ -37,14 +38,15 @@ def unpack_one(t, ps, pattern): # norm + def l2norm(t): - return F.normalize(t, dim = -1, p = 2) + return F.normalize(t, dim=-1, p=2) class RMSNorm(Module): def __init__(self, dim): super().__init__() - self.scale = dim ** 0.5 + self.scale = dim**0.5 self.gamma = nn.Parameter(torch.ones(dim)) def forward(self, x): @@ -53,13 +55,9 @@ class RMSNorm(Module): # attention + class FeedForward(Module): - def __init__( - self, - dim, - mult=4, - dropout=0. - ): + def __init__(self, dim, mult=4, dropout=0.0): super().__init__() dim_inner = int(dim * mult) self.net = nn.Sequential( @@ -68,7 +66,7 @@ class FeedForward(Module): nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), - nn.Dropout(dropout) + nn.Dropout(dropout), ) def forward(self, x): @@ -76,18 +74,10 @@ class FeedForward(Module): class Attention(Module): - def __init__( - self, - dim, - heads=8, - dim_head=64, - dropout=0., - rotary_embed=None, - flash=True - ): + def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True): super().__init__() self.heads = heads - self.scale = dim_head ** -0.5 + self.scale = dim_head**-0.5 dim_inner = heads * dim_head self.rotary_embed = rotary_embed @@ -99,15 +89,12 @@ class Attention(Module): self.to_gates = nn.Linear(dim, heads) - self.to_out = nn.Sequential( - nn.Linear(dim_inner, dim, bias=False), - nn.Dropout(dropout) - ) + self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)) def forward(self, x): x = self.norm(x) - q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads) + q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads) if exists(self.rotary_embed): q = self.rotary_embed.rotate_queries_or_keys(q) @@ -116,9 +103,9 @@ class Attention(Module): out = self.attend(q, k, v) gates = self.to_gates(x) - out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid() + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() - out = rearrange(out, 'b h n d -> b n (h d)') + out = rearrange(out, "b h n d -> b n (h d)") return self.to_out(out) @@ -128,42 +115,22 @@ class LinearAttention(Module): """ # @beartype - def __init__( - self, - *, - dim, - dim_head=32, - heads=8, - scale=8, - flash=False, - dropout=0. - ): + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): super().__init__() dim_inner = dim_head * heads self.norm = RMSNorm(dim) self.to_qkv = nn.Sequential( - nn.Linear(dim, dim_inner * 3, bias=False), - Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads) + nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads) ) self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) - self.attend = Attend( - scale=scale, - dropout=dropout, - flash=flash - ) + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) - self.to_out = nn.Sequential( - Rearrange('b h d n -> b n (h d)'), - nn.Linear(dim_inner, dim, bias=False) - ) + self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)) - def forward( - self, - x - ): + def forward(self, x): x = self.norm(x) q, k, v = self.to_qkv(x) @@ -178,19 +145,19 @@ class LinearAttention(Module): class Transformer(Module): def __init__( - self, - *, - dim, - depth, - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - ff_mult=4, - norm_output=True, - rotary_embed=None, - flash_attn=True, - linear_attn=False + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, ): super().__init__() self.layers = ModuleList([]) @@ -199,18 +166,20 @@ class Transformer(Module): if linear_attn: attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn) else: - attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, - rotary_embed=rotary_embed, flash=flash_attn) + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) - self.layers.append(ModuleList([ - attn, - FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) - ])) + self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)])) self.norm = RMSNorm(dim) if norm_output else nn.Identity() def forward(self, x): - for attn, ff in self.layers: x = attn(x) + x x = ff(x) + x @@ -220,22 +189,16 @@ class Transformer(Module): # bandsplit module + class BandSplit(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...] - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...]): super().__init__() self.dim_inputs = dim_inputs self.to_features = ModuleList([]) for dim_in in dim_inputs: - net = nn.Sequential( - RMSNorm(dim_in), - nn.Linear(dim_in, dim) - ) + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) self.to_features.append(net) @@ -250,13 +213,7 @@ class BandSplit(Module): return torch.stack(outs, dim=-2) -def MLP( - dim_in, - dim_out, - dim_hidden=None, - depth=1, - activation=nn.Tanh -): +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): dim_hidden = default(dim_hidden, dim_in) net = [] @@ -277,13 +234,7 @@ def MLP( class MaskEstimator(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...], - depth, - mlp_expansion_factor=4 - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): super().__init__() self.dim_inputs = dim_inputs self.to_freqs = ModuleList([]) @@ -292,10 +243,7 @@ class MaskEstimator(Module): for dim_in in dim_inputs: net = [] - mlp = nn.Sequential( - MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), - nn.GLU(dim=-1) - ) + mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)) self.to_freqs.append(mlp) @@ -314,53 +262,106 @@ class MaskEstimator(Module): # main class DEFAULT_FREQS_PER_BANDS = ( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 12, 12, 12, 12, 12, 12, 12, 12, - 24, 24, 24, 24, 24, 24, 24, 24, - 48, 48, 48, 48, 48, 48, 48, 48, - 128, 129, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 128, + 129, ) class BSRoformer(Module): - # @beartype def __init__( - self, - dim, - *, - depth, - stereo=False, - num_stems=1, - time_transformer_depth=2, - freq_transformer_depth=2, - linear_transformer_depth=0, - freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, - # in the paper, they divide into ~60 bands, test with 1 for starters - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - flash_attn=True, - dim_freqs_in=1025, - stft_n_fft=2048, - stft_hop_length=512, - # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction - stft_win_length=2048, - stft_normalized=False, - stft_window_fn: Optional[Callable] = None, - mask_estimator_depth=2, - multi_stft_resolution_loss_weight=1., - multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), - multi_stft_hop_size=147, - multi_stft_normalized=False, - multi_stft_window_fn: Callable = torch.hann_window, - mlp_expansion_factor=4, - use_torch_checkpoint=False, - skip_connection=False, + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, + # in the paper, they divide into ~60 bands, test with 1 for starters + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + flash_attn=True, + dim_freqs_in=1025, + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=2, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + mlp_expansion_factor=4, + use_torch_checkpoint=False, + skip_connection=False, ): super().__init__() @@ -379,7 +380,7 @@ class BSRoformer(Module): attn_dropout=attn_dropout, ff_dropout=ff_dropout, flash_attn=flash_attn, - norm_output=False + norm_output=False, ) time_rotary_embed = RotaryEmbedding(dim=dim_head) @@ -400,26 +401,23 @@ class BSRoformer(Module): self.final_norm = RMSNorm(dim) self.stft_kwargs = dict( - n_fft=stft_n_fft, - hop_length=stft_hop_length, - win_length=stft_win_length, - normalized=stft_normalized + n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized ) self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length) - freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True).shape[1] + freqs = torch.stft( + torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True + ).shape[1] assert len(freqs_per_bands) > 1 - assert sum( - freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}' + assert sum(freqs_per_bands) == freqs, ( + f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}" + ) freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands) - self.band_split = BandSplit( - dim=dim, - dim_inputs=freqs_per_bands_with_complex - ) + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) self.mask_estimators = nn.ModuleList([]) @@ -440,17 +438,9 @@ class BSRoformer(Module): self.multi_stft_n_fft = stft_n_fft self.multi_stft_window_fn = multi_stft_window_fn - self.multi_stft_kwargs = dict( - hop_length=multi_stft_hop_size, - normalized=multi_stft_normalized - ) + self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized) - def forward( - self, - raw_audio, - target=None, - return_loss_breakdown=False - ): + def forward(self, raw_audio, target=None, return_loss_breakdown=False): """ einops @@ -469,14 +459,16 @@ class BSRoformer(Module): x_is_mps = True if device.type == "mps" else False if raw_audio.ndim == 2: - raw_audio = rearrange(raw_audio, 'b t -> b 1 t') + raw_audio = rearrange(raw_audio, "b t -> b 1 t") channels = raw_audio.shape[1] - assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)' + assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), ( + "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + ) # to stft - raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t') + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") stft_window = self.stft_window_fn(device=device) @@ -485,16 +477,21 @@ class BSRoformer(Module): try: stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True) except: - stft_repr = torch.stft(raw_audio.cpu() if x_is_mps else raw_audio, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=True).to(device) + stft_repr = torch.stft( + raw_audio.cpu() if x_is_mps else raw_audio, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=True, + ).to(device) stft_repr = torch.view_as_real(stft_repr) - stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c') + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting - stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c') + stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c") - x = rearrange(stft_repr, 'b f t c -> b t (f c)') + x = rearrange(stft_repr, "b f t c -> b t (f c)") if self.use_torch_checkpoint: x = checkpoint(self.band_split, x, use_reentrant=False) @@ -505,16 +502,15 @@ class BSRoformer(Module): store = [None] * len(self.layers) for i, transformer_block in enumerate(self.layers): - if len(transformer_block) == 3: linear_transformer, time_transformer, freq_transformer = transformer_block - x, ft_ps = pack([x], 'b * d') + x, ft_ps = pack([x], "b * d") if self.use_torch_checkpoint: x = checkpoint(linear_transformer, x, use_reentrant=False) else: x = linear_transformer(x) - x, = unpack(x, ft_ps, 'b * d') + (x,) = unpack(x, ft_ps, "b * d") else: time_transformer, freq_transformer = transformer_block @@ -523,24 +519,24 @@ class BSRoformer(Module): for j in range(i): x = x + store[j] - x = rearrange(x, 'b t f d -> b f t d') - x, ps = pack([x], '* t d') + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") if self.use_torch_checkpoint: x = checkpoint(time_transformer, x, use_reentrant=False) else: x = time_transformer(x) - x, = unpack(x, ps, '* t d') - x = rearrange(x, 'b f t d -> b t f d') - x, ps = pack([x], '* f d') + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") if self.use_torch_checkpoint: x = checkpoint(freq_transformer, x, use_reentrant=False) else: x = freq_transformer(x) - x, = unpack(x, ps, '* f d') + (x,) = unpack(x, ps, "* f d") if self.skip_connection: store[i] = x @@ -553,11 +549,11 @@ class BSRoformer(Module): mask = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators], dim=1) else: mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) - mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2) + mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2) # modulate frequency representation - stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c') + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") # complex number multiplication @@ -568,18 +564,26 @@ class BSRoformer(Module): # istft - stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels) + stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels) # same as torch.stft() fix for MacOS MPS above try: - recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1]) + recon_audio = torch.istft( + stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1] + ) except: - recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False, length=raw_audio.shape[-1]).to(device) + recon_audio = torch.istft( + stft_repr.cpu() if x_is_mps else stft_repr, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=False, + length=raw_audio.shape[-1], + ).to(device) - recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', s=self.audio_channels, n=num_stems) + recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems) if num_stems == 1: - recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t') + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") # if a target is passed in, calculate loss for learning @@ -590,13 +594,13 @@ class BSRoformer(Module): assert target.ndim == 4 and target.shape[1] == self.num_stems if target.ndim == 2: - target = rearrange(target, '... t -> ... 1 t') + target = rearrange(target, "... t -> ... 1 t") - target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft + target = target[..., : recon_audio.shape[-1]] # protect against lost length on istft loss = F.l1_loss(recon_audio, target) - multi_stft_resolution_loss = 0. + multi_stft_resolution_loss = 0.0 for window_size in self.multi_stft_resolutions_window_sizes: res_stft_kwargs = dict( @@ -607,8 +611,8 @@ class BSRoformer(Module): **self.multi_stft_kwargs, ) - recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs) - target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs) + recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs) + target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs) multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y) @@ -619,4 +623,4 @@ class BSRoformer(Module): if not return_loss_breakdown: return total_loss - return total_loss, (loss, multi_stft_resolution_loss) \ No newline at end of file + return total_loss, (loss, multi_stft_resolution_loss) diff --git a/tools/uvr5/bs_roformer/mel_band_roformer.py b/tools/uvr5/bs_roformer/mel_band_roformer.py index 7b6c485..78a3904 100644 --- a/tools/uvr5/bs_roformer/mel_band_roformer.py +++ b/tools/uvr5/bs_roformer/mel_band_roformer.py @@ -1,14 +1,14 @@ from functools import partial import torch -from torch import nn, einsum, Tensor +from torch import nn from torch.nn import Module, ModuleList import torch.nn.functional as F from bs_roformer.attend import Attend from torch.utils.checkpoint import checkpoint -from typing import Tuple, Optional, List, Callable +from typing import Tuple, Optional, Callable # from beartype.typing import Tuple, Optional, List, Callable # from beartype import beartype @@ -22,6 +22,7 @@ from librosa import filters # helper functions + def exists(val): return val is not None @@ -38,9 +39,9 @@ def unpack_one(t, ps, pattern): return unpack(t, ps, pattern)[0] -def pad_at_dim(t, pad, dim=-1, value=0.): - dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1) - zeros = ((0, 0) * dims_from_right) +def pad_at_dim(t, pad, dim=-1, value=0.0): + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right return F.pad(t, (*zeros, *pad), value=value) @@ -50,10 +51,11 @@ def l2norm(t): # norm + class RMSNorm(Module): def __init__(self, dim): super().__init__() - self.scale = dim ** 0.5 + self.scale = dim**0.5 self.gamma = nn.Parameter(torch.ones(dim)) def forward(self, x): @@ -62,13 +64,9 @@ class RMSNorm(Module): # attention + class FeedForward(Module): - def __init__( - self, - dim, - mult=4, - dropout=0. - ): + def __init__(self, dim, mult=4, dropout=0.0): super().__init__() dim_inner = int(dim * mult) self.net = nn.Sequential( @@ -77,7 +75,7 @@ class FeedForward(Module): nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), - nn.Dropout(dropout) + nn.Dropout(dropout), ) def forward(self, x): @@ -85,18 +83,10 @@ class FeedForward(Module): class Attention(Module): - def __init__( - self, - dim, - heads=8, - dim_head=64, - dropout=0., - rotary_embed=None, - flash=True - ): + def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True): super().__init__() self.heads = heads - self.scale = dim_head ** -0.5 + self.scale = dim_head**-0.5 dim_inner = heads * dim_head self.rotary_embed = rotary_embed @@ -108,15 +98,12 @@ class Attention(Module): self.to_gates = nn.Linear(dim, heads) - self.to_out = nn.Sequential( - nn.Linear(dim_inner, dim, bias=False), - nn.Dropout(dropout) - ) + self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)) def forward(self, x): x = self.norm(x) - q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads) + q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads) if exists(self.rotary_embed): q = self.rotary_embed.rotate_queries_or_keys(q) @@ -125,9 +112,9 @@ class Attention(Module): out = self.attend(q, k, v) gates = self.to_gates(x) - out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid() + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() - out = rearrange(out, 'b h n d -> b n (h d)') + out = rearrange(out, "b h n d -> b n (h d)") return self.to_out(out) @@ -137,42 +124,22 @@ class LinearAttention(Module): """ # @beartype - def __init__( - self, - *, - dim, - dim_head=32, - heads=8, - scale=8, - flash=False, - dropout=0. - ): + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): super().__init__() dim_inner = dim_head * heads self.norm = RMSNorm(dim) self.to_qkv = nn.Sequential( - nn.Linear(dim, dim_inner * 3, bias=False), - Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads) + nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads) ) self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) - self.attend = Attend( - scale=scale, - dropout=dropout, - flash=flash - ) + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) - self.to_out = nn.Sequential( - Rearrange('b h d n -> b n (h d)'), - nn.Linear(dim_inner, dim, bias=False) - ) + self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)) - def forward( - self, - x - ): + def forward(self, x): x = self.norm(x) q, k, v = self.to_qkv(x) @@ -187,19 +154,19 @@ class LinearAttention(Module): class Transformer(Module): def __init__( - self, - *, - dim, - depth, - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - ff_mult=4, - norm_output=True, - rotary_embed=None, - flash_attn=True, - linear_attn=False + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, ): super().__init__() self.layers = ModuleList([]) @@ -208,18 +175,20 @@ class Transformer(Module): if linear_attn: attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn) else: - attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, - rotary_embed=rotary_embed, flash=flash_attn) + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) - self.layers.append(ModuleList([ - attn, - FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) - ])) + self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)])) self.norm = RMSNorm(dim) if norm_output else nn.Identity() def forward(self, x): - for attn, ff in self.layers: x = attn(x) + x x = ff(x) + x @@ -229,22 +198,16 @@ class Transformer(Module): # bandsplit module + class BandSplit(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...] - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...]): super().__init__() self.dim_inputs = dim_inputs self.to_features = ModuleList([]) for dim_in in dim_inputs: - net = nn.Sequential( - RMSNorm(dim_in), - nn.Linear(dim_in, dim) - ) + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) self.to_features.append(net) @@ -259,13 +222,7 @@ class BandSplit(Module): return torch.stack(outs, dim=-2) -def MLP( - dim_in, - dim_out, - dim_hidden=None, - depth=1, - activation=nn.Tanh -): +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): dim_hidden = default(dim_hidden, dim_in) net = [] @@ -286,13 +243,7 @@ def MLP( class MaskEstimator(Module): # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...], - depth, - mlp_expansion_factor=4 - ): + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): super().__init__() self.dim_inputs = dim_inputs self.to_freqs = ModuleList([]) @@ -301,10 +252,7 @@ class MaskEstimator(Module): for dim_in in dim_inputs: net = [] - mlp = nn.Sequential( - MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), - nn.GLU(dim=-1) - ) + mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)) self.to_freqs.append(mlp) @@ -322,43 +270,43 @@ class MaskEstimator(Module): # main class -class MelBandRoformer(Module): +class MelBandRoformer(Module): # @beartype def __init__( - self, - dim, - *, - depth, - stereo=False, - num_stems=1, - time_transformer_depth=2, - freq_transformer_depth=2, - linear_transformer_depth=0, - num_bands=60, - dim_head=64, - heads=8, - attn_dropout=0.1, - ff_dropout=0.1, - flash_attn=True, - dim_freqs_in=1025, - sample_rate=44100, # needed for mel filter bank from librosa - stft_n_fft=2048, - stft_hop_length=512, - # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction - stft_win_length=2048, - stft_normalized=False, - stft_window_fn: Optional[Callable] = None, - mask_estimator_depth=1, - multi_stft_resolution_loss_weight=1., - multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), - multi_stft_hop_size=147, - multi_stft_normalized=False, - multi_stft_window_fn: Callable = torch.hann_window, - match_input_audio_length=False, # if True, pad output tensor to match length of input tensor - mlp_expansion_factor=4, - use_torch_checkpoint=False, - skip_connection=False, + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + num_bands=60, + dim_head=64, + heads=8, + attn_dropout=0.1, + ff_dropout=0.1, + flash_attn=True, + dim_freqs_in=1025, + sample_rate=44100, # needed for mel filter bank from librosa + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=1, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + match_input_audio_length=False, # if True, pad output tensor to match length of input tensor + mlp_expansion_factor=4, + use_torch_checkpoint=False, + skip_connection=False, ): super().__init__() @@ -376,7 +324,7 @@ class MelBandRoformer(Module): dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, - flash_attn=flash_attn + flash_attn=flash_attn, ) time_rotary_embed = RotaryEmbedding(dim=dim_head) @@ -397,13 +345,12 @@ class MelBandRoformer(Module): self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length) self.stft_kwargs = dict( - n_fft=stft_n_fft, - hop_length=stft_hop_length, - win_length=stft_win_length, - normalized=stft_normalized + n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized ) - freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True).shape[1] + freqs = torch.stft( + torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True + ).shape[1] # create mel filter bank # with librosa.filters.mel as in section 2 of paper @@ -414,43 +361,40 @@ class MelBandRoformer(Module): # for some reason, it doesn't include the first freq? just force a value for now - mel_filter_bank[0][0] = 1. + mel_filter_bank[0][0] = 1.0 # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position, # so let's force a positive value - mel_filter_bank[-1, -1] = 1. + mel_filter_bank[-1, -1] = 1.0 # binary as in paper (then estimated masks are averaged for overlapping regions) freqs_per_band = mel_filter_bank > 0 - assert freqs_per_band.any(dim=0).all(), 'all frequencies need to be covered by all bands for now' + assert freqs_per_band.any(dim=0).all(), "all frequencies need to be covered by all bands for now" - repeated_freq_indices = repeat(torch.arange(freqs), 'f -> b f', b=num_bands) + repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands) freq_indices = repeated_freq_indices[freqs_per_band] if stereo: - freq_indices = repeat(freq_indices, 'f -> f s', s=2) + freq_indices = repeat(freq_indices, "f -> f s", s=2) freq_indices = freq_indices * 2 + torch.arange(2) - freq_indices = rearrange(freq_indices, 'f s -> (f s)') + freq_indices = rearrange(freq_indices, "f s -> (f s)") - self.register_buffer('freq_indices', freq_indices, persistent=False) - self.register_buffer('freqs_per_band', freqs_per_band, persistent=False) + self.register_buffer("freq_indices", freq_indices, persistent=False) + self.register_buffer("freqs_per_band", freqs_per_band, persistent=False) - num_freqs_per_band = reduce(freqs_per_band, 'b f -> b', 'sum') - num_bands_per_freq = reduce(freqs_per_band, 'b f -> f', 'sum') + num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum") + num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum") - self.register_buffer('num_freqs_per_band', num_freqs_per_band, persistent=False) - self.register_buffer('num_bands_per_freq', num_bands_per_freq, persistent=False) + self.register_buffer("num_freqs_per_band", num_freqs_per_band, persistent=False) + self.register_buffer("num_bands_per_freq", num_bands_per_freq, persistent=False) # band split and mask estimator freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist()) - self.band_split = BandSplit( - dim=dim, - dim_inputs=freqs_per_bands_with_complex - ) + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) self.mask_estimators = nn.ModuleList([]) @@ -471,19 +415,11 @@ class MelBandRoformer(Module): self.multi_stft_n_fft = stft_n_fft self.multi_stft_window_fn = multi_stft_window_fn - self.multi_stft_kwargs = dict( - hop_length=multi_stft_hop_size, - normalized=multi_stft_normalized - ) + self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized) self.match_input_audio_length = match_input_audio_length - def forward( - self, - raw_audio, - target=None, - return_loss_breakdown=False - ): + def forward(self, raw_audio, target=None, return_loss_breakdown=False): """ einops @@ -499,28 +435,29 @@ class MelBandRoformer(Module): device = raw_audio.device if raw_audio.ndim == 2: - raw_audio = rearrange(raw_audio, 'b t -> b 1 t') + raw_audio = rearrange(raw_audio, "b t -> b 1 t") batch, channels, raw_audio_length = raw_audio.shape istft_length = raw_audio_length if self.match_input_audio_length else None - assert (not self.stereo and channels == 1) or ( - self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)' + assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), ( + "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + ) # to stft - raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t') + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") stft_window = self.stft_window_fn(device=device) stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True) stft_repr = torch.view_as_real(stft_repr) - stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c') + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting - stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c') + stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c") # index out all frequencies for all frequency ranges across bands ascending in one go @@ -532,7 +469,7 @@ class MelBandRoformer(Module): # fold the complex (real and imag) into the frequencies dimension - x = rearrange(x, 'b f t c -> b t (f c)') + x = rearrange(x, "b f t c -> b t (f c)") if self.use_torch_checkpoint: x = checkpoint(self.band_split, x, use_reentrant=False) @@ -543,16 +480,15 @@ class MelBandRoformer(Module): store = [None] * len(self.layers) for i, transformer_block in enumerate(self.layers): - if len(transformer_block) == 3: linear_transformer, time_transformer, freq_transformer = transformer_block - x, ft_ps = pack([x], 'b * d') + x, ft_ps = pack([x], "b * d") if self.use_torch_checkpoint: x = checkpoint(linear_transformer, x, use_reentrant=False) else: x = linear_transformer(x) - x, = unpack(x, ft_ps, 'b * d') + (x,) = unpack(x, ft_ps, "b * d") else: time_transformer, freq_transformer = transformer_block @@ -561,24 +497,24 @@ class MelBandRoformer(Module): for j in range(i): x = x + store[j] - x = rearrange(x, 'b t f d -> b f t d') - x, ps = pack([x], '* t d') + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") if self.use_torch_checkpoint: x = checkpoint(time_transformer, x, use_reentrant=False) else: x = time_transformer(x) - x, = unpack(x, ps, '* t d') - x = rearrange(x, 'b f t d -> b t f d') - x, ps = pack([x], '* f d') + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") if self.use_torch_checkpoint: x = checkpoint(freq_transformer, x, use_reentrant=False) else: x = freq_transformer(x) - x, = unpack(x, ps, '* f d') + (x,) = unpack(x, ps, "* f d") if self.skip_connection: store[i] = x @@ -588,11 +524,11 @@ class MelBandRoformer(Module): masks = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators], dim=1) else: masks = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) - masks = rearrange(masks, 'b n t (f c) -> b n f t c', c=2) + masks = rearrange(masks, "b n t (f c) -> b n f t c", c=2) # modulate frequency representation - stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c') + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") # complex number multiplication @@ -603,12 +539,12 @@ class MelBandRoformer(Module): # need to average the estimated mask for the overlapped frequencies - scatter_indices = repeat(self.freq_indices, 'f -> b n f t', b=batch, n=num_stems, t=stft_repr.shape[-1]) + scatter_indices = repeat(self.freq_indices, "f -> b n f t", b=batch, n=num_stems, t=stft_repr.shape[-1]) - stft_repr_expanded_stems = repeat(stft_repr, 'b 1 ... -> b n ...', n=num_stems) + stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems) masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(2, scatter_indices, masks) - denom = repeat(self.num_bands_per_freq, 'f -> (f r) 1', r=channels) + denom = repeat(self.num_bands_per_freq, "f -> (f r) 1", r=channels) masks_averaged = masks_summed / denom.clamp(min=1e-8) @@ -618,15 +554,16 @@ class MelBandRoformer(Module): # istft - stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels) + stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels) - recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, - length=istft_length) + recon_audio = torch.istft( + stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=istft_length + ) - recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', b=batch, s=self.audio_channels, n=num_stems) + recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", b=batch, s=self.audio_channels, n=num_stems) if num_stems == 1: - recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t') + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") # if a target is passed in, calculate loss for learning @@ -637,13 +574,13 @@ class MelBandRoformer(Module): assert target.ndim == 4 and target.shape[1] == self.num_stems if target.ndim == 2: - target = rearrange(target, '... t -> ... 1 t') + target = rearrange(target, "... t -> ... 1 t") - target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft + target = target[..., : recon_audio.shape[-1]] # protect against lost length on istft loss = F.l1_loss(recon_audio, target) - multi_stft_resolution_loss = 0. + multi_stft_resolution_loss = 0.0 for window_size in self.multi_stft_resolutions_window_sizes: res_stft_kwargs = dict( @@ -654,8 +591,8 @@ class MelBandRoformer(Module): **self.multi_stft_kwargs, ) - recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs) - target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs) + recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs) + target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs) multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y) diff --git a/tools/uvr5/bsroformer.py b/tools/uvr5/bsroformer.py index 9ac09a9..ddcbfa7 100644 --- a/tools/uvr5/bsroformer.py +++ b/tools/uvr5/bsroformer.py @@ -1,28 +1,31 @@ # This code is modified from https://github.com/ZFTurbo/ -import librosa -from tqdm import tqdm import os -import torch +import warnings + +import librosa import numpy as np import soundfile as sf +import torch import torch.nn as nn import yaml -import warnings +from tqdm import tqdm + warnings.filterwarnings("ignore") class Roformer_Loader: def get_config(self, config_path): - with open(config_path, 'r', encoding='utf-8') as f: + with open(config_path, "r", encoding="utf-8") as f: # use fullloader to load tag !!python/tuple, code can be improved config = yaml.load(f, Loader=yaml.FullLoader) return config def get_default_config(self): default_config = None - if self.model_type == 'bs_roformer': + if self.model_type == "bs_roformer": # Use model_bs_roformer_ep_368_sdr_12.9628.yaml and model_bs_roformer_ep_317_sdr_12.9755.yaml as default configuration files # Other BS_Roformer models may not be compatible + # fmt: off default_config = { "audio": {"chunk_size": 352800, "sample_rate": 44100}, "model": { @@ -51,9 +54,10 @@ class Roformer_Loader: "multi_stft_normalized": False, }, "training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"}, - "inference": {"batch_size": 2, "num_overlap": 2} + "inference": {"batch_size": 2, "num_overlap": 2}, } - elif self.model_type == 'mel_band_roformer': + # fmt: on + elif self.model_type == "mel_band_roformer": # Use model_mel_band_roformer_ep_3005_sdr_11.4360.yaml as default configuration files # Other Mel_Band_Roformer models may not be compatible default_config = { @@ -82,29 +86,30 @@ class Roformer_Loader: "multi_stft_resolution_loss_weight": 1.0, "multi_stft_resolutions_window_sizes": (4096, 2048, 1024, 512, 256), "multi_stft_hop_size": 147, - "multi_stft_normalized": False + "multi_stft_normalized": False, }, "training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"}, - "inference": {"batch_size": 2, "num_overlap": 2} + "inference": {"batch_size": 2, "num_overlap": 2}, } + return default_config - def get_model_from_config(self): - if self.model_type == 'bs_roformer': + if self.model_type == "bs_roformer": from bs_roformer.bs_roformer import BSRoformer + model = BSRoformer(**dict(self.config["model"])) - elif self.model_type == 'mel_band_roformer': + elif self.model_type == "mel_band_roformer": from bs_roformer.mel_band_roformer import MelBandRoformer + model = MelBandRoformer(**dict(self.config["model"])) else: - print('Error: Unknown model: {}'.format(self.model_type)) + print("Error: Unknown model: {}".format(self.model_type)) model = None return model - def demix_track(self, model, mix, device): - C = self.config["audio"]["chunk_size"] # chunk_size + C = self.config["audio"]["chunk_size"] # chunk_size N = self.config["inference"]["num_overlap"] fade_size = C // 10 step = int(C // N) @@ -116,7 +121,7 @@ class Roformer_Loader: # Do pad from the beginning and end to account floating window results better if length_init > 2 * border and (border > 0): - mix = nn.functional.pad(mix, (border, border), mode='reflect') + mix = nn.functional.pad(mix, (border, border), mode="reflect") # Prepare windows arrays (do 1 time for speed up). This trick repairs click problems on the edges of segment window_size = C @@ -125,17 +130,17 @@ class Roformer_Loader: window_start = torch.ones(window_size) window_middle = torch.ones(window_size) window_finish = torch.ones(window_size) - window_start[-fade_size:] *= fadeout # First audio chunk, no fadein - window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout + window_start[-fade_size:] *= fadeout # First audio chunk, no fadein + window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout window_middle[-fade_size:] *= fadeout window_middle[:fade_size] *= fadein - with torch.amp.autocast('cuda'): + with torch.amp.autocast("cuda"): with torch.inference_mode(): if self.config["training"]["target_instrument"] is None: req_shape = (len(self.config["training"]["instruments"]),) + tuple(mix.shape) else: - req_shape = (1, ) + tuple(mix.shape) + req_shape = (1,) + tuple(mix.shape) result = torch.zeros(req_shape, dtype=torch.float32) counter = torch.zeros(req_shape, dtype=torch.float32) @@ -143,15 +148,15 @@ class Roformer_Loader: batch_data = [] batch_locations = [] while i < mix.shape[1]: - part = mix[:, i:i + C].to(device) + part = mix[:, i : i + C].to(device) length = part.shape[-1] if length < C: if length > C // 2 + 1: - part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect') + part = nn.functional.pad(input=part, pad=(0, C - length), mode="reflect") else: - part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0) + part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode="constant", value=0) if self.is_half: - part=part.half() + part = part.half() batch_data.append(part) batch_locations.append((i, length)) i += step @@ -170,8 +175,8 @@ class Roformer_Loader: for j in range(len(batch_locations)): start, l = batch_locations[j] - result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l] - counter[..., start:start+l] += window[..., :l] + result[..., start : start + l] += x[j][..., :l].cpu() * window[..., :l] + counter[..., start : start + l] += window[..., :l] batch_data = [] batch_locations = [] @@ -191,7 +196,6 @@ class Roformer_Loader: else: return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)} - def run_folder(self, input, vocal_root, others_root, format): self.model.eval() path = input @@ -200,20 +204,20 @@ class Roformer_Loader: file_base_name = os.path.splitext(os.path.basename(path))[0] sample_rate = 44100 - if 'sample_rate' in self.config["audio"]: - sample_rate = self.config["audio"]['sample_rate'] + if "sample_rate" in self.config["audio"]: + sample_rate = self.config["audio"]["sample_rate"] try: mix, sr = librosa.load(path, sr=sample_rate, mono=False) except Exception as e: - print('Can read track: {}'.format(path)) - print('Error message: {}'.format(str(e))) + print("Can read track: {}".format(path)) + print("Error message: {}".format(str(e))) return # in case if model only supports mono tracks isstereo = self.config["model"].get("stereo", True) if not isstereo and len(mix.shape) != 1: - mix = np.mean(mix, axis=0) # if more than 2 channels, take mean + mix = np.mean(mix, axis=0) # if more than 2 channels, take mean print("Warning: Track has more than 1 channels, but model is mono, taking mean of all channels.") mix_orig = mix.copy() @@ -226,7 +230,7 @@ class Roformer_Loader: # other instruments are caculated by subtracting target instrument from mixture target_instrument = self.config["training"]["target_instrument"] other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument] - other = mix_orig - res[target_instrument] # caculate other instruments + other = mix_orig - res[target_instrument] # caculate other instruments path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument) path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0]) @@ -237,11 +241,10 @@ class Roformer_Loader: vocal_inst = self.config["training"]["instruments"][0] path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst) self.save_audio(path_vocal, res[vocal_inst].T, sr, format) - for other in self.config["training"]["instruments"][1:]: # save other instruments + for other in self.config["training"]["instruments"][1:]: # save other instruments path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other) self.save_audio(path_other, res[other].T, sr, format) - def save_audio(self, path, data, sr, format): # input path should be endwith '.wav' if format in ["wav", "flac"]: @@ -250,10 +253,11 @@ class Roformer_Loader: sf.write(path, data, sr) else: sf.write(path, data, sr) - os.system("ffmpeg -i \"{}\" -vn \"{}\" -q:a 2 -y".format(path, path[:-3] + format)) - try: os.remove(path) - except: pass - + os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format)) + try: + os.remove(path) + except: + pass def __init__(self, model_path, config_path, device, is_half): self.device = device @@ -270,7 +274,9 @@ class Roformer_Loader: if not os.path.exists(config_path): if self.model_type is None: # if model_type is still None, raise an error - raise ValueError("Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again.") + raise ValueError( + "Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again." + ) self.config = self.get_default_config() else: # if there is a configuration file @@ -289,12 +295,10 @@ class Roformer_Loader: state_dict = torch.load(model_path, map_location="cpu") model.load_state_dict(state_dict) - if(is_half==False): + if is_half == False: self.model = model.to(device) else: self.model = model.half().to(device) - def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False): self.run_folder(input, vocal_root, others_root, format) - diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py index 372db25..e109827 100644 --- a/tools/uvr5/mdxnet.py +++ b/tools/uvr5/mdxnet.py @@ -13,9 +13,7 @@ cpu = torch.device("cpu") class ConvTDFNetTrim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): + def __init__(self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024): super(ConvTDFNetTrim, self).__init__() self.dim_f = dim_f @@ -24,17 +22,13 @@ class ConvTDFNetTrim: self.hop = hop self.n_bins = self.n_fft // 2 + 1 self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device) self.target_name = target_name self.blender = "blender" in model_name self.dim_c = 4 out_c = self.dim_c * 4 if target_name == "*" else self.dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) + self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device) self.n = L // 2 @@ -50,28 +44,18 @@ class ConvTDFNetTrim: ) x = torch.view_as_real(x) x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, self.dim_c, self.n_bins, self.dim_t] - ) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, self.dim_c, self.n_bins, self.dim_t]) return x[:, :, : self.dim_f] def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) + freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad x = torch.cat([x, freq_pad], -2) c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t]) x = x.permute([0, 2, 3, 1]) x = x.contiguous() x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) + x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True) return x.reshape([-1, c, self.chunk_size]) @@ -93,9 +77,7 @@ class Predictor: logger.info(ort.get_available_providers()) self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) + self.model_ = get_models(device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft) self.model = ort.InferenceSession( os.path.join(args.onnx, self.model_.target_name + ".onnx"), providers=[ @@ -152,9 +134,7 @@ class Predictor: trim = model.n_fft // 2 gen_size = model.chunk_size - 2 * trim pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) + mix_p = np.concatenate((np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1) mix_waves = [] i = 0 while i < n_sample + pad: @@ -172,15 +152,8 @@ class Predictor: ) tar_waves = model.istft(torch.tensor(spec_pred)) else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) + tar_waves = model.istft(torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])) + tar_signal = tar_waves[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).numpy()[:, :-pad] start = 0 if mix == 0 else margin_size end = None if mix == list(mixes.keys())[::-1][0] else -margin_size @@ -207,9 +180,7 @@ class Predictor: sources = self.demix(mix.T) opt = sources[0].T if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) + sf.write("%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate) sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) else: path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) @@ -219,18 +190,14 @@ class Predictor: opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format if os.path.exists(path_vocal): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) - ) + os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)) if os.path.exists(opt_path_vocal): try: os.remove(path_vocal) except: pass if os.path.exists(path_other): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) - ) + os.system("ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)) if os.path.exists(opt_path_other): try: os.remove(path_other) @@ -240,7 +207,7 @@ class Predictor: class MDXNetDereverb: def __init__(self, chunks): - self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy"%os.path.dirname(os.path.abspath(__file__)) + self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy" % os.path.dirname(os.path.abspath(__file__)) self.shifts = 10 # 'Predict with randomised equivariant stabilisation' self.mixing = "min_mag" # ['default','min_mag','max_mag'] self.chunks = chunks diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py index 640392a..4ca8a3b 100644 --- a/tools/uvr5/vr.py +++ b/tools/uvr5/vr.py @@ -1,6 +1,8 @@ -import os,sys +import os + parent_directory = os.path.dirname(os.path.abspath(__file__)) -import logging,pdb +import logging + logger = logging.getLogger(__name__) import librosa @@ -27,7 +29,7 @@ class AudioPre: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory) + mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json" % parent_directory) model = Nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) @@ -40,9 +42,7 @@ class AudioPre: self.mp = mp self.model = model - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False - ): + def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False): if ins_root is None and vocal_root is None: return "No save root." name = os.path.basename(music_file) @@ -61,19 +61,19 @@ class AudioPre: _, ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if X_wave[d].ndim == 1: X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) else: # lower bands X_wave[d] = librosa.core.resample( X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) # Stft of wave source X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( @@ -89,9 +89,7 @@ class AudioPre: input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] + input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :] X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) aggresive_set = float(self.data["agg"] / 100) @@ -100,9 +98,7 @@ class AudioPre: "split_bin": self.mp.param["band"][1]["crop_stop"], } with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) + pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data) # Postprocess if self.data["postprocess"]: pred_inv = np.clip(X_mag - pred, 0, np.inf) @@ -111,13 +107,11 @@ class AudioPre: v_spec_m = X_spec_m - y_spec_m if is_hp3 == True: - ins_root,vocal_root = vocal_root,ins_root + ins_root, vocal_root = vocal_root, ins_root if ins_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], y_spec_m, input_high_end, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave( y_spec_m, self.mp, input_high_end_h, input_high_end_ ) @@ -138,9 +132,7 @@ class AudioPre: self.mp.param["sr"], ) # else: - path = os.path.join( - ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(ins_root, head + "{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_instrument) * 32768).astype("int16"), @@ -160,12 +152,8 @@ class AudioPre: else: head = "vocal_" if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], v_spec_m, input_high_end, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) @@ -179,9 +167,7 @@ class AudioPre: self.mp.param["sr"], ) else: - path = os.path.join( - vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_vocals) * 32768).astype("int16"), @@ -210,7 +196,7 @@ class AudioPreDeEcho: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json"%parent_directory) + mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json" % parent_directory) nout = 64 if "DeReverb" in model_path else 48 model = CascadedNet(mp.param["bins"] * 2, nout) cpk = torch.load(model_path, map_location="cpu") @@ -245,19 +231,19 @@ class AudioPreDeEcho: _, ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if X_wave[d].ndim == 1: X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) else: # lower bands X_wave[d] = librosa.core.resample( X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) # Stft of wave source X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( @@ -273,9 +259,7 @@ class AudioPreDeEcho: input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] + input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :] X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) aggresive_set = float(self.data["agg"] / 100) @@ -284,9 +268,7 @@ class AudioPreDeEcho: "split_bin": self.mp.param["band"][1]["crop_stop"], } with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) + pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data) # Postprocess if self.data["postprocess"]: pred_inv = np.clip(X_mag - pred, 0, np.inf) @@ -296,9 +278,7 @@ class AudioPreDeEcho: if ins_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], y_spec_m, input_high_end, self.mp) wav_instrument = spec_utils.cmb_spectrogram_to_wave( y_spec_m, self.mp, input_high_end_h, input_high_end_ ) @@ -315,9 +295,7 @@ class AudioPreDeEcho: self.mp.param["sr"], ) # else: - path = os.path.join( - ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_instrument) * 32768).astype("int16"), @@ -333,12 +311,8 @@ class AudioPreDeEcho: pass if vocal_root is not None: if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) + input_high_end_ = spec_utils.mirroring(self.data["high_end_process"], v_spec_m, input_high_end, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) @@ -352,9 +326,7 @@ class AudioPreDeEcho: self.mp.param["sr"], ) else: - path = os.path.join( - vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) + path = os.path.join(vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])) sf.write( path, (np.array(wav_vocals) * 32768).astype("int16"), diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index ce52af5..cc826bf 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,13 +1,14 @@ import os -import traceback,gradio as gr +import traceback +import gradio as gr import logging from tools.i18n.i18n import I18nAuto from tools.my_utils import clean_path + i18n = I18nAuto() logger = logging.getLogger(__name__) -import librosa,ffmpeg -import soundfile as sf +import ffmpeg import torch import sys from mdxnet import MDXNetDereverb @@ -16,8 +17,10 @@ from bsroformer import Roformer_Loader try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... + + analytics.version_check = lambda: None +except: + ... weight_uvr5_root = "tools/uvr5/uvr5_weights" uvr5_names = [] @@ -25,21 +28,24 @@ for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or name.endswith(".ckpt") or "onnx" in name: uvr5_names.append(name.replace(".pth", "").replace(".ckpt", "")) -device=sys.argv[1] -is_half=eval(sys.argv[2]) -webui_port_uvr5=int(sys.argv[3]) -is_share=eval(sys.argv[4]) +device = sys.argv[1] +is_half = eval(sys.argv[2]) +webui_port_uvr5 = int(sys.argv[3]) +is_share = eval(sys.argv[4]) -def html_left(text, label='p'): + +def html_left(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" -def html_center(text, label='p'): + +def html_center(text, label="p"): return f"""
<{label} style="margin: 0; padding: 0;">{text}
""" + def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] try: @@ -52,13 +58,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format elif "roformer" in model_name.lower(): func = Roformer_Loader pre_fun = func( - model_path = os.path.join(weight_uvr5_root, model_name + ".ckpt"), - config_path = os.path.join(weight_uvr5_root, model_name + ".yaml"), - device = device, - is_half=is_half + model_path=os.path.join(weight_uvr5_root, model_name + ".ckpt"), + config_path=os.path.join(weight_uvr5_root, model_name + ".yaml"), + device=device, + is_half=is_half, ) if not os.path.exists(os.path.join(weight_uvr5_root, model_name + ".yaml")): - infos.append("Warning: You are using a model without a configuration file. The program will automatically use the default configuration file. However, the default configuration file cannot guarantee that all models will run successfully. You can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again. (For example, the configuration file corresponding to the model 'bs_roformer_ep_368_sdr_12.9628.ckpt' should be 'bs_roformer_ep_368_sdr_12.9628.yaml'.) Or you can just ignore this warning.") + infos.append( + "Warning: You are using a model without a configuration file. The program will automatically use the default configuration file. However, the default configuration file cannot guarantee that all models will run successfully. You can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '.yaml' then try it again. (For example, the configuration file corresponding to the model 'bs_roformer_ep_368_sdr_12.9628.ckpt' should be 'bs_roformer_ep_368_sdr_12.9628.yaml'.) Or you can just ignore this warning." + ) yield "\n".join(infos) else: func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho @@ -74,19 +82,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format paths = [path.name for path in paths] for path in paths: inp_path = os.path.join(inp_root, path) - if(os.path.isfile(inp_path)==False):continue + if os.path.isfile(inp_path) == False: + continue need_reformat = 1 done = 0 try: info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): + if info["streams"][0]["channels"] == 2 and info["streams"][0]["sample_rate"] == "44100": need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3) done = 1 except: need_reformat = 1 @@ -96,21 +100,15 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format os.path.join(os.environ["TEMP"]), os.path.basename(inp_path), ) - os.system( - f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y' - ) + os.system(f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y') inp_path = tmp_path try: if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal, format0, is_hp3) infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) + infos.append("%s->%s" % (os.path.basename(inp_path), traceback.format_exc())) yield "\n".join(infos) except: infos.append(traceback.format_exc()) @@ -130,80 +128,98 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format torch.cuda.empty_cache() yield "\n".join(infos) + with gr.Blocks(title="UVR5 WebUI") as app: gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Group(): - gr.Markdown(html_center(i18n("伴奏人声分离&去混响&去回声"),'h2')) + gr.Markdown(html_center(i18n("伴奏人声分离&去混响&去回声"), "h2")) with gr.Group(): - gr.Markdown( - value=html_left(i18n("人声伴奏分离批量处理, 使用UVR5模型。") + "
" + \ - i18n("合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。")+ "
" + \ - i18n("模型分为三类:") + "
" + \ - i18n("1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;") + "
" + \ - i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + "
" + \ - i18n("3、去混响、去延迟模型(by FoxJoy):") + "
  " + \ - i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + "
 " + \ - i18n("(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。") + "
" + \ - i18n("去混响/去延迟,附:") + "
" + \ - i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + "
" + \ - i18n("2、MDX-Net-Dereverb模型挺慢的;") + "
" + \ - i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"),'h4') - ) - with gr.Row(): - with gr.Column(): - model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) - dir_wav_input = gr.Textbox( - label=i18n("输入待处理音频文件夹路径"), - placeholder="C:\\Users\\Desktop\\todo-songs", - ) - wav_inputs = gr.File( - file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") - ) - with gr.Column(): - agg = gr.Slider( - minimum=0, - maximum=20, - step=1, - label=i18n("人声提取激进程度"), - value=10, - interactive=True, - visible=False, # 先不开放调整 - ) - opt_vocal_root = gr.Textbox( - label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt" - ) - opt_ins_root = gr.Textbox( - label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt" - ) - format0 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="flac", - interactive=True, - ) - with gr.Column(): - with gr.Row(): - but2 = gr.Button(i18n("转换"), variant="primary") - with gr.Row(): - vc_output4 = gr.Textbox(label=i18n("输出信息"),lines=3) - but2.click( - uvr, - [ - model_choose, - dir_wav_input, - opt_vocal_root, - wav_inputs, - opt_ins_root, - agg, - format0, - ], - [vc_output4], - api_name="uvr_convert", + gr.Markdown( + value=html_left( + i18n("人声伴奏分离批量处理, 使用UVR5模型。") + + "
" + + i18n( + "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。" ) -app.queue().launch(#concurrency_count=511, max_size=1022 + + "
" + + i18n("模型分为三类:") + + "
" + + i18n( + "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;" + ) + + "
" + + i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + + "
" + + i18n("3、去混响、去延迟模型(by FoxJoy):") + + "
  " + + i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + + "
 " + + i18n( + "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。" + ) + + "
" + + i18n("去混响/去延迟,附:") + + "
" + + i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + + "
" + + i18n("2、MDX-Net-Dereverb模型挺慢的;") + + "
" + + i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"), + "h4", + ) + ) + with gr.Row(): + with gr.Column(): + model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) + dir_wav_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径"), + placeholder="C:\\Users\\Desktop\\todo-songs", + ) + wav_inputs = gr.File( + file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + ) + with gr.Column(): + agg = gr.Slider( + minimum=0, + maximum=20, + step=1, + label=i18n("人声提取激进程度"), + value=10, + interactive=True, + visible=False, # 先不开放调整 + ) + opt_vocal_root = gr.Textbox(label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt") + opt_ins_root = gr.Textbox(label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt") + format0 = gr.Radio( + label=i18n("导出文件格式"), + choices=["wav", "flac", "mp3", "m4a"], + value="flac", + interactive=True, + ) + with gr.Column(): + with gr.Row(): + but2 = gr.Button(i18n("转换"), variant="primary") + with gr.Row(): + vc_output4 = gr.Textbox(label=i18n("输出信息"), lines=3) + but2.click( + uvr, + [ + model_choose, + dir_wav_input, + opt_vocal_root, + wav_inputs, + opt_ins_root, + agg, + format0, + ], + [vc_output4], + api_name="uvr_convert", + ) +app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share, diff --git a/webui.py b/webui.py index 41955a5..bdc9441 100644 --- a/webui.py +++ b/webui.py @@ -1,25 +1,36 @@ -import os,sys -if len(sys.argv)==1:sys.argv.append('v2') -version="v1"if sys.argv[1]=="v1" else"v2" -os.environ["version"]=version +import os +import sys + +if len(sys.argv) == 1: + sys.argv.append("v2") +version = "v1" if sys.argv[1] == "v1" else "v2" +os.environ["version"] = version now_dir = os.getcwd() sys.path.insert(0, now_dir) import warnings + warnings.filterwarnings("ignore") -import json,yaml,torch,pdb,re,shutil +import json import platform -import psutil +import re +import shutil import signal -os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO' + +import psutil +import torch +import yaml + +os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO" torch.manual_seed(233333) tmp = os.path.join(now_dir, "TEMP") os.makedirs(tmp, exist_ok=True) os.environ["TEMP"] = tmp -if(os.path.exists(tmp)): +if os.path.exists(tmp): for name in os.listdir(tmp): - if(name=="jieba.cache"):continue - path="%s/%s"%(tmp,name) - delete=os.remove if os.path.isfile(path) else shutil.rmtree + if name == "jieba.cache": + continue + path = "%s/%s" % (tmp, name) + delete = os.remove if os.path.isfile(path) else shutil.rmtree try: delete(path) except Exception as e: @@ -27,12 +38,14 @@ if(os.path.exists(tmp)): pass import site import traceback + site_packages_roots = [] for path in site.getsitepackages(): if "packages" in path: site_packages_roots.append(path) -if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] -#os.environ["OPENBLAS_NUM_THREADS"] = "4" +if site_packages_roots == []: + site_packages_roots = ["%s/runtime/Lib/site-packages" % now_dir] +# os.environ["OPENBLAS_NUM_THREADS"] = "4" os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" os.environ["all_proxy"] = "" for site_packages_root in site_packages_roots: @@ -45,29 +58,43 @@ for site_packages_root in site_packages_roots: % (now_dir, now_dir, now_dir, now_dir, now_dir, now_dir) ) break - except PermissionError as e: + except PermissionError: traceback.print_exc() -from tools import my_utils import shutil -import pdb import subprocess from subprocess import Popen -import signal -from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share + +from config import ( + exp_root, + infer_device, + is_half, + is_share, + python_exec, + webui_port_infer_tts, + webui_port_main, + webui_port_subfix, + webui_port_uvr5, +) +from tools import my_utils from tools.i18n.i18n import I18nAuto, scan_language_list -language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" -os.environ["language"]=language + +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" +os.environ["language"] = language i18n = I18nAuto(language=language) -from scipy.io import wavfile -from tools.my_utils import load_audio, check_for_existance, check_details from multiprocessing import cpu_count + +from tools.my_utils import check_details, check_for_existance + # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu try: import gradio.analytics as analytics - analytics.version_check = lambda:None -except:... + + analytics.version_check = lambda: None +except: + ... import gradio as gr -n_cpu=cpu_count() + +n_cpu = cpu_count() ngpu = torch.cuda.device_count() gpu_infos = [] @@ -75,25 +102,62 @@ mem = [] if_gpu_ok = False # 判断是否有能用来训练和加速推理的N卡 -ok_gpu_keywords={"10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060","H","600","506","507","508","509"} -set_gpu_numbers=set() +ok_gpu_keywords = { + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "L4", + "4060", + "H", + "600", + "506", + "507", + "508", + "509", +} +set_gpu_numbers = set() if torch.cuda.is_available() or ngpu != 0: for i in range(ngpu): gpu_name = torch.cuda.get_device_name(i) - if any(value in gpu_name.upper()for value in ok_gpu_keywords): + if any(value in gpu_name.upper() for value in ok_gpu_keywords): # A10#A100#V100#A40#P40#M40#K80#A4500 if_gpu_ok = True # 至少有一张能用的N卡 gpu_infos.append("%s\t%s" % (i, gpu_name)) set_gpu_numbers.add(i) - mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4)) + mem.append(int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)) # # 判断是否支持mps加速 # if torch.backends.mps.is_available(): # if_gpu_ok = True # gpu_infos.append("%s\t%s" % ("0", "Apple GPU")) # mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 + def set_default(): - global default_batch_size,default_max_batch_size,gpu_info,default_sovits_epoch,default_sovits_save_every_epoch,max_sovits_epoch,max_sovits_save_every_epoch,default_batch_size_s1,if_force_ckpt + global \ + default_batch_size, \ + default_max_batch_size, \ + gpu_info, \ + default_sovits_epoch, \ + default_sovits_save_every_epoch, \ + max_sovits_epoch, \ + max_sovits_save_every_epoch, \ + default_batch_size_s1, \ + if_force_ckpt if_force_ckpt = False if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) @@ -117,100 +181,139 @@ def set_default(): # minmem = 14 # except RuntimeError as _: # print("显存不足以开启V3训练") - default_batch_size = minmem // 2 if version!="v3"else minmem//8 - default_batch_size_s1=minmem // 2 + default_batch_size = minmem // 2 if version != "v3" else minmem // 8 + default_batch_size_s1 = minmem // 2 else: - gpu_info = ("%s\t%s" % ("0", "CPU")) + gpu_info = "%s\t%s" % ("0", "CPU") gpu_infos.append("%s\t%s" % ("0", "CPU")) set_gpu_numbers.add(0) - default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 4) - if version!="v3": - default_sovits_epoch=8 - default_sovits_save_every_epoch=4 - max_sovits_epoch=25#40 - max_sovits_save_every_epoch=25#10 + default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total / 1024 / 1024 / 1024 / 4) + if version != "v3": + default_sovits_epoch = 8 + default_sovits_save_every_epoch = 4 + max_sovits_epoch = 25 # 40 + max_sovits_save_every_epoch = 25 # 10 else: - default_sovits_epoch=2 - default_sovits_save_every_epoch=1 - max_sovits_epoch=3#40 - max_sovits_save_every_epoch=3#10 + default_sovits_epoch = 2 + default_sovits_save_every_epoch = 1 + max_sovits_epoch = 3 # 40 + max_sovits_save_every_epoch = 3 # 10 default_batch_size = max(1, default_batch_size) default_batch_size_s1 = max(1, default_batch_size_s1) default_max_batch_size = default_batch_size * 3 + set_default() gpus = "-".join([i[0] for i in gpu_infos]) -default_gpu_numbers=str(sorted(list(set_gpu_numbers))[0]) -def fix_gpu_number(input):#将越界的number强制改到界内 +default_gpu_numbers = str(sorted(list(set_gpu_numbers))[0]) + + +def fix_gpu_number(input): # 将越界的number强制改到界内 try: - if(int(input)not in set_gpu_numbers):return default_gpu_numbers - except:return input + if int(input) not in set_gpu_numbers: + return default_gpu_numbers + except: + return input return input + + def fix_gpu_numbers(inputs): - output=[] + output = [] try: - for input in inputs.split(","):output.append(str(fix_gpu_number(input))) + for input in inputs.split(","): + output.append(str(fix_gpu_number(input))) return ",".join(output) except: return inputs -pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth","GPT_SoVITS/pretrained_models/s2Gv3.pth"] -pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"] -pretrained_model_list = (pretrained_sovits_name[int(version[-1])-1],pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"),pretrained_gpt_name[int(version[-1])-1],"GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large","GPT_SoVITS/pretrained_models/chinese-hubert-base") +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] -_ = '' +pretrained_model_list = ( + pretrained_sovits_name[int(version[-1]) - 1], + pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + pretrained_gpt_name[int(version[-1]) - 1], + "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "GPT_SoVITS/pretrained_models/chinese-hubert-base", +) + +_ = "" for i in pretrained_model_list: if "s2Dv3" not in i and os.path.exists(i) == False: - _ += f'\n {i}' + _ += f"\n {i}" if _: - print("warning: ", i18n('以下模型不存在:') + _) + print("warning: ", i18n("以下模型不存在:") + _) -_ = [[],[]] +_ = [[], []] for i in range(3): - if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i]) - else:_[0].append("")##没有下pretrained模型的,说不定他们是想自己从零训底模呢 - if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i]) - else:_[-1].append("") -pretrained_gpt_name,pretrained_sovits_name = _ + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + else: + _[0].append("") ##没有下pretrained模型的,说不定他们是想自己从零训底模呢 + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) + else: + _[-1].append("") +pretrained_gpt_name, pretrained_sovits_name = _ + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3"] +for root in SoVITS_weight_root + GPT_weight_root: + os.makedirs(root, exist_ok=True) + -SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"] -GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"] -for root in SoVITS_weight_root+GPT_weight_root: - os.makedirs(root,exist_ok=True) def get_weights_names(): - SoVITS_names = [name for name in pretrained_sovits_name if name!=""] + SoVITS_names = [name for name in pretrained_sovits_name if name != ""] for path in SoVITS_weight_root: for name in os.listdir(path): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (path, name)) - GPT_names = [name for name in pretrained_gpt_name if name!=""] + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [name for name in pretrained_gpt_name if name != ""] for path in GPT_weight_root: for name in os.listdir(path): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (path, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) return SoVITS_names, GPT_names -SoVITS_names,GPT_names = get_weights_names() -for path in SoVITS_weight_root+GPT_weight_root: - os.makedirs(path,exist_ok=True) + +SoVITS_names, GPT_names = get_weights_names() +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts + def change_choices(): SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"} + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +p_label = None +p_uvr5 = None +p_asr = None +p_denoise = None +p_tts_inference = None -p_label=None -p_uvr5=None -p_asr=None -p_denoise=None -p_tts_inference=None def kill_proc_tree(pid, including_parent=True): try: @@ -231,16 +334,20 @@ def kill_proc_tree(pid, including_parent=True): except OSError: pass -system=platform.system() + +system = platform.system() + + def kill_process(pid, process_name=""): - if(system=="Windows"): + if system == "Windows": cmd = "taskkill /t /f /pid %s" % pid # os.system(cmd) - subprocess.run(cmd,shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: kill_proc_tree(pid) print(process_name + i18n("进程已终止")) + def process_info(process_name="", indicator=""): if indicator == "opened": return process_name + i18n("已开启") @@ -263,298 +370,528 @@ def process_info(process_name="", indicator=""): else: return process_name + process_name_subfix = i18n("音频标注WebUI") + + def change_label(path_list): global p_label if p_label is None: check_for_existance([path_list]) path_list = my_utils.clean_path(path_list) - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share) - yield process_info(process_name_subfix, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( + python_exec, + path_list, + webui_port_subfix, + is_share, + ) + yield ( + process_info(process_name_subfix, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_label = Popen(cmd, shell=True) else: kill_process(p_label.pid, process_name_subfix) p_label = None - yield process_info(process_name_subfix, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_subfix, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_uvr5 = i18n("人声分离WebUI") + + def change_uvr5(): global p_uvr5 if p_uvr5 is None: - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share) - yield process_info(process_name_uvr5, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s' % (python_exec, infer_device, is_half, webui_port_uvr5, is_share) + yield ( + process_info(process_name_uvr5, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_uvr5 = Popen(cmd, shell=True) else: kill_process(p_uvr5.pid, process_name_uvr5) p_uvr5 = None - yield process_info(process_name_uvr5, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_uvr5, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_tts = i18n("TTS推理WebUI") -def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path, batched_infer_enabled): + + +def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled): global p_tts_inference if batched_infer_enabled: - cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language) + cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) else: - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) + cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) # #####v3暂不支持加速推理 # if version=="v3": # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) if p_tts_inference is None: - os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) - os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) - os.environ["cnhubert_base_path"]=cnhubert_base_path - os.environ["bert_path"]=bert_path - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_number(gpu_number) - os.environ["is_half"]=str(is_half) - os.environ["infer_ttswebui"]=str(webui_port_infer_tts) - os.environ["is_share"]=str(is_share) - yield process_info(process_name_tts, "opened"), {'__type__':'update','visible':False}, {'__type__':'update','visible':True} + os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) + os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) + os.environ["cnhubert_base_path"] = cnhubert_base_path + os.environ["bert_path"] = bert_path + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number) + os.environ["is_half"] = str(is_half) + os.environ["infer_ttswebui"] = str(webui_port_infer_tts) + os.environ["is_share"] = str(is_share) + yield ( + process_info(process_name_tts, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_tts_inference = Popen(cmd, shell=True) else: kill_process(p_tts_inference.pid, process_name_tts) p_tts_inference = None - yield process_info(process_name_tts, "closed"), {'__type__':'update','visible':True}, {'__type__':'update','visible':False} + yield ( + process_info(process_name_tts, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + from tools.asr.config import asr_dict process_name_asr = i18n("语音识别") + + def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_precision): global p_asr if p_asr is None: - asr_inp_dir=my_utils.clean_path(asr_inp_dir) - asr_opt_dir=my_utils.clean_path(asr_opt_dir) + asr_inp_dir = my_utils.clean_path(asr_inp_dir) + asr_opt_dir = my_utils.clean_path(asr_opt_dir) check_for_existance([asr_inp_dir]) cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' cmd += f' -i "{asr_inp_dir}"' cmd += f' -o "{asr_opt_dir}"' - cmd += f' -s {asr_model_size}' - cmd += f' -l {asr_lang}' + cmd += f" -s {asr_model_size}" + cmd += f" -l {asr_lang}" cmd += f" -p {asr_precision}" output_file_name = os.path.basename(asr_inp_dir) output_folder = asr_opt_dir or "output/asr_opt" - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - yield process_info(process_name_asr, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list") + yield ( + process_info(process_name_asr, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_asr = Popen(cmd, shell=True) p_asr.wait() p_asr = None - yield process_info(process_name_asr, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": output_file_path}, {"__type__": "update", "value": output_file_path}, {"__type__": "update", "value": asr_inp_dir} + yield ( + process_info(process_name_asr, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": output_file_path}, + {"__type__": "update", "value": asr_inp_dir}, + ) else: - yield process_info(process_name_asr, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_asr, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_asr(): global p_asr if p_asr is not None: kill_process(p_asr.pid, process_name_asr) p_asr = None - return process_info(process_name_asr, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_asr, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + process_name_denoise = i18n("语音降噪") + + def open_denoise(denoise_inp_dir, denoise_opt_dir): global p_denoise - if(p_denoise==None): - denoise_inp_dir=my_utils.clean_path(denoise_inp_dir) - denoise_opt_dir=my_utils.clean_path(denoise_opt_dir) + if p_denoise == None: + denoise_inp_dir = my_utils.clean_path(denoise_inp_dir) + denoise_opt_dir = my_utils.clean_path(denoise_opt_dir) check_for_existance([denoise_inp_dir]) - cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32") + cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % ( + python_exec, + denoise_inp_dir, + denoise_opt_dir, + "float16" if is_half == True else "float32", + ) - yield process_info(process_name_denoise, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_denoise, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) print(cmd) p_denoise = Popen(cmd, shell=True) p_denoise.wait() - p_denoise=None - yield process_info(process_name_denoise, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": denoise_opt_dir}, {"__type__": "update", "value": denoise_opt_dir} + p_denoise = None + yield ( + process_info(process_name_denoise, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": denoise_opt_dir}, + {"__type__": "update", "value": denoise_opt_dir}, + ) else: - yield process_info(process_name_denoise, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_denoise, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_denoise(): global p_denoise if p_denoise is not None: kill_process(p_denoise.pid, process_name_denoise) p_denoise = None - return process_info(process_name_denoise, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_denoise, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -p_train_SoVITS=None + +p_train_SoVITS = None process_name_sovits = i18n("SoVITS训练") -def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D,if_grad_ckpt,lora_rank): + + +def open1Ba( + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, +): global p_train_SoVITS - if(p_train_SoVITS==None): - with open("GPT_SoVITS/configs/s2.json")as f: - data=f.read() - data=json.loads(data) - s2_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s2_%s"%(s2_dir,version),exist_ok=True) - if check_for_existance([s2_dir],is_train=True): - check_details([s2_dir],is_train=True) - if(is_half==False): - data["train"]["fp16_run"]=False - batch_size=max(1,batch_size//2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["train"]["text_low_lr_rate"]=text_low_lr_rate - data["train"]["pretrained_s2G"]=pretrained_s2G - data["train"]["pretrained_s2D"]=pretrained_s2D - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["save_every_epoch"]=save_every_epoch - data["train"]["gpu_numbers"]=gpu_numbers1Ba - data["train"]["grad_ckpt"]=if_grad_ckpt - data["train"]["lora_rank"]=lora_rank - data["model"]["version"]=version - data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir - data["save_weight_dir"]=SoVITS_weight_root[int(version[-1])-1] - data["name"]=exp_name - data["version"]=version - tmp_config_path="%s/tmp_s2.json"%tmp - with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) - if version in ["v1","v2"]: - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) + if p_train_SoVITS == None: + with open("GPT_SoVITS/configs/s2.json") as f: + data = f.read() + data = json.loads(data) + s2_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s2_%s" % (s2_dir, version), exist_ok=True) + if check_for_existance([s2_dir], is_train=True): + check_details([s2_dir], is_train=True) + if is_half == False: + data["train"]["fp16_run"] = False + batch_size = max(1, batch_size // 2) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["train"]["text_low_lr_rate"] = text_low_lr_rate + data["train"]["pretrained_s2G"] = pretrained_s2G + data["train"]["pretrained_s2D"] = pretrained_s2D + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["save_every_epoch"] = save_every_epoch + data["train"]["gpu_numbers"] = gpu_numbers1Ba + data["train"]["grad_ckpt"] = if_grad_ckpt + data["train"]["lora_rank"] = lora_rank + data["model"]["version"] = version + data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir + data["save_weight_dir"] = SoVITS_weight_root[int(version[-1]) - 1] + data["name"] = exp_name + data["version"] = version + tmp_config_path = "%s/tmp_s2.json" % tmp + with open(tmp_config_path, "w") as f: + f.write(json.dumps(data)) + if version in ["v1", "v2"]: + cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) else: - cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"'%(python_exec,tmp_config_path) - yield process_info(process_name_sovits, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_sovits, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_train_SoVITS = Popen(cmd, shell=True) p_train_SoVITS.wait() p_train_SoVITS = None - yield process_info(process_name_sovits, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_sovits, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_sovits, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_sovits, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1Ba(): global p_train_SoVITS if p_train_SoVITS is not None: kill_process(p_train_SoVITS.pid, process_name_sovits) p_train_SoVITS = None - return process_info(process_name_sovits, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_sovits, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -p_train_GPT=None + +p_train_GPT = None process_name_gpt = i18n("GPT训练") -def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): + + +def open1Bb( + batch_size, + total_epoch, + exp_name, + if_dpo, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers, + pretrained_s1, +): global p_train_GPT - if(p_train_GPT==None): - with open("GPT_SoVITS/configs/s1longer.yaml"if version=="v1"else "GPT_SoVITS/configs/s1longer-v2.yaml")as f: - data=f.read() - data=yaml.load(data, Loader=yaml.FullLoader) - s1_dir="%s/%s"%(exp_root,exp_name) - os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True) - if check_for_existance([s1_dir],is_train=True): - check_details([s1_dir],is_train=True) - if(is_half==False): - data["train"]["precision"]="32" + if p_train_GPT == None: + with open( + "GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml" + ) as f: + data = f.read() + data = yaml.load(data, Loader=yaml.FullLoader) + s1_dir = "%s/%s" % (exp_root, exp_name) + os.makedirs("%s/logs_s1" % (s1_dir), exist_ok=True) + if check_for_existance([s1_dir], is_train=True): + check_details([s1_dir], is_train=True) + if is_half == False: + data["train"]["precision"] = "32" batch_size = max(1, batch_size // 2) - data["train"]["batch_size"]=batch_size - data["train"]["epochs"]=total_epoch - data["pretrained_s1"]=pretrained_s1 - data["train"]["save_every_n_epoch"]=save_every_epoch - data["train"]["if_save_every_weights"]=if_save_every_weights - data["train"]["if_save_latest"]=if_save_latest - data["train"]["if_dpo"]=if_dpo - data["train"]["half_weights_save_dir"]=GPT_weight_root[int(version[-1])-1] - data["train"]["exp_name"]=exp_name - data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir - data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir - data["output_dir"]="%s/logs_s1_%s"%(s1_dir,version) + data["train"]["batch_size"] = batch_size + data["train"]["epochs"] = total_epoch + data["pretrained_s1"] = pretrained_s1 + data["train"]["save_every_n_epoch"] = save_every_epoch + data["train"]["if_save_every_weights"] = if_save_every_weights + data["train"]["if_save_latest"] = if_save_latest + data["train"]["if_dpo"] = if_dpo + data["train"]["half_weights_save_dir"] = GPT_weight_root[int(version[-1]) - 1] + data["train"]["exp_name"] = exp_name + data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir + data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir + data["output_dir"] = "%s/logs_s1_%s" % (s1_dir, version) # data["version"]=version - os.environ["_CUDA_VISIBLE_DEVICES"]=fix_gpu_numbers(gpu_numbers.replace("-",",")) - os.environ["hz"]="25hz" - tmp_config_path="%s/tmp_s1.yaml"%tmp - with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_numbers(gpu_numbers.replace("-", ",")) + os.environ["hz"] = "25hz" + tmp_config_path = "%s/tmp_s1.yaml" % tmp + with open(tmp_config_path, "w") as f: + f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) - yield process_info(process_name_gpt, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path) + yield ( + process_info(process_name_gpt, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) print(cmd) p_train_GPT = Popen(cmd, shell=True) p_train_GPT.wait() p_train_GPT = None - yield process_info(process_name_gpt, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_gpt, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_gpt, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_gpt, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1Bb(): global p_train_GPT if p_train_GPT is not None: kill_process(p_train_GPT.pid, process_name_gpt) p_train_GPT = None - return process_info(process_name_gpt, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_gpt, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps_slice=[] + +ps_slice = [] process_name_slice = i18n("语音切分") -def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): + + +def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, n_parts): global ps_slice inp = my_utils.clean_path(inp) opt_root = my_utils.clean_path(opt_root) check_for_existance([inp]) - if(os.path.exists(inp)==False): - yield i18n("输入路径不存在"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + if os.path.exists(inp) == False: + yield ( + i18n("输入路径不存在"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) return - if os.path.isfile(inp):n_parts=1 - elif os.path.isdir(inp):pass + if os.path.isfile(inp): + n_parts = 1 + elif os.path.isdir(inp): + pass else: - yield i18n("输入路径存在但不可用"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + i18n("输入路径存在但不可用"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) return - if (ps_slice == []): + if ps_slice == []: for i_part in range(n_parts): - cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts) + cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % ( + python_exec, + inp, + opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + i_part, + n_parts, + ) print(cmd) p = Popen(cmd, shell=True) ps_slice.append(p) - yield process_info(process_name_slice, "opened"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_slice, "opened"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) for p in ps_slice: p.wait() - ps_slice=[] - yield process_info(process_name_slice, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}, {"__type__": "update", "value": opt_root}, {"__type__": "update", "value": opt_root}, {"__type__": "update", "value": opt_root} + ps_slice = [] + yield ( + process_info(process_name_slice, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + {"__type__": "update", "value": opt_root}, + ) else: - yield process_info(process_name_slice, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} + yield ( + process_info(process_name_slice, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, + {"__type__": "update"}, + ) + def close_slice(): global ps_slice - if (ps_slice != []): + if ps_slice != []: for p_slice in ps_slice: try: kill_process(p_slice.pid, process_name_slice) except: traceback.print_exc() - ps_slice=[] - return process_info(process_name_slice, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps_slice = [] + return ( + process_info(process_name_slice, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1a=[] + +ps1a = [] process_name_1a = i18n("文本分词与特征提取") -def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): + + +def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): global ps1a inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1a == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1a == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { "i_part": str(i_part), "all_parts": str(all_parts), "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), - "is_half": str(is_half) + "is_half": str(is_half), } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1a.append(p) - yield process_info(process_name_1a, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1a, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1a: p.wait() opt = [] @@ -566,13 +903,26 @@ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir): path_text = "%s/2-name2text.txt" % opt_dir with open(path_text, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - ps1a=[] + ps1a = [] if len("".join(opt)) > 0: - yield process_info(process_name_1a, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1a, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1a, "failed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1a, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1a, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1a, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1a(): global ps1a @@ -583,27 +933,34 @@ def close1a(): except: traceback.print_exc() ps1a = [] - return process_info(process_name_1a, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + return ( + process_info(process_name_1a, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1b=[] + +ps1b = [] process_name_1b = i18n("语音自监督特征提取") -def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): + + +def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): global ps1b inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1b == []): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir": "%s/%s"%(exp_root,exp_name), - "cnhubert_base_dir":ssl_pretrained_dir, - "is_half": str(is_half) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1b == []: + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": "%s/%s" % (exp_root, exp_name), + "cnhubert_base_dir": ssl_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -613,48 +970,68 @@ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1b.append(p) - yield process_info(process_name_1b, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1b, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1b: p.wait() - ps1b=[] - yield process_info(process_name_1b, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1b = [] + yield ( + process_info(process_name_1b, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1b, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1b, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1b(): global ps1b - if (ps1b != []): + if ps1b != []: for p1b in ps1b: try: kill_process(p1b.pid, process_name_1b) except: traceback.print_exc() - ps1b=[] - return process_info(process_name_1b, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1b = [] + return ( + process_info(process_name_1b, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1c=[] + +ps1c = [] process_name_1c = i18n("语义Token提取") -def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): + + +def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): global ps1c inp_text = my_utils.clean_path(inp_text) - if check_for_existance([inp_text,''], is_dataset_processing=True): - check_details([inp_text,''], is_dataset_processing=True) - if (ps1c == []): - opt_dir="%s/%s"%(exp_root,exp_name) - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", - "is_half": str(is_half) + if check_for_existance([inp_text, ""], is_dataset_processing=True): + check_details([inp_text, ""], is_dataset_processing=True) + if ps1c == []: + opt_dir = "%s/%s" % (exp_root, exp_name) + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", + "is_half": str(is_half), } - gpu_names=gpu_numbers.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -664,11 +1041,15 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1c.append(p) - yield process_info(process_name_1c, "running"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1c, "running"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) for p in ps1c: p.wait() opt = ["item_name\tsemantic_audio"] @@ -680,46 +1061,75 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): os.remove(semantic_path) with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - ps1c=[] - yield process_info(process_name_1c, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1c = [] + yield ( + process_info(process_name_1c, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1c, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1c, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1c(): global ps1c - if (ps1c != []): + if ps1c != []: for p1c in ps1c: try: kill_process(p1c.pid, process_name_1c) except: traceback.print_exc() - ps1c=[] - return process_info(process_name_1c, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1c = [] + return ( + process_info(process_name_1c, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) -ps1abc=[] + +ps1abc = [] process_name_1abc = i18n("训练集格式化一键三连") -def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path): + + +def open1abc( + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + ssl_pretrained_dir, + pretrained_s2G_path, +): global ps1abc inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) - if check_for_existance([inp_text,inp_wav_dir], is_dataset_processing=True): - check_details([inp_text,inp_wav_dir], is_dataset_processing=True) - if (ps1abc == []): - opt_dir="%s/%s"%(exp_root,exp_name) + if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): + check_details([inp_text, inp_wav_dir], is_dataset_processing=True) + if ps1abc == []: + opt_dir = "%s/%s" % (exp_root, exp_name) try: #############################1a - path_text="%s/2-name2text.txt" % opt_dir - if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)): - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "bert_pretrained_dir":bert_pretrained_dir, - "is_half": str(is_half) + path_text = "%s/2-name2text.txt" % opt_dir + if os.path.exists(path_text) == False or ( + os.path.exists(path_text) == True + and len(open(path_text, "r", encoding="utf8").read().strip("\n").split("\n")) < 2 + ): + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "bert_pretrained_dir": bert_pretrained_dir, + "is_half": str(is_half), } - gpu_names=gpu_numbers1a.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1a.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -729,34 +1139,43 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield ( + i18n("进度") + ": 1A-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() opt = [] - for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) + for i_part in range(all_parts): # txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part) txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) - with open(txt_path, "r",encoding="utf8") as f: + with open(txt_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(txt_path) - with open(path_text, "w",encoding="utf8") as f: + with open(path_text, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") assert len("".join(opt)) > 0, process_info(process_name_1a, "failed") - yield i18n("进度") + ": 1A-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] + yield ( + i18n("进度") + ": 1A-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] #############################1b - config={ - "inp_text":inp_text, - "inp_wav_dir":inp_wav_dir, - "exp_name":exp_name, - "opt_dir":opt_dir, - "cnhubert_base_dir":ssl_pretrained_dir, + config = { + "inp_text": inp_text, + "inp_wav_dir": inp_wav_dir, + "exp_name": exp_name, + "opt_dir": opt_dir, + "cnhubert_base_dir": ssl_pretrained_dir, } - gpu_names=gpu_numbers1Ba.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1Ba.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -766,26 +1185,37 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Done, 1B-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() - yield i18n("进度") + ": 1A-Done, 1B-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - ps1abc=[] + yield ( + i18n("进度") + ": 1A-Done, 1B-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() + yield ( + i18n("进度") + ": 1A-Done, 1B-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + ps1abc = [] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir - if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)): - config={ - "inp_text":inp_text, - "exp_name":exp_name, - "opt_dir":opt_dir, - "pretrained_s2G":pretrained_s2G_path, - "s2config_path":"GPT_SoVITS/configs/s2.json", + if os.path.exists(path_semantic) == False or ( + os.path.exists(path_semantic) == True and os.path.getsize(path_semantic) < 31 + ): + config = { + "inp_text": inp_text, + "exp_name": exp_name, + "opt_dir": opt_dir, + "pretrained_s2G": pretrained_s2G_path, + "s2config_path": "GPT_SoVITS/configs/s2.json", } - gpu_names=gpu_numbers1c.split("-") - all_parts=len(gpu_names) + gpu_names = gpu_numbers1c.split("-") + all_parts = len(gpu_names) for i_part in range(all_parts): config.update( { @@ -795,335 +1225,734 @@ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numb } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec + cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) - yield i18n("进度") + ": 1A-Done, 1B-Done, 1C-Doing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} - for p in ps1abc:p.wait() + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Doing", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + for p in ps1abc: + p.wait() opt = ["item_name\tsemantic_audio"] for i_part in range(all_parts): semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) - with open(semantic_path, "r",encoding="utf8") as f: + with open(semantic_path, "r", encoding="utf8") as f: opt += f.read().strip("\n").split("\n") os.remove(semantic_path) - with open(path_semantic, "w",encoding="utf8") as f: + with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") - yield i18n("进度") + ": 1A-Done, 1B-Done, 1C-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + i18n("进度") + ": 1A-Done, 1B-Done, 1C-Done", + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) ps1abc = [] - yield process_info(process_name_1abc, "finish"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1abc, "finish"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) except: traceback.print_exc() close1abc() - yield process_info(process_name_1abc, "failed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + yield ( + process_info(process_name_1abc, "failed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) else: - yield process_info(process_name_1abc, "occupy"), {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield ( + process_info(process_name_1abc, "occupy"), + {"__type__": "update", "visible": False}, + {"__type__": "update", "visible": True}, + ) + def close1abc(): global ps1abc - if (ps1abc != []): + if ps1abc != []: for p1abc in ps1abc: try: kill_process(p1abc.pid, process_name_1abc) except: traceback.print_exc() - ps1abc=[] - return process_info(process_name_1abc, "closed"), {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False} + ps1abc = [] + return ( + process_info(process_name_1abc, "closed"), + {"__type__": "update", "visible": True}, + {"__type__": "update", "visible": False}, + ) + def switch_version(version_): - os.environ["version"]=version_ + os.environ["version"] = version_ global version version = version_ - if pretrained_sovits_name[int(version[-1])-1] !='' and pretrained_gpt_name[int(version[-1])-1] !='':... + if pretrained_sovits_name[int(version[-1]) - 1] != "" and pretrained_gpt_name[int(version[-1]) - 1] != "": + ... else: - gr.Warning(i18n('未下载模型') + ": " + version.upper()) + gr.Warning(i18n("未下载模型") + ": " + version.upper()) set_default() - return {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D")}, \ - {'__type__': 'update', 'value': pretrained_gpt_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_gpt_name[int(version[-1])-1]}, \ - {'__type__': 'update', 'value': pretrained_sovits_name[int(version[-1])-1]}, \ - {'__type__': 'update', "value": default_batch_size, "maximum": default_max_batch_size}, \ - {'__type__': 'update', "value": default_sovits_epoch, "maximum": max_sovits_epoch}, \ - {'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \ - {'__type__': 'update', "visible": True if version!="v3"else False}, \ - {'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \ - {'__type__': 'update', "interactive": True, "value": False}, \ - {'__type__': 'update', "visible": True if version== "v3" else False} # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer + return ( + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D")}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": default_batch_size, "maximum": default_max_batch_size}, + {"__type__": "update", "value": default_sovits_epoch, "maximum": max_sovits_epoch}, + {"__type__": "update", "value": default_sovits_save_every_epoch, "maximum": max_sovits_save_every_epoch}, + {"__type__": "update", "visible": True if version != "v3" else False}, + { + "__type__": "update", + "value": False if not if_force_ckpt else True, + "interactive": True if not if_force_ckpt else False, + }, + {"__type__": "update", "interactive": True, "value": False}, + {"__type__": "update", "visible": True if version == "v3" else False}, + ) # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer -if os.path.exists('GPT_SoVITS/text/G2PWModel'):... + +if os.path.exists("GPT_SoVITS/text/G2PWModel"): + ... else: - cmd = '"%s" GPT_SoVITS/download.py'%python_exec + cmd = '"%s" GPT_SoVITS/download.py' % python_exec p = Popen(cmd, shell=True) p.wait() + def sync(text): - return {'__type__': 'update', 'value': text} + return {"__type__": "update", "value": text} + with gr.Blocks(title="GPT-SoVITS WebUI") as app: gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - gr.Markdown( - value= - i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) + gr.Markdown(value=i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e") with gr.Tabs(): - with gr.TabItem("0-"+i18n("前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 - gr.Markdown(value="0a-"+i18n("UVR5人声伴奏分离&去混响去延迟工具")) + with gr.TabItem("0-" + i18n("前置数据集获取工具")): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 + gr.Markdown(value="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info")) - open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"),variant="primary",visible=True) - close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"),variant="primary",visible=False) + open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"), variant="primary", visible=True) + close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"), variant="primary", visible=False) - gr.Markdown(value="0b-"+i18n("语音切分工具")) + gr.Markdown(value="0b-" + i18n("语音切分工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),value="") - slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt") + slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"), value="") + slice_opt_root = gr.Textbox(label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt") with gr.Row(): - threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34") - min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000") - min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300") - hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10") - max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500") + threshold = gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34") + min_length = gr.Textbox( + label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"), + value="4000", + ) + min_interval = gr.Textbox(label=i18n("min_interval:最短切割间隔"), value="300") + hop_size = gr.Textbox( + label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"), + value="10", + ) + max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500") with gr.Row(): - _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True) - alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True) + _max = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("max:归一化后最大值多少"), + value=0.9, + interactive=True, + ) + alpha = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("alpha_mix:混多少比例归一化后音频进来"), + value=0.25, + interactive=True, + ) with gr.Row(): - n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True) + n_process = gr.Slider( + minimum=1, maximum=n_cpu, step=1, label=i18n("切割使用的进程数"), value=4, interactive=True + ) slicer_info = gr.Textbox(label=process_info(process_name_slice, "info")) - open_slicer_button = gr.Button(value=process_info(process_name_slice, "open"),variant="primary",visible=True) - close_slicer_button = gr.Button(value=process_info(process_name_slice, "close"),variant="primary",visible=False) + open_slicer_button = gr.Button( + value=process_info(process_name_slice, "open"), variant="primary", visible=True + ) + close_slicer_button = gr.Button( + value=process_info(process_name_slice, "close"), variant="primary", visible=False + ) - gr.Markdown(value="0bb-"+i18n("语音降噪工具")) + gr.Markdown(value="0bb-" + i18n("语音降噪工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - denoise_input_dir=gr.Textbox(label=i18n("输入文件夹路径"),value="") - denoise_output_dir=gr.Textbox(label=i18n("输出文件夹路径"),value="output/denoise_opt") + denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="") + denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt") with gr.Row(): denoise_info = gr.Textbox(label=process_info(process_name_denoise, "info")) - open_denoise_button = gr.Button(value=process_info(process_name_denoise, "open"),variant="primary",visible=True) - close_denoise_button = gr.Button(value=process_info(process_name_denoise, "close"),variant="primary",visible=False) + open_denoise_button = gr.Button( + value=process_info(process_name_denoise, "open"), variant="primary", visible=True + ) + close_denoise_button = gr.Button( + value=process_info(process_name_denoise, "close"), variant="primary", visible=False + ) - gr.Markdown(value="0c-"+i18n("语音识别工具")) + gr.Markdown(value="0c-" + i18n("语音识别工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - asr_inp_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True) + asr_inp_dir = gr.Textbox( + label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True + ) asr_opt_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True) with gr.Row(): - asr_model = gr.Dropdown(label=i18n("ASR 模型"), choices=list(asr_dict.keys()), interactive=True, value="达摩 ASR (中文)") - asr_size = gr.Dropdown(label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large") - asr_lang = gr.Dropdown(label=i18n("ASR 语言设置"), choices=["zh","yue"], interactive=True, value="zh") - asr_precision = gr.Dropdown(label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32") + asr_model = gr.Dropdown( + label=i18n("ASR 模型"), + choices=list(asr_dict.keys()), + interactive=True, + value="达摩 ASR (中文)", + ) + asr_size = gr.Dropdown( + label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large" + ) + asr_lang = gr.Dropdown( + label=i18n("ASR 语言设置"), choices=["zh", "yue"], interactive=True, value="zh" + ) + asr_precision = gr.Dropdown( + label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32" + ) with gr.Row(): asr_info = gr.Textbox(label=process_info(process_name_asr, "info")) - open_asr_button = gr.Button(value=process_info(process_name_asr, "open"),variant="primary",visible=True) - close_asr_button = gr.Button(value=process_info(process_name_asr, "close"),variant="primary",visible=False) + open_asr_button = gr.Button( + value=process_info(process_name_asr, "open"), variant="primary", visible=True + ) + close_asr_button = gr.Button( + value=process_info(process_name_asr, "close"), variant="primary", visible=False + ) - def change_lang_choices(key): #根据选择的模型修改可选的语言 - return {"__type__": "update", "choices": asr_dict[key]['lang'], "value": asr_dict[key]['lang'][0]} - def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 - return {"__type__": "update", "choices": asr_dict[key]['size'], "value": asr_dict[key]['size'][-1]} - def change_precision_choices(key): #根据选择的模型修改可选的语言 - if key =="Faster Whisper (多语种)": + def change_lang_choices(key): # 根据选择的模型修改可选的语言 + return {"__type__": "update", "choices": asr_dict[key]["lang"], "value": asr_dict[key]["lang"][0]} + + def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 + return {"__type__": "update", "choices": asr_dict[key]["size"], "value": asr_dict[key]["size"][-1]} + + def change_precision_choices(key): # 根据选择的模型修改可选的语言 + if key == "Faster Whisper (多语种)": if default_batch_size <= 4: - precision = 'int8' + precision = "int8" elif is_half: - precision = 'float16' + precision = "float16" else: - precision = 'float32' + precision = "float32" else: - precision = 'float32' - return {"__type__": "update", "choices": asr_dict[key]['precision'], "value": precision} + precision = "float32" + return {"__type__": "update", "choices": asr_dict[key]["precision"], "value": precision} + asr_model.change(change_lang_choices, [asr_model], [asr_lang]) asr_model.change(change_size_choices, [asr_model], [asr_size]) asr_model.change(change_precision_choices, [asr_model], [asr_precision]) - gr.Markdown(value="0d-"+i18n("语音文本校对标注工具")) + gr.Markdown(value="0d-" + i18n("语音文本校对标注工具")) with gr.Row(): with gr.Column(scale=3): with gr.Row(): - path_list = gr.Textbox(label=i18n("标注文件路径 (含文件后缀 *.list)"), value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", interactive=True) + path_list = gr.Textbox( + label=i18n("标注文件路径 (含文件后缀 *.list)"), + value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", + interactive=True, + ) label_info = gr.Textbox(label=process_info(process_name_subfix, "info")) - open_label = gr.Button(value=process_info(process_name_subfix, "open"),variant="primary",visible=True) - close_label = gr.Button(value=process_info(process_name_subfix, "close"),variant="primary",visible=False) + open_label = gr.Button(value=process_info(process_name_subfix, "open"), variant="primary", visible=True) + close_label = gr.Button( + value=process_info(process_name_subfix, "close"), variant="primary", visible=False + ) - open_label.click(change_label, [path_list], [label_info,open_label,close_label]) - close_label.click(change_label, [path_list], [label_info,open_label,close_label]) - open_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) - close_uvr5.click(change_uvr5, [], [uvr5_info,open_uvr5,close_uvr5]) + open_label.click(change_label, [path_list], [label_info, open_label, close_label]) + close_label.click(change_label, [path_list], [label_info, open_label, close_label]) + open_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + close_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): with gr.Row(): with gr.Row(): exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) - version_checkbox = gr.Radio(label=i18n("版本"),value=version,choices=['v1','v2','v3']) + version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v3"]) with gr.Row(): - pretrained_s2G = gr.Textbox(label=i18n("预训练SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s2D = gr.Textbox(label=i18n("预训练SoVITS-D模型路径"), value=pretrained_sovits_name[int(version[-1])-1].replace("s2G","s2D"), interactive=True, lines=2, max_lines=3,scale=9) - pretrained_s1 = gr.Textbox(label=i18n("预训练GPT模型路径"), value=pretrained_gpt_name[int(version[-1])-1], interactive=True, lines=2, max_lines=3,scale=10) + pretrained_s2G = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s2D = gr.Textbox( + label=i18n("预训练SoVITS-D模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s1 = gr.Textbox( + label=i18n("预训练GPT模型路径"), + value=pretrained_gpt_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=10, + ) - with gr.TabItem("1A-"+i18n("训练集格式化工具")): + with gr.TabItem("1A-" + i18n("训练集格式化工具")): gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")) with gr.Row(): with gr.Row(): - inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True,scale=10) + inp_text = gr.Textbox( + label=i18n("*文本标注文件"), + value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", + interactive=True, + scale=10, + ) with gr.Row(): inp_wav_dir = gr.Textbox( label=i18n("*训练集音频文件目录"), # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", interactive=True, - placeholder=i18n("填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。"), scale=10 + placeholder=i18n( + "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。" + ), + scale=10, ) - gr.Markdown(value="1Aa-"+process_name_1a) + gr.Markdown(value="1Aa-" + process_name_1a) with gr.Row(): with gr.Row(): - gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1a = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - bert_pretrained_dir = gr.Textbox(label=i18n("预训练中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False,lines=2) + bert_pretrained_dir = gr.Textbox( + label=i18n("预训练中文BERT模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + interactive=False, + lines=2, + ) with gr.Row(): - button1a_open = gr.Button(value=process_info(process_name_1a, "open"),variant="primary",visible=True) - button1a_close = gr.Button(value=process_info(process_name_1a, "close"),variant="primary",visible=False) + button1a_open = gr.Button( + value=process_info(process_name_1a, "open"), variant="primary", visible=True + ) + button1a_close = gr.Button( + value=process_info(process_name_1a, "close"), variant="primary", visible=False + ) with gr.Row(): - info1a=gr.Textbox(label=process_info(process_name_1a, "info")) + info1a = gr.Textbox(label=process_info(process_name_1a, "info")) - gr.Markdown(value="1Ab-"+process_name_1b) + gr.Markdown(value="1Ab-" + process_name_1b) with gr.Row(): with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - cnhubert_base_dir = gr.Textbox(label=i18n("预训练SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False,lines=2) + cnhubert_base_dir = gr.Textbox( + label=i18n("预训练SSL模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-hubert-base", + interactive=False, + lines=2, + ) with gr.Row(): - button1b_open = gr.Button(value=process_info(process_name_1b, "open"),variant="primary",visible=True) - button1b_close = gr.Button(value=process_info(process_name_1b, "close"),variant="primary",visible=False) + button1b_open = gr.Button( + value=process_info(process_name_1b, "open"), variant="primary", visible=True + ) + button1b_close = gr.Button( + value=process_info(process_name_1b, "close"), variant="primary", visible=False + ) with gr.Row(): - info1b=gr.Textbox(label=process_info(process_name_1b, "info")) + info1b = gr.Textbox(label=process_info(process_name_1b, "info")) - gr.Markdown(value="1Ac-"+process_name_1c) + gr.Markdown(value="1Ac-" + process_name_1c) with gr.Row(): with gr.Row(): - gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True) + gpu_numbers1c = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - pretrained_s2G_ = gr.Textbox(label=i18n("预训练SoVITS-G模型路径"), value=pretrained_sovits_name[int(version[-1])-1], interactive=False,lines=2) + pretrained_s2G_ = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=False, + lines=2, + ) with gr.Row(): - button1c_open = gr.Button(value=process_info(process_name_1c, "open"),variant="primary",visible=True) - button1c_close = gr.Button(value=process_info(process_name_1c, "close"),variant="primary",visible=False) + button1c_open = gr.Button( + value=process_info(process_name_1c, "open"), variant="primary", visible=True + ) + button1c_close = gr.Button( + value=process_info(process_name_1c, "close"), variant="primary", visible=False + ) with gr.Row(): - info1c=gr.Textbox(label=process_info(process_name_1c, "info")) + info1c = gr.Textbox(label=process_info(process_name_1c, "info")) - gr.Markdown(value="1Aabc-"+process_name_1abc) + gr.Markdown(value="1Aabc-" + process_name_1abc) with gr.Row(): with gr.Row(): - button1abc_open = gr.Button(value=process_info(process_name_1abc, "open"),variant="primary",visible=True) - button1abc_close = gr.Button(value=process_info(process_name_1abc, "close"),variant="primary",visible=False) + button1abc_open = gr.Button( + value=process_info(process_name_1abc, "open"), variant="primary", visible=True + ) + button1abc_close = gr.Button( + value=process_info(process_name_1abc, "close"), variant="primary", visible=False + ) with gr.Row(): - info1abc=gr.Textbox(label=process_info(process_name_1abc, "info")) + info1abc = gr.Textbox(label=process_info(process_name_1abc, "info")) - pretrained_s2G.change(sync,[pretrained_s2G],[pretrained_s2G_]) - open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], [asr_info,open_asr_button,close_asr_button,path_list,inp_text,inp_wav_dir]) - close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) - open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button,asr_inp_dir,denoise_input_dir,inp_wav_dir]) - close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) - open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button,asr_inp_dir,inp_wav_dir]) - close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button]) + pretrained_s2G.change(sync, [pretrained_s2G], [pretrained_s2G_]) + open_asr_button.click( + open_asr, + [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], + [asr_info, open_asr_button, close_asr_button, path_list, inp_text, inp_wav_dir], + ) + close_asr_button.click(close_asr, [], [asr_info, open_asr_button, close_asr_button]) + open_slicer_button.click( + open_slice, + [ + slice_inp_path, + slice_opt_root, + threshold, + min_length, + min_interval, + hop_size, + max_sil_kept, + _max, + alpha, + n_process, + ], + [slicer_info, open_slicer_button, close_slicer_button, asr_inp_dir, denoise_input_dir, inp_wav_dir], + ) + close_slicer_button.click(close_slice, [], [slicer_info, open_slicer_button, close_slicer_button]) + open_denoise_button.click( + open_denoise, + [denoise_input_dir, denoise_output_dir], + [denoise_info, open_denoise_button, close_denoise_button, asr_inp_dir, inp_wav_dir], + ) + close_denoise_button.click(close_denoise, [], [denoise_info, open_denoise_button, close_denoise_button]) - button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close]) - button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close]) - button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close]) - button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close]) - button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close]) - button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close]) - button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close]) - button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close]) + button1a_open.click( + open1a, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1a, bert_pretrained_dir], + [info1a, button1a_open, button1a_close], + ) + button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) + button1b_open.click( + open1b, + [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], + [info1b, button1b_open, button1b_close], + ) + button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) + button1c_open.click( + open1c, [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close] + ) + button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) + button1abc_open.click( + open1abc, + [ + inp_text, + inp_wav_dir, + exp_name, + gpu_numbers1a, + gpu_numbers1Ba, + gpu_numbers1c, + bert_pretrained_dir, + cnhubert_base_dir, + pretrained_s2G, + ], + [info1abc, button1abc_open, button1abc_close], + ) + button1abc_close.click(close1abc, [], [info1abc, button1abc_open, button1abc_close]) - with gr.TabItem("1B-"+i18n("微调训练")): - gr.Markdown(value="1Ba-"+i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) + with gr.TabItem("1B-" + i18n("微调训练")): + gr.Markdown(value="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) with gr.Row(): with gr.Column(): with gr.Row(): - batch_size = gr.Slider(minimum=1,maximum=default_max_batch_size,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True) - total_epoch = gr.Slider(minimum=1,maximum=max_sovits_epoch,step=1,label=i18n("总训练轮数total_epoch,不建议太高"),value=default_sovits_epoch,interactive=True) + batch_size = gr.Slider( + minimum=1, + maximum=default_max_batch_size, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, + ) + total_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_epoch, + step=1, + label=i18n("总训练轮数total_epoch,不建议太高"), + value=default_sovits_epoch, + interactive=True, + ) with gr.Row(): - text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,visible=True if version!="v3"else False)#v3 not need - lora_rank = gr.Radio(label=i18n("LoRA秩"), value="32", choices=['16', '32', '64', '128'],visible=True if version=="v3"else False)#v1v2 not need - save_every_epoch = gr.Slider(minimum=1,maximum=max_sovits_save_every_epoch,step=1,label=i18n("保存频率save_every_epoch"),value=default_sovits_save_every_epoch,interactive=True) + text_low_lr_rate = gr.Slider( + minimum=0.2, + maximum=0.6, + step=0.05, + label=i18n("文本模块学习率权重"), + value=0.4, + visible=True if version != "v3" else False, + ) # v3 not need + lora_rank = gr.Radio( + label=i18n("LoRA秩"), + value="32", + choices=["16", "32", "64", "128"], + visible=True if version == "v3" else False, + ) # v1v2 not need + save_every_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_save_every_epoch, + step=1, + label=i18n("保存频率save_every_epoch"), + value=default_sovits_save_every_epoch, + interactive=True, + ) with gr.Column(): with gr.Column(): - if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) - if_grad_ckpt = gr.Checkbox(label="v3是否开启梯度检查点节省显存占用", value=False, interactive=True if version == "v3" else False, show_label=True,visible=False) # 只有V3s2可以用 + if_save_latest = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + if_grad_ckpt = gr.Checkbox( + label="v3是否开启梯度检查点节省显存占用", + value=False, + interactive=True if version == "v3" else False, + show_label=True, + visible=False, + ) # 只有V3s2可以用 with gr.Row(): - gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) with gr.Row(): with gr.Row(): - button1Ba_open = gr.Button(value=process_info(process_name_sovits, "open"),variant="primary",visible=True) - button1Ba_close = gr.Button(value=process_info(process_name_sovits, "close"),variant="primary",visible=False) + button1Ba_open = gr.Button( + value=process_info(process_name_sovits, "open"), variant="primary", visible=True + ) + button1Ba_close = gr.Button( + value=process_info(process_name_sovits, "close"), variant="primary", visible=False + ) with gr.Row(): - info1Ba=gr.Textbox(label=process_info(process_name_sovits, "info")) - gr.Markdown(value="1Bb-"+i18n("GPT 训练: 模型权重文件在 GPT_weights/")) + info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info")) + gr.Markdown(value="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")) with gr.Row(): with gr.Column(): with gr.Row(): - batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size_s1,interactive=True) - total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True) + batch_size1Bb = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size_s1, + interactive=True, + ) + total_epoch1Bb = gr.Slider( + minimum=2, + maximum=50, + step=1, + label=i18n("总训练轮数total_epoch"), + value=15, + interactive=True, + ) with gr.Row(): - save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True) - if_dpo = gr.Checkbox(label=i18n("是否开启DPO训练选项(实验性)"), value=False, interactive=True, show_label=True) + save_every_epoch1Bb = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + if_dpo = gr.Checkbox( + label=i18n("是否开启DPO训练选项(实验性)"), + value=False, + interactive=True, + show_label=True, + ) with gr.Column(): with gr.Column(): - if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), value=True, interactive=True, show_label=True) - if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True) + if_save_latest1Bb = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights1Bb = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) with gr.Row(): - gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True) + gpu_numbers1Bb = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) with gr.Row(): with gr.Row(): - button1Bb_open = gr.Button(value=process_info(process_name_gpt, "open"),variant="primary",visible=True) - button1Bb_close = gr.Button(value=process_info(process_name_gpt, "close"),variant="primary",visible=False) + button1Bb_open = gr.Button( + value=process_info(process_name_gpt, "open"), variant="primary", visible=True + ) + button1Bb_close = gr.Button( + value=process_info(process_name_gpt, "close"), variant="primary", visible=False + ) with gr.Row(): - info1Bb=gr.Textbox(label=process_info(process_name_gpt, "info")) + info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info")) - button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D,if_grad_ckpt,lora_rank], [info1Ba,button1Ba_open,button1Ba_close]) - button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close]) - button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1], [info1Bb,button1Bb_open,button1Bb_close]) - button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close]) + button1Ba_open.click( + open1Ba, + [ + batch_size, + total_epoch, + exp_name, + text_low_lr_rate, + if_save_latest, + if_save_every_weights, + save_every_epoch, + gpu_numbers1Ba, + pretrained_s2G, + pretrained_s2D, + if_grad_ckpt, + lora_rank, + ], + [info1Ba, button1Ba_open, button1Ba_close], + ) + button1Ba_close.click(close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close]) + button1Bb_open.click( + open1Bb, + [ + batch_size1Bb, + total_epoch1Bb, + exp_name, + if_dpo, + if_save_latest1Bb, + if_save_every_weights1Bb, + save_every_epoch1Bb, + gpu_numbers1Bb, + pretrained_s1, + ], + [info1Bb, button1Bb_open, button1Bb_close], + ) + button1Bb_close.click(close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close]) - with gr.TabItem("1C-"+i18n("推理")): - gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。")) + with gr.TabItem("1C-" + i18n("推理")): + gr.Markdown( + value=i18n( + "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" + ) + ) with gr.Row(): with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name[0],interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name[0],interactive=True) + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=pretrained_gpt_name[0], + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=pretrained_sovits_name[0], + interactive=True, + ) with gr.Row(): - gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) + gpu_number_1C = gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") - refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown]) + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) with gr.Row(): with gr.Row(): - batched_infer_enabled = gr.Checkbox(label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True) + batched_infer_enabled = gr.Checkbox( + label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True + ) with gr.Row(): - open_tts = gr.Button(value=process_info(process_name_tts, "open"),variant='primary',visible=True) - close_tts = gr.Button(value=process_info(process_name_tts, "close"),variant='primary',visible=False) + open_tts = gr.Button( + value=process_info(process_name_tts, "open"), variant="primary", visible=True + ) + close_tts = gr.Button( + value=process_info(process_name_tts, "close"), variant="primary", visible=False + ) with gr.Row(): tts_info = gr.Textbox(label=process_info(process_name_tts, "info")) - open_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) - close_tts.click(change_tts_inference, [bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown, batched_infer_enabled], [tts_info,open_tts,close_tts]) + open_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) + close_tts.click( + change_tts_inference, + [ + bert_pretrained_dir, + cnhubert_base_dir, + gpu_number_1C, + GPT_dropdown, + SoVITS_dropdown, + batched_infer_enabled, + ], + [tts_info, open_tts, close_tts], + ) - version_checkbox.change(switch_version,[version_checkbox],[pretrained_s2G,pretrained_s2D,pretrained_s1,GPT_dropdown,SoVITS_dropdown,batch_size,total_epoch,save_every_epoch,text_low_lr_rate, if_grad_ckpt, batched_infer_enabled, lora_rank]) + version_checkbox.change( + switch_version, + [version_checkbox], + [ + pretrained_s2G, + pretrained_s2D, + pretrained_s1, + GPT_dropdown, + SoVITS_dropdown, + batch_size, + total_epoch, + save_every_epoch, + text_low_lr_rate, + if_grad_ckpt, + batched_infer_enabled, + lora_rank, + ], + ) - with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中,请静候佳音")) + with gr.TabItem(i18n("2-GPT-SoVITS-变声")): + gr.Markdown(value=i18n("施工中,请静候佳音")) - app.queue().launch(#concurrency_count=511, max_size=1022 + app.queue().launch( # concurrency_count=511, max_size=1022 server_name="0.0.0.0", inbrowser=True, share=is_share,