mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
[fast_inference] 回退策略,减少padding影响,开放选项,同步代码 (#986)
* Update README * Optimize-English-G2P * docs: change akward expression * docs: update Changelog_KO.md * Fix CN punc in EN,add 's match * Adjust normalize and g2p logic * Update zh_CN.json * Update README (#827) Update README.md Update some outdated file paths and commands * 修复英文多音字,调整字典热加载,新增姓名匹配 (#869) * Fix homograph dict * Add JSON in dict * Adjust hot dict to hot reload * Add English name dict * Adjust get name dict logic * Make API Great Again (#894) * Add zh/jp/en mix * Optimize code readability and formatted output. * Try OGG streaming * Add stream mode arg * Add media type arg * Add cut punc arg * Eliminate punc risk * Update README (#895) * Update README * Update README * update README * update README * fix typo s/Licence /License (#904) * fix reformat cmd (#917) Co-authored-by: starylan <starylan@outlook.com> * Update README.md * Normalize chinese arithmetic operations (#947) * 改变训练和推理时的mask策略,以修复当batch_size>1时,产生的复读现象 * 同步main分支代码,增加“保持随机”选项 * 在colab中运行colab_webui.ipynb发生的uvr5模型缺失问题 (#968) 在colab中使用git下载uvr5模型时报错: fatal: destination path 'uvr5_weights' already exists and is not an empty directory. 通过在下载前将原本从本仓库下载的uvr5_weights文件夹删除可以解决问题。 * [ASR] 修复FasterWhisper遍历输入路径失败 (#956) * remove glob * rename * reset mirror pos * 回退mask策略; 回退pad策略; 在T2SBlock中添加padding_mask,以减少pad的影响; 开放repetition_penalty参数,让用户自行调整重复惩罚的强度; 增加parallel_infer参数,用于开启或关闭并行推理,关闭时与0307版本保持一致; 在webui中增加“保持随机”选项; 同步main分支代码。 * 删除无用注释 --------- Co-authored-by: Lion <drain.daters.0p@icloud.com> Co-authored-by: RVC-Boss <129054828+RVC-Boss@users.noreply.github.com> Co-authored-by: KamioRinn <snowsdream@live.com> Co-authored-by: Pengoose <pengoose_dev@naver.com> Co-authored-by: Yuan-Man <68322456+Yuan-ManX@users.noreply.github.com> Co-authored-by: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Co-authored-by: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Co-authored-by: Lion-Wu <130235128+Lion-Wu@users.noreply.github.com> Co-authored-by: digger yu <digger-yu@outlook.com> Co-authored-by: SapphireLab <36986837+SapphireLab@users.noreply.github.com> Co-authored-by: starylan <starylan@outlook.com> Co-authored-by: shadow01a <141255649+shadow01a@users.noreply.github.com>
This commit is contained in:
parent
959269b5ae
commit
29f22115fb
@ -13,11 +13,11 @@ from AR.modules.lr_schedulers import WarmupCosineLRSchedule
|
||||
from AR.modules.optim import ScaledAdam
|
||||
|
||||
class Text2SemanticLightningModule(LightningModule):
|
||||
def __init__(self, config, output_dir, is_train=True, flash_attn_enabled:bool = False):
|
||||
def __init__(self, config, output_dir, is_train=True):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.top_k = 3
|
||||
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k,flash_attn_enabled=flash_attn_enabled)
|
||||
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
|
||||
pretrained_s1 = config.get("pretrained_s1")
|
||||
if pretrained_s1 and is_train:
|
||||
# print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
|
||||
|
@ -85,15 +85,22 @@ class T2SBlock:
|
||||
self.norm_b2 = norm_b2
|
||||
self.norm_eps2 = norm_eps2
|
||||
|
||||
def process_prompt(self, x, attn_mask : torch.Tensor):
|
||||
q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1)
|
||||
@torch.jit.ignore
|
||||
def to_mask(self, x, padding_mask):
|
||||
return x*padding_mask if padding_mask is not None else x
|
||||
|
||||
def process_prompt(self, x, attn_mask : torch.Tensor, padding_mask:torch.Tensor=None):
|
||||
|
||||
|
||||
q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1)
|
||||
|
||||
batch_size = q.shape[0]
|
||||
q_len = q.shape[1]
|
||||
kv_len = k.shape[1]
|
||||
|
||||
k_cache = k
|
||||
v_cache = v
|
||||
|
||||
q = self.to_mask(q, padding_mask)
|
||||
k_cache = self.to_mask(k, padding_mask)
|
||||
v_cache = self.to_mask(v, padding_mask)
|
||||
|
||||
q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2)
|
||||
k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
|
||||
@ -103,13 +110,15 @@ class T2SBlock:
|
||||
|
||||
attn = attn.permute(2, 0, 1, 3).reshape(batch_size*q_len, self.hidden_dim)
|
||||
attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
|
||||
attn = F.linear(attn, self.out_w, self.out_b)
|
||||
attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b)
|
||||
|
||||
x = self.to_mask(x + attn, padding_mask)
|
||||
x = F.layer_norm(
|
||||
x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
|
||||
x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
|
||||
)
|
||||
x = self.to_mask(x + self.mlp.forward(self.to_mask(x, padding_mask)), padding_mask)
|
||||
x = F.layer_norm(
|
||||
x + self.mlp.forward(x),
|
||||
x,
|
||||
[self.hidden_dim],
|
||||
self.norm_w2,
|
||||
self.norm_b2,
|
||||
@ -138,11 +147,13 @@ class T2SBlock:
|
||||
attn = attn.view(q_len, batch_size, self.hidden_dim).transpose(1, 0)
|
||||
attn = F.linear(attn, self.out_w, self.out_b)
|
||||
|
||||
x = x + attn
|
||||
x = F.layer_norm(
|
||||
x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
|
||||
x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1
|
||||
)
|
||||
x = x + self.mlp.forward(x)
|
||||
x = F.layer_norm(
|
||||
x + self.mlp.forward(x),
|
||||
x,
|
||||
[self.hidden_dim],
|
||||
self.norm_w2,
|
||||
self.norm_b2,
|
||||
@ -158,11 +169,13 @@ class T2STransformer:
|
||||
self.blocks = blocks
|
||||
|
||||
def process_prompt(
|
||||
self, x, attn_mask : torch.Tensor):
|
||||
self, x, attn_mask : torch.Tensor,
|
||||
padding_mask : torch.Tensor=None,
|
||||
):
|
||||
k_cache : List[torch.Tensor] = []
|
||||
v_cache : List[torch.Tensor] = []
|
||||
for i in range(self.num_blocks):
|
||||
x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask)
|
||||
x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask)
|
||||
k_cache.append(k_cache_)
|
||||
v_cache.append(v_cache_)
|
||||
return x, k_cache, v_cache
|
||||
@ -176,7 +189,7 @@ class T2STransformer:
|
||||
|
||||
|
||||
class Text2SemanticDecoder(nn.Module):
|
||||
def __init__(self, config, norm_first=False, top_k=3, flash_attn_enabled:bool=False):
|
||||
def __init__(self, config, norm_first=False, top_k=3):
|
||||
super(Text2SemanticDecoder, self).__init__()
|
||||
self.model_dim = config["model"]["hidden_dim"]
|
||||
self.embedding_dim = config["model"]["embedding_dim"]
|
||||
@ -228,47 +241,37 @@ class Text2SemanticDecoder(nn.Module):
|
||||
multidim_average="global",
|
||||
ignore_index=self.EOS,
|
||||
)
|
||||
|
||||
blocks = []
|
||||
|
||||
for i in range(self.num_layers):
|
||||
layer = self.h.layers[i]
|
||||
t2smlp = T2SMLP(
|
||||
layer.linear1.weight,
|
||||
layer.linear1.bias,
|
||||
layer.linear2.weight,
|
||||
layer.linear2.bias
|
||||
)
|
||||
|
||||
block = T2SBlock(
|
||||
self.num_head,
|
||||
self.model_dim,
|
||||
t2smlp,
|
||||
layer.self_attn.in_proj_weight,
|
||||
layer.self_attn.in_proj_bias,
|
||||
layer.self_attn.out_proj.weight,
|
||||
layer.self_attn.out_proj.bias,
|
||||
layer.norm1.weight,
|
||||
layer.norm1.bias,
|
||||
layer.norm1.eps,
|
||||
layer.norm2.weight,
|
||||
layer.norm2.bias,
|
||||
layer.norm2.eps
|
||||
)
|
||||
|
||||
blocks.append(block)
|
||||
|
||||
self.enable_flash_attn(flash_attn_enabled)
|
||||
|
||||
def enable_flash_attn(self, enable:bool=True):
|
||||
|
||||
if not enable:
|
||||
print("Not Using Flash Attention")
|
||||
self.infer_panel = self.infer_panel_batch_only
|
||||
else:
|
||||
self.infer_panel = self.infer_panel_batch_infer_with_flash_attn
|
||||
print("Using Flash Attention")
|
||||
blocks = []
|
||||
|
||||
for i in range(self.num_layers):
|
||||
layer = self.h.layers[i]
|
||||
t2smlp = T2SMLP(
|
||||
layer.linear1.weight,
|
||||
layer.linear1.bias,
|
||||
layer.linear2.weight,
|
||||
layer.linear2.bias
|
||||
)
|
||||
|
||||
block = T2SBlock(
|
||||
self.num_head,
|
||||
self.model_dim,
|
||||
t2smlp,
|
||||
layer.self_attn.in_proj_weight,
|
||||
layer.self_attn.in_proj_bias,
|
||||
layer.self_attn.out_proj.weight,
|
||||
layer.self_attn.out_proj.bias,
|
||||
layer.norm1.weight,
|
||||
layer.norm1.bias,
|
||||
layer.norm1.eps,
|
||||
layer.norm2.weight,
|
||||
layer.norm2.bias,
|
||||
layer.norm2.eps
|
||||
)
|
||||
|
||||
blocks.append(block)
|
||||
|
||||
self.t2s_transformer = T2STransformer(self.num_layers, blocks)
|
||||
self.t2s_transformer = T2STransformer(self.num_layers, blocks)
|
||||
|
||||
def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
|
||||
x = self.ar_text_embedding(x)
|
||||
@ -297,8 +300,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
(0, y_len),
|
||||
value=True,
|
||||
)
|
||||
# 取消对y[0]的mask,以防止复读,详见https://github.com/RVC-Boss/GPT-SoVITS/issues/965
|
||||
x_attn_mask[:, x_len]=False
|
||||
# x_attn_mask[:, x_len]=False
|
||||
y_attn_mask = F.pad(
|
||||
torch.triu(
|
||||
torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
|
||||
@ -394,8 +396,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
(0, y_len),
|
||||
value=True,
|
||||
)
|
||||
# 取消对y[0]的mask,以防止复读,详见https://github.com/RVC-Boss/GPT-SoVITS/issues/965
|
||||
x_attn_mask[:, x_len]=False
|
||||
# x_attn_mask[:, x_len]=False
|
||||
y_attn_mask = F.pad(
|
||||
torch.triu(
|
||||
torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
|
||||
@ -461,7 +462,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
value=True,
|
||||
)
|
||||
y_attn_mask = F.pad(
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=0),# diagonal必须为0,否则会导致batch_size>1时的复读情况
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=0),
|
||||
(x_len, 0),
|
||||
value=False,
|
||||
)
|
||||
@ -515,21 +516,31 @@ class Text2SemanticDecoder(nn.Module):
|
||||
top_p: int = 100,
|
||||
early_stop_num: int = -1,
|
||||
temperature: float = 1.0,
|
||||
repetition_penalty: float = 1.35,
|
||||
**kwargs,
|
||||
):
|
||||
## 先对phones进行embedding、对bert_features进行project,再pad到相同长度(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
||||
# max_len = 0
|
||||
# # fp16 会对结果产生影响(和没pad相比)
|
||||
# bert_feature_dtype = bert_feature[0].dtype
|
||||
# if not hasattr(self.bert_proj, "dtype"):
|
||||
# self.bert_proj.dtype = torch.float32
|
||||
# self.bert_proj=self.bert_proj.float()
|
||||
|
||||
## 先对phones进行embedding、对bert_features进行project,再pad到相同长度(padding策略会影响T2S模型生成的结果。)
|
||||
## pad之后再进行Linear会有误差(和没pad相比),就离谱。。。
|
||||
max_len = kwargs.get("max_len",x_lens.max())
|
||||
# for x_item, bert_item in zip(x, bert_feature):
|
||||
# max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
|
||||
# x_list = [self.ar_text_embedding(item) for item in x]
|
||||
# x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
|
||||
# x = torch.stack(x_list, dim=0)
|
||||
x_list = [self.ar_text_embedding(item) for item in x]
|
||||
x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
|
||||
x = torch.stack(x_list, dim=0)
|
||||
|
||||
# bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
|
||||
# bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
|
||||
# bert_feature = torch.stack(bert_features_list, dim=0)
|
||||
|
||||
bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
|
||||
x = self.ar_text_embedding(x)
|
||||
bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
|
||||
bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
|
||||
bert_feature = torch.stack(bert_features_list, dim=0)
|
||||
|
||||
|
||||
# bert_feature = self.bert_proj(bert_feature.transpose(1, 2).float()).to(dtype=bert_feature_dtype)
|
||||
# x = self.ar_text_embedding(x)
|
||||
x = x + bert_feature
|
||||
x = self.ar_text_position(x)
|
||||
|
||||
@ -539,7 +550,6 @@ class Text2SemanticDecoder(nn.Module):
|
||||
x_len = x.shape[1]
|
||||
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
|
||||
stop = False
|
||||
# print(1111111,self.num_layers)
|
||||
|
||||
k_cache = None
|
||||
v_cache = None
|
||||
@ -548,6 +558,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
y_emb = self.ar_audio_embedding(y)
|
||||
y_len = y_emb.shape[1]
|
||||
prefix_len = y.shape[1]
|
||||
y_lens = torch.LongTensor([y_emb.shape[1]]*y_emb.shape[0]).to(x.device)
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = torch.concat([x, y_pos], dim=1)
|
||||
ref_free = False
|
||||
@ -555,6 +566,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
y_emb = None
|
||||
y_len = 0
|
||||
prefix_len = 0
|
||||
y_lens = torch.LongTensor([y_len]*x.shape[0]).to(x.device)
|
||||
y_pos = None
|
||||
xy_pos = x
|
||||
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
|
||||
@ -564,39 +576,41 @@ class Text2SemanticDecoder(nn.Module):
|
||||
##### create mask #####
|
||||
bsz = x.shape[0]
|
||||
src_len = x_len + y_len
|
||||
y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
|
||||
y_mask = make_pad_mask(y_lens)
|
||||
x_mask = make_pad_mask(x_lens)
|
||||
y_paddind_mask = make_pad_mask(y_lens, y_len)
|
||||
x_paddind_mask = make_pad_mask(x_lens, max_len)
|
||||
|
||||
# (bsz, x_len + y_len)
|
||||
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
|
||||
xy_padding_mask = torch.concat([x_paddind_mask, y_paddind_mask], dim=1)
|
||||
|
||||
x_mask = F.pad(
|
||||
x_attn_mask,
|
||||
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
|
||||
value=True,
|
||||
)
|
||||
y_mask = F.pad( ###yy的右上0扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=0), # diagonal必须为0,否则会导致batch_size>1时的复读情况
|
||||
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
|
||||
(x_len, 0),
|
||||
value=False,
|
||||
)
|
||||
|
||||
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz, -1, -1).to(x.device)
|
||||
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
|
||||
xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(-1, src_len, src_len)
|
||||
xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
|
||||
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1).view(1 , src_len, src_len).expand(bsz, -1, -1).to(x.device)
|
||||
_xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(-1, src_len, src_len)
|
||||
xy_attn_mask = xy_mask.logical_or(_xy_padding_mask)
|
||||
xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
|
||||
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
|
||||
xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
|
||||
|
||||
|
||||
xy_padding_mask = ~xy_padding_mask.view(bsz, src_len, 1).expand(-1, -1, self.model_dim)
|
||||
xy_padding_mask = xy_padding_mask.to(dtype=x.dtype)
|
||||
|
||||
###### decode #####
|
||||
y_list = [None]*y.shape[0]
|
||||
batch_idx_map = list(range(y.shape[0]))
|
||||
idx_list = [None]*y.shape[0]
|
||||
for idx in tqdm(range(1500)):
|
||||
if idx == 0:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask)
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, xy_padding_mask)
|
||||
else:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)
|
||||
|
||||
@ -609,7 +623,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
logits = logits[:, :-1]
|
||||
|
||||
samples = sample(
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
||||
)[0]
|
||||
|
||||
y = torch.concat([y, samples], dim=1)
|
||||
@ -659,7 +673,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
|
||||
####################### update next step ###################################
|
||||
y_emb = self.ar_audio_embedding(y[:, -1:])
|
||||
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device)
|
||||
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to( dtype= y_emb.dtype,device=y_emb.device)
|
||||
|
||||
if (None in idx_list):
|
||||
for i in range(x.shape[0]):
|
||||
@ -670,7 +684,37 @@ class Text2SemanticDecoder(nn.Module):
|
||||
return y_list, [0]*x.shape[0]
|
||||
return y_list, idx_list
|
||||
|
||||
def infer_panel_batch_only(
|
||||
def infer_panel_0307(self,
|
||||
x:List[torch.LongTensor], #####全部文本token
|
||||
x_lens:torch.LongTensor,
|
||||
prompts:torch.LongTensor, ####参考音频token
|
||||
bert_feature:torch.LongTensor,
|
||||
top_k: int = -100,
|
||||
top_p: int = 100,
|
||||
early_stop_num: int = -1,
|
||||
temperature: float = 1.0,
|
||||
repetition_penalty: float = 1.35,
|
||||
**kwargs
|
||||
):
|
||||
y_list = []
|
||||
idx_list = []
|
||||
for i in range(len(x)):
|
||||
y, idx = self.infer_panel_with_flash_attn_only(x[i].unsqueeze(0),
|
||||
x_lens[i],
|
||||
prompts[i].unsqueeze(0),
|
||||
bert_feature[i].unsqueeze(0),
|
||||
top_k,
|
||||
top_p,
|
||||
early_stop_num,
|
||||
temperature,
|
||||
repetition_penalty,
|
||||
**kwargs)
|
||||
y_list.append(y[0])
|
||||
idx_list.append(idx)
|
||||
|
||||
return y_list, idx_list
|
||||
|
||||
def infer_panel_with_flash_attn_only(
|
||||
self,
|
||||
x:torch.LongTensor, #####全部文本token
|
||||
x_lens:torch.LongTensor,
|
||||
@ -680,22 +724,11 @@ class Text2SemanticDecoder(nn.Module):
|
||||
top_p: int = 100,
|
||||
early_stop_num: int = -1,
|
||||
temperature: float = 1.0,
|
||||
repetition_penalty: float = 1.35,
|
||||
**kwargs
|
||||
):
|
||||
## 先对phones进行embedding、对bert_features进行project,再pad到相同长度(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
||||
# max_len = 0
|
||||
# for x_item, bert_item in zip(x, bert_feature):
|
||||
# max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
|
||||
# x_list = [self.ar_text_embedding(item) for item in x]
|
||||
# x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
|
||||
# x = torch.stack(x_list, dim=0)
|
||||
|
||||
# bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
|
||||
# bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
|
||||
# bert_feature = torch.stack(bert_features_list, dim=0)
|
||||
|
||||
bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
|
||||
x = self.ar_text_embedding(x)
|
||||
x = x + bert_feature
|
||||
x = x + self.bert_proj(bert_feature.transpose(1, 2))
|
||||
x = self.ar_text_position(x)
|
||||
|
||||
# AR Decoder
|
||||
@ -705,17 +738,9 @@ class Text2SemanticDecoder(nn.Module):
|
||||
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
|
||||
stop = False
|
||||
# print(1111111,self.num_layers)
|
||||
cache = {
|
||||
"all_stage": self.num_layers,
|
||||
"k": [None] * self.num_layers, ###根据配置自己手写
|
||||
"v": [None] * self.num_layers,
|
||||
# "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存,每次都要重新拼xy_pos.主要还是写法原因,其实是可以历史统一一样的,但也没啥计算量就不管了
|
||||
"y_emb": None, ##只需要对最新的samples求emb,再拼历史的就行
|
||||
# "logits":None,###原版就已经只对结尾求再拼接了,不用管
|
||||
# "xy_dec":None,###不需要,本来只需要最后一个做logits
|
||||
"first_infer": 1,
|
||||
"stage": 0,
|
||||
}
|
||||
|
||||
k_cache = None
|
||||
v_cache = None
|
||||
################### first step ##########################
|
||||
if y is not None:
|
||||
y_emb = self.ar_audio_embedding(y)
|
||||
@ -723,7 +748,6 @@ class Text2SemanticDecoder(nn.Module):
|
||||
prefix_len = y.shape[1]
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = torch.concat([x, y_pos], dim=1)
|
||||
cache["y_emb"] = y_emb
|
||||
ref_free = False
|
||||
else:
|
||||
y_emb = None
|
||||
@ -734,127 +758,58 @@ class Text2SemanticDecoder(nn.Module):
|
||||
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
|
||||
ref_free = True
|
||||
|
||||
##### create mask #####
|
||||
bsz = x.shape[0]
|
||||
src_len = x_len + y_len
|
||||
y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
|
||||
y_mask = make_pad_mask(y_lens)
|
||||
x_mask = make_pad_mask(x_lens)
|
||||
|
||||
# (bsz, x_len + y_len)
|
||||
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
|
||||
|
||||
x_mask = F.pad(
|
||||
x_attn_mask_pad = F.pad(
|
||||
x_attn_mask,
|
||||
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
|
||||
value=True,
|
||||
)
|
||||
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=0), # diagonal必须为0,否则会导致batch_size>1时的复读情况
|
||||
y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
|
||||
(x_len, 0),
|
||||
value=False,
|
||||
)
|
||||
|
||||
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz*self.num_head, -1, -1).to(x.device)
|
||||
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
|
||||
xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(bsz, src_len, src_len).repeat(self.num_head, 1, 1)
|
||||
xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
|
||||
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).unsqueeze(0).expand(bsz*self.num_head, -1, -1).view(bsz, self.num_head, src_len, src_len).to(x.device)
|
||||
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
|
||||
xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
|
||||
|
||||
y_list = [None]*y.shape[0]
|
||||
batch_idx_map = list(range(y.shape[0]))
|
||||
idx_list = [None]*y.shape[0]
|
||||
for idx in tqdm(range(1500)):
|
||||
|
||||
xy_dec, _ = self.h((xy_pos, None), mask=xy_attn_mask, cache=cache)
|
||||
if xy_attn_mask is not None:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None)
|
||||
else:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)
|
||||
|
||||
logits = self.ar_predict_layer(
|
||||
xy_dec[:, -1]
|
||||
) ##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的
|
||||
# samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
|
||||
if(idx==0):###第一次跑不能EOS否则没有了
|
||||
logits = logits[:, :-1] ###刨除1024终止符号的概率
|
||||
samples = sample(
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature
|
||||
)[0]
|
||||
# 本次生成的 semantic_ids 和之前的 y 构成新的 y
|
||||
# print(samples.shape)#[1,1]#第一个1是bs
|
||||
y = torch.concat([y, samples], dim=1)
|
||||
)
|
||||
|
||||
if idx == 0:
|
||||
xy_attn_mask = None
|
||||
logits = logits[:, :-1]
|
||||
|
||||
samples = sample(
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
||||
)[0]
|
||||
|
||||
y = torch.concat([y, samples], dim=1)
|
||||
|
||||
# 移除已经生成完毕的序列
|
||||
reserved_idx_of_batch_for_y = None
|
||||
if (self.EOS in torch.argmax(logits, dim=-1)) or \
|
||||
(self.EOS in samples[:, 0]): ###如果生成到EOS,则停止
|
||||
l = samples[:, 0]==self.EOS
|
||||
removed_idx_of_batch_for_y = torch.where(l==True)[0].tolist()
|
||||
reserved_idx_of_batch_for_y = torch.where(l==False)[0]
|
||||
# batch_indexs = torch.tensor(batch_idx_map, device=y.device)[removed_idx_of_batch_for_y]
|
||||
for i in removed_idx_of_batch_for_y:
|
||||
batch_index = batch_idx_map[i]
|
||||
idx_list[batch_index] = idx - 1
|
||||
y_list[batch_index] = y[i, :-1]
|
||||
|
||||
batch_idx_map = [batch_idx_map[i] for i in reserved_idx_of_batch_for_y.tolist()]
|
||||
|
||||
# 只保留未生成完毕的序列
|
||||
if reserved_idx_of_batch_for_y is not None:
|
||||
# index = torch.LongTensor(batch_idx_map).to(y.device)
|
||||
y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
|
||||
if cache["y_emb"] is not None:
|
||||
cache["y_emb"] = torch.index_select(cache["y_emb"], dim=0, index=reserved_idx_of_batch_for_y)
|
||||
if cache["k"] is not None:
|
||||
for i in range(self.num_layers):
|
||||
# 因为kv转置了,所以batch dim是1
|
||||
cache["k"][i] = torch.index_select(cache["k"][i], dim=1, index=reserved_idx_of_batch_for_y)
|
||||
cache["v"][i] = torch.index_select(cache["v"][i], dim=1, index=reserved_idx_of_batch_for_y)
|
||||
|
||||
|
||||
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
||||
print("use early stop num:", early_stop_num)
|
||||
stop = True
|
||||
|
||||
if not (None in idx_list):
|
||||
# print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
|
||||
|
||||
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
|
||||
stop = True
|
||||
if stop:
|
||||
# if prompts.shape[1] == y.shape[1]:
|
||||
# y = torch.concat([y, torch.zeros_like(samples)], dim=1)
|
||||
# print("bad zero prediction")
|
||||
if y.shape[1]==0:
|
||||
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
|
||||
print("bad zero prediction")
|
||||
print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
|
||||
break
|
||||
|
||||
####################### update next step ###################################
|
||||
cache["first_infer"] = 0
|
||||
if cache["y_emb"] is not None:
|
||||
y_emb = torch.cat(
|
||||
[cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], dim = 1
|
||||
)
|
||||
cache["y_emb"] = y_emb
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = y_pos[:, -1:]
|
||||
else:
|
||||
y_emb = self.ar_audio_embedding(y[:, -1:])
|
||||
cache["y_emb"] = y_emb
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = y_pos
|
||||
y_len = y_pos.shape[1]
|
||||
|
||||
###最右边一列(是错的)
|
||||
# xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
|
||||
# xy_attn_mask[:,-1]=False
|
||||
###最下面一行(是对的)
|
||||
xy_attn_mask = torch.zeros(
|
||||
(1, x_len + y_len), dtype=torch.bool, device=xy_pos.device
|
||||
)
|
||||
|
||||
if (None in idx_list):
|
||||
for i in range(x.shape[0]):
|
||||
if idx_list[i] is None:
|
||||
idx_list[i] = 1500-1 ###如果没有生成到EOS,就用最大长度代替
|
||||
|
||||
####################### update next step ###################################
|
||||
y_emb = self.ar_audio_embedding(y[:, -1:])
|
||||
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device)
|
||||
|
||||
if ref_free:
|
||||
return y_list, [0]*x.shape[0]
|
||||
return y_list, idx_list
|
||||
return y[:, :-1], 0
|
||||
return y[:, :-1], idx - 1
|
||||
|
@ -37,7 +37,6 @@ default:
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
flash_attn_enabled: true
|
||||
|
||||
custom:
|
||||
device: cuda
|
||||
@ -46,7 +45,6 @@ custom:
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
flash_attn_enabled: true
|
||||
|
||||
|
||||
"""
|
||||
@ -66,6 +64,9 @@ def set_seed(seed:int):
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# torch.backends.cudnn.enabled = True
|
||||
# 开启后会影响精度
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
except:
|
||||
pass
|
||||
return seed
|
||||
@ -78,7 +79,6 @@ class TTS_Config:
|
||||
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
"flash_attn_enabled": True
|
||||
}
|
||||
configs:dict = None
|
||||
def __init__(self, configs: Union[dict, str]=None):
|
||||
@ -108,7 +108,6 @@ class TTS_Config:
|
||||
|
||||
self.device = self.configs.get("device", torch.device("cpu"))
|
||||
self.is_half = self.configs.get("is_half", False)
|
||||
self.flash_attn_enabled = self.configs.get("flash_attn_enabled", True)
|
||||
self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
|
||||
self.vits_weights_path = self.configs.get("vits_weights_path", None)
|
||||
self.bert_base_path = self.configs.get("bert_base_path", None)
|
||||
@ -141,7 +140,7 @@ class TTS_Config:
|
||||
self.n_speakers:int = 300
|
||||
|
||||
self.languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
|
||||
# print(self)
|
||||
|
||||
|
||||
def _load_configs(self, configs_path: str)->dict:
|
||||
with open(configs_path, 'r') as f:
|
||||
@ -169,7 +168,6 @@ class TTS_Config:
|
||||
"vits_weights_path" : self.vits_weights_path,
|
||||
"bert_base_path" : self.bert_base_path,
|
||||
"cnhuhbert_base_path": self.cnhuhbert_base_path,
|
||||
"flash_attn_enabled" : self.flash_attn_enabled
|
||||
}
|
||||
return self.config
|
||||
|
||||
@ -289,8 +287,7 @@ class TTS:
|
||||
dict_s1 = torch.load(weights_path, map_location=self.configs.device)
|
||||
config = dict_s1["config"]
|
||||
self.configs.max_sec = config["data"]["max_sec"]
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False,
|
||||
flash_attn_enabled=self.configs.flash_attn_enabled)
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
t2s_model.load_state_dict(dict_s1["weight"])
|
||||
t2s_model = t2s_model.to(self.configs.device)
|
||||
t2s_model = t2s_model.eval()
|
||||
@ -435,8 +432,6 @@ class TTS:
|
||||
device:torch.device=torch.device("cpu"),
|
||||
precision:torch.dtype=torch.float32,
|
||||
):
|
||||
# 但是这里不能套,反而会负优化
|
||||
# with torch.no_grad():
|
||||
_data:list = []
|
||||
index_and_len_list = []
|
||||
for idx, item in enumerate(data):
|
||||
@ -484,8 +479,6 @@ class TTS:
|
||||
norm_text_batch = []
|
||||
bert_max_len = 0
|
||||
phones_max_len = 0
|
||||
# 但是这里也不能套,反而会负优化
|
||||
# with torch.no_grad():
|
||||
for item in item_list:
|
||||
if prompt_data is not None:
|
||||
all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\
|
||||
@ -518,11 +511,11 @@ class TTS:
|
||||
max_len = max(bert_max_len, phones_max_len)
|
||||
# phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
#### 直接对phones和bert_features进行pad。(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
||||
all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
all_bert_features_batch = all_bert_features_list
|
||||
all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precision, device=device)
|
||||
for idx, item in enumerate(all_bert_features_list):
|
||||
all_bert_features_batch[idx, :, : item.shape[-1]] = item
|
||||
# all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
# all_bert_features_batch = all_bert_features_list
|
||||
# all_bert_features_batch = torch.zeros((len(all_bert_features_list), 1024, max_len), dtype=precision, device=device)
|
||||
# for idx, item in enumerate(all_bert_features_list):
|
||||
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
|
||||
|
||||
# #### 先对phones进行embedding、对bert_features进行project,再pad到相同长度,(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
||||
# all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
|
||||
@ -539,7 +532,8 @@ class TTS:
|
||||
"all_phones": all_phones_batch,
|
||||
"all_phones_len": torch.LongTensor(all_phones_len_list).to(device),
|
||||
"all_bert_features": all_bert_features_batch,
|
||||
"norm_text": norm_text_batch
|
||||
"norm_text": norm_text_batch,
|
||||
"max_len": max_len,
|
||||
}
|
||||
_data.append(batch)
|
||||
|
||||
@ -569,7 +563,6 @@ class TTS:
|
||||
'''
|
||||
self.stop_flag = True
|
||||
|
||||
# 使用装饰器
|
||||
@torch.no_grad()
|
||||
def run(self, inputs:dict):
|
||||
"""
|
||||
@ -594,6 +587,8 @@ class TTS:
|
||||
"speed_factor":1.0, # float. control the speed of the synthesized audio.
|
||||
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
|
||||
"seed": -1, # int. random seed for reproducibility.
|
||||
"parallel_infer": True, # bool. whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float. repetition penalty for T2S model.
|
||||
}
|
||||
returns:
|
||||
tuple[int, np.ndarray]: sampling rate and audio data.
|
||||
@ -618,9 +613,17 @@ class TTS:
|
||||
seed = inputs.get("seed", -1)
|
||||
seed = -1 if seed in ["", None] else seed
|
||||
actual_seed = set_seed(seed)
|
||||
parallel_infer = inputs.get("parallel_infer", True)
|
||||
repetition_penalty = inputs.get("repetition_penalty", 1.35)
|
||||
|
||||
if parallel_infer:
|
||||
print(i18n("并行推理模式已开启"))
|
||||
self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer_with_flash_attn
|
||||
else:
|
||||
print(i18n("并行推理模式已关闭"))
|
||||
self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_0307
|
||||
|
||||
if return_fragment:
|
||||
# split_bucket = False
|
||||
print(i18n("分段返回模式已开启"))
|
||||
if split_bucket:
|
||||
split_bucket = False
|
||||
@ -740,6 +743,7 @@ class TTS:
|
||||
all_phoneme_lens:torch.LongTensor = item["all_phones_len"]
|
||||
all_bert_features:torch.LongTensor = item["all_bert_features"]
|
||||
norm_text:str = item["norm_text"]
|
||||
max_len = item["max_len"]
|
||||
|
||||
print(i18n("前端处理后的文本(每句):"), norm_text)
|
||||
if no_prompt_text :
|
||||
@ -758,6 +762,8 @@ class TTS:
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
early_stop_num=self.configs.hz * self.configs.max_sec,
|
||||
max_len=max_len,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
t4 = ttime()
|
||||
t_34 += t4 - t3
|
||||
|
@ -2,7 +2,6 @@ custom:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cuda
|
||||
flash_attn_enabled: true
|
||||
is_half: true
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
@ -10,7 +9,6 @@ default:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cpu
|
||||
flash_attn_enabled: true
|
||||
is_half: false
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
|
@ -93,8 +93,11 @@ def inference(text, text_lang,
|
||||
text_split_method, batch_size,
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket,fragment_interval,
|
||||
seed,
|
||||
seed, keep_random, parallel_infer,
|
||||
repetition_penalty
|
||||
):
|
||||
|
||||
seed = -1 if keep_random else seed
|
||||
actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
|
||||
inputs={
|
||||
"text": text,
|
||||
@ -112,6 +115,8 @@ def inference(text, text_lang,
|
||||
"return_fragment":False,
|
||||
"fragment_interval":fragment_interval,
|
||||
"seed":actual_seed,
|
||||
"parallel_infer": parallel_infer,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
}
|
||||
for item in tts_pipeline.run(inputs):
|
||||
yield item, actual_seed
|
||||
@ -197,6 +202,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
|
||||
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
|
||||
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
|
||||
repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True)
|
||||
with gr.Column():
|
||||
how_to_cut = gr.Radio(
|
||||
label=i18n("怎么切"),
|
||||
@ -205,8 +211,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
interactive=True,
|
||||
)
|
||||
with gr.Row():
|
||||
split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), value=True, interactive=True, show_label=True)
|
||||
parallel_infer = gr.Checkbox(label=i18n("并行推理(速度更快,但可能增大复读概率)"), value=True, interactive=True, show_label=True)
|
||||
split_bucket = gr.Checkbox(label=i18n("数据分桶(并行推理时会降低一点计算量)"), value=True, interactive=True, show_label=True)
|
||||
seed = gr.Number(label=i18n("随机种子"),value=-1)
|
||||
keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True)
|
||||
|
||||
# with gr.Column():
|
||||
output = gr.Audio(label=i18n("输出的语音"))
|
||||
with gr.Row():
|
||||
@ -223,7 +232,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
how_to_cut, batch_size,
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket,fragment_interval,
|
||||
seed
|
||||
seed, keep_random, parallel_infer,
|
||||
repetition_penalty
|
||||
],
|
||||
[output, seed],
|
||||
)
|
||||
|
232173
GPT_SoVITS/text/cmudict.rep
232173
GPT_SoVITS/text/cmudict.rep
File diff suppressed because it is too large
Load Diff
@ -1 +1,2 @@
|
||||
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
|
||||
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
|
||||
JSON JH EY1 S AH0 N
|
Binary file not shown.
@ -1,18 +1,26 @@
|
||||
import pickle
|
||||
import os
|
||||
import re
|
||||
import wordsegment
|
||||
from g2p_en import G2p
|
||||
|
||||
from string import punctuation
|
||||
|
||||
from text import symbols
|
||||
|
||||
import unicodedata
|
||||
from builtins import str as unicode
|
||||
from g2p_en.expand import normalize_numbers
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
word_tokenize = TweetTokenizer().tokenize
|
||||
from nltk import pos_tag
|
||||
|
||||
current_file_path = os.path.dirname(__file__)
|
||||
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
||||
CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
|
||||
CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
|
||||
CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
|
||||
_g2p = G2p()
|
||||
NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
|
||||
|
||||
arpa = {
|
||||
"AH0",
|
||||
@ -90,7 +98,7 @@ arpa = {
|
||||
|
||||
|
||||
def replace_phs(phs):
|
||||
rep_map = {";": ",", ":": ",", "'": "-", '"': "-"}
|
||||
rep_map = {"'": "-"}
|
||||
phs_new = []
|
||||
for ph in phs:
|
||||
if ph in symbols:
|
||||
@ -112,7 +120,7 @@ def read_dict():
|
||||
if line_index >= start_line:
|
||||
line = line.strip()
|
||||
word_split = line.split(" ")
|
||||
word = word_split[0]
|
||||
word = word_split[0].lower()
|
||||
|
||||
syllable_split = word_split[1].split(" - ")
|
||||
g2p_dict[word] = []
|
||||
@ -132,16 +140,11 @@ def read_dict_new():
|
||||
line = f.readline()
|
||||
line_index = 1
|
||||
while line:
|
||||
if line_index >= 49:
|
||||
if line_index >= 57:
|
||||
line = line.strip()
|
||||
word_split = line.split(" ")
|
||||
word = word_split[0]
|
||||
|
||||
syllable_split = word_split[1].split(" - ")
|
||||
g2p_dict[word] = []
|
||||
for syllable in syllable_split:
|
||||
phone_split = syllable.split(" ")
|
||||
g2p_dict[word].append(phone_split)
|
||||
word = word_split[0].lower()
|
||||
g2p_dict[word] = [word_split[1].split(" ")]
|
||||
|
||||
line_index = line_index + 1
|
||||
line = f.readline()
|
||||
@ -153,14 +156,16 @@ def read_dict_new():
|
||||
if line_index >= 0:
|
||||
line = line.strip()
|
||||
word_split = line.split(" ")
|
||||
word = word_split[0]
|
||||
word = word_split[0].lower()
|
||||
if word not in g2p_dict:
|
||||
g2p_dict[word] = []
|
||||
g2p_dict[word].append(word_split[1:])
|
||||
g2p_dict[word] = [word_split[1:]]
|
||||
|
||||
line_index = line_index + 1
|
||||
line = f.readline()
|
||||
|
||||
return g2p_dict
|
||||
|
||||
def hot_reload_hot(g2p_dict):
|
||||
with open(CMU_DICT_HOT_PATH) as f:
|
||||
line = f.readline()
|
||||
line_index = 1
|
||||
@ -168,14 +173,13 @@ def read_dict_new():
|
||||
if line_index >= 0:
|
||||
line = line.strip()
|
||||
word_split = line.split(" ")
|
||||
word = word_split[0]
|
||||
#if word not in g2p_dict:
|
||||
g2p_dict[word] = []
|
||||
g2p_dict[word].append(word_split[1:])
|
||||
word = word_split[0].lower()
|
||||
# 自定义发音词直接覆盖字典
|
||||
g2p_dict[word] = [word_split[1:]]
|
||||
|
||||
line_index = line_index + 1
|
||||
line = f.readline()
|
||||
|
||||
|
||||
return g2p_dict
|
||||
|
||||
|
||||
@ -192,43 +196,167 @@ def get_dict():
|
||||
g2p_dict = read_dict_new()
|
||||
cache_dict(g2p_dict, CACHE_PATH)
|
||||
|
||||
g2p_dict = hot_reload_hot(g2p_dict)
|
||||
|
||||
return g2p_dict
|
||||
|
||||
|
||||
eng_dict = get_dict()
|
||||
def get_namedict():
|
||||
if os.path.exists(NAMECACHE_PATH):
|
||||
with open(NAMECACHE_PATH, "rb") as pickle_file:
|
||||
name_dict = pickle.load(pickle_file)
|
||||
else:
|
||||
name_dict = {}
|
||||
|
||||
return name_dict
|
||||
|
||||
|
||||
def text_normalize(text):
|
||||
# todo: eng text normalize
|
||||
return text.replace(";", ",")
|
||||
# 适配中文及 g2p_en 标点
|
||||
rep_map = {
|
||||
"[;::,;]": ",",
|
||||
'["’]': "'",
|
||||
"。": ".",
|
||||
"!": "!",
|
||||
"?": "?",
|
||||
}
|
||||
for p, r in rep_map.items():
|
||||
text = re.sub(p, r, text)
|
||||
|
||||
# 来自 g2p_en 文本格式化处理
|
||||
# 增加大写兼容
|
||||
text = unicode(text)
|
||||
text = normalize_numbers(text)
|
||||
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
||||
if unicodedata.category(char) != 'Mn') # Strip accents
|
||||
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
|
||||
text = re.sub(r"(?i)i\.e\.", "that is", text)
|
||||
text = re.sub(r"(?i)e\.g\.", "for example", text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
class en_G2p(G2p):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# 分词初始化
|
||||
wordsegment.load()
|
||||
|
||||
# 扩展过时字典, 添加姓名字典
|
||||
self.cmu = get_dict()
|
||||
self.namedict = get_namedict()
|
||||
|
||||
# 剔除读音错误的几个缩写
|
||||
for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
|
||||
del self.cmu[word.lower()]
|
||||
|
||||
# 修正多音字
|
||||
self.homograph2features["read"] = (['R', 'IY1', 'D'], ['R', 'EH1', 'D'], 'VBP')
|
||||
self.homograph2features["complex"] = (['K', 'AH0', 'M', 'P', 'L', 'EH1', 'K', 'S'], ['K', 'AA1', 'M', 'P', 'L', 'EH0', 'K', 'S'], 'JJ')
|
||||
|
||||
|
||||
def __call__(self, text):
|
||||
# tokenization
|
||||
words = word_tokenize(text)
|
||||
tokens = pos_tag(words) # tuples of (word, tag)
|
||||
|
||||
# steps
|
||||
prons = []
|
||||
for o_word, pos in tokens:
|
||||
# 还原 g2p_en 小写操作逻辑
|
||||
word = o_word.lower()
|
||||
|
||||
if re.search("[a-z]", word) is None:
|
||||
pron = [word]
|
||||
# 先把单字母推出去
|
||||
elif len(word) == 1:
|
||||
# 单读 A 发音修正, 这里需要原格式 o_word 判断大写
|
||||
if o_word == "A":
|
||||
pron = ['EY1']
|
||||
else:
|
||||
pron = self.cmu[word][0]
|
||||
# g2p_en 原版多音字处理
|
||||
elif word in self.homograph2features: # Check homograph
|
||||
pron1, pron2, pos1 = self.homograph2features[word]
|
||||
if pos.startswith(pos1):
|
||||
pron = pron1
|
||||
# pos1比pos长仅出现在read
|
||||
elif len(pos) < len(pos1) and pos == pos1[:len(pos)]:
|
||||
pron = pron1
|
||||
else:
|
||||
pron = pron2
|
||||
else:
|
||||
# 递归查找预测
|
||||
pron = self.qryword(o_word)
|
||||
|
||||
prons.extend(pron)
|
||||
prons.extend([" "])
|
||||
|
||||
return prons[:-1]
|
||||
|
||||
|
||||
def qryword(self, o_word):
|
||||
word = o_word.lower()
|
||||
|
||||
# 查字典, 单字母除外
|
||||
if len(word) > 1 and word in self.cmu: # lookup CMU dict
|
||||
return self.cmu[word][0]
|
||||
|
||||
# 单词仅首字母大写时查找姓名字典
|
||||
if o_word.istitle() and word in self.namedict:
|
||||
return self.namedict[word][0]
|
||||
|
||||
# oov 长度小于等于 3 直接读字母
|
||||
if len(word) <= 3:
|
||||
phones = []
|
||||
for w in word:
|
||||
# 单读 A 发音修正, 此处不存在大写的情况
|
||||
if w == "a":
|
||||
phones.extend(['EY1'])
|
||||
else:
|
||||
phones.extend(self.cmu[w][0])
|
||||
return phones
|
||||
|
||||
# 尝试分离所有格
|
||||
if re.match(r"^([a-z]+)('s)$", word):
|
||||
phones = self.qryword(word[:-2])
|
||||
# P T K F TH HH 无声辅音结尾 's 发 ['S']
|
||||
if phones[-1] in ['P', 'T', 'K', 'F', 'TH', 'HH']:
|
||||
phones.extend(['S'])
|
||||
# S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
|
||||
elif phones[-1] in ['S', 'Z', 'SH', 'ZH', 'CH', 'JH']:
|
||||
phones.extend(['AH0', 'Z'])
|
||||
# B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
|
||||
# AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
|
||||
# ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
|
||||
else:
|
||||
phones.extend(['Z'])
|
||||
return phones
|
||||
|
||||
# 尝试进行分词,应对复合词
|
||||
comps = wordsegment.segment(word.lower())
|
||||
|
||||
# 无法分词的送回去预测
|
||||
if len(comps)==1:
|
||||
return self.predict(word)
|
||||
|
||||
# 可以分词的递归处理
|
||||
return [phone for comp in comps for phone in self.qryword(comp)]
|
||||
|
||||
|
||||
_g2p = en_G2p()
|
||||
|
||||
|
||||
def g2p(text):
|
||||
phones = []
|
||||
words = re.split(r"([,;.\-\?\!\s+])", text)
|
||||
for w in words:
|
||||
if w.upper() in eng_dict:
|
||||
phns = eng_dict[w.upper()]
|
||||
for ph in phns:
|
||||
phones += ph
|
||||
else:
|
||||
phone_list = list(filter(lambda p: p != " ", _g2p(w)))
|
||||
for ph in phone_list:
|
||||
if ph in arpa:
|
||||
phones.append(ph)
|
||||
else:
|
||||
phones.append(ph)
|
||||
# g2p_en 整段推理,剔除不存在的arpa返回
|
||||
phone_list = _g2p(text)
|
||||
phones = [ph if ph != "<unk>" else "UNK" for ph in phone_list if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]]
|
||||
|
||||
return replace_phs(phones)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# print(get_dict())
|
||||
print(g2p("hello"))
|
||||
print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
|
||||
# all_phones = set()
|
||||
# for k, syllables in eng_dict.items():
|
||||
# for group in syllables:
|
||||
# for ph in group:
|
||||
# all_phones.add(ph)
|
||||
# print(all_phones)
|
||||
print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
|
||||
print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
|
BIN
GPT_SoVITS/text/namedict_cache.pickle
Normal file
BIN
GPT_SoVITS/text/namedict_cache.pickle
Normal file
Binary file not shown.
@ -106,6 +106,29 @@ def replace_default_num(match):
|
||||
return verbalize_digit(number, alt_one=True)
|
||||
|
||||
|
||||
# 加减乘除
|
||||
RE_ASMD = re.compile(
|
||||
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
|
||||
asmd_map = {
|
||||
'+': '加',
|
||||
'-': '减',
|
||||
'×': '乘',
|
||||
'÷': '除',
|
||||
'=': '等于'
|
||||
}
|
||||
|
||||
|
||||
def replace_asmd(match) -> str:
|
||||
"""
|
||||
Args:
|
||||
match (re.Match)
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
|
||||
return result
|
||||
|
||||
|
||||
# 数字表达式
|
||||
# 纯小数
|
||||
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
|
||||
@ -155,7 +178,13 @@ def replace_number(match) -> str:
|
||||
# match.group(1) and match.group(8) are copy from RE_NUMBER
|
||||
|
||||
RE_RANGE = re.compile(
|
||||
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
|
||||
r"""
|
||||
(?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
|
||||
((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
|
||||
[-~] # 匹配范围分隔符
|
||||
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
|
||||
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
|
||||
""", re.VERBOSE)
|
||||
|
||||
|
||||
def replace_range(match) -> str:
|
||||
@ -165,7 +194,7 @@ def replace_range(match) -> str:
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
first, second = match.group(1), match.group(8)
|
||||
first, second = match.group(1), match.group(6)
|
||||
first = RE_NUMBER.sub(replace_number, first)
|
||||
second = RE_NUMBER.sub(replace_number, second)
|
||||
result = f"{first}到{second}"
|
||||
|
@ -34,6 +34,7 @@ from .num import RE_PERCENTAGE
|
||||
from .num import RE_POSITIVE_QUANTIFIERS
|
||||
from .num import RE_RANGE
|
||||
from .num import RE_TO_RANGE
|
||||
from .num import RE_ASMD
|
||||
from .num import replace_default_num
|
||||
from .num import replace_frac
|
||||
from .num import replace_negative_num
|
||||
@ -42,6 +43,7 @@ from .num import replace_percentage
|
||||
from .num import replace_positive_quantifier
|
||||
from .num import replace_range
|
||||
from .num import replace_to_range
|
||||
from .num import replace_asmd
|
||||
from .phonecode import RE_MOBILE_PHONE
|
||||
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
||||
from .phonecode import RE_TELEPHONE
|
||||
@ -67,7 +69,7 @@ class TextNormalizer():
|
||||
if lang == "zh":
|
||||
text = text.replace(" ", "")
|
||||
# 过滤掉特殊字符
|
||||
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text)
|
||||
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||||
text = text.strip()
|
||||
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||||
@ -142,6 +144,11 @@ class TextNormalizer():
|
||||
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
|
||||
|
||||
sentence = RE_RANGE.sub(replace_range, sentence)
|
||||
|
||||
# 处理加减乘除
|
||||
while RE_ASMD.search(sentence):
|
||||
sentence = RE_ASMD.sub(replace_asmd, sentence)
|
||||
|
||||
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
|
||||
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
|
||||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
|
||||
|
29
README.md
29
README.md
@ -8,8 +8,10 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
||||
|
||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
|
||||
[](https://discord.gg/dnrgs5GHfG)
|
||||
|
||||
|
||||
[**English**](./README.md) | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md)
|
||||
|
||||
@ -43,7 +45,8 @@ For users in China region, you can [click here](https://www.codewithgpu.com/i/RV
|
||||
|
||||
- Python 3.9, PyTorch 2.0.1, CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
|
||||
- Python 3.9, PyTorch 2.3.0.dev20240122, macOS 14.3 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, CPU devices
|
||||
|
||||
_Note: numba==0.56.4 requires py<3.11_
|
||||
|
||||
@ -51,6 +54,10 @@ _Note: numba==0.56.4 requires py<3.11_
|
||||
|
||||
If you are a Windows user (tested with win>=10), you can directly download the [pre-packaged distribution](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
|
||||
|
||||
Users in China region can download [the 0217 package](https://www.icloud.com.cn/iclouddrive/061bfkcVJcBfsMfLF5R2XKdTQ#GPT-SoVITS-beta0217) or [the 0306fix2 package](https://www.icloud.com.cn/iclouddrive/09aaTLf96aa92dbLe0fPNM5CQ#GPT-SoVITS-beta0306fix2) by clicking the links and then selecting "Download a copy."
|
||||
|
||||
_Note: The 0306fix2 version doubles the inference speed and fixes all issues with the no reference text mode._
|
||||
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
@ -63,7 +70,9 @@ bash install.sh
|
||||
|
||||
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
|
||||
|
||||
First make sure you have installed FFmpeg by running `brew install ffmpeg` or `conda install ffmpeg`, then install by using the following commands:
|
||||
1. Install Xcode command-line tools by running `xcode-select --install`
|
||||
2. Install FFmpeg by running `brew install ffmpeg` or `conda install ffmpeg`.
|
||||
3. Install the program by running the following commands:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
@ -139,7 +148,15 @@ Users in China region can download these two models by entering the links below
|
||||
|
||||
- [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights)
|
||||
|
||||
For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/damo_asr/models`.
|
||||
For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
|
||||
|
||||
For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
|
||||
|
||||
Users in China region can download this model by entering the links below
|
||||
|
||||
- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/0c4pQxFs7oWyVU1iMTq2DbmLA#faster-whisper-large-v3) (clicking "Download a copy")
|
||||
|
||||
- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3) (HuggingFace mirror site)
|
||||
|
||||
## Dataset Format
|
||||
|
||||
@ -202,13 +219,13 @@ python audio_slicer.py \
|
||||
```
|
||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
||||
```
|
||||
python tools/damo_asr/cmd-asr.py "<Path to the directory containing input audio files>"
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
||||
|
||||
(No progress bars, GPU performance may cause time delays)
|
||||
```
|
||||
python ./tools/damo_asr/WhisperASR.py -i <input> -o <output> -f <file_name.list> -l <language>
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language>
|
||||
```
|
||||
A custom list save path is enabled
|
||||
|
||||
|
515
api.py
515
api.py
@ -18,6 +18,9 @@
|
||||
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
|
||||
`-fp` - `覆盖 config.py 使用全精度`
|
||||
`-hp` - `覆盖 config.py 使用半精度`
|
||||
`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`
|
||||
·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`
|
||||
·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入`
|
||||
|
||||
`-hb` - `cnhubert路径`
|
||||
`-b` - `bert路径`
|
||||
@ -39,6 +42,18 @@ POST:
|
||||
}
|
||||
```
|
||||
|
||||
使用执行参数指定的参考音频并设定分割符号:
|
||||
GET:
|
||||
`http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。`
|
||||
POST:
|
||||
```json
|
||||
{
|
||||
"text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
|
||||
"text_language": "zh",
|
||||
"cut_punc": ",。",
|
||||
}
|
||||
```
|
||||
|
||||
手动指定当次推理所使用的参考音频:
|
||||
GET:
|
||||
`http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
|
||||
@ -103,14 +118,10 @@ RESP: 无
|
||||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import os,re
|
||||
import sys
|
||||
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
||||
|
||||
import signal
|
||||
import LangSegment
|
||||
from time import time as ttime
|
||||
import torch
|
||||
import librosa
|
||||
@ -129,35 +140,8 @@ from text.cleaner import clean_text
|
||||
from module.mel_processing import spectrogram_torch
|
||||
from my_utils import load_audio
|
||||
import config as global_config
|
||||
|
||||
g_config = global_config.Config()
|
||||
|
||||
# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
|
||||
|
||||
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
|
||||
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
|
||||
|
||||
parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
|
||||
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
|
||||
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
||||
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
|
||||
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
||||
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
||||
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
|
||||
# bool值的用法为 `python ./api.py -fp ...`
|
||||
# 此时 full_precision==True, half_precision==False
|
||||
|
||||
parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
|
||||
parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
sovits_path = args.sovits_path
|
||||
gpt_path = args.gpt_path
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
|
||||
class DefaultRefer:
|
||||
@ -170,50 +154,6 @@ class DefaultRefer:
|
||||
return is_full(self.path, self.text, self.language)
|
||||
|
||||
|
||||
default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
|
||||
|
||||
device = args.device
|
||||
port = args.port
|
||||
host = args.bind_addr
|
||||
|
||||
if sovits_path == "":
|
||||
sovits_path = g_config.pretrained_sovits_path
|
||||
print(f"[WARN] 未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
|
||||
if gpt_path == "":
|
||||
gpt_path = g_config.pretrained_gpt_path
|
||||
print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
|
||||
|
||||
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
|
||||
if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
|
||||
default_refer.path, default_refer.text, default_refer.language = "", "", ""
|
||||
print("[INFO] 未指定默认参考音频")
|
||||
else:
|
||||
print(f"[INFO] 默认参考音频路径: {default_refer.path}")
|
||||
print(f"[INFO] 默认参考音频文本: {default_refer.text}")
|
||||
print(f"[INFO] 默认参考音频语种: {default_refer.language}")
|
||||
|
||||
is_half = g_config.is_half
|
||||
if args.full_precision:
|
||||
is_half = False
|
||||
if args.half_precision:
|
||||
is_half = True
|
||||
if args.full_precision and args.half_precision:
|
||||
is_half = g_config.is_half # 炒饭fallback
|
||||
|
||||
print(f"[INFO] 半精: {is_half}")
|
||||
|
||||
cnhubert_base_path = args.hubert_path
|
||||
bert_path = args.bert_path
|
||||
|
||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
||||
if is_half:
|
||||
bert_model = bert_model.half().to(device)
|
||||
else:
|
||||
bert_model = bert_model.to(device)
|
||||
|
||||
|
||||
def is_empty(*items): # 任意一项不为空返回False
|
||||
for item in items:
|
||||
if item is not None and item != "":
|
||||
@ -227,6 +167,7 @@ def is_full(*items): # 任意一项为空返回False
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def change_sovits_weights(sovits_path):
|
||||
global vq_model, hps
|
||||
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
||||
@ -246,9 +187,9 @@ def change_sovits_weights(sovits_path):
|
||||
else:
|
||||
vq_model = vq_model.to(device)
|
||||
vq_model.eval()
|
||||
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||
with open("./sweight.txt", "w", encoding="utf-8") as f:
|
||||
f.write(sovits_path)
|
||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
|
||||
|
||||
def change_gpt_weights(gpt_path):
|
||||
global hz, max_sec, t2s_model, config
|
||||
hz = 50
|
||||
@ -262,8 +203,7 @@ def change_gpt_weights(gpt_path):
|
||||
t2s_model = t2s_model.to(device)
|
||||
t2s_model.eval()
|
||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
print("Number of parameter: %.2fM" % (total / 1e6))
|
||||
with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
|
||||
logger.info("Number of parameter: %.2fM" % (total / 1e6))
|
||||
|
||||
|
||||
def get_bert_feature(text, word2ph):
|
||||
@ -283,9 +223,81 @@ def get_bert_feature(text, word2ph):
|
||||
return phone_level_feature.T
|
||||
|
||||
|
||||
n_semantic = 1024
|
||||
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
||||
hps = dict_s2["config"]
|
||||
def clean_text_inf(text, language):
|
||||
phones, word2ph, norm_text = clean_text(text, language)
|
||||
phones = cleaned_text_to_sequence(phones)
|
||||
return phones, word2ph, norm_text
|
||||
|
||||
|
||||
def get_bert_inf(phones, word2ph, norm_text, language):
|
||||
language=language.replace("all_","")
|
||||
if language == "zh":
|
||||
bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
|
||||
else:
|
||||
bert = torch.zeros(
|
||||
(1024, len(phones)),
|
||||
dtype=torch.float16 if is_half == True else torch.float32,
|
||||
).to(device)
|
||||
|
||||
return bert
|
||||
|
||||
|
||||
def get_phones_and_bert(text,language):
|
||||
if language in {"en","all_zh","all_ja"}:
|
||||
language = language.replace("all_","")
|
||||
if language == "en":
|
||||
LangSegment.setfilters(["en"])
|
||||
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
|
||||
else:
|
||||
# 因无法区别中日文汉字,以用户输入为准
|
||||
formattext = text
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
phones, word2ph, norm_text = clean_text_inf(formattext, language)
|
||||
if language == "zh":
|
||||
bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||
else:
|
||||
bert = torch.zeros(
|
||||
(1024, len(phones)),
|
||||
dtype=torch.float16 if is_half == True else torch.float32,
|
||||
).to(device)
|
||||
elif language in {"zh", "ja","auto"}:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
LangSegment.setfilters(["zh","ja","en","ko"])
|
||||
if language == "auto":
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
if tmp["lang"] == "ko":
|
||||
langlist.append("zh")
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
if tmp["lang"] == "en":
|
||||
langlist.append(tmp["lang"])
|
||||
else:
|
||||
# 因无法区别中日文汉字,以用户输入为准
|
||||
langlist.append(language)
|
||||
textlist.append(tmp["text"])
|
||||
# logger.info(textlist)
|
||||
# logger.info(langlist)
|
||||
phones_list = []
|
||||
bert_list = []
|
||||
norm_text_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
||||
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
||||
phones_list.append(phones)
|
||||
norm_text_list.append(norm_text)
|
||||
bert_list.append(bert)
|
||||
bert = torch.cat(bert_list, dim=1)
|
||||
phones = sum(phones_list, [])
|
||||
norm_text = ''.join(norm_text_list)
|
||||
|
||||
return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text
|
||||
|
||||
|
||||
class DictToAttrRecursive:
|
||||
@ -298,39 +310,6 @@ class DictToAttrRecursive:
|
||||
setattr(self, key, value)
|
||||
|
||||
|
||||
hps = DictToAttrRecursive(hps)
|
||||
hps.model.semantic_frame_rate = "25hz"
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
||||
config = dict_s1["config"]
|
||||
ssl_model = cnhubert.get_model()
|
||||
if is_half:
|
||||
ssl_model = ssl_model.half().to(device)
|
||||
else:
|
||||
ssl_model = ssl_model.to(device)
|
||||
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model)
|
||||
if is_half:
|
||||
vq_model = vq_model.half().to(device)
|
||||
else:
|
||||
vq_model = vq_model.to(device)
|
||||
vq_model.eval()
|
||||
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||
hz = 50
|
||||
max_sec = config['data']['max_sec']
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
t2s_model.load_state_dict(dict_s1["weight"])
|
||||
if is_half:
|
||||
t2s_model = t2s_model.half()
|
||||
t2s_model = t2s_model.to(device)
|
||||
t2s_model.eval()
|
||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
print("Number of parameter: %.2fM" % (total / 1e6))
|
||||
|
||||
|
||||
def get_spepc(hps, filename):
|
||||
audio = load_audio(filename, int(hps.data.sampling_rate))
|
||||
audio = torch.FloatTensor(audio)
|
||||
@ -341,17 +320,86 @@ def get_spepc(hps, filename):
|
||||
return spec
|
||||
|
||||
|
||||
dict_language = {
|
||||
"中文": "zh",
|
||||
"英文": "en",
|
||||
"日文": "ja",
|
||||
"ZH": "zh",
|
||||
"EN": "en",
|
||||
"JA": "ja",
|
||||
"zh": "zh",
|
||||
"en": "en",
|
||||
"ja": "ja"
|
||||
}
|
||||
def pack_audio(audio_bytes, data, rate):
|
||||
if media_type == "ogg":
|
||||
audio_bytes = pack_ogg(audio_bytes, data, rate)
|
||||
elif media_type == "aac":
|
||||
audio_bytes = pack_aac(audio_bytes, data, rate)
|
||||
else:
|
||||
# wav无法流式, 先暂存raw
|
||||
audio_bytes = pack_raw(audio_bytes, data, rate)
|
||||
|
||||
return audio_bytes
|
||||
|
||||
|
||||
def pack_ogg(audio_bytes, data, rate):
|
||||
with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
||||
audio_file.write(data)
|
||||
|
||||
return audio_bytes
|
||||
|
||||
|
||||
def pack_raw(audio_bytes, data, rate):
|
||||
audio_bytes.write(data.tobytes())
|
||||
|
||||
return audio_bytes
|
||||
|
||||
|
||||
def pack_wav(audio_bytes, rate):
|
||||
data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16)
|
||||
wav_bytes = BytesIO()
|
||||
sf.write(wav_bytes, data, rate, format='wav')
|
||||
|
||||
return wav_bytes
|
||||
|
||||
|
||||
def pack_aac(audio_bytes, data, rate):
|
||||
process = subprocess.Popen([
|
||||
'ffmpeg',
|
||||
'-f', 's16le', # 输入16位有符号小端整数PCM
|
||||
'-ar', str(rate), # 设置采样率
|
||||
'-ac', '1', # 单声道
|
||||
'-i', 'pipe:0', # 从管道读取输入
|
||||
'-c:a', 'aac', # 音频编码器为AAC
|
||||
'-b:a', '192k', # 比特率
|
||||
'-vn', # 不包含视频
|
||||
'-f', 'adts', # 输出AAC数据流格式
|
||||
'pipe:1' # 将输出写入管道
|
||||
], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
out, _ = process.communicate(input=data.tobytes())
|
||||
audio_bytes.write(out)
|
||||
|
||||
return audio_bytes
|
||||
|
||||
|
||||
def read_clean_buffer(audio_bytes):
|
||||
audio_chunk = audio_bytes.getvalue()
|
||||
audio_bytes.truncate(0)
|
||||
audio_bytes.seek(0)
|
||||
|
||||
return audio_bytes, audio_chunk
|
||||
|
||||
|
||||
def cut_text(text, punc):
|
||||
punc_list = [p for p in punc if p in {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}]
|
||||
if len(punc_list) > 0:
|
||||
punds = r"[" + "".join(punc_list) + r"]"
|
||||
text = text.strip("\n")
|
||||
items = re.split(f"({punds})", text)
|
||||
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
|
||||
# 在句子不存在符号或句尾无符号的时候保证文本完整
|
||||
if len(items)%2 == 1:
|
||||
mergeitems.append(items[-1])
|
||||
text = "\n".join(mergeitems)
|
||||
|
||||
while "\n\n" in text:
|
||||
text = text.replace("\n\n", "\n")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def only_punc(text):
|
||||
return not any(t.isalnum() or t.isalpha() for t in text)
|
||||
|
||||
|
||||
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
@ -374,25 +422,19 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
|
||||
codes = vq_model.extract_latent(ssl_content)
|
||||
prompt_semantic = codes[0, 0]
|
||||
t1 = ttime()
|
||||
prompt_language = dict_language[prompt_language]
|
||||
text_language = dict_language[text_language]
|
||||
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
||||
phones1 = cleaned_text_to_sequence(phones1)
|
||||
prompt_language = dict_language[prompt_language.lower()]
|
||||
text_language = dict_language[text_language.lower()]
|
||||
phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language)
|
||||
texts = text.split("\n")
|
||||
audio_opt = []
|
||||
audio_bytes = BytesIO()
|
||||
|
||||
for text in texts:
|
||||
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
|
||||
phones2 = cleaned_text_to_sequence(phones2)
|
||||
if (prompt_language == "zh"):
|
||||
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
||||
else:
|
||||
bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
|
||||
device)
|
||||
if (text_language == "zh"):
|
||||
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
|
||||
else:
|
||||
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
|
||||
# 简单防止纯符号引发参考音频泄露
|
||||
if only_punc(text):
|
||||
continue
|
||||
|
||||
audio_opt = []
|
||||
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language)
|
||||
bert = torch.cat([bert1, bert2], 1)
|
||||
|
||||
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
||||
@ -426,8 +468,17 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
|
||||
audio_opt.append(audio)
|
||||
audio_opt.append(zero_wav)
|
||||
t4 = ttime()
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
||||
yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
|
||||
audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16),hps.data.sampling_rate)
|
||||
# logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
||||
if stream_mode == "normal":
|
||||
audio_bytes, audio_chunk = read_clean_buffer(audio_bytes)
|
||||
yield audio_chunk
|
||||
|
||||
if not stream_mode == "normal":
|
||||
if media_type == "wav":
|
||||
audio_bytes = pack_wav(audio_bytes,hps.data.sampling_rate)
|
||||
yield audio_bytes.getvalue()
|
||||
|
||||
|
||||
|
||||
def handle_control(command):
|
||||
@ -449,15 +500,16 @@ def handle_change(path, text, language):
|
||||
if language != "" or language is not None:
|
||||
default_refer.language = language
|
||||
|
||||
print(f"[INFO] 当前默认参考音频路径: {default_refer.path}")
|
||||
print(f"[INFO] 当前默认参考音频文本: {default_refer.text}")
|
||||
print(f"[INFO] 当前默认参考音频语种: {default_refer.language}")
|
||||
print(f"[INFO] is_ready: {default_refer.is_ready()}")
|
||||
logger.info(f"当前默认参考音频路径: {default_refer.path}")
|
||||
logger.info(f"当前默认参考音频文本: {default_refer.text}")
|
||||
logger.info(f"当前默认参考音频语种: {default_refer.language}")
|
||||
logger.info(f"is_ready: {default_refer.is_ready()}")
|
||||
|
||||
|
||||
return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
|
||||
|
||||
|
||||
def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc):
|
||||
if (
|
||||
refer_wav_path == "" or refer_wav_path is None
|
||||
or prompt_text == "" or prompt_text is None
|
||||
@ -471,24 +523,145 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
if not default_refer.is_ready():
|
||||
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
||||
|
||||
with torch.no_grad():
|
||||
gen = get_tts_wav(
|
||||
refer_wav_path, prompt_text, prompt_language, text, text_language
|
||||
)
|
||||
sampling_rate, audio_data = next(gen)
|
||||
if cut_punc == None:
|
||||
text = cut_text(text,default_cut_punc)
|
||||
else:
|
||||
text = cut_text(text,cut_punc)
|
||||
|
||||
wav = BytesIO()
|
||||
sf.write(wav, audio_data, sampling_rate, format="wav")
|
||||
wav.seek(0)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
return StreamingResponse(wav, media_type="audio/wav")
|
||||
return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language), media_type="audio/"+media_type)
|
||||
|
||||
|
||||
|
||||
|
||||
# --------------------------------
|
||||
# 初始化部分
|
||||
# --------------------------------
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
||||
|
||||
dict_language = {
|
||||
"中文": "all_zh",
|
||||
"英文": "en",
|
||||
"日文": "all_ja",
|
||||
"中英混合": "zh",
|
||||
"日英混合": "ja",
|
||||
"多语种混合": "auto", #多语种启动切分识别语种
|
||||
"all_zh": "all_zh",
|
||||
"en": "en",
|
||||
"all_ja": "all_ja",
|
||||
"zh": "zh",
|
||||
"ja": "ja",
|
||||
"auto": "auto",
|
||||
}
|
||||
|
||||
# logger
|
||||
logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG)
|
||||
logger = logging.getLogger('uvicorn')
|
||||
|
||||
# 获取配置
|
||||
g_config = global_config.Config()
|
||||
|
||||
# 获取参数
|
||||
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
|
||||
|
||||
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
|
||||
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
|
||||
parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
|
||||
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
|
||||
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
|
||||
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
||||
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
||||
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
|
||||
# bool值的用法为 `python ./api.py -fp ...`
|
||||
# 此时 full_precision==True, half_precision==False
|
||||
parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive")
|
||||
parser.add_argument("-mt", "--media_type", type=str, default="wav", help="音频编码格式, wav / ogg / aac")
|
||||
parser.add_argument("-cp", "--cut_punc", type=str, default="", help="文本切分符号设定, 符号范围,.;?!、,。?!;:…")
|
||||
# 切割常用分句符为 `python ./api.py -cp ".?!。?!"`
|
||||
parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
|
||||
parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
|
||||
|
||||
args = parser.parse_args()
|
||||
sovits_path = args.sovits_path
|
||||
gpt_path = args.gpt_path
|
||||
device = args.device
|
||||
port = args.port
|
||||
host = args.bind_addr
|
||||
cnhubert_base_path = args.hubert_path
|
||||
bert_path = args.bert_path
|
||||
default_cut_punc = args.cut_punc
|
||||
|
||||
# 应用参数配置
|
||||
default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
|
||||
|
||||
# 模型路径检查
|
||||
if sovits_path == "":
|
||||
sovits_path = g_config.pretrained_sovits_path
|
||||
logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
|
||||
if gpt_path == "":
|
||||
gpt_path = g_config.pretrained_gpt_path
|
||||
logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
|
||||
|
||||
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
|
||||
if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
|
||||
default_refer.path, default_refer.text, default_refer.language = "", "", ""
|
||||
logger.info("未指定默认参考音频")
|
||||
else:
|
||||
logger.info(f"默认参考音频路径: {default_refer.path}")
|
||||
logger.info(f"默认参考音频文本: {default_refer.text}")
|
||||
logger.info(f"默认参考音频语种: {default_refer.language}")
|
||||
|
||||
# 获取半精度
|
||||
is_half = g_config.is_half
|
||||
if args.full_precision:
|
||||
is_half = False
|
||||
if args.half_precision:
|
||||
is_half = True
|
||||
if args.full_precision and args.half_precision:
|
||||
is_half = g_config.is_half # 炒饭fallback
|
||||
logger.info(f"半精: {is_half}")
|
||||
|
||||
# 流式返回模式
|
||||
if args.stream_mode.lower() in ["normal","n"]:
|
||||
stream_mode = "normal"
|
||||
logger.info("流式返回已开启")
|
||||
else:
|
||||
stream_mode = "close"
|
||||
|
||||
# 音频编码格式
|
||||
if args.media_type.lower() in ["aac","ogg"]:
|
||||
media_type = args.media_type.lower()
|
||||
elif stream_mode == "close":
|
||||
media_type = "wav"
|
||||
else:
|
||||
media_type = "ogg"
|
||||
logger.info(f"编码格式: {media_type}")
|
||||
|
||||
# 初始化模型
|
||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
||||
ssl_model = cnhubert.get_model()
|
||||
if is_half:
|
||||
bert_model = bert_model.half().to(device)
|
||||
ssl_model = ssl_model.half().to(device)
|
||||
else:
|
||||
bert_model = bert_model.to(device)
|
||||
ssl_model = ssl_model.to(device)
|
||||
change_sovits_weights(sovits_path)
|
||||
change_gpt_weights(gpt_path)
|
||||
|
||||
|
||||
|
||||
|
||||
# --------------------------------
|
||||
# 接口部分
|
||||
# --------------------------------
|
||||
app = FastAPI()
|
||||
|
||||
#clark新增-----2024-02-21
|
||||
#可在启动后动态修改模型,以此满足同一个api不同的朗读者请求
|
||||
@app.post("/set_model")
|
||||
async def set_model(request: Request):
|
||||
json_post_raw = await request.json()
|
||||
@ -496,11 +669,11 @@ async def set_model(request: Request):
|
||||
gpt_path=json_post_raw.get("gpt_model_path")
|
||||
global sovits_path
|
||||
sovits_path=json_post_raw.get("sovits_model_path")
|
||||
print("gptpath"+gpt_path+";vitspath"+sovits_path)
|
||||
logger.info("gptpath"+gpt_path+";vitspath"+sovits_path)
|
||||
change_sovits_weights(sovits_path)
|
||||
change_gpt_weights(gpt_path)
|
||||
return "ok"
|
||||
# 新增-----end------
|
||||
|
||||
|
||||
@app.post("/control")
|
||||
async def control(request: Request):
|
||||
@ -541,6 +714,7 @@ async def tts_endpoint(request: Request):
|
||||
json_post_raw.get("prompt_language"),
|
||||
json_post_raw.get("text"),
|
||||
json_post_raw.get("text_language"),
|
||||
json_post_raw.get("cut_punc"),
|
||||
)
|
||||
|
||||
|
||||
@ -551,8 +725,9 @@ async def tts_endpoint(
|
||||
prompt_language: str = None,
|
||||
text: str = None,
|
||||
text_language: str = None,
|
||||
cut_punc: str = None,
|
||||
):
|
||||
return handle(refer_wav_path, prompt_text, prompt_language, text, text_language)
|
||||
return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
14
api_v2.py
14
api_v2.py
@ -22,7 +22,7 @@ POST:
|
||||
```json
|
||||
{
|
||||
"text": "", # str.(required) text to be synthesized
|
||||
"text_lang": "", # str.(required) language of the text to be synthesized
|
||||
"text_lang": "", # str.(required) language of the text to be synthesized
|
||||
"ref_audio_path": "", # str.(required) reference audio path.
|
||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||
@ -32,12 +32,14 @@ POST:
|
||||
"text_split_method": "cut5", # str.(optional) text split method, see text_segmentation_method.py for details.
|
||||
"batch_size": 1, # int.(optional) batch size for inference
|
||||
"batch_threshold": 0.75, # float.(optional) threshold for batch splitting.
|
||||
"split_bucket": true, # bool.(optional) whether to split the batch into multiple buckets.
|
||||
"split_bucket": true, # bool.(optional) whether to split the batch into multiple buckets.
|
||||
"speed_factor":1.0, # float.(optional) control the speed of the synthesized audio.
|
||||
"fragment_interval":0.3, # float.(optional) to control the interval of the audio fragment.
|
||||
"seed": -1, # int.(optional) random seed for reproducibility.
|
||||
"media_type": "wav", # str.(optional) media type of the output audio, support "wav", "raw", "ogg", "aac".
|
||||
"streaming_mode": false, # bool.(optional) whether to return a streaming response.
|
||||
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
||||
}
|
||||
```
|
||||
|
||||
@ -159,6 +161,8 @@ class TTS_Request(BaseModel):
|
||||
seed:int = -1
|
||||
media_type:str = "wav"
|
||||
streaming_mode:bool = False
|
||||
parallel_infer:bool = True
|
||||
repetition_penalty:float = 1.35
|
||||
|
||||
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
|
||||
def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
||||
@ -287,6 +291,8 @@ async def tts_handle(req:dict):
|
||||
"seed": -1, # int. random seed for reproducibility.
|
||||
"media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
|
||||
"streaming_mode": False, # bool. whether to return a streaming response.
|
||||
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
||||
}
|
||||
returns:
|
||||
StreamingResponse: audio stream response.
|
||||
@ -354,6 +360,8 @@ async def tts_get_endpoint(
|
||||
seed:int = -1,
|
||||
media_type:str = "wav",
|
||||
streaming_mode:bool = False,
|
||||
parallel_infer:bool = True,
|
||||
repetition_penalty:float = 1.35
|
||||
):
|
||||
req = {
|
||||
"text": text,
|
||||
@ -373,6 +381,8 @@ async def tts_get_endpoint(
|
||||
"seed":seed,
|
||||
"media_type":media_type,
|
||||
"streaming_mode":streaming_mode,
|
||||
"parallel_infer":parallel_infer,
|
||||
"repetition_penalty":float(repetition_penalty)
|
||||
}
|
||||
return await tts_handle(req)
|
||||
|
||||
|
@ -67,6 +67,7 @@
|
||||
"!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
|
||||
"# @title UVR5 pretrains 安装uvr5模型\n",
|
||||
"%cd /content/GPT-SoVITS/tools/uvr5\n",
|
||||
"%rm -r uvr5_weights\n",
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
|
@ -8,7 +8,7 @@
|
||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
||||
|
||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
|
||||
|
||||
[**English**](../../README.md) | [**中文简体**](./README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md)
|
||||
@ -41,16 +41,21 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### 测试通过的环境
|
||||
|
||||
- Python 3.9、PyTorch 2.0.1 和 CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2 和 CUDA 12.3
|
||||
- Python 3.9、Pytorch 2.3.0.dev20240122 和 macOS 14.3(Apple 芯片)
|
||||
- Python 3.9,PyTorch 2.0.1,CUDA 11
|
||||
- Python 3.10.13,PyTorch 2.1.2,CUDA 12.3
|
||||
- Python 3.9,Pytorch 2.2.2,macOS 14.4.1(Apple 芯片)
|
||||
- Python 3.9,PyTorch 2.2.2,CPU 设备
|
||||
|
||||
_注意: numba==0.56.4 需要 python<3.11_
|
||||
_注: numba==0.56.4 需要 python<3.11_
|
||||
|
||||
### Windows
|
||||
|
||||
如果你是 Windows 用户(已在 win>=10 上测试),可以直接下载[预打包文件](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
|
||||
|
||||
中国地区用户可以通过点击链接并选择“下载副本”来下载[0217版本包](https://www.icloud.com.cn/iclouddrive/061bfkcVJcBfsMfLF5R2XKdTQ#GPT-SoVITS-beta0217)或[0306fix2版本包](https://www.icloud.com.cn/iclouddrive/09aaTLf96aa92dbLe0fPNM5CQ#GPT-SoVITS-beta0306fix2)。
|
||||
|
||||
_注:0306fix2版本推理速度翻倍,节约生命。修复了无参考文本模式的所有问题。_
|
||||
|
||||
### Linux
|
||||
|
||||
```bash
|
||||
@ -63,7 +68,9 @@ bash install.sh
|
||||
|
||||
**注:在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型,所以我们暂时使用CPU进行训练。**
|
||||
|
||||
首先确保你已通过运行 `brew install ffmpeg` 或 `conda install ffmpeg` 安装 FFmpeg,然后运行以下命令安装:
|
||||
1. 运行 `xcode-select --install` 安装 Xcode command-line tools。
|
||||
2. 运行 `brew install ffmpeg` 或 `conda install ffmpeg` 安装 FFmpeg。
|
||||
3. 完成上述步骤后,运行以下的命令来安装本项目:
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
@ -139,7 +146,15 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
- [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights)
|
||||
|
||||
对于中文自动语音识别(附加),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/damo_asr/models` 中。
|
||||
对于中文自动语音识别(附加),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 中。
|
||||
|
||||
对于英语与日语自动语音识别(附加),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将它们放置在 `tools/asr/models` 中。 此外,[其他模型](https://huggingface.co/Systran)可能具有类似效果,但占用更小的磁盘空间。
|
||||
|
||||
中国地区用户可以通过以下链接下载:
|
||||
- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/0c4pQxFs7oWyVU1iMTq2DbmLA#faster-whisper-large-v3)(点击“下载副本”)
|
||||
|
||||
- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3)(Hugging Face镜像站)
|
||||
|
||||
|
||||
## 数据集格式
|
||||
|
||||
@ -202,13 +217,13 @@ python audio_slicer.py \
|
||||
````
|
||||
这是使用命令行完成数据集ASR处理的方式(仅限中文)
|
||||
````
|
||||
python tools/damo_asr/cmd-asr.py "<Path to the directory containing input audio files>"
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
````
|
||||
通过Faster_Whisper进行ASR处理(除中文之外的ASR标记)
|
||||
|
||||
(没有进度条,GPU性能可能会导致时间延迟)
|
||||
````
|
||||
python ./tools/damo_asr/WhisperASR.py -i <input> -o <output> -f <file_name.list> -l <language>
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language>
|
||||
````
|
||||
启用自定义列表保存路径
|
||||
## 致谢
|
||||
|
@ -8,7 +8,7 @@
|
||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
||||
|
||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
|
||||
|
||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](./README.md) | [**한국어**](../ko/README.md)
|
||||
@ -39,7 +39,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
- Python 3.9, PyTorch 2.0.1, CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
|
||||
- Python 3.9, PyTorch 2.3.0.dev20240122, macOS 14.3 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, CPUデバイス
|
||||
|
||||
_注記: numba==0.56.4 は py<3.11 が必要です_
|
||||
|
||||
@ -59,7 +60,9 @@ bash install.sh
|
||||
|
||||
**注:MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練します。**
|
||||
|
||||
まず、`brew install ffmpeg`または`conda install ffmpeg`を実行してFFmpegをインストールしたことを確認してください。次に、以下のコマンドを使用してインストールします:
|
||||
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
|
||||
2. `brew install ffmpeg` または `conda install ffmpeg` を実行して、FFmpegをインストールします。
|
||||
3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
@ -127,7 +130,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
[GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITSpretrained_models` に置きます。
|
||||
|
||||
中国語 ASR(追加)については、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、[Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/damo_asr/models` に置いてください。
|
||||
中国語 ASR(追加)については、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、[Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` に置いてください。
|
||||
|
||||
UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードして `tools/uvr5/uvr5_weights` に置きます。
|
||||
|
||||
@ -156,7 +159,7 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||
- [ ] **優先度 高:**
|
||||
|
||||
- [x] 日本語と英語でのローカライズ。
|
||||
- [ ] ユーザーガイド。
|
||||
- [] ユーザーガイド。
|
||||
- [x] 日本語データセットと英語データセットのファインチューニングトレーニング。
|
||||
|
||||
- [ ] **機能:**
|
||||
@ -192,13 +195,13 @@ python audio_slicer.py \
|
||||
```
|
||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
||||
```
|
||||
python tools/damo_asr/cmd-asr.py "<Path to the directory containing input audio files>"
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
|
||||
|
||||
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
||||
```
|
||||
python ./tools/damo_asr/WhisperASR.py -i <input> -o <output> -f <file_name.list> -l <language>
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language>
|
||||
```
|
||||
カスタムリストの保存パスが有効になっています
|
||||
## クレジット
|
||||
|
@ -89,6 +89,21 @@
|
||||
1. 참조 텍스트 입력을 지원합니다.
|
||||
2. 프론트엔드에 있던 중국어 텍스트 입력 버그를 수정하였습니다.
|
||||
|
||||
todolist :
|
||||
### 20240221 업데이트
|
||||
|
||||
1. 중국어 다음음자 추론 최적화
|
||||
1. 데이터 처리에 음성 노이즈 감소 옵션을 추가하였습니다. (노이즈 감소는 16k 샘플링률만 남기며, 노이즈가 크지 않다면 사용하지 마십시오.)
|
||||
2. 중국어 및 일본어 프론트엔드 처리를 최적화하였습니다. https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
|
||||
3. Mac에서 CPU 추론이 더 빨라졌으므로 추론 장치를 mps에서 CPU로 변경하였습니다.
|
||||
4. colab에서 공용 URL을 열지 않는 문제를 수정하였습니다.
|
||||
|
||||
### 20240306 업데이트
|
||||
|
||||
1. 추론 속도를 50% 빠르게 하였습니다. (RTX3090+pytorch2.2.1+cu11.8+win10+py39 테스트 완료) https://github.com/RVC-Boss/GPT-SoVITS/pull/672
|
||||
2. faster whisper를 사용할 때 중국어 ASR을 먼저 다운로드할 필요가 없습니다.
|
||||
3. uvr5의 잔향 제거 모델이 잔향이 있는지 여부를 반대로 반환하는 문제를 수정하였습니다.
|
||||
4. faster whisper가 CUDA를 사용할 수 없는 경우 자동으로 CPU 추론을 사용하도록 수정하였습니다.
|
||||
5. is_half의 판단을 수정하여 Mac에서 CPU 추론이 정상적으로 작동하도록 수정하였습니다.
|
||||
|
||||
todolist:
|
||||
|
||||
1. 중국어 다양한 발음 단어 추론 최적화(테스트 결과를 작성하시는 분은 pr 코멘트 영역에 작성해주시면 감사하겠습니다)
|
||||
|
@ -8,7 +8,7 @@
|
||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
||||
|
||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||
[](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
|
||||
|
||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](./README.md)
|
||||
@ -37,9 +37,10 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
|
||||
### 테스트 통과 환경
|
||||
|
||||
- Python 3.9, PyTorch 2.0.1 및 CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2 및 CUDA 12.3
|
||||
- Python 3.9, Pytorch 2.3.0.dev20240122 및 macOS 14.3 (Apple Slilicon)
|
||||
- Python 3.9, PyTorch 2.0.1, CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
|
||||
- Python 3.9, Pytorch 2.2.2, macOS 14.4.1 (Apple Slilicon)
|
||||
- Python 3.9, PyTorch 2.2.2, CPU 장치
|
||||
|
||||
_참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._
|
||||
|
||||
@ -57,9 +58,11 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
**주의: Mac에서 GPU로 훈련된 모델은 다른 장치에서 훈련된 모델에 비해 현저히 낮은 품질을 나타내므로, 우리는 일시적으로 CPU를 사용하여 훈련하고 있습니다.**
|
||||
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
|
||||
|
||||
먼저 `brew install ffmpeg` 또는 `conda install ffmpeg`를 실행하여 FFmpeg가 설치되었는지 확인한 다음, 다음 명령어를 사용하여 설치하세요:
|
||||
1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요.
|
||||
2. `brew install ffmpeg` 또는 `conda install ffmpeg`을 실행하여 FFmpeg를 설치하세요.
|
||||
3. 위의 단계를 완료한 후, 다음 명령어를 실행하여 이 프로젝트를 설치하세요.
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
@ -130,7 +133,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
[GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS)에서 사전 훈련된 모델을 다운로드하고 `GPT_SoVITS\pretrained_models`에 넣습니다.
|
||||
|
||||
중국어 자동 음성 인식(ASR), 음성 반주 분리 및 음성 제거를 위해 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files)을 다운로드하고 `tools/damo_asr/models`에 넣습니다.
|
||||
중국어 자동 음성 인식(ASR), 음성 반주 분리 및 음성 제거를 위해 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files)을 다운로드하고 `tools/asr/models`에 넣습니다.
|
||||
|
||||
UVR5(음성/반주 분리 및 잔향 제거)를 위해 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights)에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights`에 넣습니다.
|
||||
|
||||
@ -196,13 +199,13 @@ python audio_slicer.py \
|
||||
```
|
||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
||||
```
|
||||
python tools/damo_asr/cmd-asr.py "<Path to the directory containing input audio files>"
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
||||
|
||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
||||
```
|
||||
python ./tools/damo_asr/WhisperASR.py -i <input> -o <output> -f <file_name.list> -l <language>
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language>
|
||||
```
|
||||
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
||||
## 감사의 말
|
||||
|
@ -2,6 +2,18 @@
|
||||
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
|
||||
"UVR5已开启": "UVR5已开启",
|
||||
"UVR5已关闭": "UVR5已关闭",
|
||||
"输入文件夹路径": "输入文件夹路径",
|
||||
"输出文件夹路径": "输出文件夹路径",
|
||||
"ASR 模型": "ASR 模型",
|
||||
"ASR 模型尺寸": "ASR 模型尺寸",
|
||||
"ASR 语言设置": "ASR 语言设置",
|
||||
"模型切换": "模型切换",
|
||||
"是否开启dpo训练选项(实验性)": "是否开启dpo训练选项(实验性)",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。",
|
||||
"使用无参考文本模式时建议使用微调的GPT": "使用无参考文本模式时建议使用微调的GPT",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。",
|
||||
"gpt采样参数(无参考文本时不要太低):": "gpt采样参数(无参考文本时不要太低):",
|
||||
"按标点符号切": "按标点符号切",
|
||||
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.",
|
||||
"0-前置数据集获取工具": "0-前置数据集获取工具",
|
||||
"0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5人声伴奏分离&去混响去延迟工具",
|
||||
|
@ -24,4 +24,5 @@ psutil
|
||||
jieba_fast
|
||||
jieba
|
||||
LangSegment>=0.2.0
|
||||
Faster_Whisper
|
||||
Faster_Whisper
|
||||
wordsegment
|
@ -1,18 +1,16 @@
|
||||
import argparse
|
||||
import os
|
||||
os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
|
||||
import traceback
|
||||
import requests
|
||||
from glob import glob
|
||||
import torch
|
||||
|
||||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
import torch
|
||||
from faster_whisper import WhisperModel
|
||||
from tqdm import tqdm
|
||||
|
||||
from tools.asr.config import check_fw_local_models
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||
|
||||
language_code_list = [
|
||||
"af", "am", "ar", "as", "az",
|
||||
"ba", "be", "bg", "bn", "bo",
|
||||
@ -36,7 +34,7 @@ language_code_list = [
|
||||
"vi", "yi", "yo", "zh", "yue",
|
||||
"auto"]
|
||||
|
||||
def execute_asr(input_folder, output_folder, model_size, language,precision):
|
||||
def execute_asr(input_folder, output_folder, model_size, language, precision):
|
||||
if '-local' in model_size:
|
||||
model_size = model_size[:-6]
|
||||
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
||||
@ -50,17 +48,18 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
|
||||
model = WhisperModel(model_path, device=device, compute_type=precision)
|
||||
except:
|
||||
return print(traceback.format_exc())
|
||||
|
||||
input_file_names = os.listdir(input_folder)
|
||||
input_file_names.sort()
|
||||
|
||||
output = []
|
||||
output_file_name = os.path.basename(input_folder)
|
||||
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
||||
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
|
||||
for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
|
||||
|
||||
for file_name in tqdm(input_file_names):
|
||||
try:
|
||||
file_path = os.path.join(input_folder, file_name)
|
||||
segments, info = model.transcribe(
|
||||
audio = file,
|
||||
audio = file_path,
|
||||
beam_size = 5,
|
||||
vad_filter = True,
|
||||
vad_parameters = dict(min_silence_duration_ms=700),
|
||||
@ -68,18 +67,23 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
|
||||
text = ''
|
||||
|
||||
if info.language == "zh":
|
||||
print("检测为中文文本,转funasr处理")
|
||||
print("检测为中文文本, 转 FunASR 处理")
|
||||
if("only_asr"not in globals()):
|
||||
from tools.asr.funasr_asr import only_asr##如果用英文就不需要导入下载模型
|
||||
text = only_asr(file)
|
||||
from tools.asr.funasr_asr import \
|
||||
only_asr # #如果用英文就不需要导入下载模型
|
||||
text = only_asr(file_path)
|
||||
|
||||
if text == '':
|
||||
for segment in segments:
|
||||
text += segment.text
|
||||
output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}")
|
||||
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
|
||||
except:
|
||||
return print(traceback.format_exc())
|
||||
|
||||
|
||||
output_folder = output_folder or "output/asr_opt"
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
||||
|
||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(output))
|
||||
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||
|
@ -38,10 +38,11 @@ def execute_asr(input_folder, output_folder, model_size, language):
|
||||
output = []
|
||||
output_file_name = os.path.basename(input_folder)
|
||||
|
||||
for name in tqdm(input_file_names):
|
||||
for file_name in tqdm(input_file_names):
|
||||
try:
|
||||
text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"]
|
||||
output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}")
|
||||
file_path = os.path.join(input_folder, file_name)
|
||||
text = model.generate(input=file_path)[0]["text"]
|
||||
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
||||
except:
|
||||
print(traceback.format_exc())
|
||||
|
||||
|
@ -73,8 +73,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
|
||||
os.path.basename(inp_path),
|
||||
)
|
||||
os.system(
|
||||
"ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
|
||||
% (inp_path, tmp_path)
|
||||
f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
|
||||
)
|
||||
inp_path = tmp_path
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user