推理功能增强和改进

This commit is contained in:
chasonjiang 2024-03-13 19:51:24 +08:00
parent d60d8ea3fb
commit 252c9b7eb6
3 changed files with 99 additions and 34 deletions

View File

@ -707,19 +707,33 @@ class Text2SemanticDecoder(nn.Module):
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
ref_free = True
x_attn_mask_pad = F.pad(
x_attn_mask,
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1(x,x+y)
value=True,
)
y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
##### create mask #####
bsz = x.shape[0]
src_len = x_len + y_len
y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
y_mask = make_pad_mask(y_lens)
x_mask = make_pad_mask(x_lens)
# (bsz, x_len + y_len)
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
x_mask = F.pad(
x_attn_mask,
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1(x,x+y)
value=True,
)
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
x.device
)
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz*self.num_head, -1, -1).to(x.device)
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(bsz, src_len, src_len).repeat(self.num_head, 1, 1)
xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
y_list = [None]*y.shape[0]
batch_idx_map = list(range(y.shape[0]))

View File

@ -3,6 +3,8 @@ import math
import os, sys
import random
import traceback
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import ffmpeg
@ -552,14 +554,15 @@ class TTS:
"prompt_text": "", # str. prompt text for the reference audio
"prompt_lang": "", # str. language of the prompt text for the reference audio
"top_k": 5, # int. top k sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"text_split_method": "", # str. text split method, see text_segmentaion_method.py for details.
"batch_size": 1, # int. batch size for inference
"batch_threshold": 0.75, # float. threshold for batch splitting.
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
"return_fragment": False, # bool. step by step return the audio fragment.
"speed_factor":1.0, # float. control the speed of the synthesized audio.
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
}
returns:
tulpe[int, np.ndarray]: sampling rate and audio data.
@ -580,9 +583,10 @@ class TTS:
speed_factor = inputs.get("speed_factor", 1.0)
split_bucket = inputs.get("split_bucket", True)
return_fragment = inputs.get("return_fragment", False)
fragment_interval = inputs.get("fragment_interval", 0.3)
if return_fragment:
split_bucket = False
# split_bucket = False
print(i18n("分段返回模式已开启"))
if split_bucket:
split_bucket = False
@ -590,7 +594,10 @@ class TTS:
if split_bucket:
print(i18n("分桶处理模式已开启"))
if fragment_interval<0.01:
fragment_interval = 0.01
print(i18n("分段间隔过小已自动设置为0.01"))
no_prompt_text = False
if prompt_text in [None, ""]:
@ -627,19 +634,52 @@ class TTS:
###### text preprocessing ########
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
if len(data) == 0:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
dtype=np.int16)
return
t1 = ttime()
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=split_bucket
)
data:list = None
if not return_fragment:
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
if len(data) == 0:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
dtype=np.int16)
return
batch_index_list:list = None
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=split_bucket
)
else:
print(i18n("############ 切分文本 ############"))
texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
data = []
for i in range(len(texts)):
if i%batch_size == 0:
data.append([])
data[-1].append(texts[i])
def make_batch(batch_texts):
batch_data = []
print(i18n("############ 提取文本Bert特征 ############"))
for text in tqdm(batch_texts):
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang)
if phones is None:
continue
res={
"phones": phones,
"bert_features": bert_features,
"norm_text": norm_text,
}
batch_data.append(res)
batch, _ = self.to_batch(batch_data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=False
)
return batch[0]
t2 = ttime()
try:
print("############ 推理 ############")
@ -649,6 +689,9 @@ class TTS:
audio = []
for item in data:
t3 = ttime()
if return_fragment:
item = make_batch(item)
batch_phones = item["phones"]
batch_phones_len = item["phones_len"]
all_phoneme_ids = item["all_phones"]
@ -734,14 +777,16 @@ class TTS:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
yield self.audio_postprocess([batch_audio_fragment],
self.configs.sampling_rate,
batch_index_list,
None,
speed_factor,
split_bucket)
False,
fragment_interval
)
else:
audio.append(batch_audio_fragment)
if self.stop_flag:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
dtype=np.int16)
return
@ -751,7 +796,9 @@ class TTS:
self.configs.sampling_rate,
batch_index_list,
speed_factor,
split_bucket)
split_bucket,
fragment_interval
)
except Exception as e:
traceback.print_exc()
# 必须返回一个空音频, 否则会导致显存不释放。
@ -781,9 +828,11 @@ class TTS:
sr:int,
batch_index_list:list=None,
speed_factor:float=1.0,
split_bucket:bool=True)->tuple[int, np.ndarray]:
split_bucket:bool=True,
fragment_interval:float=0.3
)->tuple[int, np.ndarray]:
zero_wav = torch.zeros(
int(self.configs.sampling_rate * 0.3),
int(self.configs.sampling_rate * fragment_interval),
dtype=self.precison,
device=self.configs.device
)

View File

@ -91,7 +91,7 @@ def inference(text, text_lang,
top_p, temperature,
text_split_method, batch_size,
speed_factor, ref_text_free,
split_bucket
split_bucket,fragment_interval,
):
inputs={
"text": text,
@ -106,7 +106,8 @@ def inference(text, text_lang,
"batch_size":int(batch_size),
"speed_factor":float(speed_factor),
"split_bucket":split_bucket,
"return_fragment":False
"return_fragment":False,
"fragment_interval":fragment_interval,
}
for item in tts_pipline.run(inputs):
@ -188,6 +189,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Column():
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True)
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
@ -216,7 +218,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
top_k, top_p, temperature,
how_to_cut, batch_size,
speed_factor, ref_text_free,
split_bucket
split_bucket,fragment_interval,
],
[output],
)