mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
Merge pull request #758 from ChasonJiang/fast_inference_
[fast inference] 推理功能增强和改进
This commit is contained in:
commit
099059de93
@ -504,18 +504,29 @@ class Text2SemanticDecoder(nn.Module):
|
||||
|
||||
def infer_panel_batch_infer_with_flash_attn(
|
||||
self,
|
||||
x, #####全部文本token
|
||||
x_lens,
|
||||
prompts, ####参考音频token
|
||||
bert_feature,
|
||||
x:List[torch.LongTensor], #####全部文本token
|
||||
x_lens:torch.LongTensor,
|
||||
prompts:torch.LongTensor, ####参考音频token
|
||||
bert_feature:List[torch.LongTensor],
|
||||
top_k: int = -100,
|
||||
top_p: int = 100,
|
||||
early_stop_num: int = -1,
|
||||
temperature: float = 1.0,
|
||||
):
|
||||
|
||||
bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
|
||||
x = self.ar_text_embedding(x)
|
||||
# 先对phones进行embedding、对bert_features进行project,再pad到相同长度,以缓解复读问题。(可能还有其他因素导致复读)
|
||||
max_len = 0
|
||||
for x_item, bert_item in zip(x, bert_feature):
|
||||
max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
|
||||
x_list = [self.ar_text_embedding(item) for item in x]
|
||||
x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
|
||||
x = torch.stack(x_list, dim=0)
|
||||
|
||||
bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
|
||||
bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
|
||||
bert_feature = torch.stack(bert_features_list, dim=0)
|
||||
|
||||
# bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
|
||||
# x = self.ar_text_embedding(x)
|
||||
x = x + bert_feature
|
||||
x = self.ar_text_position(x)
|
||||
|
||||
@ -658,17 +669,30 @@ class Text2SemanticDecoder(nn.Module):
|
||||
|
||||
def infer_panel_batch_only(
|
||||
self,
|
||||
x, #####全部文本token
|
||||
x_lens,
|
||||
prompts, ####参考音频token
|
||||
bert_feature,
|
||||
x:List[torch.LongTensor], #####全部文本token
|
||||
x_lens:torch.LongTensor,
|
||||
prompts:torch.LongTensor, ####参考音频token
|
||||
bert_feature:List[torch.LongTensor],
|
||||
top_k: int = -100,
|
||||
top_p: int = 100,
|
||||
early_stop_num: int = -1,
|
||||
temperature: float = 1.0,
|
||||
):
|
||||
x = self.ar_text_embedding(x)
|
||||
x = x + self.bert_proj(bert_feature.transpose(1, 2))
|
||||
# 先对phones进行embedding、对bert_features进行project,再pad到相同长度,以缓解复读问题。(可能还有其他因素导致复读)
|
||||
max_len = 0
|
||||
for x_item, bert_item in zip(x, bert_feature):
|
||||
max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
|
||||
x_list = [self.ar_text_embedding(item) for item in x]
|
||||
x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
|
||||
x = torch.stack(x_list, dim=0)
|
||||
|
||||
bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
|
||||
bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
|
||||
bert_feature = torch.stack(bert_features_list, dim=0)
|
||||
|
||||
# bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
|
||||
# x = self.ar_text_embedding(x)
|
||||
x = x + bert_feature
|
||||
x = self.ar_text_position(x)
|
||||
|
||||
# AR Decoder
|
||||
@ -707,19 +731,33 @@ class Text2SemanticDecoder(nn.Module):
|
||||
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
|
||||
ref_free = True
|
||||
|
||||
x_attn_mask_pad = F.pad(
|
||||
x_attn_mask,
|
||||
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
|
||||
value=True,
|
||||
)
|
||||
y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
##### create mask #####
|
||||
bsz = x.shape[0]
|
||||
src_len = x_len + y_len
|
||||
y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
|
||||
y_mask = make_pad_mask(y_lens)
|
||||
x_mask = make_pad_mask(x_lens)
|
||||
|
||||
# (bsz, x_len + y_len)
|
||||
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
|
||||
|
||||
x_mask = F.pad(
|
||||
x_attn_mask,
|
||||
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
|
||||
value=True,
|
||||
)
|
||||
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
|
||||
(x_len, 0),
|
||||
value=False,
|
||||
)
|
||||
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
|
||||
x.device
|
||||
)
|
||||
|
||||
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz*self.num_head, -1, -1).to(x.device)
|
||||
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
|
||||
xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(bsz, src_len, src_len).repeat(self.num_head, 1, 1)
|
||||
xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
|
||||
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
|
||||
xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
|
||||
|
||||
y_list = [None]*y.shape[0]
|
||||
batch_idx_map = list(range(y.shape[0]))
|
||||
|
@ -3,6 +3,8 @@ import math
|
||||
import os, sys
|
||||
import random
|
||||
import traceback
|
||||
|
||||
from tqdm import tqdm
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
import ffmpeg
|
||||
@ -49,17 +51,24 @@ custom:
|
||||
|
||||
"""
|
||||
|
||||
# def set_seed(seed):
|
||||
# random.seed(seed)
|
||||
# os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
# np.random.seed(seed)
|
||||
# torch.manual_seed(seed)
|
||||
# torch.cuda.manual_seed(seed)
|
||||
# torch.cuda.manual_seed_all(seed)
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# torch.backends.cudnn.enabled = True
|
||||
# set_seed(1234)
|
||||
def set_seed(seed:int):
|
||||
seed = int(seed)
|
||||
seed = seed if seed != -1 else random.randrange(1 << 32)
|
||||
print(f"Set seed to {seed}")
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# torch.backends.cudnn.enabled = True
|
||||
except:
|
||||
pass
|
||||
return seed
|
||||
|
||||
class TTS_Config:
|
||||
default_configs={
|
||||
@ -226,7 +235,7 @@ class TTS:
|
||||
self.cnhuhbert_model = CNHubert(base_path)
|
||||
self.cnhuhbert_model=self.cnhuhbert_model.eval()
|
||||
self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device)
|
||||
if self.configs.is_half:
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.cnhuhbert_model = self.cnhuhbert_model.half()
|
||||
|
||||
|
||||
@ -237,7 +246,7 @@ class TTS:
|
||||
self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path)
|
||||
self.bert_model=self.bert_model.eval()
|
||||
self.bert_model = self.bert_model.to(self.configs.device)
|
||||
if self.configs.is_half:
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.bert_model = self.bert_model.half()
|
||||
|
||||
|
||||
@ -270,7 +279,7 @@ class TTS:
|
||||
vits_model = vits_model.eval()
|
||||
vits_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
self.vits_model = vits_model
|
||||
if self.configs.is_half:
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.vits_model = self.vits_model.half()
|
||||
|
||||
|
||||
@ -288,7 +297,7 @@ class TTS:
|
||||
t2s_model = t2s_model.to(self.configs.device)
|
||||
t2s_model = t2s_model.eval()
|
||||
self.t2s_model = t2s_model
|
||||
if self.configs.is_half:
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.t2s_model = self.t2s_model.half()
|
||||
|
||||
def enable_half_precision(self, enable: bool = True):
|
||||
@ -298,7 +307,7 @@ class TTS:
|
||||
enable: bool, whether to enable half precision.
|
||||
|
||||
'''
|
||||
if self.configs.device == "cpu" and enable:
|
||||
if str(self.configs.device) == "cpu" and enable:
|
||||
print("Half precision is not supported on CPU.")
|
||||
return
|
||||
|
||||
@ -420,7 +429,14 @@ class TTS:
|
||||
batch = torch.stack(padded_sequences)
|
||||
return batch
|
||||
|
||||
def to_batch(self, data:list, prompt_data:dict=None, batch_size:int=5, threshold:float=0.75, split_bucket:bool=True):
|
||||
def to_batch(self, data:list,
|
||||
prompt_data:dict=None,
|
||||
batch_size:int=5,
|
||||
threshold:float=0.75,
|
||||
split_bucket:bool=True,
|
||||
device:torch.device=torch.device("cpu"),
|
||||
precison:torch.dtype=torch.float32,
|
||||
):
|
||||
|
||||
_data:list = []
|
||||
index_and_len_list = []
|
||||
@ -472,14 +488,14 @@ class TTS:
|
||||
for item in item_list:
|
||||
if prompt_data is not None:
|
||||
all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\
|
||||
.to(dtype=self.precison)
|
||||
all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"])
|
||||
phones = torch.LongTensor(item["phones"])
|
||||
.to(dtype=precison, device=device)
|
||||
all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"]).to(device)
|
||||
phones = torch.LongTensor(item["phones"]).to(device)
|
||||
# norm_text = prompt_data["norm_text"]+item["norm_text"]
|
||||
else:
|
||||
all_bert_features = item["bert_features"]\
|
||||
.to(dtype=self.precison)
|
||||
phones = torch.LongTensor(item["phones"])
|
||||
.to(dtype=precison, device=device)
|
||||
phones = torch.LongTensor(item["phones"]).to(device)
|
||||
all_phones = phones
|
||||
# norm_text = item["norm_text"]
|
||||
|
||||
@ -494,19 +510,33 @@ class TTS:
|
||||
norm_text_batch.append(item["norm_text"])
|
||||
|
||||
phones_batch = phones_list
|
||||
max_len = max(bert_max_len, phones_max_len)
|
||||
all_phones_batch = all_phones_list
|
||||
all_bert_features_batch = all_bert_features_list
|
||||
|
||||
|
||||
# max_len = max(bert_max_len, phones_max_len)
|
||||
# phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
#### 直接对phones和bert_features进行pad,会增大复读概率。
|
||||
# all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
|
||||
# all_bert_features_batch = all_bert_features_list
|
||||
all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=self.precison)
|
||||
for idx, item in enumerate(all_bert_features_list):
|
||||
all_bert_features_batch[idx, :, : item.shape[-1]] = item
|
||||
# all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precison, device=device)
|
||||
# for idx, item in enumerate(all_bert_features_list):
|
||||
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
|
||||
|
||||
# #### 先对phones进行embedding、对bert_features进行project,再pad到相同长度,以缓解复读问题。(可能还有其他因素导致复读)
|
||||
# all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
|
||||
# all_phones_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) for item in all_phones_list]
|
||||
# all_phones_batch = torch.stack(all_phones_list, dim=0)
|
||||
|
||||
# all_bert_features_list = [self.t2s_model.model.bert_proj(item.to(self.t2s_model.device).transpose(0, 1)) for item in all_bert_features_list]
|
||||
# all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
|
||||
# all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
|
||||
|
||||
batch = {
|
||||
"phones": phones_batch,
|
||||
"phones_len": torch.LongTensor(phones_len_list),
|
||||
"phones_len": torch.LongTensor(phones_len_list).to(device),
|
||||
"all_phones": all_phones_batch,
|
||||
"all_phones_len": torch.LongTensor(all_phones_len_list),
|
||||
"all_phones_len": torch.LongTensor(all_phones_len_list).to(device),
|
||||
"all_bert_features": all_bert_features_batch,
|
||||
"norm_text": norm_text_batch
|
||||
}
|
||||
@ -552,14 +582,16 @@ class TTS:
|
||||
"prompt_text": "", # str. prompt text for the reference audio
|
||||
"prompt_lang": "", # str. language of the prompt text for the reference audio
|
||||
"top_k": 5, # int. top k sampling
|
||||
"top_p": 1, # float. top p sampling
|
||||
"temperature": 1, # float. temperature for sampling
|
||||
"top_p": 1, # float. top p sampling
|
||||
"temperature": 1, # float. temperature for sampling
|
||||
"text_split_method": "", # str. text split method, see text_segmentaion_method.py for details.
|
||||
"batch_size": 1, # int. batch size for inference
|
||||
"batch_threshold": 0.75, # float. threshold for batch splitting.
|
||||
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
|
||||
"return_fragment": False, # bool. step by step return the audio fragment.
|
||||
"speed_factor":1.0, # float. control the speed of the synthesized audio.
|
||||
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
|
||||
"seed": -1, # int. random seed for reproducibility.
|
||||
}
|
||||
returns:
|
||||
tulpe[int, np.ndarray]: sampling rate and audio data.
|
||||
@ -580,9 +612,13 @@ class TTS:
|
||||
speed_factor = inputs.get("speed_factor", 1.0)
|
||||
split_bucket = inputs.get("split_bucket", True)
|
||||
return_fragment = inputs.get("return_fragment", False)
|
||||
fragment_interval = inputs.get("fragment_interval", 0.3)
|
||||
seed = inputs.get("seed", -1)
|
||||
seed = -1 if seed in ["", None] else seed
|
||||
set_seed(seed)
|
||||
|
||||
if return_fragment:
|
||||
split_bucket = False
|
||||
# split_bucket = False
|
||||
print(i18n("分段返回模式已开启"))
|
||||
if split_bucket:
|
||||
split_bucket = False
|
||||
@ -590,7 +626,10 @@ class TTS:
|
||||
|
||||
if split_bucket:
|
||||
print(i18n("分桶处理模式已开启"))
|
||||
|
||||
|
||||
if fragment_interval<0.01:
|
||||
fragment_interval = 0.01
|
||||
print(i18n("分段间隔过小,已自动设置为0.01"))
|
||||
|
||||
no_prompt_text = False
|
||||
if prompt_text in [None, ""]:
|
||||
@ -627,19 +666,58 @@ class TTS:
|
||||
|
||||
|
||||
###### text preprocessing ########
|
||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
|
||||
if len(data) == 0:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
|
||||
dtype=np.int16)
|
||||
return
|
||||
|
||||
t1 = ttime()
|
||||
data, batch_index_list = self.to_batch(data,
|
||||
prompt_data=self.prompt_cache if not no_prompt_text else None,
|
||||
batch_size=batch_size,
|
||||
threshold=batch_threshold,
|
||||
split_bucket=split_bucket
|
||||
)
|
||||
data:list = None
|
||||
if not return_fragment:
|
||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
|
||||
if len(data) == 0:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
return
|
||||
|
||||
batch_index_list:list = None
|
||||
data, batch_index_list = self.to_batch(data,
|
||||
prompt_data=self.prompt_cache if not no_prompt_text else None,
|
||||
batch_size=batch_size,
|
||||
threshold=batch_threshold,
|
||||
split_bucket=split_bucket,
|
||||
device=self.configs.device,
|
||||
precison=self.precison
|
||||
)
|
||||
else:
|
||||
print(i18n("############ 切分文本 ############"))
|
||||
texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
|
||||
data = []
|
||||
for i in range(len(texts)):
|
||||
if i%batch_size == 0:
|
||||
data.append([])
|
||||
data[-1].append(texts[i])
|
||||
|
||||
def make_batch(batch_texts):
|
||||
batch_data = []
|
||||
print(i18n("############ 提取文本Bert特征 ############"))
|
||||
for text in tqdm(batch_texts):
|
||||
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang)
|
||||
if phones is None:
|
||||
continue
|
||||
res={
|
||||
"phones": phones,
|
||||
"bert_features": bert_features,
|
||||
"norm_text": norm_text,
|
||||
}
|
||||
batch_data.append(res)
|
||||
if len(batch_data) == 0:
|
||||
return None
|
||||
batch, _ = self.to_batch(batch_data,
|
||||
prompt_data=self.prompt_cache if not no_prompt_text else None,
|
||||
batch_size=batch_size,
|
||||
threshold=batch_threshold,
|
||||
split_bucket=False,
|
||||
device=self.configs.device,
|
||||
precison=self.precison
|
||||
)
|
||||
return batch[0]
|
||||
|
||||
t2 = ttime()
|
||||
try:
|
||||
print("############ 推理 ############")
|
||||
@ -649,26 +727,23 @@ class TTS:
|
||||
audio = []
|
||||
for item in data:
|
||||
t3 = ttime()
|
||||
batch_phones = item["phones"]
|
||||
batch_phones_len = item["phones_len"]
|
||||
all_phoneme_ids = item["all_phones"]
|
||||
all_phoneme_lens = item["all_phones_len"]
|
||||
all_bert_features = item["all_bert_features"]
|
||||
norm_text = item["norm_text"]
|
||||
|
||||
# batch_phones = batch_phones.to(self.configs.device)
|
||||
batch_phones_len = batch_phones_len.to(self.configs.device)
|
||||
all_phoneme_ids = all_phoneme_ids.to(self.configs.device)
|
||||
all_phoneme_lens = all_phoneme_lens.to(self.configs.device)
|
||||
all_bert_features = all_bert_features.to(self.configs.device)
|
||||
if self.configs.is_half:
|
||||
all_bert_features = all_bert_features.half()
|
||||
if return_fragment:
|
||||
item = make_batch(item)
|
||||
if item is None:
|
||||
continue
|
||||
|
||||
batch_phones:List[torch.LongTensor] = item["phones"]
|
||||
batch_phones_len:torch.LongTensor = item["phones_len"]
|
||||
all_phoneme_ids:List[torch.LongTensor] = item["all_phones"]
|
||||
all_phoneme_lens:torch.LongTensor = item["all_phones_len"]
|
||||
all_bert_features:List[torch.LongTensor] = item["all_bert_features"]
|
||||
norm_text:str = item["norm_text"]
|
||||
|
||||
print(i18n("前端处理后的文本(每句):"), norm_text)
|
||||
if no_prompt_text :
|
||||
prompt = None
|
||||
else:
|
||||
prompt = self.prompt_cache["prompt_semantic"].expand(all_phoneme_ids.shape[0], -1).to(self.configs.device)
|
||||
prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
|
||||
|
||||
with torch.no_grad():
|
||||
pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
|
||||
@ -734,14 +809,16 @@ class TTS:
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
||||
yield self.audio_postprocess([batch_audio_fragment],
|
||||
self.configs.sampling_rate,
|
||||
batch_index_list,
|
||||
None,
|
||||
speed_factor,
|
||||
split_bucket)
|
||||
False,
|
||||
fragment_interval
|
||||
)
|
||||
else:
|
||||
audio.append(batch_audio_fragment)
|
||||
|
||||
if self.stop_flag:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
return
|
||||
|
||||
@ -751,7 +828,9 @@ class TTS:
|
||||
self.configs.sampling_rate,
|
||||
batch_index_list,
|
||||
speed_factor,
|
||||
split_bucket)
|
||||
split_bucket,
|
||||
fragment_interval
|
||||
)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
# 必须返回一个空音频, 否则会导致显存不释放。
|
||||
@ -769,7 +848,7 @@ class TTS:
|
||||
|
||||
def empty_cache(self):
|
||||
try:
|
||||
if str(self.configs.device) == "cuda":
|
||||
if "cuda" in str(self.configs.device):
|
||||
torch.cuda.empty_cache()
|
||||
elif str(self.configs.device) == "mps":
|
||||
torch.mps.empty_cache()
|
||||
@ -781,9 +860,11 @@ class TTS:
|
||||
sr:int,
|
||||
batch_index_list:list=None,
|
||||
speed_factor:float=1.0,
|
||||
split_bucket:bool=True)->tuple[int, np.ndarray]:
|
||||
split_bucket:bool=True,
|
||||
fragment_interval:float=0.3
|
||||
)->tuple[int, np.ndarray]:
|
||||
zero_wav = torch.zeros(
|
||||
int(self.configs.sampling_rate * 0.3),
|
||||
int(self.configs.sampling_rate * fragment_interval),
|
||||
dtype=self.precison,
|
||||
device=self.configs.device
|
||||
)
|
||||
|
@ -28,7 +28,7 @@ is_share = os.environ.get("is_share", "False")
|
||||
is_share = eval(is_share)
|
||||
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
||||
is_half = eval(os.environ.get("is_half", "True")) and not torch.backends.mps.is_available()
|
||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||
gpt_path = os.environ.get("gpt_path", None)
|
||||
sovits_path = os.environ.get("sovits_path", None)
|
||||
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
|
||||
@ -41,7 +41,7 @@ from tools.i18n.i18n import I18nAuto
|
||||
|
||||
i18n = I18nAuto()
|
||||
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
||||
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
@ -91,7 +91,8 @@ def inference(text, text_lang,
|
||||
top_p, temperature,
|
||||
text_split_method, batch_size,
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket
|
||||
split_bucket,fragment_interval,
|
||||
seed,
|
||||
):
|
||||
inputs={
|
||||
"text": text,
|
||||
@ -106,7 +107,9 @@ def inference(text, text_lang,
|
||||
"batch_size":int(batch_size),
|
||||
"speed_factor":float(speed_factor),
|
||||
"split_bucket":split_bucket,
|
||||
"return_fragment":False
|
||||
"return_fragment":False,
|
||||
"fragment_interval":fragment_interval,
|
||||
"seed":seed,
|
||||
}
|
||||
|
||||
for item in tts_pipline.run(inputs):
|
||||
@ -188,6 +191,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
|
||||
with gr.Column():
|
||||
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
|
||||
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
|
||||
speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True)
|
||||
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
|
||||
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
|
||||
@ -201,6 +205,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
)
|
||||
with gr.Row():
|
||||
split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), value=True, interactive=True, show_label=True)
|
||||
seed = gr.Number(label=i18n("随机种子"),value=-1)
|
||||
# with gr.Column():
|
||||
output = gr.Audio(label=i18n("输出的语音"))
|
||||
with gr.Row():
|
||||
@ -216,7 +221,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
top_k, top_p, temperature,
|
||||
how_to_cut, batch_size,
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket
|
||||
split_bucket,fragment_interval,
|
||||
seed
|
||||
],
|
||||
[output],
|
||||
)
|
||||
|
@ -894,6 +894,7 @@ class SynthesizerTrn(nn.Module):
|
||||
if freeze_quantizer:
|
||||
self.ssl_proj.requires_grad_(False)
|
||||
self.quantizer.requires_grad_(False)
|
||||
#self.quantizer.eval()
|
||||
# self.enc_p.text_embedding.requires_grad_(False)
|
||||
# self.enc_p.encoder_text.requires_grad_(False)
|
||||
# self.enc_p.mrte.requires_grad_(False)
|
||||
|
@ -49,8 +49,8 @@ if os.path.exists(txt_path) == False:
|
||||
os.makedirs(bert_dir, exist_ok=True)
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda:0"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
# elif torch.backends.mps.is_available():
|
||||
# device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
|
||||
|
@ -50,8 +50,8 @@ maxx=0.95
|
||||
alpha=0.5
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda:0"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
# elif torch.backends.mps.is_available():
|
||||
# device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
model=cnhubert.get_model()
|
||||
|
@ -40,8 +40,8 @@ if os.path.exists(semantic_path) == False:
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
# elif torch.backends.mps.is_available():
|
||||
# device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
hps = utils.get_hparams_from_file(s2config_path)
|
||||
|
@ -118,16 +118,16 @@ def main(args):
|
||||
os.environ["MASTER_ADDR"]="localhost"
|
||||
trainer: Trainer = Trainer(
|
||||
max_epochs=config["train"]["epochs"],
|
||||
accelerator="gpu",
|
||||
accelerator="gpu" if torch.cuda.is_available() else "cpu",
|
||||
# val_check_interval=9999999999999999999999,###不要验证
|
||||
# check_val_every_n_epoch=None,
|
||||
limit_val_batches=0,
|
||||
devices=-1,
|
||||
devices=-1 if torch.cuda.is_available() else 1,
|
||||
benchmark=False,
|
||||
fast_dev_run=False,
|
||||
strategy = "auto" if torch.backends.mps.is_available() else DDPStrategy(
|
||||
strategy = DDPStrategy(
|
||||
process_group_backend="nccl" if platform.system() != "Windows" else "gloo"
|
||||
), # mps 不支持多节点训练
|
||||
) if torch.cuda.is_available() else "auto",
|
||||
precision=config["train"]["precision"],
|
||||
logger=logger,
|
||||
num_sanity_val_steps=0,
|
||||
|
@ -41,15 +41,15 @@ torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就
|
||||
# from config import pretrained_s2G,pretrained_s2D
|
||||
global_step = 0
|
||||
|
||||
device = "cpu" # cuda以外的设备,等mps优化后加入
|
||||
|
||||
|
||||
def main():
|
||||
"""Assume Single Node Multi GPUs Training Only"""
|
||||
assert torch.cuda.is_available() or torch.backends.mps.is_available(), "Only GPU training is allowed."
|
||||
|
||||
if torch.backends.mps.is_available():
|
||||
n_gpus = 1
|
||||
else:
|
||||
if torch.cuda.is_available():
|
||||
n_gpus = torch.cuda.device_count()
|
||||
else:
|
||||
n_gpus = 1
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
|
||||
|
||||
@ -73,7 +73,7 @@ def run(rank, n_gpus, hps):
|
||||
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
|
||||
|
||||
dist.init_process_group(
|
||||
backend = "gloo" if os.name == "nt" or torch.backends.mps.is_available() else "nccl",
|
||||
backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
|
||||
init_method="env://",
|
||||
world_size=n_gpus,
|
||||
rank=rank,
|
||||
@ -137,9 +137,9 @@ def run(rank, n_gpus, hps):
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model,
|
||||
).to("mps")
|
||||
).to(device)
|
||||
|
||||
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to("mps")
|
||||
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device)
|
||||
for name, param in net_g.named_parameters():
|
||||
if not param.requires_grad:
|
||||
print(name, "not requires_grad")
|
||||
@ -187,8 +187,8 @@ def run(rank, n_gpus, hps):
|
||||
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
||||
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
||||
else:
|
||||
net_g = net_g.to("mps")
|
||||
net_d = net_d.to("mps")
|
||||
net_g = net_g.to(device)
|
||||
net_d = net_d.to(device)
|
||||
|
||||
try: # 如果能加载自动resume
|
||||
_, _, _, epoch_str = utils.load_checkpoint(
|
||||
@ -320,12 +320,12 @@ def train_and_evaluate(
|
||||
rank, non_blocking=True
|
||||
)
|
||||
else:
|
||||
spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
|
||||
y, y_lengths = y.to("mps"), y_lengths.to("mps")
|
||||
ssl = ssl.to("mps")
|
||||
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
|
||||
y, y_lengths = y.to(device), y_lengths.to(device)
|
||||
ssl = ssl.to(device)
|
||||
ssl.requires_grad = False
|
||||
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
|
||||
text, text_lengths = text.to("mps"), text_lengths.to("mps")
|
||||
text, text_lengths = text.to(device), text_lengths.to(device)
|
||||
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
(
|
||||
@ -532,10 +532,10 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
||||
ssl = ssl.cuda()
|
||||
text, text_lengths = text.cuda(), text_lengths.cuda()
|
||||
else:
|
||||
spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
|
||||
y, y_lengths = y.to("mps"), y_lengths.to("mps")
|
||||
ssl = ssl.to("mps")
|
||||
text, text_lengths = text.to("mps"), text_lengths.to("mps")
|
||||
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
|
||||
y, y_lengths = y.to(device), y_lengths.to(device)
|
||||
ssl = ssl.to(device)
|
||||
text, text_lengths = text.to(device), text_lengths.to(device)
|
||||
for test in [0, 1]:
|
||||
y_hat, mask, *_ = generator.module.infer(
|
||||
ssl, spec, spec_lengths, text, text_lengths, test=test
|
||||
|
13
README.md
13
README.md
@ -33,7 +33,7 @@ Unseen speakers few-shot fine-tuning demo:
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
||||
|
||||
[教程中文版](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) [User guide (EN)](https://rentry.co/GPT-SoVITS-guide#/)
|
||||
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||
|
||||
## Installation
|
||||
|
||||
@ -61,13 +61,7 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
Only Macs that meet the following conditions can train models:
|
||||
|
||||
- Mac computers with Apple silicon
|
||||
- macOS 12.3 or later
|
||||
- Xcode command-line tools installed by running `xcode-select --install`
|
||||
|
||||
**All Macs can do inference with CPU, which has been demonstrated to outperform GPU inference.**
|
||||
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
|
||||
|
||||
First make sure you have installed FFmpeg by running `brew install ffmpeg` or `conda install ffmpeg`, then install by using the following commands:
|
||||
|
||||
@ -75,12 +69,9 @@ First make sure you have installed FFmpeg by running `brew install ffmpeg` or `c
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
_Note: Training models will only work if you've installed PyTorch Nightly._
|
||||
|
||||
### Install Manually
|
||||
|
||||
#### Install Dependences
|
||||
|
7
api.py
7
api.py
@ -13,7 +13,7 @@
|
||||
`-dt` - `默认参考音频文本`
|
||||
`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
|
||||
|
||||
`-d` - `推理设备, "cuda","cpu","mps"`
|
||||
`-d` - `推理设备, "cuda","cpu"`
|
||||
`-a` - `绑定地址, 默认"127.0.0.1"`
|
||||
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
|
||||
`-fp` - `覆盖 config.py 使用全精度`
|
||||
@ -143,7 +143,7 @@ parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="
|
||||
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
|
||||
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
||||
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
|
||||
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
||||
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
||||
@ -482,9 +482,6 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
wav.seek(0)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
if device == "mps":
|
||||
print('executed torch.mps.empty_cache()')
|
||||
torch.mps.empty_cache()
|
||||
return StreamingResponse(wav, media_type="audio/wav")
|
||||
|
||||
|
||||
|
@ -33,6 +33,8 @@
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
||||
|
||||
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
||||
|
||||
## 安装
|
||||
|
||||
中国地区用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验。
|
||||
@ -59,13 +61,7 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
只有符合以下条件的 Mac 可以训练模型:
|
||||
|
||||
- 搭载 Apple 芯片的 Mac
|
||||
- 运行macOS 12.3 或更高版本
|
||||
- 已通过运行`xcode-select --install`安装 Xcode command-line tools
|
||||
|
||||
**所有 Mac 都可使用 CPU 进行推理,且已测试性能优于 GPU。**
|
||||
**注:在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型,所以我们暂时使用CPU进行训练。**
|
||||
|
||||
首先确保你已通过运行 `brew install ffmpeg` 或 `conda install ffmpeg` 安装 FFmpeg,然后运行以下命令安装:
|
||||
|
||||
@ -73,12 +69,9 @@ bash install.sh
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
_注:只有安装了Pytorch Nightly才可训练模型。_
|
||||
|
||||
### 手动安装
|
||||
|
||||
#### 安装依赖
|
||||
|
@ -57,13 +57,7 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
モデルをトレーニングできるMacは、以下の条件を満たす必要があります:
|
||||
|
||||
- Appleシリコンを搭載したMacコンピュータ
|
||||
- macOS 12.3以降
|
||||
- `xcode-select --install`を実行してインストールされたXcodeコマンドラインツール
|
||||
|
||||
**すべてのMacはCPUを使用して推論を行うことができ、GPU推論よりも優れていることが実証されています。**
|
||||
**注:MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練します。**
|
||||
|
||||
まず、`brew install ffmpeg`または`conda install ffmpeg`を実行してFFmpegをインストールしたことを確認してください。次に、以下のコマンドを使用してインストールします:
|
||||
|
||||
@ -71,12 +65,9 @@ bash install.sh
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
_注:PyTorch Nightlyをインストールした場合にのみ、モデルのトレーニングが可能です。_
|
||||
|
||||
### 手動インストール
|
||||
|
||||
#### 依存関係をインストールします
|
||||
|
@ -57,13 +57,7 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
다음 조건을 충족하는 Mac에서만 모델을 훈련할 수 있습니다:
|
||||
|
||||
- Apple 실리콘을 탑재한 Mac
|
||||
- macOS 12.3 이상 버전
|
||||
- `xcode-select --install`을 실행하여 Xcode 명령줄 도구가 설치됨
|
||||
|
||||
**모든 Mac은 CPU를 사용하여 추론할 수 있으며, GPU 추론보다 우수한 성능을 보여주었습니다.**
|
||||
**주의: Mac에서 GPU로 훈련된 모델은 다른 장치에서 훈련된 모델에 비해 현저히 낮은 품질을 나타내므로, 우리는 일시적으로 CPU를 사용하여 훈련하고 있습니다.**
|
||||
|
||||
먼저 `brew install ffmpeg` 또는 `conda install ffmpeg`를 실행하여 FFmpeg가 설치되었는지 확인한 다음, 다음 명령어를 사용하여 설치하세요:
|
||||
|
||||
@ -71,12 +65,9 @@ bash install.sh
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
_참고: PyTorch Nightly가 설치되어야만 모델을 훈련할 수 있습니다._
|
||||
|
||||
### 수동 설치
|
||||
|
||||
#### 의존성 설치
|
||||
|
17
webui.py
17
webui.py
@ -55,7 +55,7 @@ from scipy.io import wavfile
|
||||
from tools.my_utils import load_audio
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
||||
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
||||
|
||||
n_cpu=cpu_count()
|
||||
|
||||
@ -73,18 +73,19 @@ if torch.cuda.is_available() or ngpu != 0:
|
||||
if_gpu_ok = True # 至少有一张能用的N卡
|
||||
gpu_infos.append("%s\t%s" % (i, gpu_name))
|
||||
mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
|
||||
# 判断是否支持mps加速
|
||||
if torch.backends.mps.is_available():
|
||||
if_gpu_ok = True
|
||||
gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
|
||||
mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
|
||||
# # 判断是否支持mps加速
|
||||
# if torch.backends.mps.is_available():
|
||||
# if_gpu_ok = True
|
||||
# gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
|
||||
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
|
||||
|
||||
if if_gpu_ok and len(gpu_infos) > 0:
|
||||
gpu_info = "\n".join(gpu_infos)
|
||||
default_batch_size = min(mem) // 2
|
||||
else:
|
||||
gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
|
||||
default_batch_size = 1
|
||||
gpu_info = ("%s\t%s" % ("0", "CPU"))
|
||||
gpu_infos.append("%s\t%s" % ("0", "CPU"))
|
||||
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
|
||||
gpus = "-".join([i[0] for i in gpu_infos])
|
||||
|
||||
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
||||
|
Loading…
x
Reference in New Issue
Block a user