Merge pull request #758 from ChasonJiang/fast_inference_

[fast inference] 推理功能增强和改进
This commit is contained in:
RVC-Boss 2024-03-16 21:16:43 +08:00 committed by GitHub
commit 099059de93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 266 additions and 176 deletions

View File

@ -504,18 +504,29 @@ class Text2SemanticDecoder(nn.Module):
def infer_panel_batch_infer_with_flash_attn(
self,
x, #####全部文本token
x_lens,
prompts, ####参考音频token
bert_feature,
x:List[torch.LongTensor], #####全部文本token
x_lens:torch.LongTensor,
prompts:torch.LongTensor, ####参考音频token
bert_feature:List[torch.LongTensor],
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
):
bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_embedding(x)
# 先对phones进行embedding、对bert_features进行project再pad到相同长度以缓解复读问题。可能还有其他因素导致复读
max_len = 0
for x_item, bert_item in zip(x, bert_feature):
max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
x_list = [self.ar_text_embedding(item) for item in x]
x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
x = torch.stack(x_list, dim=0)
bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
bert_feature = torch.stack(bert_features_list, dim=0)
# bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
# x = self.ar_text_embedding(x)
x = x + bert_feature
x = self.ar_text_position(x)
@ -658,17 +669,30 @@ class Text2SemanticDecoder(nn.Module):
def infer_panel_batch_only(
self,
x, #####全部文本token
x_lens,
prompts, ####参考音频token
bert_feature,
x:List[torch.LongTensor], #####全部文本token
x_lens:torch.LongTensor,
prompts:torch.LongTensor, ####参考音频token
bert_feature:List[torch.LongTensor],
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
# 先对phones进行embedding、对bert_features进行project再pad到相同长度以缓解复读问题。可能还有其他因素导致复读
max_len = 0
for x_item, bert_item in zip(x, bert_feature):
max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
x_list = [self.ar_text_embedding(item) for item in x]
x_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) if item.shape[0]<max_len else item for item in x_list]
x = torch.stack(x_list, dim=0)
bert_features_list = [self.bert_proj(item.transpose(0, 1)) for item in bert_feature]
bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) if item.shape[0]<max_len else item for item in bert_features_list]
bert_feature = torch.stack(bert_features_list, dim=0)
# bert_feature = self.bert_proj(bert_feature.transpose(1, 2))
# x = self.ar_text_embedding(x)
x = x + bert_feature
x = self.ar_text_position(x)
# AR Decoder
@ -707,19 +731,33 @@ class Text2SemanticDecoder(nn.Module):
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
ref_free = True
x_attn_mask_pad = F.pad(
x_attn_mask,
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1(x,x+y)
value=True,
)
y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
##### create mask #####
bsz = x.shape[0]
src_len = x_len + y_len
y_lens = torch.LongTensor([y_len]*bsz).to(x.device)
y_mask = make_pad_mask(y_lens)
x_mask = make_pad_mask(x_lens)
# (bsz, x_len + y_len)
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
x_mask = F.pad(
x_attn_mask,
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1(x,x+y)
value=True,
)
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(
x.device
)
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).expand(bsz*self.num_head, -1, -1).to(x.device)
# xy_mask = torch.triu(torch.ones(src_len, src_len, dtype=torch.bool, device=x.device), diagonal=1)
xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).expand(bsz, src_len, src_len).repeat(self.num_head, 1, 1)
xy_attn_mask = xy_mask.logical_or(xy_padding_mask)
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
xy_attn_mask = new_attn_mask.masked_fill(xy_attn_mask, float("-inf"))
y_list = [None]*y.shape[0]
batch_idx_map = list(range(y.shape[0]))

View File

@ -3,6 +3,8 @@ import math
import os, sys
import random
import traceback
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import ffmpeg
@ -49,17 +51,24 @@ custom:
"""
# def set_seed(seed):
# random.seed(seed)
# os.environ['PYTHONHASHSEED'] = str(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.enabled = True
# set_seed(1234)
def set_seed(seed:int):
seed = int(seed)
seed = seed if seed != -1 else random.randrange(1 << 32)
print(f"Set seed to {seed}")
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
try:
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.enabled = True
except:
pass
return seed
class TTS_Config:
default_configs={
@ -226,7 +235,7 @@ class TTS:
self.cnhuhbert_model = CNHubert(base_path)
self.cnhuhbert_model=self.cnhuhbert_model.eval()
self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device)
if self.configs.is_half:
if self.configs.is_half and str(self.configs.device)!="cpu":
self.cnhuhbert_model = self.cnhuhbert_model.half()
@ -237,7 +246,7 @@ class TTS:
self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path)
self.bert_model=self.bert_model.eval()
self.bert_model = self.bert_model.to(self.configs.device)
if self.configs.is_half:
if self.configs.is_half and str(self.configs.device)!="cpu":
self.bert_model = self.bert_model.half()
@ -270,7 +279,7 @@ class TTS:
vits_model = vits_model.eval()
vits_model.load_state_dict(dict_s2["weight"], strict=False)
self.vits_model = vits_model
if self.configs.is_half:
if self.configs.is_half and str(self.configs.device)!="cpu":
self.vits_model = self.vits_model.half()
@ -288,7 +297,7 @@ class TTS:
t2s_model = t2s_model.to(self.configs.device)
t2s_model = t2s_model.eval()
self.t2s_model = t2s_model
if self.configs.is_half:
if self.configs.is_half and str(self.configs.device)!="cpu":
self.t2s_model = self.t2s_model.half()
def enable_half_precision(self, enable: bool = True):
@ -298,7 +307,7 @@ class TTS:
enable: bool, whether to enable half precision.
'''
if self.configs.device == "cpu" and enable:
if str(self.configs.device) == "cpu" and enable:
print("Half precision is not supported on CPU.")
return
@ -420,7 +429,14 @@ class TTS:
batch = torch.stack(padded_sequences)
return batch
def to_batch(self, data:list, prompt_data:dict=None, batch_size:int=5, threshold:float=0.75, split_bucket:bool=True):
def to_batch(self, data:list,
prompt_data:dict=None,
batch_size:int=5,
threshold:float=0.75,
split_bucket:bool=True,
device:torch.device=torch.device("cpu"),
precison:torch.dtype=torch.float32,
):
_data:list = []
index_and_len_list = []
@ -472,14 +488,14 @@ class TTS:
for item in item_list:
if prompt_data is not None:
all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1)\
.to(dtype=self.precison)
all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"])
phones = torch.LongTensor(item["phones"])
.to(dtype=precison, device=device)
all_phones = torch.LongTensor(prompt_data["phones"]+item["phones"]).to(device)
phones = torch.LongTensor(item["phones"]).to(device)
# norm_text = prompt_data["norm_text"]+item["norm_text"]
else:
all_bert_features = item["bert_features"]\
.to(dtype=self.precison)
phones = torch.LongTensor(item["phones"])
.to(dtype=precison, device=device)
phones = torch.LongTensor(item["phones"]).to(device)
all_phones = phones
# norm_text = item["norm_text"]
@ -494,19 +510,33 @@ class TTS:
norm_text_batch.append(item["norm_text"])
phones_batch = phones_list
max_len = max(bert_max_len, phones_max_len)
all_phones_batch = all_phones_list
all_bert_features_batch = all_bert_features_list
# max_len = max(bert_max_len, phones_max_len)
# phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
#### 直接对phones和bert_features进行pad会增大复读概率。
# all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
# all_bert_features_batch = all_bert_features_list
all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=self.precison)
for idx, item in enumerate(all_bert_features_list):
all_bert_features_batch[idx, :, : item.shape[-1]] = item
# all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precison, device=device)
# for idx, item in enumerate(all_bert_features_list):
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
# #### 先对phones进行embedding、对bert_features进行project再pad到相同长度以缓解复读问题。可能还有其他因素导致复读
# all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
# all_phones_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) for item in all_phones_list]
# all_phones_batch = torch.stack(all_phones_list, dim=0)
# all_bert_features_list = [self.t2s_model.model.bert_proj(item.to(self.t2s_model.device).transpose(0, 1)) for item in all_bert_features_list]
# all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
# all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
batch = {
"phones": phones_batch,
"phones_len": torch.LongTensor(phones_len_list),
"phones_len": torch.LongTensor(phones_len_list).to(device),
"all_phones": all_phones_batch,
"all_phones_len": torch.LongTensor(all_phones_len_list),
"all_phones_len": torch.LongTensor(all_phones_len_list).to(device),
"all_bert_features": all_bert_features_batch,
"norm_text": norm_text_batch
}
@ -552,14 +582,16 @@ class TTS:
"prompt_text": "", # str. prompt text for the reference audio
"prompt_lang": "", # str. language of the prompt text for the reference audio
"top_k": 5, # int. top k sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"text_split_method": "", # str. text split method, see text_segmentaion_method.py for details.
"batch_size": 1, # int. batch size for inference
"batch_threshold": 0.75, # float. threshold for batch splitting.
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
"return_fragment": False, # bool. step by step return the audio fragment.
"speed_factor":1.0, # float. control the speed of the synthesized audio.
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
"seed": -1, # int. random seed for reproducibility.
}
returns:
tulpe[int, np.ndarray]: sampling rate and audio data.
@ -580,9 +612,13 @@ class TTS:
speed_factor = inputs.get("speed_factor", 1.0)
split_bucket = inputs.get("split_bucket", True)
return_fragment = inputs.get("return_fragment", False)
fragment_interval = inputs.get("fragment_interval", 0.3)
seed = inputs.get("seed", -1)
seed = -1 if seed in ["", None] else seed
set_seed(seed)
if return_fragment:
split_bucket = False
# split_bucket = False
print(i18n("分段返回模式已开启"))
if split_bucket:
split_bucket = False
@ -590,7 +626,10 @@ class TTS:
if split_bucket:
print(i18n("分桶处理模式已开启"))
if fragment_interval<0.01:
fragment_interval = 0.01
print(i18n("分段间隔过小已自动设置为0.01"))
no_prompt_text = False
if prompt_text in [None, ""]:
@ -627,19 +666,58 @@ class TTS:
###### text preprocessing ########
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
if len(data) == 0:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
dtype=np.int16)
return
t1 = ttime()
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=split_bucket
)
data:list = None
if not return_fragment:
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method)
if len(data) == 0:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
dtype=np.int16)
return
batch_index_list:list = None
data, batch_index_list = self.to_batch(data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=split_bucket,
device=self.configs.device,
precison=self.precison
)
else:
print(i18n("############ 切分文本 ############"))
texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
data = []
for i in range(len(texts)):
if i%batch_size == 0:
data.append([])
data[-1].append(texts[i])
def make_batch(batch_texts):
batch_data = []
print(i18n("############ 提取文本Bert特征 ############"))
for text in tqdm(batch_texts):
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(text, text_lang)
if phones is None:
continue
res={
"phones": phones,
"bert_features": bert_features,
"norm_text": norm_text,
}
batch_data.append(res)
if len(batch_data) == 0:
return None
batch, _ = self.to_batch(batch_data,
prompt_data=self.prompt_cache if not no_prompt_text else None,
batch_size=batch_size,
threshold=batch_threshold,
split_bucket=False,
device=self.configs.device,
precison=self.precison
)
return batch[0]
t2 = ttime()
try:
print("############ 推理 ############")
@ -649,26 +727,23 @@ class TTS:
audio = []
for item in data:
t3 = ttime()
batch_phones = item["phones"]
batch_phones_len = item["phones_len"]
all_phoneme_ids = item["all_phones"]
all_phoneme_lens = item["all_phones_len"]
all_bert_features = item["all_bert_features"]
norm_text = item["norm_text"]
# batch_phones = batch_phones.to(self.configs.device)
batch_phones_len = batch_phones_len.to(self.configs.device)
all_phoneme_ids = all_phoneme_ids.to(self.configs.device)
all_phoneme_lens = all_phoneme_lens.to(self.configs.device)
all_bert_features = all_bert_features.to(self.configs.device)
if self.configs.is_half:
all_bert_features = all_bert_features.half()
if return_fragment:
item = make_batch(item)
if item is None:
continue
batch_phones:List[torch.LongTensor] = item["phones"]
batch_phones_len:torch.LongTensor = item["phones_len"]
all_phoneme_ids:List[torch.LongTensor] = item["all_phones"]
all_phoneme_lens:torch.LongTensor = item["all_phones_len"]
all_bert_features:List[torch.LongTensor] = item["all_bert_features"]
norm_text:str = item["norm_text"]
print(i18n("前端处理后的文本(每句):"), norm_text)
if no_prompt_text :
prompt = None
else:
prompt = self.prompt_cache["prompt_semantic"].expand(all_phoneme_ids.shape[0], -1).to(self.configs.device)
prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
with torch.no_grad():
pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
@ -734,14 +809,16 @@ class TTS:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
yield self.audio_postprocess([batch_audio_fragment],
self.configs.sampling_rate,
batch_index_list,
None,
speed_factor,
split_bucket)
False,
fragment_interval
)
else:
audio.append(batch_audio_fragment)
if self.stop_flag:
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate * 0.3),
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
dtype=np.int16)
return
@ -751,7 +828,9 @@ class TTS:
self.configs.sampling_rate,
batch_index_list,
speed_factor,
split_bucket)
split_bucket,
fragment_interval
)
except Exception as e:
traceback.print_exc()
# 必须返回一个空音频, 否则会导致显存不释放。
@ -769,7 +848,7 @@ class TTS:
def empty_cache(self):
try:
if str(self.configs.device) == "cuda":
if "cuda" in str(self.configs.device):
torch.cuda.empty_cache()
elif str(self.configs.device) == "mps":
torch.mps.empty_cache()
@ -781,9 +860,11 @@ class TTS:
sr:int,
batch_index_list:list=None,
speed_factor:float=1.0,
split_bucket:bool=True)->tuple[int, np.ndarray]:
split_bucket:bool=True,
fragment_interval:float=0.3
)->tuple[int, np.ndarray]:
zero_wav = torch.zeros(
int(self.configs.sampling_rate * 0.3),
int(self.configs.sampling_rate * fragment_interval),
dtype=self.precison,
device=self.configs.device
)

View File

@ -28,7 +28,7 @@ is_share = os.environ.get("is_share", "False")
is_share = eval(is_share)
if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
is_half = eval(os.environ.get("is_half", "True")) and not torch.backends.mps.is_available()
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
@ -41,7 +41,7 @@ from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
if torch.cuda.is_available():
device = "cuda"
@ -91,7 +91,8 @@ def inference(text, text_lang,
top_p, temperature,
text_split_method, batch_size,
speed_factor, ref_text_free,
split_bucket
split_bucket,fragment_interval,
seed,
):
inputs={
"text": text,
@ -106,7 +107,9 @@ def inference(text, text_lang,
"batch_size":int(batch_size),
"speed_factor":float(speed_factor),
"split_bucket":split_bucket,
"return_fragment":False
"return_fragment":False,
"fragment_interval":fragment_interval,
"seed":seed,
}
for item in tts_pipline.run(inputs):
@ -188,6 +191,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Column():
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True)
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
@ -201,6 +205,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
)
with gr.Row():
split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), value=True, interactive=True, show_label=True)
seed = gr.Number(label=i18n("随机种子"),value=-1)
# with gr.Column():
output = gr.Audio(label=i18n("输出的语音"))
with gr.Row():
@ -216,7 +221,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
top_k, top_p, temperature,
how_to_cut, batch_size,
speed_factor, ref_text_free,
split_bucket
split_bucket,fragment_interval,
seed
],
[output],
)

View File

@ -894,6 +894,7 @@ class SynthesizerTrn(nn.Module):
if freeze_quantizer:
self.ssl_proj.requires_grad_(False)
self.quantizer.requires_grad_(False)
#self.quantizer.eval()
# self.enc_p.text_embedding.requires_grad_(False)
# self.enc_p.encoder_text.requires_grad_(False)
# self.enc_p.mrte.requires_grad_(False)

View File

@ -49,8 +49,8 @@ if os.path.exists(txt_path) == False:
os.makedirs(bert_dir, exist_ok=True)
if torch.cuda.is_available():
device = "cuda:0"
elif torch.backends.mps.is_available():
device = "mps"
# elif torch.backends.mps.is_available():
# device = "mps"
else:
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)

View File

@ -50,8 +50,8 @@ maxx=0.95
alpha=0.5
if torch.cuda.is_available():
device = "cuda:0"
elif torch.backends.mps.is_available():
device = "mps"
# elif torch.backends.mps.is_available():
# device = "mps"
else:
device = "cpu"
model=cnhubert.get_model()

View File

@ -40,8 +40,8 @@ if os.path.exists(semantic_path) == False:
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
# elif torch.backends.mps.is_available():
# device = "mps"
else:
device = "cpu"
hps = utils.get_hparams_from_file(s2config_path)

View File

@ -118,16 +118,16 @@ def main(args):
os.environ["MASTER_ADDR"]="localhost"
trainer: Trainer = Trainer(
max_epochs=config["train"]["epochs"],
accelerator="gpu",
accelerator="gpu" if torch.cuda.is_available() else "cpu",
# val_check_interval=9999999999999999999999,###不要验证
# check_val_every_n_epoch=None,
limit_val_batches=0,
devices=-1,
devices=-1 if torch.cuda.is_available() else 1,
benchmark=False,
fast_dev_run=False,
strategy = "auto" if torch.backends.mps.is_available() else DDPStrategy(
strategy = DDPStrategy(
process_group_backend="nccl" if platform.system() != "Windows" else "gloo"
), # mps 不支持多节点训练
) if torch.cuda.is_available() else "auto",
precision=config["train"]["precision"],
logger=logger,
num_sanity_val_steps=0,

View File

@ -41,15 +41,15 @@ torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就
# from config import pretrained_s2G,pretrained_s2D
global_step = 0
device = "cpu" # cuda以外的设备等mps优化后加入
def main():
"""Assume Single Node Multi GPUs Training Only"""
assert torch.cuda.is_available() or torch.backends.mps.is_available(), "Only GPU training is allowed."
if torch.backends.mps.is_available():
n_gpus = 1
else:
if torch.cuda.is_available():
n_gpus = torch.cuda.device_count()
else:
n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
@ -73,7 +73,7 @@ def run(rank, n_gpus, hps):
writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
dist.init_process_group(
backend = "gloo" if os.name == "nt" or torch.backends.mps.is_available() else "nccl",
backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
init_method="env://",
world_size=n_gpus,
rank=rank,
@ -137,9 +137,9 @@ def run(rank, n_gpus, hps):
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).to("mps")
).to(device)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to("mps")
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device)
for name, param in net_g.named_parameters():
if not param.requires_grad:
print(name, "not requires_grad")
@ -187,8 +187,8 @@ def run(rank, n_gpus, hps):
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
else:
net_g = net_g.to("mps")
net_d = net_d.to("mps")
net_g = net_g.to(device)
net_d = net_d.to(device)
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
@ -320,12 +320,12 @@ def train_and_evaluate(
rank, non_blocking=True
)
else:
spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
y, y_lengths = y.to("mps"), y_lengths.to("mps")
ssl = ssl.to("mps")
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device)
ssl = ssl.to(device)
ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
text, text_lengths = text.to("mps"), text_lengths.to("mps")
text, text_lengths = text.to(device), text_lengths.to(device)
with autocast(enabled=hps.train.fp16_run):
(
@ -532,10 +532,10 @@ def evaluate(hps, generator, eval_loader, writer_eval):
ssl = ssl.cuda()
text, text_lengths = text.cuda(), text_lengths.cuda()
else:
spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps")
y, y_lengths = y.to("mps"), y_lengths.to("mps")
ssl = ssl.to("mps")
text, text_lengths = text.to("mps"), text_lengths.to("mps")
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
y, y_lengths = y.to(device), y_lengths.to(device)
ssl = ssl.to(device)
text, text_lengths = text.to(device), text_lengths.to(device)
for test in [0, 1]:
y_hat, mask, *_ = generator.module.infer(
ssl, spec, spec_lengths, text, text_lengths, test=test

View File

@ -33,7 +33,7 @@ Unseen speakers few-shot fine-tuning demo:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
[教程中文版](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) [User guide (EN)](https://rentry.co/GPT-SoVITS-guide#/)
**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## Installation
@ -61,13 +61,7 @@ bash install.sh
### macOS
Only Macs that meet the following conditions can train models:
- Mac computers with Apple silicon
- macOS 12.3 or later
- Xcode command-line tools installed by running `xcode-select --install`
**All Macs can do inference with CPU, which has been demonstrated to outperform GPU inference.**
**Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.**
First make sure you have installed FFmpeg by running `brew install ffmpeg` or `conda install ffmpeg`, then install by using the following commands:
@ -75,12 +69,9 @@ First make sure you have installed FFmpeg by running `brew install ffmpeg` or `c
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
```
_Note: Training models will only work if you've installed PyTorch Nightly._
### Install Manually
#### Install Dependences

7
api.py
View File

@ -13,7 +13,7 @@
`-dt` - `默认参考音频文本`
`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
`-d` - `推理设备, "cuda","cpu","mps"`
`-d` - `推理设备, "cuda","cpu"`
`-a` - `绑定地址, 默认"127.0.0.1"`
`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
`-fp` - `覆盖 config.py 使用全精度`
@ -143,7 +143,7 @@ parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu / mps")
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
@ -482,9 +482,6 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
wav.seek(0)
torch.cuda.empty_cache()
if device == "mps":
print('executed torch.mps.empty_cache()')
torch.mps.empty_cache()
return StreamingResponse(wav, media_type="audio/wav")

View File

@ -33,6 +33,8 @@
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
## 安装
中国地区用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验。
@ -59,13 +61,7 @@ bash install.sh
### macOS
只有符合以下条件的 Mac 可以训练模型:
- 搭载 Apple 芯片的 Mac
- 运行macOS 12.3 或更高版本
- 已通过运行`xcode-select --install`安装 Xcode command-line tools
**所有 Mac 都可使用 CPU 进行推理,且已测试性能优于 GPU。**
**注:在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型所以我们暂时使用CPU进行训练。**
首先确保你已通过运行 `brew install ffmpeg``conda install ffmpeg` 安装 FFmpeg然后运行以下命令安装
@ -73,12 +69,9 @@ bash install.sh
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
```
_注只有安装了Pytorch Nightly才可训练模型。_
### 手动安装
#### 安装依赖

View File

@ -57,13 +57,7 @@ bash install.sh
### macOS
モデルをトレーニングできるMacは、以下の条件を満たす必要があります
- Appleシリコンを搭載したMacコンピュータ
- macOS 12.3以降
- `xcode-select --install`を実行してインストールされたXcodeコマンドラインツール
**すべてのMacはCPUを使用して推論を行うことができ、GPU推論よりも優れていることが実証されています。**
**注MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練します。**
まず、`brew install ffmpeg`または`conda install ffmpeg`を実行してFFmpegをインストールしたことを確認してください。次に、以下のコマンドを使用してインストールします
@ -71,12 +65,9 @@ bash install.sh
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
```
_注PyTorch Nightlyをインストールした場合にのみ、モデルのトレーニングが可能です。_
### 手動インストール
#### 依存関係をインストールします

View File

@ -57,13 +57,7 @@ bash install.sh
### macOS
다음 조건을 충족하는 Mac에서만 모델을 훈련할 수 있습니다:
- Apple 실리콘을 탑재한 Mac
- macOS 12.3 이상 버전
- `xcode-select --install`을 실행하여 Xcode 명령줄 도구가 설치됨
**모든 Mac은 CPU를 사용하여 추론할 수 있으며, GPU 추론보다 우수한 성능을 보여주었습니다.**
**주의: Mac에서 GPU로 훈련된 모델은 다른 장치에서 훈련된 모델에 비해 현저히 낮은 품질을 나타내므로, 우리는 일시적으로 CPU를 사용하여 훈련하고 있습니다.**
먼저 `brew install ffmpeg` 또는 `conda install ffmpeg`를 실행하여 FFmpeg가 설치되었는지 확인한 다음, 다음 명령어를 사용하여 설치하세요:
@ -71,12 +65,9 @@ bash install.sh
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip3 install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
```
_참고: PyTorch Nightly가 설치되어야만 모델을 훈련할 수 있습니다._
### 수동 설치
#### 의존성 설치

View File

@ -55,7 +55,7 @@ from scipy.io import wavfile
from tools.my_utils import load_audio
from multiprocessing import cpu_count
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
n_cpu=cpu_count()
@ -73,18 +73,19 @@ if torch.cuda.is_available() or ngpu != 0:
if_gpu_ok = True # 至少有一张能用的N卡
gpu_infos.append("%s\t%s" % (i, gpu_name))
mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
# 判断是否支持mps加速
if torch.backends.mps.is_available():
if_gpu_ok = True
gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
# # 判断是否支持mps加速
# if torch.backends.mps.is_available():
# if_gpu_ok = True
# gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
if if_gpu_ok and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
default_batch_size = min(mem) // 2
else:
gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
default_batch_size = 1
gpu_info = ("%s\t%s" % ("0", "CPU"))
gpu_infos.append("%s\t%s" % ("0", "CPU"))
default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
gpus = "-".join([i[0] for i in gpu_infos])
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"