mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-09 17:47:30 +08:00
Fix bugs in install.sh
, reduce log noise, and improve error reporting (#2464)
* Update Install.sh * Format Code * Delete dev null * Update README, Support Dark Mode in CSS/JS
This commit is contained in:
parent
7dec5f5bb0
commit
6fdc67ca83
@ -28,7 +28,8 @@ class Text2SemanticLightningModule(LightningModule):
|
|||||||
self.load_state_dict(
|
self.load_state_dict(
|
||||||
torch.load(
|
torch.load(
|
||||||
pretrained_s1,
|
pretrained_s1,
|
||||||
map_location="cpu", weights_only=False,
|
map_location="cpu",
|
||||||
|
weights_only=False,
|
||||||
)["weight"],
|
)["weight"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -32,19 +32,21 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|||||||
|
|
||||||
from tools.audio_sr import AP_BWE
|
from tools.audio_sr import AP_BWE
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
from tools.my_utils import load_audio
|
|
||||||
from TTS_infer_pack.text_segmentation_method import splits
|
from TTS_infer_pack.text_segmentation_method import splits
|
||||||
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
||||||
from sv import SV
|
from sv import SV
|
||||||
resample_transform_dict={}
|
|
||||||
def resample(audio_tensor, sr0,sr1,device):
|
resample_transform_dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
key="%s-%s-%s"%(sr0,sr1,str(device))
|
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
||||||
if key not in resample_transform_dict:
|
if key not in resample_transform_dict:
|
||||||
resample_transform_dict[key] = torchaudio.transforms.Resample(
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
sr0, sr1
|
|
||||||
).to(device)
|
|
||||||
return resample_transform_dict[key](audio_tensor)
|
return resample_transform_dict[key](audio_tensor)
|
||||||
|
|
||||||
|
|
||||||
language = os.environ.get("language", "Auto")
|
language = os.environ.get("language", "Auto")
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||||
i18n = I18nAuto(language=language)
|
i18n = I18nAuto(language=language)
|
||||||
@ -111,6 +113,7 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
|||||||
|
|
||||||
return processed_audio
|
return processed_audio
|
||||||
|
|
||||||
|
|
||||||
class DictToAttrRecursive(dict):
|
class DictToAttrRecursive(dict):
|
||||||
def __init__(self, input_dict):
|
def __init__(self, input_dict):
|
||||||
super().__init__(input_dict)
|
super().__init__(input_dict)
|
||||||
@ -479,7 +482,7 @@ class TTS:
|
|||||||
def init_vits_weights(self, weights_path: str):
|
def init_vits_weights(self, weights_path: str):
|
||||||
self.configs.vits_weights_path = weights_path
|
self.configs.vits_weights_path = weights_path
|
||||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
|
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
|
||||||
if "Pro"in model_version:
|
if "Pro" in model_version:
|
||||||
self.init_sv_model()
|
self.init_sv_model()
|
||||||
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
||||||
|
|
||||||
@ -498,9 +501,9 @@ class TTS:
|
|||||||
else:
|
else:
|
||||||
hps["model"]["version"] = "v2"
|
hps["model"]["version"] = "v2"
|
||||||
version = hps["model"]["version"]
|
version = hps["model"]["version"]
|
||||||
v3v4set={"v3", "v4"}
|
v3v4set = {"v3", "v4"}
|
||||||
if model_version not in v3v4set:
|
if model_version not in v3v4set:
|
||||||
if "Pro"not in model_version:
|
if "Pro" not in model_version:
|
||||||
model_version = version
|
model_version = version
|
||||||
else:
|
else:
|
||||||
hps["model"]["version"] = model_version
|
hps["model"]["version"] = model_version
|
||||||
@ -542,7 +545,7 @@ class TTS:
|
|||||||
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
||||||
del vits_model.enc_q
|
del vits_model.enc_q
|
||||||
|
|
||||||
self.is_v2pro=model_version in {"v2Pro","v2ProPlus"}
|
self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
|
||||||
|
|
||||||
if if_lora_v3 == False:
|
if if_lora_v3 == False:
|
||||||
print(
|
print(
|
||||||
@ -632,7 +635,9 @@ class TTS:
|
|||||||
)
|
)
|
||||||
self.vocoder.remove_weight_norm()
|
self.vocoder.remove_weight_norm()
|
||||||
state_dict_g = torch.load(
|
state_dict_g = torch.load(
|
||||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
|
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
||||||
|
map_location="cpu",
|
||||||
|
weights_only=False,
|
||||||
)
|
)
|
||||||
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
|
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
|
||||||
|
|
||||||
@ -752,11 +757,13 @@ class TTS:
|
|||||||
|
|
||||||
if raw_sr != self.configs.sampling_rate:
|
if raw_sr != self.configs.sampling_rate:
|
||||||
audio = raw_audio.to(self.configs.device)
|
audio = raw_audio.to(self.configs.device)
|
||||||
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
|
if audio.shape[0] == 2:
|
||||||
|
audio = audio.mean(0).unsqueeze(0)
|
||||||
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
|
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
|
||||||
else:
|
else:
|
||||||
audio = raw_audio.to(self.configs.device)
|
audio = raw_audio.to(self.configs.device)
|
||||||
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
|
if audio.shape[0] == 2:
|
||||||
|
audio = audio.mean(0).unsqueeze(0)
|
||||||
|
|
||||||
maxx = audio.abs().max()
|
maxx = audio.abs().max()
|
||||||
if maxx > 1:
|
if maxx > 1:
|
||||||
@ -775,8 +782,9 @@ class TTS:
|
|||||||
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
|
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
|
||||||
if self.configs.is_half:
|
if self.configs.is_half:
|
||||||
audio = audio.half()
|
audio = audio.half()
|
||||||
else:audio=None
|
else:
|
||||||
return spec,audio
|
audio = None
|
||||||
|
return spec, audio
|
||||||
|
|
||||||
def _set_prompt_semantic(self, ref_wav_path: str):
|
def _set_prompt_semantic(self, ref_wav_path: str):
|
||||||
zero_wav = np.zeros(
|
zero_wav = np.zeros(
|
||||||
@ -1073,7 +1081,10 @@ class TTS:
|
|||||||
|
|
||||||
###### setting reference audio and prompt text preprocessing ########
|
###### setting reference audio and prompt text preprocessing ########
|
||||||
t0 = time.perf_counter()
|
t0 = time.perf_counter()
|
||||||
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"] or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)):
|
if (ref_audio_path is not None) and (
|
||||||
|
ref_audio_path != self.prompt_cache["ref_audio_path"]
|
||||||
|
or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)
|
||||||
|
):
|
||||||
if not os.path.exists(ref_audio_path):
|
if not os.path.exists(ref_audio_path):
|
||||||
raise ValueError(f"{ref_audio_path} not exists")
|
raise ValueError(f"{ref_audio_path} not exists")
|
||||||
self.set_ref_audio(ref_audio_path)
|
self.set_ref_audio(ref_audio_path)
|
||||||
@ -1212,9 +1223,10 @@ class TTS:
|
|||||||
t_34 += t4 - t3
|
t_34 += t4 - t3
|
||||||
|
|
||||||
refer_audio_spec = []
|
refer_audio_spec = []
|
||||||
if self.is_v2pro:sv_emb=[]
|
if self.is_v2pro:
|
||||||
for spec,audio_tensor in self.prompt_cache["refer_spec"]:
|
sv_emb = []
|
||||||
spec=spec.to(dtype=self.precision, device=self.configs.device)
|
for spec, audio_tensor in self.prompt_cache["refer_spec"]:
|
||||||
|
spec = spec.to(dtype=self.precision, device=self.configs.device)
|
||||||
refer_audio_spec.append(spec)
|
refer_audio_spec.append(spec)
|
||||||
if self.is_v2pro:
|
if self.is_v2pro:
|
||||||
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
|
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
|
||||||
@ -1249,10 +1261,14 @@ class TTS:
|
|||||||
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||||
)
|
)
|
||||||
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
||||||
if self.is_v2pro!=True:
|
if self.is_v2pro != True:
|
||||||
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
|
_batch_audio_fragment = self.vits_model.decode(
|
||||||
|
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
|
||||||
|
).detach()[0, 0, :]
|
||||||
else:
|
else:
|
||||||
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
|
_batch_audio_fragment = self.vits_model.decode(
|
||||||
|
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
||||||
|
).detach()[0, 0, :]
|
||||||
audio_frag_end_idx.insert(0, 0)
|
audio_frag_end_idx.insert(0, 0)
|
||||||
batch_audio_fragment = [
|
batch_audio_fragment = [
|
||||||
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
|
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
|
||||||
@ -1266,9 +1282,13 @@ class TTS:
|
|||||||
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
||||||
) # .unsqueeze(0)#mq要多unsqueeze一次
|
) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||||
if self.is_v2pro != True:
|
if self.is_v2pro != True:
|
||||||
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
|
audio_fragment = self.vits_model.decode(
|
||||||
|
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
|
||||||
|
).detach()[0, 0, :]
|
||||||
else:
|
else:
|
||||||
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
|
audio_fragment = self.vits_model.decode(
|
||||||
|
_pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
|
||||||
|
).detach()[0, 0, :]
|
||||||
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
||||||
else:
|
else:
|
||||||
if parallel_infer:
|
if parallel_infer:
|
||||||
@ -1410,7 +1430,7 @@ class TTS:
|
|||||||
raw_entry = self.prompt_cache["refer_spec"][0]
|
raw_entry = self.prompt_cache["refer_spec"][0]
|
||||||
if isinstance(raw_entry, tuple):
|
if isinstance(raw_entry, tuple):
|
||||||
raw_entry = raw_entry[0]
|
raw_entry = raw_entry[0]
|
||||||
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
|
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
|
||||||
|
|
||||||
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||||
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
||||||
@ -1480,7 +1500,7 @@ class TTS:
|
|||||||
raw_entry = self.prompt_cache["refer_spec"][0]
|
raw_entry = self.prompt_cache["refer_spec"][0]
|
||||||
if isinstance(raw_entry, tuple):
|
if isinstance(raw_entry, tuple):
|
||||||
raw_entry = raw_entry[0]
|
raw_entry = raw_entry[0]
|
||||||
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
|
refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
|
||||||
|
|
||||||
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||||
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
||||||
|
@ -160,7 +160,9 @@ class TextPreprocessor:
|
|||||||
else:
|
else:
|
||||||
for tmp in LangSegmenter.getTexts(text):
|
for tmp in LangSegmenter.getTexts(text):
|
||||||
if langlist:
|
if langlist:
|
||||||
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
|
if (tmp["lang"] == "en" and langlist[-1] == "en") or (
|
||||||
|
tmp["lang"] != "en" and langlist[-1] != "en"
|
||||||
|
):
|
||||||
textlist[-1] += tmp["text"]
|
textlist[-1] += tmp["text"]
|
||||||
continue
|
continue
|
||||||
if tmp["lang"] == "en":
|
if tmp["lang"] == "en":
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
"""
|
|
||||||
Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
|
||||||
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
|
||||||
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
|
||||||
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
|
||||||
"""
|
"""
|
||||||
|
Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||||
|
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
||||||
|
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||||
|
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||||
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import math
|
import math
|
||||||
@ -16,15 +15,14 @@ import torch.nn.functional as F
|
|||||||
import pooling_layers as pooling_layers
|
import pooling_layers as pooling_layers
|
||||||
from fusion import AFF
|
from fusion import AFF
|
||||||
|
|
||||||
class ReLU(nn.Hardtanh):
|
|
||||||
|
|
||||||
|
class ReLU(nn.Hardtanh):
|
||||||
def __init__(self, inplace=False):
|
def __init__(self, inplace=False):
|
||||||
super(ReLU, self).__init__(0, 20, inplace)
|
super(ReLU, self).__init__(0, 20, inplace)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
inplace_str = 'inplace' if self.inplace else ''
|
inplace_str = "inplace" if self.inplace else ""
|
||||||
return self.__class__.__name__ + ' (' \
|
return self.__class__.__name__ + " (" + inplace_str + ")"
|
||||||
+ inplace_str + ')'
|
|
||||||
|
|
||||||
|
|
||||||
class BasicBlockERes2Net(nn.Module):
|
class BasicBlockERes2Net(nn.Module):
|
||||||
@ -32,28 +30,28 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
|
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||||
super(BasicBlockERes2Net, self).__init__()
|
super(BasicBlockERes2Net, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
self.convs = nn.ModuleList(convs)
|
self.convs = nn.ModuleList(convs)
|
||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
stride=stride, bias=False),
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
)
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -64,18 +62,18 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = sp + spx[i]
|
sp = sp + spx[i]
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -86,22 +84,23 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
||||||
expansion = 2
|
expansion = 2
|
||||||
|
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||||
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
fuse_models=[]
|
fuse_models = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
for j in range(self.nums - 1):
|
for j in range(self.nums - 1):
|
||||||
fuse_models.append(AFF(channels=width))
|
fuse_models.append(AFF(channels=width))
|
||||||
|
|
||||||
@ -109,15 +108,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.fuse_models = nn.ModuleList(fuse_models)
|
self.fuse_models = nn.ModuleList(fuse_models)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
stride=stride, bias=False),
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
)
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -128,19 +127,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = self.fuse_models[i-1](sp, spx[i])
|
sp = self.fuse_models[i - 1](sp, spx[i])
|
||||||
|
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -151,16 +150,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class ERes2Net(nn.Module):
|
class ERes2Net(nn.Module):
|
||||||
def __init__(self,
|
def __init__(
|
||||||
block=BasicBlockERes2Net,
|
self,
|
||||||
block_fuse=BasicBlockERes2Net_diff_AFF,
|
block=BasicBlockERes2Net,
|
||||||
num_blocks=[3, 4, 6, 3],
|
block_fuse=BasicBlockERes2Net_diff_AFF,
|
||||||
m_channels=32,
|
num_blocks=[3, 4, 6, 3],
|
||||||
feat_dim=80,
|
m_channels=32,
|
||||||
embedding_size=192,
|
feat_dim=80,
|
||||||
pooling_func='TSTP',
|
embedding_size=192,
|
||||||
two_emb_layer=False):
|
pooling_func="TSTP",
|
||||||
|
two_emb_layer=False,
|
||||||
|
):
|
||||||
super(ERes2Net, self).__init__()
|
super(ERes2Net, self).__init__()
|
||||||
self.in_planes = m_channels
|
self.in_planes = m_channels
|
||||||
self.feat_dim = feat_dim
|
self.feat_dim = feat_dim
|
||||||
@ -176,20 +178,24 @@ class ERes2Net(nn.Module):
|
|||||||
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||||
|
|
||||||
# Downsampling module for each layer
|
# Downsampling module for each layer
|
||||||
self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False)
|
self.layer1_downsample = nn.Conv2d(
|
||||||
self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
|
m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
|
||||||
self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
|
)
|
||||||
|
self.layer2_downsample = nn.Conv2d(
|
||||||
|
m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
|
||||||
|
)
|
||||||
|
self.layer3_downsample = nn.Conv2d(
|
||||||
|
m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
|
||||||
|
)
|
||||||
|
|
||||||
# Bottom-up fusion module
|
# Bottom-up fusion module
|
||||||
self.fuse_mode12 = AFF(channels=m_channels * 4)
|
self.fuse_mode12 = AFF(channels=m_channels * 4)
|
||||||
self.fuse_mode123 = AFF(channels=m_channels * 8)
|
self.fuse_mode123 = AFF(channels=m_channels * 8)
|
||||||
self.fuse_mode1234 = AFF(channels=m_channels * 16)
|
self.fuse_mode1234 = AFF(channels=m_channels * 16)
|
||||||
|
|
||||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
|
||||||
self.pool = getattr(pooling_layers, pooling_func)(
|
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
|
||||||
in_dim=self.stats_dim * block.expansion)
|
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
|
||||||
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
|
|
||||||
embedding_size)
|
|
||||||
if self.two_emb_layer:
|
if self.two_emb_layer:
|
||||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||||
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
||||||
@ -212,7 +218,7 @@ class ERes2Net(nn.Module):
|
|||||||
out1 = self.layer1(out)
|
out1 = self.layer1(out)
|
||||||
out2 = self.layer2(out1)
|
out2 = self.layer2(out1)
|
||||||
out1_downsample = self.layer1_downsample(out1)
|
out1_downsample = self.layer1_downsample(out1)
|
||||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||||
out3 = self.layer3(out2)
|
out3 = self.layer3(out2)
|
||||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||||
@ -243,18 +249,16 @@ class ERes2Net(nn.Module):
|
|||||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||||
out4 = self.layer4(out3)
|
out4 = self.layer4(out3)
|
||||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
|
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
|
||||||
return fuse_out1234
|
return fuse_out1234
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
x = torch.zeros(10, 300, 80)
|
x = torch.zeros(10, 300, 80)
|
||||||
model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP')
|
model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func="TSTP")
|
||||||
model.eval()
|
model.eval()
|
||||||
out = model(x)
|
out = model(x)
|
||||||
print(out.shape) # torch.Size([10, 192])
|
print(out.shape) # torch.Size([10, 192])
|
||||||
|
|
||||||
num_params = sum(param.numel() for param in model.parameters())
|
num_params = sum(param.numel() for param in model.parameters())
|
||||||
print("{} M".format(num_params / 1e6)) # 6.61M
|
print("{} M".format(num_params / 1e6)) # 6.61M
|
||||||
|
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
"""
|
|
||||||
To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
|
|
||||||
within each stage. However, this modification also increases the number of model parameters and computational complexity.
|
|
||||||
To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
|
|
||||||
both the model parameters and its computational cost.
|
|
||||||
"""
|
"""
|
||||||
|
To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
|
||||||
|
within each stage. However, this modification also increases the number of model parameters and computational complexity.
|
||||||
|
To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
|
||||||
|
both the model parameters and its computational cost.
|
||||||
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import math
|
import math
|
||||||
@ -17,47 +15,42 @@ import torch.nn.functional as F
|
|||||||
import pooling_layers as pooling_layers
|
import pooling_layers as pooling_layers
|
||||||
from fusion import AFF
|
from fusion import AFF
|
||||||
|
|
||||||
class ReLU(nn.Hardtanh):
|
|
||||||
|
|
||||||
|
class ReLU(nn.Hardtanh):
|
||||||
def __init__(self, inplace=False):
|
def __init__(self, inplace=False):
|
||||||
super(ReLU, self).__init__(0, 20, inplace)
|
super(ReLU, self).__init__(0, 20, inplace)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
inplace_str = 'inplace' if self.inplace else ''
|
inplace_str = "inplace" if self.inplace else ""
|
||||||
return self.__class__.__name__ + ' (' \
|
return self.__class__.__name__ + " (" + inplace_str + ")"
|
||||||
+ inplace_str + ')'
|
|
||||||
|
|
||||||
|
|
||||||
class BasicBlockERes2NetV2(nn.Module):
|
class BasicBlockERes2NetV2(nn.Module):
|
||||||
|
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
||||||
super(BasicBlockERes2NetV2, self).__init__()
|
super(BasicBlockERes2NetV2, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
self.expansion = expansion
|
self.expansion = expansion
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
self.convs = nn.ModuleList(convs)
|
self.convs = nn.ModuleList(convs)
|
||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes,
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
self.expansion * planes,
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
kernel_size=1,
|
)
|
||||||
stride=stride,
|
|
||||||
bias=False),
|
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -68,18 +61,18 @@ class BasicBlockERes2NetV2(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = sp + spx[i]
|
sp = sp + spx[i]
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -90,22 +83,22 @@ class BasicBlockERes2NetV2(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
class BasicBlockERes2NetV2AFF(nn.Module):
|
|
||||||
|
|
||||||
|
class BasicBlockERes2NetV2AFF(nn.Module):
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
||||||
super(BasicBlockERes2NetV2AFF, self).__init__()
|
super(BasicBlockERes2NetV2AFF, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
self.expansion = expansion
|
self.expansion = expansion
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
fuse_models=[]
|
fuse_models = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
for j in range(self.nums - 1):
|
for j in range(self.nums - 1):
|
||||||
fuse_models.append(AFF(channels=width, r=4))
|
fuse_models.append(AFF(channels=width, r=4))
|
||||||
|
|
||||||
@ -113,18 +106,15 @@ class BasicBlockERes2NetV2AFF(nn.Module):
|
|||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.fuse_models = nn.ModuleList(fuse_models)
|
self.fuse_models = nn.ModuleList(fuse_models)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes,
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
self.expansion * planes,
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
kernel_size=1,
|
)
|
||||||
stride=stride,
|
|
||||||
bias=False),
|
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -135,19 +125,19 @@ class BasicBlockERes2NetV2AFF(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = self.fuse_models[i-1](sp, spx[i])
|
sp = self.fuse_models[i - 1](sp, spx[i])
|
||||||
|
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -158,19 +148,22 @@ class BasicBlockERes2NetV2AFF(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class ERes2NetV2(nn.Module):
|
class ERes2NetV2(nn.Module):
|
||||||
def __init__(self,
|
def __init__(
|
||||||
block=BasicBlockERes2NetV2,
|
self,
|
||||||
block_fuse=BasicBlockERes2NetV2AFF,
|
block=BasicBlockERes2NetV2,
|
||||||
num_blocks=[3, 4, 6, 3],
|
block_fuse=BasicBlockERes2NetV2AFF,
|
||||||
m_channels=64,
|
num_blocks=[3, 4, 6, 3],
|
||||||
feat_dim=80,
|
m_channels=64,
|
||||||
embedding_size=192,
|
feat_dim=80,
|
||||||
baseWidth=26,
|
embedding_size=192,
|
||||||
scale=2,
|
baseWidth=26,
|
||||||
expansion=2,
|
scale=2,
|
||||||
pooling_func='TSTP',
|
expansion=2,
|
||||||
two_emb_layer=False):
|
pooling_func="TSTP",
|
||||||
|
two_emb_layer=False,
|
||||||
|
):
|
||||||
super(ERes2NetV2, self).__init__()
|
super(ERes2NetV2, self).__init__()
|
||||||
self.in_planes = m_channels
|
self.in_planes = m_channels
|
||||||
self.feat_dim = feat_dim
|
self.feat_dim = feat_dim
|
||||||
@ -181,42 +174,29 @@ class ERes2NetV2(nn.Module):
|
|||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.expansion = expansion
|
self.expansion = expansion
|
||||||
|
|
||||||
self.conv1 = nn.Conv2d(1,
|
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
|
||||||
m_channels,
|
|
||||||
kernel_size=3,
|
|
||||||
stride=1,
|
|
||||||
padding=1,
|
|
||||||
bias=False)
|
|
||||||
self.bn1 = nn.BatchNorm2d(m_channels)
|
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||||
self.layer1 = self._make_layer(block,
|
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
|
||||||
m_channels,
|
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
|
||||||
num_blocks[0],
|
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||||
stride=1)
|
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||||
self.layer2 = self._make_layer(block,
|
|
||||||
m_channels * 2,
|
|
||||||
num_blocks[1],
|
|
||||||
stride=2)
|
|
||||||
self.layer3 = self._make_layer(block_fuse,
|
|
||||||
m_channels * 4,
|
|
||||||
num_blocks[2],
|
|
||||||
stride=2)
|
|
||||||
self.layer4 = self._make_layer(block_fuse,
|
|
||||||
m_channels * 8,
|
|
||||||
num_blocks[3],
|
|
||||||
stride=2)
|
|
||||||
|
|
||||||
# Downsampling module
|
# Downsampling module
|
||||||
self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \
|
self.layer3_ds = nn.Conv2d(
|
||||||
padding=1, stride=2, bias=False)
|
m_channels * 4 * self.expansion,
|
||||||
|
m_channels * 8 * self.expansion,
|
||||||
|
kernel_size=3,
|
||||||
|
padding=1,
|
||||||
|
stride=2,
|
||||||
|
bias=False,
|
||||||
|
)
|
||||||
|
|
||||||
# Bottom-up fusion module
|
# Bottom-up fusion module
|
||||||
self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
|
self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
|
||||||
|
|
||||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
|
||||||
self.pool = getattr(pooling_layers, pooling_func)(
|
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion)
|
||||||
in_dim=self.stats_dim * self.expansion)
|
self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size)
|
||||||
self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
|
|
||||||
embedding_size)
|
|
||||||
if self.two_emb_layer:
|
if self.two_emb_layer:
|
||||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||||
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
||||||
@ -228,7 +208,11 @@ class ERes2NetV2(nn.Module):
|
|||||||
strides = [stride] + [1] * (num_blocks - 1)
|
strides = [stride] + [1] * (num_blocks - 1)
|
||||||
layers = []
|
layers = []
|
||||||
for stride in strides:
|
for stride in strides:
|
||||||
layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion))
|
layers.append(
|
||||||
|
block(
|
||||||
|
self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion
|
||||||
|
)
|
||||||
|
)
|
||||||
self.in_planes = planes * self.expansion
|
self.in_planes = planes * self.expansion
|
||||||
return nn.Sequential(*layers)
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
@ -264,7 +248,7 @@ class ERes2NetV2(nn.Module):
|
|||||||
out3_ds = self.layer3_ds(out3)
|
out3_ds = self.layer3_ds(out3)
|
||||||
fuse_out34 = self.fuse34(out4, out3_ds)
|
fuse_out34 = self.fuse34(out4, out3_ds)
|
||||||
# print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
|
# print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
|
||||||
return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1)
|
return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)
|
||||||
# stats = self.pool(fuse_out34)
|
# stats = self.pool(fuse_out34)
|
||||||
#
|
#
|
||||||
# embed_a = self.seg_1(stats)
|
# embed_a = self.seg_1(stats)
|
||||||
@ -276,17 +260,13 @@ class ERes2NetV2(nn.Module):
|
|||||||
# else:
|
# else:
|
||||||
# return embed_a
|
# return embed_a
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
x = torch.randn(1, 300, 80)
|
x = torch.randn(1, 300, 80)
|
||||||
model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
|
model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
|
||||||
model.eval()
|
model.eval()
|
||||||
y = model(x)
|
y = model(x)
|
||||||
print(y.size())
|
print(y.size())
|
||||||
macs, num_params = profile(model, inputs=(x, ))
|
macs, num_params = profile(model, inputs=(x,))
|
||||||
print("Params: {} M".format(num_params / 1e6)) # 17.86 M
|
print("Params: {} M".format(num_params / 1e6)) # 17.86 M
|
||||||
print("MACs: {} G".format(macs / 1e9)) # 12.69 G
|
print("MACs: {} G".format(macs / 1e9)) # 12.69 G
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,14 +1,13 @@
|
|||||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
"""Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||||
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
||||||
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||||
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||||
ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
|
ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
|
||||||
recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
|
recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
|
||||||
"""
|
"""
|
||||||
import pdb
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import math
|
import math
|
||||||
@ -17,15 +16,14 @@ import torch.nn.functional as F
|
|||||||
import pooling_layers as pooling_layers
|
import pooling_layers as pooling_layers
|
||||||
from fusion import AFF
|
from fusion import AFF
|
||||||
|
|
||||||
class ReLU(nn.Hardtanh):
|
|
||||||
|
|
||||||
|
class ReLU(nn.Hardtanh):
|
||||||
def __init__(self, inplace=False):
|
def __init__(self, inplace=False):
|
||||||
super(ReLU, self).__init__(0, 20, inplace)
|
super(ReLU, self).__init__(0, 20, inplace)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
inplace_str = 'inplace' if self.inplace else ''
|
inplace_str = "inplace" if self.inplace else ""
|
||||||
return self.__class__.__name__ + ' (' \
|
return self.__class__.__name__ + " (" + inplace_str + ")"
|
||||||
+ inplace_str + ')'
|
|
||||||
|
|
||||||
|
|
||||||
class BasicBlockERes2Net(nn.Module):
|
class BasicBlockERes2Net(nn.Module):
|
||||||
@ -33,27 +31,28 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
|
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
||||||
super(BasicBlockERes2Net, self).__init__()
|
super(BasicBlockERes2Net, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
self.convs = nn.ModuleList(convs)
|
self.convs = nn.ModuleList(convs)
|
||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
|
)
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -64,18 +63,18 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = sp + spx[i]
|
sp = sp + spx[i]
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -86,22 +85,23 @@ class BasicBlockERes2Net(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
||||||
expansion = 4
|
expansion = 4
|
||||||
|
|
||||||
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
||||||
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
||||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
|
||||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
self.nums = scale
|
self.nums = scale
|
||||||
|
|
||||||
convs=[]
|
convs = []
|
||||||
fuse_models=[]
|
fuse_models = []
|
||||||
bns=[]
|
bns = []
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||||
bns.append(nn.BatchNorm2d(width))
|
bns.append(nn.BatchNorm2d(width))
|
||||||
for j in range(self.nums - 1):
|
for j in range(self.nums - 1):
|
||||||
fuse_models.append(AFF(channels=width))
|
fuse_models.append(AFF(channels=width))
|
||||||
|
|
||||||
@ -109,14 +109,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
self.bns = nn.ModuleList(bns)
|
self.bns = nn.ModuleList(bns)
|
||||||
self.fuse_models = nn.ModuleList(fuse_models)
|
self.fuse_models = nn.ModuleList(fuse_models)
|
||||||
self.relu = ReLU(inplace=True)
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
|
||||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
self.shortcut = nn.Sequential()
|
self.shortcut = nn.Sequential()
|
||||||
if stride != 1 or in_planes != self.expansion * planes:
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
self.shortcut = nn.Sequential(
|
self.shortcut = nn.Sequential(
|
||||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||||
nn.BatchNorm2d(self.expansion * planes))
|
nn.BatchNorm2d(self.expansion * planes),
|
||||||
|
)
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
self.width = width
|
self.width = width
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
@ -127,20 +128,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
out = self.conv1(x)
|
out = self.conv1(x)
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
spx = torch.split(out,self.width,1)
|
spx = torch.split(out, self.width, 1)
|
||||||
for i in range(self.nums):
|
for i in range(self.nums):
|
||||||
if i==0:
|
if i == 0:
|
||||||
sp = spx[i]
|
sp = spx[i]
|
||||||
else:
|
else:
|
||||||
sp = self.fuse_models[i-1](sp, spx[i])
|
sp = self.fuse_models[i - 1](sp, spx[i])
|
||||||
|
|
||||||
sp = self.convs[i](sp)
|
sp = self.convs[i](sp)
|
||||||
sp = self.relu(self.bns[i](sp))
|
sp = self.relu(self.bns[i](sp))
|
||||||
if i==0:
|
if i == 0:
|
||||||
out = sp
|
out = sp
|
||||||
else:
|
else:
|
||||||
out = torch.cat((out,sp),1)
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
|
|
||||||
out = self.conv3(out)
|
out = self.conv3(out)
|
||||||
out = self.bn3(out)
|
out = self.bn3(out)
|
||||||
@ -151,16 +151,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class ERes2Net(nn.Module):
|
class ERes2Net(nn.Module):
|
||||||
def __init__(self,
|
def __init__(
|
||||||
block=BasicBlockERes2Net,
|
self,
|
||||||
block_fuse=BasicBlockERes2Net_diff_AFF,
|
block=BasicBlockERes2Net,
|
||||||
num_blocks=[3, 4, 6, 3],
|
block_fuse=BasicBlockERes2Net_diff_AFF,
|
||||||
m_channels=64,
|
num_blocks=[3, 4, 6, 3],
|
||||||
feat_dim=80,
|
m_channels=64,
|
||||||
embedding_size=192,
|
feat_dim=80,
|
||||||
pooling_func='TSTP',
|
embedding_size=192,
|
||||||
two_emb_layer=False):
|
pooling_func="TSTP",
|
||||||
|
two_emb_layer=False,
|
||||||
|
):
|
||||||
super(ERes2Net, self).__init__()
|
super(ERes2Net, self).__init__()
|
||||||
self.in_planes = m_channels
|
self.in_planes = m_channels
|
||||||
self.feat_dim = feat_dim
|
self.feat_dim = feat_dim
|
||||||
@ -176,17 +179,22 @@ class ERes2Net(nn.Module):
|
|||||||
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||||
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||||
|
|
||||||
self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
|
self.layer1_downsample = nn.Conv2d(
|
||||||
self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
|
m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
|
||||||
self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False)
|
)
|
||||||
|
self.layer2_downsample = nn.Conv2d(
|
||||||
|
m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
|
||||||
|
)
|
||||||
|
self.layer3_downsample = nn.Conv2d(
|
||||||
|
m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False
|
||||||
|
)
|
||||||
|
|
||||||
self.fuse_mode12 = AFF(channels=m_channels * 8)
|
self.fuse_mode12 = AFF(channels=m_channels * 8)
|
||||||
self.fuse_mode123 = AFF(channels=m_channels * 16)
|
self.fuse_mode123 = AFF(channels=m_channels * 16)
|
||||||
self.fuse_mode1234 = AFF(channels=m_channels * 32)
|
self.fuse_mode1234 = AFF(channels=m_channels * 32)
|
||||||
|
|
||||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
|
||||||
self.pool = getattr(pooling_layers, pooling_func)(
|
self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
|
||||||
in_dim=self.stats_dim * block.expansion)
|
|
||||||
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
|
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
|
||||||
if self.two_emb_layer:
|
if self.two_emb_layer:
|
||||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||||
@ -229,7 +237,7 @@ class ERes2Net(nn.Module):
|
|||||||
else:
|
else:
|
||||||
return embed_a
|
return embed_a
|
||||||
|
|
||||||
def forward2(self, x,if_mean):
|
def forward2(self, x, if_mean):
|
||||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||||
|
|
||||||
x = x.unsqueeze_(1)
|
x = x.unsqueeze_(1)
|
||||||
@ -243,14 +251,13 @@ class ERes2Net(nn.Module):
|
|||||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||||
out4 = self.layer4(out3)
|
out4 = self.layer4(out3)
|
||||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2)#bs,20480,T
|
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2) # bs,20480,T
|
||||||
if(if_mean==False):
|
if if_mean == False:
|
||||||
mean=fuse_out1234[0].transpose(1,0)#(T,20480),bs=T
|
mean = fuse_out1234[0].transpose(1, 0) # (T,20480),bs=T
|
||||||
else:
|
else:
|
||||||
mean = fuse_out1234.mean(2)#bs,20480
|
mean = fuse_out1234.mean(2) # bs,20480
|
||||||
mean_std=torch.cat([mean,torch.zeros_like(mean)],1)
|
mean_std = torch.cat([mean, torch.zeros_like(mean)], 1)
|
||||||
return self.seg_1(mean_std)#(T,192)
|
return self.seg_1(mean_std) # (T,192)
|
||||||
|
|
||||||
|
|
||||||
# stats = self.pool(fuse_out1234)
|
# stats = self.pool(fuse_out1234)
|
||||||
# if self.two_emb_layer:
|
# if self.two_emb_layer:
|
||||||
@ -275,12 +282,8 @@ class ERes2Net(nn.Module):
|
|||||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||||
out4 = self.layer4(out3)
|
out4 = self.layer4(out3)
|
||||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
|
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
|
||||||
return fuse_out1234
|
return fuse_out1234
|
||||||
# print(fuse_out1234.shape)
|
# print(fuse_out1234.shape)
|
||||||
# print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
|
# print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
|
||||||
# pdb.set_trace()
|
# pdb.set_trace()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,7 +6,6 @@ import torch.nn as nn
|
|||||||
|
|
||||||
|
|
||||||
class AFF(nn.Module):
|
class AFF(nn.Module):
|
||||||
|
|
||||||
def __init__(self, channels=64, r=4):
|
def __init__(self, channels=64, r=4):
|
||||||
super(AFF, self).__init__()
|
super(AFF, self).__init__()
|
||||||
inter_channels = int(channels // r)
|
inter_channels = int(channels // r)
|
||||||
@ -23,7 +22,6 @@ class AFF(nn.Module):
|
|||||||
xa = torch.cat((x, ds_y), dim=1)
|
xa = torch.cat((x, ds_y), dim=1)
|
||||||
x_att = self.local_att(xa)
|
x_att = self.local_att(xa)
|
||||||
x_att = 1.0 + torch.tanh(x_att)
|
x_att = 1.0 + torch.tanh(x_att)
|
||||||
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att)
|
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
|
||||||
|
|
||||||
return xo
|
return xo
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ def _get_waveform_and_window_properties(
|
|||||||
)
|
)
|
||||||
assert 0 < window_shift, "`window_shift` must be greater than 0"
|
assert 0 < window_shift, "`window_shift` must be greater than 0"
|
||||||
assert padded_window_size % 2 == 0, (
|
assert padded_window_size % 2 == 0, (
|
||||||
"the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
|
"the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
|
||||||
)
|
)
|
||||||
assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
|
assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
|
||||||
assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
|
assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
|
||||||
@ -441,7 +441,9 @@ def get_mel_banks(
|
|||||||
high_freq: float,
|
high_freq: float,
|
||||||
vtln_low: float,
|
vtln_low: float,
|
||||||
vtln_high: float,
|
vtln_high: float,
|
||||||
vtln_warp_factor: float,device=None,dtype=None
|
vtln_warp_factor: float,
|
||||||
|
device=None,
|
||||||
|
dtype=None,
|
||||||
) -> Tuple[Tensor, Tensor]:
|
) -> Tuple[Tensor, Tensor]:
|
||||||
"""
|
"""
|
||||||
Returns:
|
Returns:
|
||||||
@ -457,9 +459,9 @@ def get_mel_banks(
|
|||||||
if high_freq <= 0.0:
|
if high_freq <= 0.0:
|
||||||
high_freq += nyquist
|
high_freq += nyquist
|
||||||
|
|
||||||
assert (
|
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), (
|
||||||
(0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
|
"Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
|
||||||
), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
|
)
|
||||||
|
|
||||||
# fft-bin width [think of it as Nyquist-freq / half-window-length]
|
# fft-bin width [think of it as Nyquist-freq / half-window-length]
|
||||||
fft_bin_width = sample_freq / window_length_padded
|
fft_bin_width = sample_freq / window_length_padded
|
||||||
@ -475,7 +477,7 @@ def get_mel_banks(
|
|||||||
|
|
||||||
assert vtln_warp_factor == 1.0 or (
|
assert vtln_warp_factor == 1.0 or (
|
||||||
(low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
|
(low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
|
||||||
), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
|
), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
|
||||||
vtln_low, vtln_high, low_freq, high_freq
|
vtln_low, vtln_high, low_freq, high_freq
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -508,9 +510,12 @@ def get_mel_banks(
|
|||||||
bins[up_idx] = up_slope[up_idx]
|
bins[up_idx] = up_slope[up_idx]
|
||||||
bins[down_idx] = down_slope[down_idx]
|
bins[down_idx] = down_slope[down_idx]
|
||||||
|
|
||||||
return bins.to(device=device,dtype=dtype)#, center_freqs
|
return bins.to(device=device, dtype=dtype) # , center_freqs
|
||||||
|
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
|
||||||
cache={}
|
|
||||||
def fbank(
|
def fbank(
|
||||||
waveform: Tensor,
|
waveform: Tensor,
|
||||||
blackman_coeff: float = 0.42,
|
blackman_coeff: float = 0.42,
|
||||||
@ -620,14 +625,34 @@ def fbank(
|
|||||||
# size (num_mel_bins, padded_window_size // 2)
|
# size (num_mel_bins, padded_window_size // 2)
|
||||||
# print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
|
# print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
|
||||||
|
|
||||||
cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype)
|
cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % (
|
||||||
|
num_mel_bins,
|
||||||
|
padded_window_size,
|
||||||
|
sample_frequency,
|
||||||
|
low_freq,
|
||||||
|
high_freq,
|
||||||
|
vtln_low,
|
||||||
|
vtln_high,
|
||||||
|
vtln_warp,
|
||||||
|
device,
|
||||||
|
dtype,
|
||||||
|
)
|
||||||
if cache_key not in cache:
|
if cache_key not in cache:
|
||||||
mel_energies = get_mel_banks(
|
mel_energies = get_mel_banks(
|
||||||
num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype
|
num_mel_bins,
|
||||||
|
padded_window_size,
|
||||||
|
sample_frequency,
|
||||||
|
low_freq,
|
||||||
|
high_freq,
|
||||||
|
vtln_low,
|
||||||
|
vtln_high,
|
||||||
|
vtln_warp,
|
||||||
|
device,
|
||||||
|
dtype,
|
||||||
)
|
)
|
||||||
cache[cache_key]=mel_energies
|
cache[cache_key] = mel_energies
|
||||||
else:
|
else:
|
||||||
mel_energies=cache[cache_key]
|
mel_energies = cache[cache_key]
|
||||||
|
|
||||||
# pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
|
# pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
|
||||||
mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
|
mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
|
"""This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -11,6 +11,7 @@ class TAP(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Temporal average pooling, only first-order mean is considered
|
Temporal average pooling, only first-order mean is considered
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super(TAP, self).__init__()
|
super(TAP, self).__init__()
|
||||||
|
|
||||||
@ -25,6 +26,7 @@ class TSDP(nn.Module):
|
|||||||
"""
|
"""
|
||||||
Temporal standard deviation pooling, only second-order std is considered
|
Temporal standard deviation pooling, only second-order std is considered
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super(TSDP, self).__init__()
|
super(TSDP, self).__init__()
|
||||||
|
|
||||||
@ -41,6 +43,7 @@ class TSTP(nn.Module):
|
|||||||
x-vector
|
x-vector
|
||||||
Comment: simple concatenation can not make full use of both statistics
|
Comment: simple concatenation can not make full use of both statistics
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super(TSTP, self).__init__()
|
super(TSTP, self).__init__()
|
||||||
|
|
||||||
@ -56,9 +59,10 @@ class TSTP(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class ASTP(nn.Module):
|
class ASTP(nn.Module):
|
||||||
""" Attentive statistics pooling: Channel- and context-dependent
|
"""Attentive statistics pooling: Channel- and context-dependent
|
||||||
statistics pooling, first used in ECAPA_TDNN.
|
statistics pooling, first used in ECAPA_TDNN.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
|
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
|
||||||
super(ASTP, self).__init__()
|
super(ASTP, self).__init__()
|
||||||
self.global_context_att = global_context_att
|
self.global_context_att = global_context_att
|
||||||
@ -66,15 +70,10 @@ class ASTP(nn.Module):
|
|||||||
# Use Conv1d with stride == 1 rather than Linear, then we don't
|
# Use Conv1d with stride == 1 rather than Linear, then we don't
|
||||||
# need to transpose inputs.
|
# need to transpose inputs.
|
||||||
if global_context_att:
|
if global_context_att:
|
||||||
self.linear1 = nn.Conv1d(
|
self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1) # equals W and b in the paper
|
||||||
in_dim * 3, bottleneck_dim,
|
|
||||||
kernel_size=1) # equals W and b in the paper
|
|
||||||
else:
|
else:
|
||||||
self.linear1 = nn.Conv1d(
|
self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
|
||||||
in_dim, bottleneck_dim,
|
self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
|
||||||
kernel_size=1) # equals W and b in the paper
|
|
||||||
self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
|
|
||||||
kernel_size=1) # equals V and k in the paper
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""
|
"""
|
||||||
@ -88,15 +87,13 @@ class ASTP(nn.Module):
|
|||||||
|
|
||||||
if self.global_context_att:
|
if self.global_context_att:
|
||||||
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
||||||
context_std = torch.sqrt(
|
context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
||||||
torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
|
||||||
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
||||||
else:
|
else:
|
||||||
x_in = x
|
x_in = x
|
||||||
|
|
||||||
# DON'T use ReLU here! ReLU may be hard to converge.
|
# DON'T use ReLU here! ReLU may be hard to converge.
|
||||||
alpha = torch.tanh(
|
alpha = torch.tanh(self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
|
||||||
self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
|
|
||||||
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
||||||
mean = torch.sum(alpha * x, dim=2)
|
mean = torch.sum(alpha * x, dim=2)
|
||||||
var = torch.sum(alpha * (x**2), dim=2) - mean**2
|
var = torch.sum(alpha * (x**2), dim=2) - mean**2
|
||||||
|
@ -37,10 +37,13 @@ default_config = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sv_cn_model = None
|
sv_cn_model = None
|
||||||
|
|
||||||
|
|
||||||
def init_sv_cn(device, is_half):
|
def init_sv_cn(device, is_half):
|
||||||
global sv_cn_model
|
global sv_cn_model
|
||||||
sv_cn_model = SV(device, is_half)
|
sv_cn_model = SV(device, is_half)
|
||||||
|
|
||||||
|
|
||||||
def load_sovits_new(sovits_path):
|
def load_sovits_new(sovits_path):
|
||||||
f = open(sovits_path, "rb")
|
f = open(sovits_path, "rb")
|
||||||
meta = f.read(2)
|
meta = f.read(2)
|
||||||
@ -129,7 +132,9 @@ def sample(
|
|||||||
|
|
||||||
|
|
||||||
@torch.jit.script
|
@torch.jit.script
|
||||||
def spectrogram_torch(hann_window:Tensor, y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False):
|
def spectrogram_torch(
|
||||||
|
hann_window: Tensor, y: Tensor, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool = False
|
||||||
|
):
|
||||||
# hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
|
# hann_window = torch.hann_window(win_size, device=y.device, dtype=y.dtype)
|
||||||
y = torch.nn.functional.pad(
|
y = torch.nn.functional.pad(
|
||||||
y.unsqueeze(1),
|
y.unsqueeze(1),
|
||||||
@ -380,8 +385,9 @@ class VitsModel(nn.Module):
|
|||||||
self.vq_model = self.vq_model.half()
|
self.vq_model = self.vq_model.half()
|
||||||
self.vq_model = self.vq_model.to(device)
|
self.vq_model = self.vq_model.to(device)
|
||||||
self.vq_model.eval()
|
self.vq_model.eval()
|
||||||
self.hann_window = torch.hann_window(self.hps.data.win_length, device=device, dtype= torch.float16 if is_half else torch.float32)
|
self.hann_window = torch.hann_window(
|
||||||
|
self.hps.data.win_length, device=device, dtype=torch.float16 if is_half else torch.float32
|
||||||
|
)
|
||||||
|
|
||||||
def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0, sv_emb=None):
|
def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0, sv_emb=None):
|
||||||
refer = spectrogram_torch(
|
refer = spectrogram_torch(
|
||||||
@ -667,7 +673,9 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
|
|||||||
ref_seq = torch.LongTensor([ref_seq_id]).to(device)
|
ref_seq = torch.LongTensor([ref_seq_id]).to(device)
|
||||||
ref_bert = ref_bert_T.T.to(ref_seq.device)
|
ref_bert = ref_bert_T.T.to(ref_seq.device)
|
||||||
text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
|
text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
|
||||||
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", "auto", "v2"
|
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
|
||||||
|
"auto",
|
||||||
|
"v2",
|
||||||
)
|
)
|
||||||
text_seq = torch.LongTensor([text_seq_id]).to(device)
|
text_seq = torch.LongTensor([text_seq_id]).to(device)
|
||||||
text_bert = text_bert_T.T.to(text_seq.device)
|
text_bert = text_bert_T.T.to(text_seq.device)
|
||||||
@ -675,7 +683,7 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
|
|||||||
ssl_content = ssl(ref_audio).to(device)
|
ssl_content = ssl(ref_audio).to(device)
|
||||||
|
|
||||||
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
|
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
|
||||||
vits = VitsModel(vits_path,device=device,is_half=False)
|
vits = VitsModel(vits_path, device=device, is_half=False)
|
||||||
vits.eval()
|
vits.eval()
|
||||||
|
|
||||||
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
|
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
|
||||||
@ -726,7 +734,7 @@ def export_prov2(
|
|||||||
is_half=True,
|
is_half=True,
|
||||||
):
|
):
|
||||||
if sv_cn_model == None:
|
if sv_cn_model == None:
|
||||||
init_sv_cn(device,is_half)
|
init_sv_cn(device, is_half)
|
||||||
|
|
||||||
if not os.path.exists(output_path):
|
if not os.path.exists(output_path):
|
||||||
os.makedirs(output_path)
|
os.makedirs(output_path)
|
||||||
@ -747,9 +755,7 @@ def export_prov2(
|
|||||||
|
|
||||||
print(f"device: {device}")
|
print(f"device: {device}")
|
||||||
|
|
||||||
ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(
|
ref_seq_id, ref_bert_T, ref_norm_text = get_phones_and_bert(ref_text, "all_zh", "v2")
|
||||||
ref_text, "all_zh", "v2"
|
|
||||||
)
|
|
||||||
ref_seq = torch.LongTensor([ref_seq_id]).to(device)
|
ref_seq = torch.LongTensor([ref_seq_id]).to(device)
|
||||||
ref_bert = ref_bert_T.T
|
ref_bert = ref_bert_T.T
|
||||||
if is_half:
|
if is_half:
|
||||||
@ -757,7 +763,9 @@ def export_prov2(
|
|||||||
ref_bert = ref_bert.to(ref_seq.device)
|
ref_bert = ref_bert.to(ref_seq.device)
|
||||||
|
|
||||||
text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
|
text_seq_id, text_bert_T, norm_text = get_phones_and_bert(
|
||||||
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.", "auto", "v2"
|
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
|
||||||
|
"auto",
|
||||||
|
"v2",
|
||||||
)
|
)
|
||||||
text_seq = torch.LongTensor([text_seq_id]).to(device)
|
text_seq = torch.LongTensor([text_seq_id]).to(device)
|
||||||
text_bert = text_bert_T.T
|
text_bert = text_bert_T.T
|
||||||
@ -773,7 +781,7 @@ def export_prov2(
|
|||||||
sv_model = ExportERes2NetV2(sv_cn_model)
|
sv_model = ExportERes2NetV2(sv_cn_model)
|
||||||
|
|
||||||
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
|
# vits_path = "SoVITS_weights_v2/xw_e8_s216.pth"
|
||||||
vits = VitsModel(vits_path, version,is_half=is_half,device=device)
|
vits = VitsModel(vits_path, version, is_half=is_half, device=device)
|
||||||
vits.eval()
|
vits.eval()
|
||||||
|
|
||||||
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
|
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
|
||||||
@ -871,7 +879,7 @@ class GPT_SoVITS(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class ExportERes2NetV2(nn.Module):
|
class ExportERes2NetV2(nn.Module):
|
||||||
def __init__(self, sv_cn_model:SV):
|
def __init__(self, sv_cn_model: SV):
|
||||||
super(ExportERes2NetV2, self).__init__()
|
super(ExportERes2NetV2, self).__init__()
|
||||||
self.bn1 = sv_cn_model.embedding_model.bn1
|
self.bn1 = sv_cn_model.embedding_model.bn1
|
||||||
self.conv1 = sv_cn_model.embedding_model.conv1
|
self.conv1 = sv_cn_model.embedding_model.conv1
|
||||||
@ -898,11 +906,11 @@ class ExportERes2NetV2(nn.Module):
|
|||||||
out4 = self.layer4(out3)
|
out4 = self.layer4(out3)
|
||||||
out3_ds = self.layer3_ds(out3)
|
out3_ds = self.layer3_ds(out3)
|
||||||
fuse_out34 = self.fuse34(out4, out3_ds)
|
fuse_out34 = self.fuse34(out4, out3_ds)
|
||||||
return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1)
|
return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)
|
||||||
|
|
||||||
|
|
||||||
class GPT_SoVITS_V2Pro(nn.Module):
|
class GPT_SoVITS_V2Pro(nn.Module):
|
||||||
def __init__(self, t2s: T2SModel, vits: VitsModel,sv_model:ExportERes2NetV2):
|
def __init__(self, t2s: T2SModel, vits: VitsModel, sv_model: ExportERes2NetV2):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.t2s = t2s
|
self.t2s = t2s
|
||||||
self.vits = vits
|
self.vits = vits
|
||||||
@ -930,6 +938,7 @@ class GPT_SoVITS_V2Pro(nn.Module):
|
|||||||
audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed, sv_emb)
|
audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed, sv_emb)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
||||||
parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
|
parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
|
||||||
@ -1046,24 +1055,14 @@ def export_symbel(version="v2"):
|
|||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
||||||
parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
|
parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
|
||||||
parser.add_argument(
|
parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file")
|
||||||
"--sovits_model", required=True, help="Path to the SoVITS model file"
|
parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file")
|
||||||
)
|
parser.add_argument("--ref_text", required=True, help="Path to the reference text file")
|
||||||
parser.add_argument(
|
parser.add_argument("--output_path", required=True, help="Path to the output directory")
|
||||||
"--ref_audio", required=True, help="Path to the reference audio file"
|
parser.add_argument("--export_common_model", action="store_true", help="Export Bert and SSL model")
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--ref_text", required=True, help="Path to the reference text file"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output_path", required=True, help="Path to the output directory"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--export_common_model", action="store_true", help="Export Bert and SSL model"
|
|
||||||
)
|
|
||||||
parser.add_argument("--device", help="Device to use")
|
parser.add_argument("--device", help="Device to use")
|
||||||
parser.add_argument("--version", help="version of the model", default="v2")
|
parser.add_argument("--version", help="version of the model", default="v2")
|
||||||
parser.add_argument("--no-half", action="store_true", help = "Do not use half precision for model weights")
|
parser.add_argument("--no-half", action="store_true", help="Do not use half precision for model weights")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.version in ["v2Pro", "v2ProPlus"]:
|
if args.version in ["v2Pro", "v2ProPlus"]:
|
||||||
|
@ -406,7 +406,7 @@ class GPTSoVITSV3(torch.nn.Module):
|
|||||||
chunk_len = 934 - fea_ref.shape[2]
|
chunk_len = 934 - fea_ref.shape[2]
|
||||||
wav_gen_list = []
|
wav_gen_list = []
|
||||||
idx = 0
|
idx = 0
|
||||||
fea_todo = fea_todo[:,:,:-5]
|
fea_todo = fea_todo[:, :, :-5]
|
||||||
wav_gen_length = fea_todo.shape[2] * 256
|
wav_gen_length = fea_todo.shape[2] * 256
|
||||||
while 1:
|
while 1:
|
||||||
# current_time = datetime.now()
|
# current_time = datetime.now()
|
||||||
@ -438,7 +438,8 @@ class GPTSoVITSV3(torch.nn.Module):
|
|||||||
|
|
||||||
wav_gen = torch.cat(wav_gen_list, 2)
|
wav_gen = torch.cat(wav_gen_list, 2)
|
||||||
return wav_gen[0][0][:wav_gen_length]
|
return wav_gen[0][0][:wav_gen_length]
|
||||||
|
|
||||||
|
|
||||||
class GPTSoVITSV4(torch.nn.Module):
|
class GPTSoVITSV4(torch.nn.Module):
|
||||||
def __init__(self, gpt_sovits_half, cfm, hifigan):
|
def __init__(self, gpt_sovits_half, cfm, hifigan):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -465,7 +466,7 @@ class GPTSoVITSV4(torch.nn.Module):
|
|||||||
chunk_len = 1000 - fea_ref.shape[2]
|
chunk_len = 1000 - fea_ref.shape[2]
|
||||||
wav_gen_list = []
|
wav_gen_list = []
|
||||||
idx = 0
|
idx = 0
|
||||||
fea_todo = fea_todo[:,:,:-10]
|
fea_todo = fea_todo[:, :, :-10]
|
||||||
wav_gen_length = fea_todo.shape[2] * 480
|
wav_gen_length = fea_todo.shape[2] * 480
|
||||||
while 1:
|
while 1:
|
||||||
# current_time = datetime.now()
|
# current_time = datetime.now()
|
||||||
@ -581,6 +582,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
|||||||
|
|
||||||
v3v4set = {"v3", "v4"}
|
v3v4set = {"v3", "v4"}
|
||||||
|
|
||||||
|
|
||||||
def get_sovits_weights(sovits_path):
|
def get_sovits_weights(sovits_path):
|
||||||
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||||
@ -703,14 +705,13 @@ def export_cfm(
|
|||||||
return export_cfm
|
return export_cfm
|
||||||
|
|
||||||
|
|
||||||
def export_1(ref_wav_path,ref_wav_text,version="v3"):
|
def export_1(ref_wav_path, ref_wav_text, version="v3"):
|
||||||
if version == "v3":
|
if version == "v3":
|
||||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
||||||
init_bigvgan()
|
init_bigvgan()
|
||||||
else:
|
else:
|
||||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
|
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
|
||||||
init_hifigan()
|
init_hifigan()
|
||||||
|
|
||||||
|
|
||||||
dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
|
dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
|
||||||
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
|
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
|
||||||
@ -755,9 +756,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
|
|||||||
# phones1, bert1, norm_text1 = get_phones_and_bert(
|
# phones1, bert1, norm_text1 = get_phones_and_bert(
|
||||||
# "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
|
# "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
|
||||||
# )
|
# )
|
||||||
phones1, bert1, norm_text1 = get_phones_and_bert(
|
phones1, bert1, norm_text1 = get_phones_and_bert(ref_wav_text, "auto", "v3")
|
||||||
ref_wav_text, "auto", "v3"
|
|
||||||
)
|
|
||||||
phones2, bert2, norm_text2 = get_phones_and_bert(
|
phones2, bert2, norm_text2 = get_phones_and_bert(
|
||||||
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
|
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
|
||||||
"auto",
|
"auto",
|
||||||
@ -918,7 +917,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
|
|||||||
hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,))
|
hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,))
|
||||||
hifigan_model_.save("onnx/ad/hifigan_model.pt")
|
hifigan_model_.save("onnx/ad/hifigan_model.pt")
|
||||||
wav_gen = hifigan_model(cmf_res)
|
wav_gen = hifigan_model(cmf_res)
|
||||||
|
|
||||||
print("wav_gen:", wav_gen.shape, wav_gen.dtype)
|
print("wav_gen:", wav_gen.shape, wav_gen.dtype)
|
||||||
audio = wav_gen[0][0].cpu().detach().numpy()
|
audio = wav_gen[0][0].cpu().detach().numpy()
|
||||||
|
|
||||||
@ -1205,7 +1204,6 @@ def export_2(version="v3"):
|
|||||||
gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
|
gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
|
||||||
sr = 24000 if version == "v3" else 48000
|
sr = 24000 if version == "v3" else 48000
|
||||||
|
|
||||||
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
# print("thread:", torch.get_num_threads())
|
# print("thread:", torch.get_num_threads())
|
||||||
# print("thread:", torch.get_num_interop_threads())
|
# print("thread:", torch.get_num_interop_threads())
|
||||||
@ -1216,14 +1214,14 @@ def export_2(version="v3"):
|
|||||||
"汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
|
"汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
|
||||||
gpt_sovits_v3v4,
|
gpt_sovits_v3v4,
|
||||||
"out.wav",
|
"out.wav",
|
||||||
sr
|
sr,
|
||||||
)
|
)
|
||||||
|
|
||||||
test_export(
|
test_export(
|
||||||
"你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
|
"你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
|
||||||
gpt_sovits_v3v4,
|
gpt_sovits_v3v4,
|
||||||
"out2.wav",
|
"out2.wav",
|
||||||
sr
|
sr,
|
||||||
)
|
)
|
||||||
|
|
||||||
# test_export(
|
# test_export(
|
||||||
|
@ -143,9 +143,9 @@ class DiT(nn.Module):
|
|||||||
drop_audio_cond=False, # cfg for cond audio
|
drop_audio_cond=False, # cfg for cond audio
|
||||||
drop_text=False, # cfg for text
|
drop_text=False, # cfg for text
|
||||||
# mask: bool["b n"] | None = None, # noqa: F722
|
# mask: bool["b n"] | None = None, # noqa: F722
|
||||||
infer=False, # bool
|
infer=False, # bool
|
||||||
text_cache=None, # torch tensor as text_embed
|
text_cache=None, # torch tensor as text_embed
|
||||||
dt_cache=None, # torch tensor as dt
|
dt_cache=None, # torch tensor as dt
|
||||||
):
|
):
|
||||||
x = x0.transpose(2, 1)
|
x = x0.transpose(2, 1)
|
||||||
cond = cond0.transpose(2, 1)
|
cond = cond0.transpose(2, 1)
|
||||||
@ -191,4 +191,4 @@ class DiT(nn.Module):
|
|||||||
if infer:
|
if infer:
|
||||||
return output, text_embed, dt
|
return output, text_embed, dt
|
||||||
else:
|
else:
|
||||||
return output
|
return output
|
||||||
|
@ -21,7 +21,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
3) computes spectrograms from audio files.
|
3) computes spectrograms from audio files.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, hparams, version=None,val=False):
|
def __init__(self, hparams, version=None, val=False):
|
||||||
exp_dir = hparams.exp_dir
|
exp_dir = hparams.exp_dir
|
||||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
self.path2 = "%s/2-name2text.txt" % exp_dir
|
||||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
self.path4 = "%s/4-cnhubert" % exp_dir
|
||||||
@ -29,7 +29,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
assert os.path.exists(self.path2)
|
assert os.path.exists(self.path2)
|
||||||
assert os.path.exists(self.path4)
|
assert os.path.exists(self.path4)
|
||||||
assert os.path.exists(self.path5)
|
assert os.path.exists(self.path5)
|
||||||
self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
|
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
self.path7 = "%s/7-sv_cn" % exp_dir
|
self.path7 = "%s/7-sv_cn" % exp_dir
|
||||||
assert os.path.exists(self.path7)
|
assert os.path.exists(self.path7)
|
||||||
@ -118,7 +118,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||||
ssl.requires_grad = False
|
ssl.requires_grad = False
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
sv_emb=torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
|
sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
spec = torch.zeros(1025, 100)
|
spec = torch.zeros(1025, 100)
|
||||||
@ -126,10 +126,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
ssl = torch.zeros(1, 768, 100)
|
ssl = torch.zeros(1, 768, 100)
|
||||||
text = text[-1:]
|
text = text[-1:]
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
sv_emb=torch.zeros(1,20480)
|
sv_emb = torch.zeros(1, 20480)
|
||||||
print("load audio or ssl error!!!!!!", audiopath)
|
print("load audio or ssl error!!!!!!", audiopath)
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
return (ssl, spec, wav, text,sv_emb)
|
return (ssl, spec, wav, text, sv_emb)
|
||||||
else:
|
else:
|
||||||
return (ssl, spec, wav, text)
|
return (ssl, spec, wav, text)
|
||||||
|
|
||||||
@ -192,9 +192,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
class TextAudioSpeakerCollate:
|
class TextAudioSpeakerCollate:
|
||||||
"""Zero-pads model inputs and targets"""
|
"""Zero-pads model inputs and targets"""
|
||||||
|
|
||||||
def __init__(self, return_ids=False,version=None):
|
def __init__(self, return_ids=False, version=None):
|
||||||
self.return_ids = return_ids
|
self.return_ids = return_ids
|
||||||
self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
|
self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
|
||||||
|
|
||||||
def __call__(self, batch):
|
def __call__(self, batch):
|
||||||
"""Collate's training batch from normalized text, audio and speaker identities
|
"""Collate's training batch from normalized text, audio and speaker identities
|
||||||
@ -228,7 +228,7 @@ class TextAudioSpeakerCollate:
|
|||||||
text_padded.zero_()
|
text_padded.zero_()
|
||||||
|
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
sv_embs=torch.FloatTensor(len(batch),20480)
|
sv_embs = torch.FloatTensor(len(batch), 20480)
|
||||||
|
|
||||||
for i in range(len(ids_sorted_decreasing)):
|
for i in range(len(ids_sorted_decreasing)):
|
||||||
row = batch[ids_sorted_decreasing[i]]
|
row = batch[ids_sorted_decreasing[i]]
|
||||||
@ -250,11 +250,30 @@ class TextAudioSpeakerCollate:
|
|||||||
text_lengths[i] = text.size(0)
|
text_lengths[i] = text.size(0)
|
||||||
|
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
sv_embs[i]=row[4]
|
sv_embs[i] = row[4]
|
||||||
if self.is_v2Pro:
|
if self.is_v2Pro:
|
||||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths,sv_embs
|
return (
|
||||||
|
ssl_padded,
|
||||||
|
ssl_lengths,
|
||||||
|
spec_padded,
|
||||||
|
spec_lengths,
|
||||||
|
wav_padded,
|
||||||
|
wav_lengths,
|
||||||
|
text_padded,
|
||||||
|
text_lengths,
|
||||||
|
sv_embs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
|
return (
|
||||||
|
ssl_padded,
|
||||||
|
ssl_lengths,
|
||||||
|
spec_padded,
|
||||||
|
spec_lengths,
|
||||||
|
wav_padded,
|
||||||
|
wav_lengths,
|
||||||
|
text_padded,
|
||||||
|
text_lengths,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||||
|
@ -586,12 +586,17 @@ class DiscriminatorS(torch.nn.Module):
|
|||||||
|
|
||||||
return x, fmap
|
return x, fmap
|
||||||
|
|
||||||
v2pro_set={"v2Pro","v2ProPlus"}
|
|
||||||
|
v2pro_set = {"v2Pro", "v2ProPlus"}
|
||||||
|
|
||||||
|
|
||||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||||
def __init__(self, use_spectral_norm=False,version=None):
|
def __init__(self, use_spectral_norm=False, version=None):
|
||||||
super(MultiPeriodDiscriminator, self).__init__()
|
super(MultiPeriodDiscriminator, self).__init__()
|
||||||
if version in v2pro_set:periods = [2, 3, 5, 7, 11,17,23]
|
if version in v2pro_set:
|
||||||
else:periods = [2, 3, 5, 7, 11]
|
periods = [2, 3, 5, 7, 11, 17, 23]
|
||||||
|
else:
|
||||||
|
periods = [2, 3, 5, 7, 11]
|
||||||
|
|
||||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||||
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||||
@ -787,6 +792,7 @@ class CodePredictor(nn.Module):
|
|||||||
|
|
||||||
return pred_codes.transpose(0, 1)
|
return pred_codes.transpose(0, 1)
|
||||||
|
|
||||||
|
|
||||||
class SynthesizerTrn(nn.Module):
|
class SynthesizerTrn(nn.Module):
|
||||||
"""
|
"""
|
||||||
Synthesizer for Training
|
Synthesizer for Training
|
||||||
@ -886,13 +892,13 @@ class SynthesizerTrn(nn.Module):
|
|||||||
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
|
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
|
||||||
self.freeze_quantizer = freeze_quantizer
|
self.freeze_quantizer = freeze_quantizer
|
||||||
|
|
||||||
self.is_v2pro=self.version in v2pro_set
|
self.is_v2pro = self.version in v2pro_set
|
||||||
if self.is_v2pro:
|
if self.is_v2pro:
|
||||||
self.sv_emb = nn.Linear(20480, gin_channels)
|
self.sv_emb = nn.Linear(20480, gin_channels)
|
||||||
self.ge_to512 = nn.Linear(gin_channels, 512)
|
self.ge_to512 = nn.Linear(gin_channels, 512)
|
||||||
self.prelu = nn.PReLU(num_parameters=gin_channels)
|
self.prelu = nn.PReLU(num_parameters=gin_channels)
|
||||||
|
|
||||||
def forward(self, ssl, y, y_lengths, text, text_lengths,sv_emb=None):
|
def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None):
|
||||||
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
|
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
|
||||||
if self.version == "v1":
|
if self.version == "v1":
|
||||||
ge = self.ref_enc(y * y_mask, y_mask)
|
ge = self.ref_enc(y * y_mask, y_mask)
|
||||||
@ -952,7 +958,7 @@ class SynthesizerTrn(nn.Module):
|
|||||||
return o, y_mask, (z, z_p, m_p, logs_p)
|
return o, y_mask, (z, z_p, m_p, logs_p)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def decode(self, codes, text, refer,noise_scale=0.5, speed=1, sv_emb=None):
|
def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None):
|
||||||
def get_ge(refer, sv_emb):
|
def get_ge(refer, sv_emb):
|
||||||
ge = None
|
ge = None
|
||||||
if refer is not None:
|
if refer is not None:
|
||||||
@ -970,8 +976,8 @@ class SynthesizerTrn(nn.Module):
|
|||||||
|
|
||||||
if type(refer) == list:
|
if type(refer) == list:
|
||||||
ges = []
|
ges = []
|
||||||
for idx,_refer in enumerate(refer):
|
for idx, _refer in enumerate(refer):
|
||||||
ge = get_ge(_refer, sv_emb[idx]if self.is_v2pro else None)
|
ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
|
||||||
ges.append(ge)
|
ges.append(ge)
|
||||||
ge = torch.stack(ges, 0).mean(0)
|
ge = torch.stack(ges, 0).mean(0)
|
||||||
else:
|
else:
|
||||||
@ -983,7 +989,14 @@ class SynthesizerTrn(nn.Module):
|
|||||||
quantized = self.quantizer.decode(codes)
|
quantized = self.quantizer.decode(codes)
|
||||||
if self.semantic_frame_rate == "25hz":
|
if self.semantic_frame_rate == "25hz":
|
||||||
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
|
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
|
||||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, self.ge_to512(ge.transpose(2,1)).transpose(2,1)if self.is_v2pro else ge, speed)
|
x, m_p, logs_p, y_mask = self.enc_p(
|
||||||
|
quantized,
|
||||||
|
y_lengths,
|
||||||
|
text,
|
||||||
|
text_lengths,
|
||||||
|
self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
|
||||||
|
speed,
|
||||||
|
)
|
||||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||||
|
|
||||||
z = self.flow(z_p, y_mask, g=ge, reverse=True)
|
z = self.flow(z_p, y_mask, g=ge, reverse=True)
|
||||||
@ -996,6 +1009,7 @@ class SynthesizerTrn(nn.Module):
|
|||||||
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
||||||
return codes.transpose(0, 1)
|
return codes.transpose(0, 1)
|
||||||
|
|
||||||
|
|
||||||
class CFM(torch.nn.Module):
|
class CFM(torch.nn.Module):
|
||||||
def __init__(self, in_channels, dit):
|
def __init__(self, in_channels, dit):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -1029,7 +1043,18 @@ class CFM(torch.nn.Module):
|
|||||||
t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
|
t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
|
||||||
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
|
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
|
||||||
v_pred, text_emb, dt = self.estimator(
|
v_pred, text_emb, dt = self.estimator(
|
||||||
x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False, infer=True, text_cache=text_cache, dt_cache=dt_cache
|
x,
|
||||||
|
prompt_x,
|
||||||
|
x_lens,
|
||||||
|
t_tensor,
|
||||||
|
d_tensor,
|
||||||
|
mu,
|
||||||
|
use_grad_ckpt=False,
|
||||||
|
drop_audio_cond=False,
|
||||||
|
drop_text=False,
|
||||||
|
infer=True,
|
||||||
|
text_cache=text_cache,
|
||||||
|
dt_cache=dt_cache,
|
||||||
)
|
)
|
||||||
v_pred = v_pred.transpose(2, 1)
|
v_pred = v_pred.transpose(2, 1)
|
||||||
if self.use_conditioner_cache:
|
if self.use_conditioner_cache:
|
||||||
@ -1037,18 +1062,18 @@ class CFM(torch.nn.Module):
|
|||||||
dt_cache = dt
|
dt_cache = dt
|
||||||
if inference_cfg_rate > 1e-5:
|
if inference_cfg_rate > 1e-5:
|
||||||
neg, text_cfg_emb, _ = self.estimator(
|
neg, text_cfg_emb, _ = self.estimator(
|
||||||
x,
|
x,
|
||||||
prompt_x,
|
prompt_x,
|
||||||
x_lens,
|
x_lens,
|
||||||
t_tensor,
|
t_tensor,
|
||||||
d_tensor,
|
d_tensor,
|
||||||
mu,
|
mu,
|
||||||
use_grad_ckpt=False,
|
use_grad_ckpt=False,
|
||||||
drop_audio_cond=True,
|
drop_audio_cond=True,
|
||||||
drop_text=True,
|
drop_text=True,
|
||||||
infer=True,
|
infer=True,
|
||||||
text_cache=text_cfg_cache,
|
text_cache=text_cfg_cache,
|
||||||
dt_cache=dt_cache
|
dt_cache=dt_cache,
|
||||||
)
|
)
|
||||||
neg = neg.transpose(2, 1)
|
neg = neg.transpose(2, 1)
|
||||||
if self.use_conditioner_cache:
|
if self.use_conditioner_cache:
|
||||||
|
@ -762,7 +762,9 @@ class CodePredictor(nn.Module):
|
|||||||
|
|
||||||
return pred_codes.transpose(0, 1)
|
return pred_codes.transpose(0, 1)
|
||||||
|
|
||||||
v2pro_set={"v2Pro","v2ProPlus"}
|
|
||||||
|
v2pro_set = {"v2Pro", "v2ProPlus"}
|
||||||
|
|
||||||
|
|
||||||
class SynthesizerTrn(nn.Module):
|
class SynthesizerTrn(nn.Module):
|
||||||
"""
|
"""
|
||||||
@ -868,7 +870,7 @@ class SynthesizerTrn(nn.Module):
|
|||||||
# self.enc_p.text_embedding.requires_grad_(False)
|
# self.enc_p.text_embedding.requires_grad_(False)
|
||||||
# self.enc_p.encoder_text.requires_grad_(False)
|
# self.enc_p.encoder_text.requires_grad_(False)
|
||||||
# self.enc_p.mrte.requires_grad_(False)
|
# self.enc_p.mrte.requires_grad_(False)
|
||||||
self.is_v2pro=self.version in v2pro_set
|
self.is_v2pro = self.version in v2pro_set
|
||||||
if self.is_v2pro:
|
if self.is_v2pro:
|
||||||
self.sv_emb = nn.Linear(20480, gin_channels)
|
self.sv_emb = nn.Linear(20480, gin_channels)
|
||||||
self.ge_to512 = nn.Linear(gin_channels, 512)
|
self.ge_to512 = nn.Linear(gin_channels, 512)
|
||||||
@ -891,7 +893,7 @@ class SynthesizerTrn(nn.Module):
|
|||||||
quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)
|
quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)
|
||||||
|
|
||||||
if self.is_v2pro:
|
if self.is_v2pro:
|
||||||
ge_ = self.ge_to512(ge.transpose(2,1)).transpose(2,1)
|
ge_ = self.ge_to512(ge.transpose(2, 1)).transpose(2, 1)
|
||||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge_, speed)
|
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge_, speed)
|
||||||
else:
|
else:
|
||||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed)
|
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import math
|
import math
|
||||||
import pdb
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -720,10 +719,10 @@ class MelStyleEncoder(nn.Module):
|
|||||||
else:
|
else:
|
||||||
len_ = (~mask).sum(dim=1).unsqueeze(1)
|
len_ = (~mask).sum(dim=1).unsqueeze(1)
|
||||||
x = x.masked_fill(mask.unsqueeze(-1), 0)
|
x = x.masked_fill(mask.unsqueeze(-1), 0)
|
||||||
dtype=x.dtype
|
dtype = x.dtype
|
||||||
x = x.float()
|
x = x.float()
|
||||||
x=torch.div(x,len_.unsqueeze(1))
|
x = torch.div(x, len_.unsqueeze(1))
|
||||||
out=x.sum(dim=1).to(dtype)
|
out = x.sum(dim=1).to(dtype)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def forward(self, x, mask=None):
|
def forward(self, x, mask=None):
|
||||||
|
@ -10,7 +10,6 @@ i_part = os.environ.get("i_part")
|
|||||||
all_parts = os.environ.get("all_parts")
|
all_parts = os.environ.get("all_parts")
|
||||||
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
||||||
from feature_extractor import cnhubert
|
|
||||||
|
|
||||||
opt_dir = os.environ.get("opt_dir")
|
opt_dir = os.environ.get("opt_dir")
|
||||||
sv_path = os.environ.get("sv_path")
|
sv_path = os.environ.get("sv_path")
|
||||||
@ -19,19 +18,18 @@ import torch
|
|||||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
import numpy as np
|
|
||||||
from scipy.io import wavfile
|
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
sys.path.append(now_dir)
|
sys.path.append(now_dir)
|
||||||
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
|
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
|
||||||
from tools.my_utils import load_audio, clean_path
|
from tools.my_utils import clean_path
|
||||||
from time import time as ttime
|
from time import time as ttime
|
||||||
import shutil
|
import shutil
|
||||||
from ERes2NetV2 import ERes2NetV2
|
from ERes2NetV2 import ERes2NetV2
|
||||||
import kaldi as Kaldi
|
import kaldi as Kaldi
|
||||||
|
|
||||||
|
|
||||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
name = os.path.basename(path)
|
name = os.path.basename(path)
|
||||||
@ -56,37 +54,45 @@ if torch.cuda.is_available():
|
|||||||
else:
|
else:
|
||||||
device = "cpu"
|
device = "cpu"
|
||||||
|
|
||||||
|
|
||||||
class SV:
|
class SV:
|
||||||
def __init__(self,device,is_half):
|
def __init__(self, device, is_half):
|
||||||
pretrained_state = torch.load(sv_path, map_location='cpu')
|
pretrained_state = torch.load(sv_path, map_location="cpu")
|
||||||
embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
|
embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
|
||||||
embedding_model.load_state_dict(pretrained_state)
|
embedding_model.load_state_dict(pretrained_state)
|
||||||
embedding_model.eval()
|
embedding_model.eval()
|
||||||
self.embedding_model=embedding_model
|
self.embedding_model = embedding_model
|
||||||
self.res=torchaudio.transforms.Resample(32000, 16000).to(device)
|
self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
|
||||||
if is_half == False:
|
if is_half == False:
|
||||||
self.embedding_model=self.embedding_model.to(device)
|
self.embedding_model = self.embedding_model.to(device)
|
||||||
else:
|
else:
|
||||||
self.embedding_model=self.embedding_model.half().to(device)
|
self.embedding_model = self.embedding_model.half().to(device)
|
||||||
self.is_half=is_half
|
self.is_half = is_half
|
||||||
|
|
||||||
def compute_embedding3(self,wav):#(1,x)#-1~1
|
def compute_embedding3(self, wav): # (1,x)#-1~1
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
wav=self.res(wav)
|
wav = self.res(wav)
|
||||||
if self.is_half==True:wav=wav.half()
|
if self.is_half == True:
|
||||||
feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
|
wav = wav.half()
|
||||||
|
feat = torch.stack(
|
||||||
|
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
|
||||||
|
)
|
||||||
sv_emb = self.embedding_model.forward3(feat)
|
sv_emb = self.embedding_model.forward3(feat)
|
||||||
return sv_emb
|
return sv_emb
|
||||||
|
|
||||||
sv=SV(device,is_half)
|
|
||||||
|
sv = SV(device, is_half)
|
||||||
|
|
||||||
|
|
||||||
def name2go(wav_name, wav_path):
|
def name2go(wav_name, wav_path):
|
||||||
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
|
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
|
||||||
if os.path.exists(sv_cn_path):return
|
if os.path.exists(sv_cn_path):
|
||||||
wav_path="%s/%s" % (wav32dir, wav_name)
|
return
|
||||||
wav32k,sr0 = torchaudio.load(wav_path)
|
wav_path = "%s/%s" % (wav32dir, wav_name)
|
||||||
assert sr0==32000
|
wav32k, sr0 = torchaudio.load(wav_path)
|
||||||
|
assert sr0 == 32000
|
||||||
wav32k = wav32k.to(device)
|
wav32k = wav32k.to(device)
|
||||||
emb=sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
|
emb = sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
|
||||||
my_save(emb, sv_cn_path)
|
my_save(emb, sv_cn_path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,15 +17,16 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
|||||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
model_version2byte={
|
model_version2byte = {
|
||||||
"v3":b"03",
|
"v3": b"03",
|
||||||
"v4":b"04",
|
"v4": b"04",
|
||||||
"v2Pro":b"05",
|
"v2Pro": b"05",
|
||||||
"v2ProPlus":b"06",
|
"v2ProPlus": b"06",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def my_save2(fea, path, model_version):
|
def my_save2(fea, path, model_version):
|
||||||
bio = BytesIO()
|
bio = BytesIO()
|
||||||
torch.save(fea, bio)
|
torch.save(fea, bio)
|
||||||
@ -50,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
|
|||||||
if lora_rank:
|
if lora_rank:
|
||||||
opt["lora_rank"] = lora_rank
|
opt["lora_rank"] = lora_rank
|
||||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
||||||
elif (model_version!=None and "Pro"in model_version):
|
elif model_version != None and "Pro" in model_version:
|
||||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
||||||
else:
|
else:
|
||||||
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
||||||
@ -58,6 +59,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
|
|||||||
except:
|
except:
|
||||||
return traceback.format_exc()
|
return traceback.format_exc()
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
00:v1
|
00:v1
|
||||||
01:v2
|
01:v2
|
||||||
|
@ -36,7 +36,7 @@ from module.models import (
|
|||||||
MultiPeriodDiscriminator,
|
MultiPeriodDiscriminator,
|
||||||
SynthesizerTrn,
|
SynthesizerTrn,
|
||||||
)
|
)
|
||||||
from process_ckpt import savee,my_save2
|
from process_ckpt import savee
|
||||||
|
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
torch.backends.cudnn.deterministic = False
|
torch.backends.cudnn.deterministic = False
|
||||||
@ -87,11 +87,30 @@ def run(rank, n_gpus, hps):
|
|||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.set_device(rank)
|
torch.cuda.set_device(rank)
|
||||||
|
|
||||||
train_dataset = TextAudioSpeakerLoader(hps.data,version=hps.model.version)
|
train_dataset = TextAudioSpeakerLoader(hps.data, version=hps.model.version)
|
||||||
train_sampler = DistributedBucketSampler(
|
train_sampler = DistributedBucketSampler(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
hps.train.batch_size,
|
hps.train.batch_size,
|
||||||
[32,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,],
|
[
|
||||||
|
32,
|
||||||
|
300,
|
||||||
|
400,
|
||||||
|
500,
|
||||||
|
600,
|
||||||
|
700,
|
||||||
|
800,
|
||||||
|
900,
|
||||||
|
1000,
|
||||||
|
1100,
|
||||||
|
1200,
|
||||||
|
1300,
|
||||||
|
1400,
|
||||||
|
1500,
|
||||||
|
1600,
|
||||||
|
1700,
|
||||||
|
1800,
|
||||||
|
1900,
|
||||||
|
],
|
||||||
num_replicas=n_gpus,
|
num_replicas=n_gpus,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
@ -130,9 +149,9 @@ def run(rank, n_gpus, hps):
|
|||||||
)
|
)
|
||||||
|
|
||||||
net_d = (
|
net_d = (
|
||||||
MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).cuda(rank)
|
MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).cuda(rank)
|
||||||
if torch.cuda.is_available()
|
if torch.cuda.is_available()
|
||||||
else MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).to(device)
|
else MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device)
|
||||||
)
|
)
|
||||||
for name, param in net_g.named_parameters():
|
for name, param in net_g.named_parameters():
|
||||||
if not param.requires_grad:
|
if not param.requires_grad:
|
||||||
@ -235,7 +254,7 @@ def run(rank, n_gpus, hps):
|
|||||||
print(
|
print(
|
||||||
"loaded pretrained %s" % hps.train.pretrained_s2D,
|
"loaded pretrained %s" % hps.train.pretrained_s2D,
|
||||||
net_d.module.load_state_dict(
|
net_d.module.load_state_dict(
|
||||||
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],strict=False
|
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], strict=False
|
||||||
)
|
)
|
||||||
if torch.cuda.is_available()
|
if torch.cuda.is_available()
|
||||||
else net_d.load_state_dict(
|
else net_d.load_state_dict(
|
||||||
@ -310,17 +329,44 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
net_g.train()
|
net_g.train()
|
||||||
net_d.train()
|
net_d.train()
|
||||||
for batch_idx, data in enumerate(tqdm(train_loader)):
|
for batch_idx, data in enumerate(tqdm(train_loader)):
|
||||||
if hps.model.version in {"v2Pro","v2ProPlus"}:
|
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||||
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths,sv_emb=data
|
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths, sv_emb = data
|
||||||
else:
|
else:
|
||||||
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths=data
|
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
spec, spec_lengths = (spec.cuda(rank,non_blocking=True,),spec_lengths.cuda(rank,non_blocking=True,),)
|
spec, spec_lengths = (
|
||||||
y, y_lengths = (y.cuda(rank,non_blocking=True,),y_lengths.cuda(rank,non_blocking=True,),)
|
spec.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
spec_lengths.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
y, y_lengths = (
|
||||||
|
y.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
y_lengths.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
ssl = ssl.cuda(rank, non_blocking=True)
|
ssl = ssl.cuda(rank, non_blocking=True)
|
||||||
ssl.requires_grad = False
|
ssl.requires_grad = False
|
||||||
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
|
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
|
||||||
text, text_lengths = (text.cuda(rank,non_blocking=True,),text_lengths.cuda(rank,non_blocking=True,),)
|
text, text_lengths = (
|
||||||
|
text.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
text_lengths.cuda(
|
||||||
|
rank,
|
||||||
|
non_blocking=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||||
sv_emb = sv_emb.cuda(rank, non_blocking=True)
|
sv_emb = sv_emb.cuda(rank, non_blocking=True)
|
||||||
else:
|
else:
|
||||||
@ -334,9 +380,19 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
sv_emb = sv_emb.to(device)
|
sv_emb = sv_emb.to(device)
|
||||||
with autocast(enabled=hps.train.fp16_run):
|
with autocast(enabled=hps.train.fp16_run):
|
||||||
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||||
(y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl) = net_g(ssl, spec, spec_lengths, text, text_lengths,sv_emb)
|
(y_hat, kl_ssl, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl) = net_g(
|
||||||
|
ssl, spec, spec_lengths, text, text_lengths, sv_emb
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
(y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl,) = net_g(ssl, spec, spec_lengths, text, text_lengths)
|
(
|
||||||
|
y_hat,
|
||||||
|
kl_ssl,
|
||||||
|
ids_slice,
|
||||||
|
x_mask,
|
||||||
|
z_mask,
|
||||||
|
(z, z_p, m_p, logs_p, m_q, logs_q),
|
||||||
|
stats_ssl,
|
||||||
|
) = net_g(ssl, spec, spec_lengths, text, text_lengths)
|
||||||
|
|
||||||
mel = spec_to_mel_torch(
|
mel = spec_to_mel_torch(
|
||||||
spec,
|
spec,
|
||||||
@ -508,7 +564,14 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
% (
|
% (
|
||||||
hps.name,
|
hps.name,
|
||||||
epoch,
|
epoch,
|
||||||
savee(ckpt,hps.name + "_e%s_s%s" % (epoch, global_step),epoch,global_step,hps,model_version=None if hps.model.version not in {"v2Pro","v2ProPlus"}else hps.model.version),
|
savee(
|
||||||
|
ckpt,
|
||||||
|
hps.name + "_e%s_s%s" % (epoch, global_step),
|
||||||
|
epoch,
|
||||||
|
global_step,
|
||||||
|
hps,
|
||||||
|
model_version=None if hps.model.version not in {"v2Pro", "v2ProPlus"} else hps.model.version,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,24 +1,32 @@
|
|||||||
import sys,os,torch
|
import sys
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
|
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
|
||||||
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
|
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
|
||||||
from ERes2NetV2 import ERes2NetV2
|
from ERes2NetV2 import ERes2NetV2
|
||||||
import kaldi as Kaldi
|
import kaldi as Kaldi
|
||||||
|
|
||||||
|
|
||||||
class SV:
|
class SV:
|
||||||
def __init__(self,device,is_half):
|
def __init__(self, device, is_half):
|
||||||
pretrained_state = torch.load(sv_path, map_location='cpu', weights_only=False)
|
pretrained_state = torch.load(sv_path, map_location="cpu", weights_only=False)
|
||||||
embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
|
embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
|
||||||
embedding_model.load_state_dict(pretrained_state)
|
embedding_model.load_state_dict(pretrained_state)
|
||||||
embedding_model.eval()
|
embedding_model.eval()
|
||||||
self.embedding_model=embedding_model
|
self.embedding_model = embedding_model
|
||||||
if is_half == False:
|
if is_half == False:
|
||||||
self.embedding_model=self.embedding_model.to(device)
|
self.embedding_model = self.embedding_model.to(device)
|
||||||
else:
|
else:
|
||||||
self.embedding_model=self.embedding_model.half().to(device)
|
self.embedding_model = self.embedding_model.half().to(device)
|
||||||
self.is_half=is_half
|
self.is_half = is_half
|
||||||
|
|
||||||
def compute_embedding3(self,wav):
|
def compute_embedding3(self, wav):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
if self.is_half==True:wav=wav.half()
|
if self.is_half == True:
|
||||||
feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
|
wav = wav.half()
|
||||||
|
feat = torch.stack(
|
||||||
|
[Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
|
||||||
|
)
|
||||||
sv_emb = self.embedding_model.forward3(feat)
|
sv_emb = self.embedding_model.forward3(feat)
|
||||||
return sv_emb
|
return sv_emb
|
||||||
|
@ -3,38 +3,44 @@ import re
|
|||||||
|
|
||||||
# jieba静音
|
# jieba静音
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
jieba.setLogLevel(logging.CRITICAL)
|
jieba.setLogLevel(logging.CRITICAL)
|
||||||
|
|
||||||
# 更改fast_langdetect大模型位置
|
# 更改fast_langdetect大模型位置
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import fast_langdetect
|
import fast_langdetect
|
||||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
|
|
||||||
|
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
|
||||||
|
fast_langdetect.infer.LangDetectConfig(
|
||||||
|
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
from split_lang import LangSplitter
|
from split_lang import LangSplitter
|
||||||
|
|
||||||
|
|
||||||
def full_en(text):
|
def full_en(text):
|
||||||
pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
|
pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
|
||||||
return bool(re.match(pattern, text))
|
return bool(re.match(pattern, text))
|
||||||
|
|
||||||
|
|
||||||
def full_cjk(text):
|
def full_cjk(text):
|
||||||
# 来自wiki
|
# 来自wiki
|
||||||
cjk_ranges = [
|
cjk_ranges = [
|
||||||
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||||||
(0x3400, 0x4DB5), # CJK Extension A
|
(0x3400, 0x4DB5), # CJK Extension A
|
||||||
(0x20000, 0x2A6DD), # CJK Extension B
|
(0x20000, 0x2A6DD), # CJK Extension B
|
||||||
(0x2A700, 0x2B73F), # CJK Extension C
|
(0x2A700, 0x2B73F), # CJK Extension C
|
||||||
(0x2B740, 0x2B81F), # CJK Extension D
|
(0x2B740, 0x2B81F), # CJK Extension D
|
||||||
(0x2B820, 0x2CEAF), # CJK Extension E
|
(0x2B820, 0x2CEAF), # CJK Extension E
|
||||||
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
||||||
(0x30000, 0x3134A), # CJK Extension G
|
(0x30000, 0x3134A), # CJK Extension G
|
||||||
(0x31350, 0x323AF), # CJK Extension H
|
(0x31350, 0x323AF), # CJK Extension H
|
||||||
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
||||||
]
|
]
|
||||||
|
|
||||||
pattern = r'[0-9、-〜。!?.!?… /]+$'
|
pattern = r"[0-9、-〜。!?.!?… /]+$"
|
||||||
|
|
||||||
cjk_text = ""
|
cjk_text = ""
|
||||||
for char in text:
|
for char in text:
|
||||||
@ -45,7 +51,7 @@ def full_cjk(text):
|
|||||||
return cjk_text
|
return cjk_text
|
||||||
|
|
||||||
|
|
||||||
def split_jako(tag_lang,item):
|
def split_jako(tag_lang, item):
|
||||||
if tag_lang == "ja":
|
if tag_lang == "ja":
|
||||||
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
|
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
|
||||||
else:
|
else:
|
||||||
@ -53,41 +59,40 @@ def split_jako(tag_lang,item):
|
|||||||
|
|
||||||
lang_list: list[dict] = []
|
lang_list: list[dict] = []
|
||||||
tag = 0
|
tag = 0
|
||||||
for match in re.finditer(pattern, item['text']):
|
for match in re.finditer(pattern, item["text"]):
|
||||||
if match.start() > tag:
|
if match.start() > tag:
|
||||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
|
lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
|
||||||
|
|
||||||
tag = match.end()
|
tag = match.end()
|
||||||
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
|
lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
|
||||||
|
|
||||||
if tag < len(item['text']):
|
if tag < len(item["text"]):
|
||||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
|
lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
|
||||||
|
|
||||||
return lang_list
|
return lang_list
|
||||||
|
|
||||||
|
|
||||||
def merge_lang(lang_list, item):
|
def merge_lang(lang_list, item):
|
||||||
if lang_list and item['lang'] == lang_list[-1]['lang']:
|
if lang_list and item["lang"] == lang_list[-1]["lang"]:
|
||||||
lang_list[-1]['text'] += item['text']
|
lang_list[-1]["text"] += item["text"]
|
||||||
else:
|
else:
|
||||||
lang_list.append(item)
|
lang_list.append(item)
|
||||||
return lang_list
|
return lang_list
|
||||||
|
|
||||||
|
|
||||||
class LangSegmenter():
|
class LangSegmenter:
|
||||||
# 默认过滤器, 基于gsv目前四种语言
|
# 默认过滤器, 基于gsv目前四种语言
|
||||||
DEFAULT_LANG_MAP = {
|
DEFAULT_LANG_MAP = {
|
||||||
"zh": "zh",
|
"zh": "zh",
|
||||||
"yue": "zh", # 粤语
|
"yue": "zh", # 粤语
|
||||||
"wuu": "zh", # 吴语
|
"wuu": "zh", # 吴语
|
||||||
"zh-cn": "zh",
|
"zh-cn": "zh",
|
||||||
"zh-tw": "x", # 繁体设置为x
|
"zh-tw": "x", # 繁体设置为x
|
||||||
"ko": "ko",
|
"ko": "ko",
|
||||||
"ja": "ja",
|
"ja": "ja",
|
||||||
"en": "en",
|
"en": "en",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def getTexts(text):
|
def getTexts(text):
|
||||||
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
|
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
|
||||||
substr = lang_splitter.split_by_lang(text=text)
|
substr = lang_splitter.split_by_lang(text=text)
|
||||||
@ -95,18 +100,18 @@ class LangSegmenter():
|
|||||||
lang_list: list[dict] = []
|
lang_list: list[dict] = []
|
||||||
|
|
||||||
for _, item in enumerate(substr):
|
for _, item in enumerate(substr):
|
||||||
dict_item = {'lang':item.lang,'text':item.text}
|
dict_item = {"lang": item.lang, "text": item.text}
|
||||||
|
|
||||||
# 处理短英文被识别为其他语言的问题
|
# 处理短英文被识别为其他语言的问题
|
||||||
if full_en(dict_item['text']):
|
if full_en(dict_item["text"]):
|
||||||
dict_item['lang'] = 'en'
|
dict_item["lang"] = "en"
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 处理非日语夹日文的问题(不包含CJK)
|
# 处理非日语夹日文的问题(不包含CJK)
|
||||||
ja_list: list[dict] = []
|
ja_list: list[dict] = []
|
||||||
if dict_item['lang'] != 'ja':
|
if dict_item["lang"] != "ja":
|
||||||
ja_list = split_jako('ja',dict_item)
|
ja_list = split_jako("ja", dict_item)
|
||||||
|
|
||||||
if not ja_list:
|
if not ja_list:
|
||||||
ja_list.append(dict_item)
|
ja_list.append(dict_item)
|
||||||
@ -115,8 +120,8 @@ class LangSegmenter():
|
|||||||
ko_list: list[dict] = []
|
ko_list: list[dict] = []
|
||||||
temp_list: list[dict] = []
|
temp_list: list[dict] = []
|
||||||
for _, ko_item in enumerate(ja_list):
|
for _, ko_item in enumerate(ja_list):
|
||||||
if ko_item["lang"] != 'ko':
|
if ko_item["lang"] != "ko":
|
||||||
ko_list = split_jako('ko',ko_item)
|
ko_list = split_jako("ko", ko_item)
|
||||||
|
|
||||||
if ko_list:
|
if ko_list:
|
||||||
temp_list.extend(ko_list)
|
temp_list.extend(ko_list)
|
||||||
@ -126,50 +131,50 @@ class LangSegmenter():
|
|||||||
# 未存在非日韩文夹日韩文
|
# 未存在非日韩文夹日韩文
|
||||||
if len(temp_list) == 1:
|
if len(temp_list) == 1:
|
||||||
# 未知语言检查是否为CJK
|
# 未知语言检查是否为CJK
|
||||||
if dict_item['lang'] == 'x':
|
if dict_item["lang"] == "x":
|
||||||
cjk_text = full_cjk(dict_item['text'])
|
cjk_text = full_cjk(dict_item["text"])
|
||||||
if cjk_text:
|
if cjk_text:
|
||||||
dict_item = {'lang':'zh','text':cjk_text}
|
dict_item = {"lang": "zh", "text": cjk_text}
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
else:
|
else:
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 存在非日韩文夹日韩文
|
# 存在非日韩文夹日韩文
|
||||||
for _, temp_item in enumerate(temp_list):
|
for _, temp_item in enumerate(temp_list):
|
||||||
# 未知语言检查是否为CJK
|
# 未知语言检查是否为CJK
|
||||||
if temp_item['lang'] == 'x':
|
if temp_item["lang"] == "x":
|
||||||
cjk_text = full_cjk(dict_item['text'])
|
cjk_text = full_cjk(dict_item["text"])
|
||||||
if cjk_text:
|
if cjk_text:
|
||||||
dict_item = {'lang':'zh','text':cjk_text}
|
dict_item = {"lang": "zh", "text": cjk_text}
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
else:
|
else:
|
||||||
lang_list = merge_lang(lang_list,dict_item)
|
lang_list = merge_lang(lang_list, dict_item)
|
||||||
else:
|
else:
|
||||||
lang_list = merge_lang(lang_list,temp_item)
|
lang_list = merge_lang(lang_list, temp_item)
|
||||||
|
|
||||||
temp_list = lang_list
|
temp_list = lang_list
|
||||||
lang_list = []
|
lang_list = []
|
||||||
for _, temp_item in enumerate(temp_list):
|
for _, temp_item in enumerate(temp_list):
|
||||||
if temp_item['lang'] == 'x':
|
if temp_item["lang"] == "x":
|
||||||
if lang_list:
|
if lang_list:
|
||||||
temp_item['lang'] = lang_list[-1]['lang']
|
temp_item["lang"] = lang_list[-1]["lang"]
|
||||||
elif len(temp_list) > 1:
|
elif len(temp_list) > 1:
|
||||||
temp_item['lang'] = temp_list[1]['lang']
|
temp_item["lang"] = temp_list[1]["lang"]
|
||||||
else:
|
else:
|
||||||
temp_item['lang'] = 'zh'
|
temp_item["lang"] = "zh"
|
||||||
|
|
||||||
lang_list = merge_lang(lang_list,temp_item)
|
lang_list = merge_lang(lang_list, temp_item)
|
||||||
|
|
||||||
return lang_list
|
return lang_list
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
text = "MyGO?,你也喜欢まいご吗?"
|
text = "MyGO?,你也喜欢まいご吗?"
|
||||||
print(LangSegmenter.getTexts(text))
|
print(LangSegmenter.getTexts(text))
|
||||||
|
|
||||||
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
|
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
|
||||||
print(LangSegmenter.getTexts(text))
|
print(LangSegmenter.getTexts(text))
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import traceback
|
|
||||||
import warnings
|
import warnings
|
||||||
import zipfile
|
import zipfile
|
||||||
from typing import Any, Dict, List, Tuple
|
from typing import Any, Dict, List, Tuple
|
||||||
@ -23,8 +22,9 @@ from .utils import load_config
|
|||||||
onnxruntime.set_default_logger_severity(3)
|
onnxruntime.set_default_logger_severity(3)
|
||||||
try:
|
try:
|
||||||
onnxruntime.preload_dlls()
|
onnxruntime.preload_dlls()
|
||||||
except:pass
|
except:
|
||||||
#traceback.print_exc()
|
pass
|
||||||
|
# traceback.print_exc()
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
model_version = "1.1"
|
model_version = "1.1"
|
||||||
|
@ -655,11 +655,7 @@ class ToneSandhi:
|
|||||||
while i < len(seg):
|
while i < len(seg):
|
||||||
word, pos = seg[i]
|
word, pos = seg[i]
|
||||||
merged = False
|
merged = False
|
||||||
if (
|
if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
|
||||||
i - 1 >= 0
|
|
||||||
and word == "一"
|
|
||||||
and i + 1 < len(seg)
|
|
||||||
):
|
|
||||||
last = new_seg[-1] if new_seg else seg[i - 1]
|
last = new_seg[-1] if new_seg else seg[i - 1]
|
||||||
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
|
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
|
||||||
combined = last[0] + "一" + seg[i + 1][0]
|
combined = last[0] + "一" + seg[i + 1][0]
|
||||||
|
17
README.md
17
README.md
@ -9,10 +9,14 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
|||||||
|
|
||||||
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
||||||
[](https://discord.gg/dnrgs5GHfG)
|
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
|
||||||
|
|
||||||
|
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
|
||||||
|
[](https://rentry.co/GPT-SoVITS-guide#/)
|
||||||
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
|
||||||
|
|
||||||
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
|
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
|
||||||
|
|
||||||
@ -128,8 +132,9 @@ Due to rapid development in the codebase and a slower Docker image release cycle
|
|||||||
|
|
||||||
- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags
|
- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags
|
||||||
- Choose an appropriate image tag for your environment
|
- Choose an appropriate image tag for your environment
|
||||||
- `Lite` means the Docker image does not include ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed
|
- `Lite` means the Docker image **does not include** ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed
|
||||||
- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose
|
- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose
|
||||||
|
- Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
|
||||||
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
|
- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes
|
||||||
|
|
||||||
#### Environment Variables
|
#### Environment Variables
|
||||||
@ -333,7 +338,7 @@ Use v4 from v1/v2/v3 environment:
|
|||||||
New Features:
|
New Features:
|
||||||
|
|
||||||
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
|
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
|
||||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
|
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
|
||||||
|
|
||||||
|
84
api.py
84
api.py
@ -199,6 +199,8 @@ def is_full(*items): # 任意一项为空返回False
|
|||||||
|
|
||||||
|
|
||||||
bigvgan_model = hifigan_model = sv_cn_model = None
|
bigvgan_model = hifigan_model = sv_cn_model = None
|
||||||
|
|
||||||
|
|
||||||
def clean_hifigan_model():
|
def clean_hifigan_model():
|
||||||
global hifigan_model
|
global hifigan_model
|
||||||
if hifigan_model:
|
if hifigan_model:
|
||||||
@ -208,6 +210,8 @@ def clean_hifigan_model():
|
|||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def clean_bigvgan_model():
|
def clean_bigvgan_model():
|
||||||
global bigvgan_model
|
global bigvgan_model
|
||||||
if bigvgan_model:
|
if bigvgan_model:
|
||||||
@ -217,6 +221,8 @@ def clean_bigvgan_model():
|
|||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def clean_sv_cn_model():
|
def clean_sv_cn_model():
|
||||||
global sv_cn_model
|
global sv_cn_model
|
||||||
if sv_cn_model:
|
if sv_cn_model:
|
||||||
@ -229,7 +235,7 @@ def clean_sv_cn_model():
|
|||||||
|
|
||||||
|
|
||||||
def init_bigvgan():
|
def init_bigvgan():
|
||||||
global bigvgan_model, hifigan_model,sv_cn_model
|
global bigvgan_model, hifigan_model, sv_cn_model
|
||||||
from BigVGAN import bigvgan
|
from BigVGAN import bigvgan
|
||||||
|
|
||||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
||||||
@ -247,7 +253,7 @@ def init_bigvgan():
|
|||||||
|
|
||||||
|
|
||||||
def init_hifigan():
|
def init_hifigan():
|
||||||
global hifigan_model, bigvgan_model,sv_cn_model
|
global hifigan_model, bigvgan_model, sv_cn_model
|
||||||
hifigan_model = Generator(
|
hifigan_model = Generator(
|
||||||
initial_channel=100,
|
initial_channel=100,
|
||||||
resblock="1",
|
resblock="1",
|
||||||
@ -262,7 +268,9 @@ def init_hifigan():
|
|||||||
hifigan_model.eval()
|
hifigan_model.eval()
|
||||||
hifigan_model.remove_weight_norm()
|
hifigan_model.remove_weight_norm()
|
||||||
state_dict_g = torch.load(
|
state_dict_g = torch.load(
|
||||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
|
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
||||||
|
map_location="cpu",
|
||||||
|
weights_only=False,
|
||||||
)
|
)
|
||||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||||
if is_half == True:
|
if is_half == True:
|
||||||
@ -272,19 +280,21 @@ def init_hifigan():
|
|||||||
|
|
||||||
|
|
||||||
from sv import SV
|
from sv import SV
|
||||||
|
|
||||||
|
|
||||||
def init_sv_cn():
|
def init_sv_cn():
|
||||||
global hifigan_model, bigvgan_model, sv_cn_model
|
global hifigan_model, bigvgan_model, sv_cn_model
|
||||||
sv_cn_model = SV(device, is_half)
|
sv_cn_model = SV(device, is_half)
|
||||||
|
|
||||||
|
|
||||||
resample_transform_dict={}
|
resample_transform_dict = {}
|
||||||
def resample(audio_tensor, sr0,sr1,device):
|
|
||||||
|
|
||||||
|
def resample(audio_tensor, sr0, sr1, device):
|
||||||
global resample_transform_dict
|
global resample_transform_dict
|
||||||
key="%s-%s-%s"%(sr0,sr1,str(device))
|
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
||||||
if key not in resample_transform_dict:
|
if key not in resample_transform_dict:
|
||||||
resample_transform_dict[key] = torchaudio.transforms.Resample(
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||||
sr0, sr1
|
|
||||||
).to(device)
|
|
||||||
return resample_transform_dict[key](audio_tensor)
|
return resample_transform_dict[key](audio_tensor)
|
||||||
|
|
||||||
|
|
||||||
@ -370,6 +380,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
|||||||
|
|
||||||
def get_sovits_weights(sovits_path):
|
def get_sovits_weights(sovits_path):
|
||||||
from config import pretrained_sovits_name
|
from config import pretrained_sovits_name
|
||||||
|
|
||||||
path_sovits_v3 = pretrained_sovits_name["v3"]
|
path_sovits_v3 = pretrained_sovits_name["v3"]
|
||||||
path_sovits_v4 = pretrained_sovits_name["v4"]
|
path_sovits_v4 = pretrained_sovits_name["v4"]
|
||||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||||
@ -628,15 +639,17 @@ class DictToAttrRecursive(dict):
|
|||||||
|
|
||||||
|
|
||||||
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
||||||
sr1=int(hps.data.sampling_rate)
|
sr1 = int(hps.data.sampling_rate)
|
||||||
audio, sr0=torchaudio.load(filename)
|
audio, sr0 = torchaudio.load(filename)
|
||||||
if sr0!=sr1:
|
if sr0 != sr1:
|
||||||
audio=audio.to(device)
|
audio = audio.to(device)
|
||||||
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
if audio.shape[0] == 2:
|
||||||
audio=resample(audio,sr0,sr1,device)
|
audio = audio.mean(0).unsqueeze(0)
|
||||||
|
audio = resample(audio, sr0, sr1, device)
|
||||||
else:
|
else:
|
||||||
audio=audio.to(device)
|
audio = audio.to(device)
|
||||||
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
if audio.shape[0] == 2:
|
||||||
|
audio = audio.mean(0).unsqueeze(0)
|
||||||
|
|
||||||
maxx = audio.abs().max()
|
maxx = audio.abs().max()
|
||||||
if maxx > 1:
|
if maxx > 1:
|
||||||
@ -649,10 +662,10 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
|||||||
hps.data.win_length,
|
hps.data.win_length,
|
||||||
center=False,
|
center=False,
|
||||||
)
|
)
|
||||||
spec=spec.to(dtype)
|
spec = spec.to(dtype)
|
||||||
if is_v2pro==True:
|
if is_v2pro == True:
|
||||||
audio=resample(audio,sr1,16000,device).to(dtype)
|
audio = resample(audio, sr1, 16000, device).to(dtype)
|
||||||
return spec,audio
|
return spec, audio
|
||||||
|
|
||||||
|
|
||||||
def pack_audio(audio_bytes, data, rate):
|
def pack_audio(audio_bytes, data, rate):
|
||||||
@ -872,29 +885,29 @@ def get_tts_wav(
|
|||||||
prompt_semantic = codes[0, 0]
|
prompt_semantic = codes[0, 0]
|
||||||
prompt = prompt_semantic.unsqueeze(0).to(device)
|
prompt = prompt_semantic.unsqueeze(0).to(device)
|
||||||
|
|
||||||
is_v2pro = version in {"v2Pro","v2ProPlus"}
|
is_v2pro = version in {"v2Pro", "v2ProPlus"}
|
||||||
if version not in {"v3", "v4"}:
|
if version not in {"v3", "v4"}:
|
||||||
refers = []
|
refers = []
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
sv_emb= []
|
sv_emb = []
|
||||||
if sv_cn_model == None:
|
if sv_cn_model == None:
|
||||||
init_sv_cn()
|
init_sv_cn()
|
||||||
if inp_refs:
|
if inp_refs:
|
||||||
for path in inp_refs:
|
for path in inp_refs:
|
||||||
try:#####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer
|
try: #####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer
|
||||||
refer,audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
|
refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
|
||||||
refers.append(refer)
|
refers.append(refer)
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
|
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
if len(refers) == 0:
|
if len(refers) == 0:
|
||||||
refers,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
|
refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
|
||||||
refers=[refers]
|
refers = [refers]
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
sv_emb=[sv_cn_model.compute_embedding3(audio_tensor)]
|
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
|
||||||
else:
|
else:
|
||||||
refer,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
||||||
|
|
||||||
t1 = ttime()
|
t1 = ttime()
|
||||||
# os.environ['version'] = version
|
# os.environ['version'] = version
|
||||||
@ -937,14 +950,22 @@ def get_tts_wav(
|
|||||||
if version not in {"v3", "v4"}:
|
if version not in {"v3", "v4"}:
|
||||||
if is_v2pro:
|
if is_v2pro:
|
||||||
audio = (
|
audio = (
|
||||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb)
|
vq_model.decode(
|
||||||
|
pred_semantic,
|
||||||
|
torch.LongTensor(phones2).to(device).unsqueeze(0),
|
||||||
|
refers,
|
||||||
|
speed=speed,
|
||||||
|
sv_emb=sv_emb,
|
||||||
|
)
|
||||||
.detach()
|
.detach()
|
||||||
.cpu()
|
.cpu()
|
||||||
.numpy()[0, 0]
|
.numpy()[0, 0]
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
audio = (
|
audio = (
|
||||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)
|
vq_model.decode(
|
||||||
|
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
|
||||||
|
)
|
||||||
.detach()
|
.detach()
|
||||||
.cpu()
|
.cpu()
|
||||||
.numpy()[0, 0]
|
.numpy()[0, 0]
|
||||||
@ -1108,7 +1129,6 @@ def handle(
|
|||||||
if not default_refer.is_ready():
|
if not default_refer.is_ready():
|
||||||
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
||||||
|
|
||||||
|
|
||||||
if cut_punc == None:
|
if cut_punc == None:
|
||||||
text = cut_text(text, default_cut_punc)
|
text = cut_text(text, default_cut_punc)
|
||||||
else:
|
else:
|
||||||
|
14
config.py
14
config.py
@ -144,7 +144,8 @@ webui_port_subfix = 9871
|
|||||||
|
|
||||||
api_port = 9880
|
api_port = 9880
|
||||||
|
|
||||||
#Thanks to the contribution of @Karasukaigan and @XXXXRT666
|
|
||||||
|
# Thanks to the contribution of @Karasukaigan and @XXXXRT666
|
||||||
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
|
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
|
||||||
cpu = torch.device("cpu")
|
cpu = torch.device("cpu")
|
||||||
cuda = torch.device(f"cuda:{idx}")
|
cuda = torch.device(f"cuda:{idx}")
|
||||||
@ -157,10 +158,13 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
|
|||||||
mem_gb = mem_bytes / (1024**3) + 0.4
|
mem_gb = mem_bytes / (1024**3) + 0.4
|
||||||
major, minor = capability
|
major, minor = capability
|
||||||
sm_version = major + minor / 10.0
|
sm_version = major + minor / 10.0
|
||||||
is_16_series = bool(re.search(r"16\d{2}", name))and sm_version == 7.5
|
is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
|
||||||
if mem_gb < 4 or sm_version < 5.3:return cpu, torch.float32, 0.0, 0.0
|
if mem_gb < 4 or sm_version < 5.3:
|
||||||
if sm_version == 6.1 or is_16_series==True:return cuda, torch.float32, sm_version, mem_gb
|
return cpu, torch.float32, 0.0, 0.0
|
||||||
if sm_version > 6.1:return cuda, torch.float16, sm_version, mem_gb
|
if sm_version == 6.1 or is_16_series == True:
|
||||||
|
return cuda, torch.float32, sm_version, mem_gb
|
||||||
|
if sm_version > 6.1:
|
||||||
|
return cuda, torch.float16, sm_version, mem_gb
|
||||||
return cpu, torch.float32, 0.0, 0.0
|
return cpu, torch.float32, 0.0, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,10 +12,6 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
|
||||||
environment:
|
environment:
|
||||||
- is_half=true
|
- is_half=true
|
||||||
tty: true
|
tty: true
|
||||||
@ -34,10 +30,6 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
|
||||||
- tools/asr/models:/workspace/models/asr_models
|
- tools/asr/models:/workspace/models/asr_models
|
||||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||||
environment:
|
environment:
|
||||||
@ -58,10 +50,6 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
|
||||||
environment:
|
environment:
|
||||||
- is_half=true
|
- is_half=true
|
||||||
tty: true
|
tty: true
|
||||||
@ -80,10 +68,6 @@ services:
|
|||||||
- "9880:9880"
|
- "9880:9880"
|
||||||
volumes:
|
volumes:
|
||||||
- .:/workspace/GPT-SoVITS
|
- .:/workspace/GPT-SoVITS
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/asr/models
|
|
||||||
- /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
|
|
||||||
- tools/asr/models:/workspace/models/asr_models
|
- tools/asr/models:/workspace/models/asr_models
|
||||||
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
- tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
|
||||||
environment:
|
environment:
|
||||||
|
@ -7,12 +7,14 @@
|
|||||||
|
|
||||||
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
|
||||||
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
|
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
||||||
|
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://rentry.co/GPT-SoVITS-guide#/)
|
||||||
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md)
|
||||||
[](https://discord.gg/dnrgs5GHfG)
|
|
||||||
|
|
||||||
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
@ -128,8 +130,9 @@ brew install ffmpeg
|
|||||||
|
|
||||||
- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags)
|
- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags)
|
||||||
- 根据你的运行环境选择合适的镜像标签
|
- 根据你的运行环境选择合适的镜像标签
|
||||||
- `Lite` Docker 镜像不包含 ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载
|
- `Lite` Docker 镜像**不包含** ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载
|
||||||
- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64)
|
- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64)
|
||||||
|
- Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
|
||||||
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
|
- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像
|
||||||
|
|
||||||
#### 环境变量
|
#### 环境变量
|
||||||
@ -329,7 +332,7 @@ python webui.py
|
|||||||
新特性:
|
新特性:
|
||||||
|
|
||||||
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
|
1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
|
||||||
[更多详情](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
[更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
|
2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.
|
||||||
|
|
||||||
|
@ -5,12 +5,16 @@
|
|||||||
|
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
||||||
|
|
||||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
||||||
[](https://discord.gg/dnrgs5GHfG)
|
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
|
||||||
|
|
||||||
|
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
|
||||||
|
[](https://rentry.co/GPT-SoVITS-guide#/)
|
||||||
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ja/Changelog_JA.md)
|
||||||
|
|
||||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
@ -122,8 +126,9 @@ brew install ffmpeg
|
|||||||
|
|
||||||
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください
|
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください
|
||||||
- 環境に合った適切なイメージタグを選択してください
|
- 環境に合った適切なイメージタグを選択してください
|
||||||
- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが含まれていないことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします
|
- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが**含まれていない**ことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします
|
||||||
- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます
|
- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます
|
||||||
|
- Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
|
||||||
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
|
- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です
|
||||||
|
|
||||||
#### 環境変数
|
#### 環境変数
|
||||||
@ -304,7 +309,7 @@ v2 環境から v3 を使用する方法:
|
|||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力します(V3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
|
1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力します(V3 はネイティブに 24kHz 音声のみ出力)**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
|
||||||
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1/V2/V3 環境から V4 への移行方法:
|
V1/V2/V3 環境から V4 への移行方法:
|
||||||
|
|
||||||
@ -319,7 +324,7 @@ V1/V2/V3 環境から V4 への移行方法:
|
|||||||
新機能:
|
新機能:
|
||||||
|
|
||||||
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
|
1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
|
||||||
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
|
2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.
|
||||||
|
|
||||||
|
@ -5,12 +5,16 @@
|
|||||||
|
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
||||||
|
|
||||||
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
||||||
[](https://discord.gg/dnrgs5GHfG)
|
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
|
||||||
|
|
||||||
|
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
|
||||||
|
[](https://rentry.co/GPT-SoVITS-guide#/)
|
||||||
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/ko/Changelog_KO.md)
|
||||||
|
|
||||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
|
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
|
||||||
|
|
||||||
@ -122,8 +126,9 @@ brew install ffmpeg
|
|||||||
|
|
||||||
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요
|
- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요
|
||||||
- 환경에 맞는 적절한 이미지 태그를 선택하세요
|
- 환경에 맞는 적절한 이미지 태그를 선택하세요
|
||||||
- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 포함되어 있지 않음을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다
|
- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 **포함되어 있지 않음**을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다
|
||||||
- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다
|
- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다
|
||||||
|
- Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
|
||||||
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
|
- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다
|
||||||
|
|
||||||
#### 환경 변수
|
#### 환경 변수
|
||||||
@ -319,7 +324,7 @@ V1/V2/V3 환경에서 V4로 전환 방법:
|
|||||||
신규 기능:
|
신규 기능:
|
||||||
|
|
||||||
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
|
1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
|
||||||
[자세히 보기](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
[자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
|
2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.
|
||||||
|
|
||||||
|
@ -7,12 +7,14 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
|
|||||||
|
|
||||||
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
|
||||||
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
||||||
|
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
||||||
|
[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
|
||||||
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
[](https://rentry.co/GPT-SoVITS-guide#/)
|
||||||
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/tr/Changelog_TR.md)
|
||||||
[](https://discord.gg/dnrgs5GHfG)
|
|
||||||
|
|
||||||
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
|
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
|
||||||
|
|
||||||
@ -124,8 +126,9 @@ Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığ
|
|||||||
|
|
||||||
- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin
|
- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin
|
||||||
- Ortamınıza uygun bir imaj etiketi seçin
|
- Ortamınıza uygun bir imaj etiketi seçin
|
||||||
- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin bulunmadığı anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir
|
- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin **bulunmadığı** anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir
|
||||||
- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir
|
- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir
|
||||||
|
- Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
|
||||||
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
|
- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz
|
||||||
|
|
||||||
#### Ortam Değişkenleri
|
#### Ortam Değişkenleri
|
||||||
@ -323,7 +326,7 @@ V1/V2/V3 ortamından V4'e geçiş:
|
|||||||
Yeni Özellikler:
|
Yeni Özellikler:
|
||||||
|
|
||||||
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
|
1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
|
||||||
[Daha fazla bilgi](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
[Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
|
2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
|
||||||
|
|
||||||
|
222
install.sh
222
install.sh
@ -5,14 +5,62 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
|||||||
|
|
||||||
cd "$SCRIPT_DIR" || exit 1
|
cd "$SCRIPT_DIR" || exit 1
|
||||||
|
|
||||||
set -e
|
RESET="\033[0m"
|
||||||
|
BOLD="\033[1m"
|
||||||
|
ERROR="\033[1;31m[ERROR]: $RESET"
|
||||||
|
WARNING="\033[1;33m[WARNING]: $RESET"
|
||||||
|
INFO="\033[1;32m[INFO]: $RESET"
|
||||||
|
SUCCESS="\033[1;34m[SUCCESS]: $RESET"
|
||||||
|
|
||||||
|
set -eE
|
||||||
|
set -o errtrace
|
||||||
|
|
||||||
|
trap 'on_error $LINENO "$BASH_COMMAND" $?' ERR
|
||||||
|
|
||||||
|
# shellcheck disable=SC2317
|
||||||
|
on_error() {
|
||||||
|
local lineno="$1"
|
||||||
|
local cmd="$2"
|
||||||
|
local code="$3"
|
||||||
|
|
||||||
|
echo -e "${ERROR}${BOLD}Command \"${cmd}\" Failed${RESET} at ${BOLD}Line ${lineno}${RESET} with Exit Code ${BOLD}${code}${RESET}"
|
||||||
|
echo -e "${ERROR}${BOLD}Call Stack:${RESET}"
|
||||||
|
for ((i = ${#FUNCNAME[@]} - 1; i >= 1; i--)); do
|
||||||
|
echo -e " in ${BOLD}${FUNCNAME[i]}()${RESET} at ${BASH_SOURCE[i]}:${BOLD}${BASH_LINENO[i - 1]}${RESET}"
|
||||||
|
done
|
||||||
|
exit "$code"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_conda_quiet() {
|
||||||
|
local output
|
||||||
|
output=$(conda install --yes --quiet "$@" 2>&1) || {
|
||||||
|
echo -e "${ERROR} Conda install failed:\n$output"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
run_pip_quiet() {
|
||||||
|
local output
|
||||||
|
output=$(pip install "$@" 2>&1) || {
|
||||||
|
echo -e "${ERROR} Pip install failed:\n$output"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
run_wget_quiet() {
|
||||||
|
local output
|
||||||
|
output=$(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$@" 2>&1) || {
|
||||||
|
echo -e "${ERROR} Wget failed:\n$output"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ! command -v conda &>/dev/null; then
|
if ! command -v conda &>/dev/null; then
|
||||||
echo "Conda Not Found"
|
echo -e "${ERROR}Conda Not Found"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
|
run_conda_quiet gcc
|
||||||
|
|
||||||
USE_CUDA=false
|
USE_CUDA=false
|
||||||
USE_ROCM=false
|
USE_ROCM=false
|
||||||
@ -34,8 +82,8 @@ print_help() {
|
|||||||
echo " -h, --help Show this help message and exit"
|
echo " -h, --help Show this help message and exit"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " bash install.sh --source HF --download-uvr5"
|
echo " bash install.sh --device CU128 --source HF --download-uvr5"
|
||||||
echo " bash install.sh --source ModelScope"
|
echo " bash install.sh --device MPS --source ModelScope"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Show help if no arguments provided
|
# Show help if no arguments provided
|
||||||
@ -59,8 +107,8 @@ while [[ $# -gt 0 ]]; do
|
|||||||
USE_MODELSCOPE=true
|
USE_MODELSCOPE=true
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Error: Invalid Download Source: $2"
|
echo -e "${ERROR}Error: Invalid Download Source: $2"
|
||||||
echo "Choose From: [HF, HF-Mirror, ModelScope]"
|
echo -e "${ERROR}Choose From: [HF, HF-Mirror, ModelScope]"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@ -86,8 +134,8 @@ while [[ $# -gt 0 ]]; do
|
|||||||
USE_CPU=true
|
USE_CPU=true
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Error: Invalid Device: $2"
|
echo -e "${ERROR}Error: Invalid Device: $2"
|
||||||
echo "Choose From: [CU126, CU128, ROCM, MPS, CPU]"
|
echo -e "${ERROR}Choose From: [CU126, CU128, ROCM, MPS, CPU]"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@ -102,22 +150,23 @@ while [[ $# -gt 0 ]]; do
|
|||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown Argument: $1"
|
echo -e "${ERROR}Unknown Argument: $1"
|
||||||
echo "Use -h or --help to see available options."
|
echo ""
|
||||||
|
print_help
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then
|
if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then
|
||||||
echo "Error: Device is REQUIRED"
|
echo -e "${ERROR}Error: Device is REQUIRED"
|
||||||
echo ""
|
echo ""
|
||||||
print_help
|
print_help
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then
|
if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then
|
||||||
echo "Error: Download Source is REQUIRED"
|
echo -e "${ERROR}Error: Download Source is REQUIRED"
|
||||||
echo ""
|
echo ""
|
||||||
print_help
|
print_help
|
||||||
exit 1
|
exit 1
|
||||||
@ -125,55 +174,65 @@ fi
|
|||||||
|
|
||||||
# 安装构建工具
|
# 安装构建工具
|
||||||
# Install build tools
|
# Install build tools
|
||||||
|
echo -e "${INFO}Detected system: $(uname -s) $(uname -r) $(uname -m)"
|
||||||
if [ "$(uname)" != "Darwin" ]; then
|
if [ "$(uname)" != "Darwin" ]; then
|
||||||
gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0)
|
gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0)
|
||||||
if [ "$gcc_major_version" -lt 11 ]; then
|
if [ "$gcc_major_version" -lt 11 ]; then
|
||||||
echo "Installing GCC & G++..."
|
echo -e "${INFO}Installing GCC & G++..."
|
||||||
conda install -c conda-forge gcc=11 gxx=11 -q -y
|
run_conda_quiet gcc=11 gxx=11
|
||||||
|
echo -e "${SUCCESS}GCC & G++ Installed..."
|
||||||
else
|
else
|
||||||
echo "GCC >=11"
|
echo -e "${INFO}Detected GCC Version: $gcc_major_version"
|
||||||
|
echo -e "${INFO}Skip Installing GCC & G++ From Conda-Forge"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
if ! xcode-select -p &>/dev/null; then
|
if ! xcode-select -p &>/dev/null; then
|
||||||
echo "Installing Xcode Command Line Tools..."
|
echo -e "${INFO}Installing Xcode Command Line Tools..."
|
||||||
xcode-select --install
|
xcode-select --install
|
||||||
fi
|
echo -e "${INFO}Waiting For Xcode Command Line Tools Installation Complete..."
|
||||||
echo "Waiting For Xcode Command Line Tools Installation Complete..."
|
while true; do
|
||||||
while true; do
|
sleep 20
|
||||||
sleep 20
|
|
||||||
|
|
||||||
if xcode-select -p &>/dev/null; then
|
if xcode-select -p &>/dev/null; then
|
||||||
echo "Xcode Command Line Tools Installed"
|
echo -e "${SUCCESS}Xcode Command Line Tools Installed"
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
echo "Installing,Please Wait..."
|
echo -e "${INFO}Installing,Please Wait..."
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
XCODE_PATH=$(xcode-select -p)
|
||||||
|
if [[ "$XCODE_PATH" == *"Xcode.app"* ]]; then
|
||||||
|
echo -e "${WARNING} Detected Xcode path: $XCODE_PATH"
|
||||||
|
echo -e "${WARNING} If your Xcode version does not match your macOS version, it may cause unexpected issues during compilation or package builds."
|
||||||
fi
|
fi
|
||||||
done
|
fi
|
||||||
conda install -c conda-forge -q -y
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Installing ffmpeg and cmake..."
|
echo -e "${INFO}Installing FFmpeg & CMake..."
|
||||||
conda install ffmpeg cmake make -q -y
|
run_conda_quiet ffmpeg cmake make
|
||||||
|
echo -e "${SUCCESS}FFmpeg & CMake Installed"
|
||||||
|
|
||||||
echo "Installing unzip..."
|
echo -e "${INFO}Installing unzip..."
|
||||||
conda install unzip -y --quiet
|
run_conda_quiet unzip
|
||||||
|
echo -e "${SUCCESS}unzip Installed"
|
||||||
|
|
||||||
if [ "$USE_HF" = "true" ]; then
|
if [ "$USE_HF" = "true" ]; then
|
||||||
echo "Download Model From HuggingFace"
|
echo -e "${INFO}Download Model From HuggingFace"
|
||||||
PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
||||||
G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
||||||
UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
||||||
NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
||||||
PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||||
elif [ "$USE_HF_MIRROR" = "true" ]; then
|
elif [ "$USE_HF_MIRROR" = "true" ]; then
|
||||||
echo "Download Model From HuggingFace-Mirror"
|
echo -e "${INFO}Download Model From HuggingFace-Mirror"
|
||||||
PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip"
|
||||||
G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip"
|
||||||
UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip"
|
||||||
NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip"
|
||||||
PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||||
elif [ "$USE_MODELSCOPE" = "true" ]; then
|
elif [ "$USE_MODELSCOPE" = "true" ]; then
|
||||||
echo "Download Model From ModelScope"
|
echo -e "${INFO}Download Model From ModelScope"
|
||||||
PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
|
PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip"
|
||||||
G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
|
G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip"
|
||||||
UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
|
UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip"
|
||||||
@ -181,118 +240,129 @@ elif [ "$USE_MODELSCOPE" = "true" ]; then
|
|||||||
PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
|
PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$WORKFLOW" = "true" ]; then
|
|
||||||
WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
|
||||||
else
|
|
||||||
WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||||
echo "Pretrained Model Exists"
|
echo -e "${INFO}Pretrained Model Exists"
|
||||||
|
echo -e "${INFO}Skip Downloading Pretrained Models"
|
||||||
else
|
else
|
||||||
echo "Download Pretrained Models"
|
echo -e "${INFO}Downloading Pretrained Models..."
|
||||||
"${WGET_CMD[@]}" "$PRETRINED_URL"
|
rm -rf pretrained_models.zip
|
||||||
|
run_wget_quiet "$PRETRINED_URL"
|
||||||
|
|
||||||
unzip -q -o pretrained_models.zip -d GPT_SoVITS
|
unzip -q -o pretrained_models.zip -d GPT_SoVITS
|
||||||
rm -rf pretrained_models.zip
|
rm -rf pretrained_models.zip
|
||||||
|
echo -e "${SUCCESS}Pretrained Models Downloaded"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then
|
if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then
|
||||||
echo "Download G2PWModel"
|
echo -e "${INFO}Downloading G2PWModel.."
|
||||||
"${WGET_CMD[@]}" "$G2PW_URL"
|
rm -rf G2PWModel.zip
|
||||||
|
run_wget_quiet "$G2PW_URL"
|
||||||
|
|
||||||
unzip -q -o G2PWModel.zip -d GPT_SoVITS/text
|
unzip -q -o G2PWModel.zip -d GPT_SoVITS/text
|
||||||
rm -rf G2PWModel.zip
|
rm -rf G2PWModel.zip
|
||||||
|
echo -e "${SUCCESS}G2PWModel Downloaded"
|
||||||
else
|
else
|
||||||
echo "G2PWModel Exists"
|
echo -e "${INFO}G2PWModel Exists"
|
||||||
|
echo -e "${INFO}Skip Downloading G2PWModel"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$DOWNLOAD_UVR5" = "true" ]; then
|
if [ "$DOWNLOAD_UVR5" = "true" ]; then
|
||||||
if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then
|
||||||
echo "UVR5 Model Exists"
|
echo -e"${INFO}UVR5 Models Exists"
|
||||||
|
echo -e "${INFO}Skip Downloading UVR5 Models"
|
||||||
else
|
else
|
||||||
echo "Download UVR5 Model"
|
echo -e "${INFO}Downloading UVR5 Models..."
|
||||||
"${WGET_CMD[@]}" "$UVR5_URL"
|
rm -rf uvr5_weights.zip
|
||||||
|
run_wget_quiet "$UVR5_URL"
|
||||||
|
|
||||||
unzip -q -o uvr5_weights.zip -d tools/uvr5
|
unzip -q -o uvr5_weights.zip -d tools/uvr5
|
||||||
rm -rf uvr5_weights.zip
|
rm -rf uvr5_weights.zip
|
||||||
|
echo -e "${SUCCESS}UVR5 Models Downloaded"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
||||||
echo "Checking for CUDA installation..."
|
echo -e "${INFO}Checking For Nvidia Driver Installation..."
|
||||||
if command -v nvidia-smi &>/dev/null; then
|
if command -v nvidia-smi &>/dev/null; then
|
||||||
echo "CUDA found."
|
echo "${INFO}Nvidia Driver Founded"
|
||||||
else
|
else
|
||||||
|
echo -e "${WARNING}Nvidia Driver Not Found, Fallback to CPU"
|
||||||
USE_CUDA=false
|
USE_CUDA=false
|
||||||
USE_CPU=true
|
USE_CPU=true
|
||||||
echo "CUDA not found."
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
||||||
echo "Checking for ROCm installation..."
|
echo -e "${INFO}Checking For ROCm Installation..."
|
||||||
if [ -d "/opt/rocm" ]; then
|
if [ -d "/opt/rocm" ]; then
|
||||||
echo "ROCm found."
|
echo -e "${INFO}ROCm Founded"
|
||||||
if grep -qi "microsoft" /proc/version; then
|
if grep -qi "microsoft" /proc/version; then
|
||||||
echo "You are running WSL."
|
echo -e "${INFO}WSL2 Founded"
|
||||||
IS_WSL=true
|
IS_WSL=true
|
||||||
else
|
else
|
||||||
echo "You are NOT running WSL."
|
|
||||||
IS_WSL=false
|
IS_WSL=false
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
echo -e "${WARNING}ROCm Not Found, Fallback to CPU"
|
||||||
USE_ROCM=false
|
USE_ROCM=false
|
||||||
USE_CPU=true
|
USE_CPU=true
|
||||||
echo "ROCm not found."
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then
|
||||||
echo "Installing PyTorch with CUDA support..."
|
|
||||||
if [ "$CUDA" = 128 ]; then
|
if [ "$CUDA" = 128 ]; then
|
||||||
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
|
echo -e "${INFO}Installing PyTorch For CUDA 12.8..."
|
||||||
|
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu128"
|
||||||
elif [ "$CUDA" = 126 ]; then
|
elif [ "$CUDA" = 126 ]; then
|
||||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu126
|
echo -e "${INFO}Installing PyTorch For CUDA 12.6..."
|
||||||
|
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cu126"
|
||||||
fi
|
fi
|
||||||
elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then
|
||||||
echo "Installing PyTorch with ROCm support..."
|
echo -e "${INFO}Installing PyTorch For ROCm 6.2..."
|
||||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
|
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/rocm6.2"
|
||||||
elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then
|
elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then
|
||||||
echo "Installing PyTorch for CPU..."
|
echo -e "${INFO}Installing PyTorch For CPU..."
|
||||||
pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cpu
|
run_pip_quiet torch torchaudio --index-url "https://download.pytorch.org/whl/cpu"
|
||||||
elif [ "$WORKFLOW" = false ]; then
|
elif [ "$WORKFLOW" = false ]; then
|
||||||
echo "Unknown Err"
|
echo -e "${ERROR}Unknown Err"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
echo -e "${SUCCESS}PyTorch Installed"
|
||||||
|
|
||||||
echo "Installing Python dependencies from requirements.txt..."
|
echo -e "${INFO}Installing Python Dependencies From requirements.txt..."
|
||||||
|
|
||||||
# 刷新环境
|
|
||||||
# Refresh environment
|
|
||||||
hash -r
|
hash -r
|
||||||
|
|
||||||
pip install -r extra-req.txt --no-deps --quiet
|
run_pip_quiet -r extra-req.txt --no-deps
|
||||||
|
|
||||||
pip install -r requirements.txt --quiet
|
run_pip_quiet -r requirements.txt
|
||||||
|
|
||||||
|
echo -e "${SUCCESS}Python Dependencies Installed"
|
||||||
|
|
||||||
PY_PREFIX=$(python -c "import sys; print(sys.prefix)")
|
PY_PREFIX=$(python -c "import sys; print(sys.prefix)")
|
||||||
PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))")
|
PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))")
|
||||||
|
|
||||||
"${WGET_CMD[@]}" "$NLTK_URL" -O nltk_data.zip
|
echo -e "${INFO}Downloading NLTK Data..."
|
||||||
|
rm -rf nltk_data.zip
|
||||||
|
run_wget_quiet "$NLTK_URL" -O nltk_data.zip
|
||||||
unzip -q -o nltk_data -d "$PY_PREFIX"
|
unzip -q -o nltk_data -d "$PY_PREFIX"
|
||||||
rm -rf nltk_data.zip
|
rm -rf nltk_data.zip
|
||||||
|
echo -e "${SUCCESS}NLTK Data Downloaded"
|
||||||
|
|
||||||
"${WGET_CMD[@]}" "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz
|
echo -e "${INFO}Downloading Open JTalk Dict..."
|
||||||
tar -xvzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX"
|
|
||||||
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
|
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
|
||||||
|
run_wget_quiet "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz
|
||||||
|
tar -xzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX"
|
||||||
|
rm -rf open_jtalk_dic_utf_8-1.11.tar.gz
|
||||||
|
echo -e "${SUCCESS}Open JTalk Dic Downloaded"
|
||||||
|
|
||||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
||||||
echo "Update to WSL compatible runtime lib..."
|
echo -e "${INFO}Updating WSL Compatible Runtime Lib For ROCm..."
|
||||||
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
|
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
|
||||||
cd "${location}"/torch/lib/ || exit
|
cd "${location}"/torch/lib/ || exit
|
||||||
rm libhsa-runtime64.so*
|
rm libhsa-runtime64.so*
|
||||||
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
||||||
|
echo -e "${SUCCESS}ROCm Runtime Lib Updated..."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Installation completed successfully!"
|
echo -e "${SUCCESS}Installation Completed"
|
||||||
|
@ -1,81 +1,38 @@
|
|||||||
js = """
|
js = """
|
||||||
function createGradioAnimation() {
|
function deleteTheme() {
|
||||||
|
|
||||||
const params = new URLSearchParams(window.location.search);
|
const params = new URLSearchParams(window.location.search);
|
||||||
if (params.get('__theme') !== 'light') {
|
if (params.has('__theme')) {
|
||||||
params.set('__theme', 'light'); // 仅当 __theme 不是 'light' 时设置为 'light'
|
params.delete('__theme');
|
||||||
window.location.search = params.toString(); // 更新 URL,触发页面刷新
|
const newUrl = `${window.location.pathname}?${params.toString()}`;
|
||||||
}
|
window.location.replace(newUrl);
|
||||||
|
}
|
||||||
var container = document.createElement('div');
|
|
||||||
container.id = 'gradio-animation';
|
|
||||||
container.style.fontSize = '2em';
|
|
||||||
container.style.fontWeight = '500';
|
|
||||||
container.style.textAlign = 'center';
|
|
||||||
container.style.marginBottom = '20px';
|
|
||||||
container.style.fontFamily = '-apple-system, sans-serif, Arial, Calibri';
|
|
||||||
|
|
||||||
var text = 'Welcome to GPT-SoVITS !';
|
|
||||||
for (var i = 0; i < text.length; i++) {
|
|
||||||
(function(i){
|
|
||||||
setTimeout(function(){
|
|
||||||
var letter = document.createElement('span');
|
|
||||||
letter.style.opacity = '0';
|
|
||||||
letter.style.transition = 'opacity 0.5s';
|
|
||||||
letter.innerText = text[i];
|
|
||||||
|
|
||||||
container.appendChild(letter);
|
|
||||||
|
|
||||||
setTimeout(function() {
|
|
||||||
letter.style.opacity = '1';
|
|
||||||
}, 50);
|
|
||||||
}, i * 250);
|
|
||||||
})(i);
|
|
||||||
}
|
|
||||||
return 'Animation created';
|
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
css = """
|
css = """
|
||||||
/* CSSStyleRule */
|
/* CSSStyleRule */
|
||||||
|
|
||||||
.markdown {
|
.markdown {
|
||||||
background-color: lightblue;
|
|
||||||
padding: 6px 10px;
|
padding: 6px 10px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.checkbox_info {
|
@media (prefers-color-scheme: light) {
|
||||||
color: var(--block-title-text-color) !important;
|
.markdown {
|
||||||
font-size: var(--block-title-text-size) !important;
|
background-color: lightblue;
|
||||||
font-weight: var(--block-title-text-weight) !important;
|
color: #000;
|
||||||
height: 22px;
|
}
|
||||||
margin-bottom: 8px !important;
|
}
|
||||||
|
|
||||||
|
@media (prefers-color-scheme: dark) {
|
||||||
|
.markdown {
|
||||||
|
background-color: #4b4b4b;
|
||||||
|
color: rgb(244, 244, 245);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
::selection {
|
::selection {
|
||||||
background: #ffc078; !important;
|
background: #ffc078 !important;
|
||||||
}
|
|
||||||
|
|
||||||
#checkbox_train_dpo input[type="checkbox"]{
|
|
||||||
margin-top: 6px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#checkbox_train_dpo span {
|
|
||||||
margin-top: 6px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#checkbox_align_train {
|
|
||||||
padding-top: 18px;
|
|
||||||
padding-bottom: 18px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#checkbox_align_infer input[type="checkbox"] {
|
|
||||||
margin-top: 10px;
|
|
||||||
}
|
|
||||||
|
|
||||||
#checkbox_align_infer span {
|
|
||||||
margin-top: 10px;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
footer {
|
footer {
|
||||||
@ -91,6 +48,7 @@ footer * {
|
|||||||
}
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
top_html = """
|
top_html = """
|
||||||
<div align="center">
|
<div align="center">
|
||||||
<div style="margin-bottom: 5px; font-size: 15px;">{}</div>
|
<div style="margin-bottom: 5px; font-size: 15px;">{}</div>
|
||||||
|
@ -109,7 +109,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
|
|||||||
if os.path.exists(wav_path):
|
if os.path.exists(wav_path):
|
||||||
...
|
...
|
||||||
else:
|
else:
|
||||||
gr.Warning(wav_path+i18n("路径错误"))
|
gr.Warning(wav_path + i18n("路径错误"))
|
||||||
return
|
return
|
||||||
if is_train:
|
if is_train:
|
||||||
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
|
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||||
|
|
||||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
|
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
|
||||||
i18n = I18nAuto(language=language)
|
i18n = I18nAuto(language=language)
|
||||||
import argparse
|
import argparse
|
||||||
@ -309,7 +310,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
with gr.Blocks(analytics_enabled=False) as demo:
|
with gr.Blocks(analytics_enabled=False) as demo:
|
||||||
gr.Markdown(
|
gr.Markdown(
|
||||||
value=i18n("Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)")
|
value=i18n(
|
||||||
|
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
btn_change_index = gr.Button("Change Index")
|
btn_change_index = gr.Button("Change Index")
|
||||||
|
@ -190,14 +190,14 @@ class Predictor:
|
|||||||
opt_path_vocal = path_vocal[:-4] + ".%s" % format
|
opt_path_vocal = path_vocal[:-4] + ".%s" % format
|
||||||
opt_path_other = path_other[:-4] + ".%s" % format
|
opt_path_other = path_other[:-4] + ".%s" % format
|
||||||
if os.path.exists(path_vocal):
|
if os.path.exists(path_vocal):
|
||||||
os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_vocal, opt_path_vocal))
|
os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_vocal, opt_path_vocal))
|
||||||
if os.path.exists(opt_path_vocal):
|
if os.path.exists(opt_path_vocal):
|
||||||
try:
|
try:
|
||||||
os.remove(path_vocal)
|
os.remove(path_vocal)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if os.path.exists(path_other):
|
if os.path.exists(path_other):
|
||||||
os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_other, opt_path_other))
|
os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_other, opt_path_other))
|
||||||
if os.path.exists(opt_path_other):
|
if os.path.exists(opt_path_other):
|
||||||
try:
|
try:
|
||||||
os.remove(path_other)
|
os.remove(path_other)
|
||||||
|
@ -140,7 +140,7 @@ class AudioPre:
|
|||||||
)
|
)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
opt_format_path = path[:-4] + ".%s" % format
|
opt_format_path = path[:-4] + ".%s" % format
|
||||||
cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
|
cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
|
||||||
print(cmd)
|
print(cmd)
|
||||||
os.system(cmd)
|
os.system(cmd)
|
||||||
if os.path.exists(opt_format_path):
|
if os.path.exists(opt_format_path):
|
||||||
@ -177,7 +177,7 @@ class AudioPre:
|
|||||||
)
|
)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
opt_format_path = path[:-4] + ".%s" % format
|
opt_format_path = path[:-4] + ".%s" % format
|
||||||
cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
|
cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
|
||||||
print(cmd)
|
print(cmd)
|
||||||
os.system(cmd)
|
os.system(cmd)
|
||||||
if os.path.exists(opt_format_path):
|
if os.path.exists(opt_format_path):
|
||||||
@ -307,7 +307,7 @@ class AudioPreDeEcho:
|
|||||||
)
|
)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
opt_format_path = path[:-4] + ".%s" % format
|
opt_format_path = path[:-4] + ".%s" % format
|
||||||
cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
|
cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
|
||||||
print(cmd)
|
print(cmd)
|
||||||
os.system(cmd)
|
os.system(cmd)
|
||||||
if os.path.exists(opt_format_path):
|
if os.path.exists(opt_format_path):
|
||||||
@ -340,7 +340,7 @@ class AudioPreDeEcho:
|
|||||||
)
|
)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
opt_format_path = path[:-4] + ".%s" % format
|
opt_format_path = path[:-4] + ".%s" % format
|
||||||
cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
|
cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
|
||||||
print(cmd)
|
print(cmd)
|
||||||
os.system(cmd)
|
os.system(cmd)
|
||||||
if os.path.exists(opt_format_path):
|
if os.path.exists(opt_format_path):
|
||||||
|
4
webui.py
4
webui.py
@ -507,7 +507,7 @@ def open1Ba(
|
|||||||
):
|
):
|
||||||
global p_train_SoVITS
|
global p_train_SoVITS
|
||||||
if p_train_SoVITS == None:
|
if p_train_SoVITS == None:
|
||||||
exp_name=exp_name.rstrip(" ")
|
exp_name = exp_name.rstrip(" ")
|
||||||
config_file = (
|
config_file = (
|
||||||
"GPT_SoVITS/configs/s2.json"
|
"GPT_SoVITS/configs/s2.json"
|
||||||
if version not in {"v2Pro", "v2ProPlus"}
|
if version not in {"v2Pro", "v2ProPlus"}
|
||||||
@ -604,7 +604,7 @@ def open1Bb(
|
|||||||
):
|
):
|
||||||
global p_train_GPT
|
global p_train_GPT
|
||||||
if p_train_GPT == None:
|
if p_train_GPT == None:
|
||||||
exp_name=exp_name.rstrip(" ")
|
exp_name = exp_name.rstrip(" ")
|
||||||
with open(
|
with open(
|
||||||
"GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml"
|
"GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml"
|
||||||
) as f:
|
) as f:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user