diff --git a/.github/build_windows_packages.ps1 b/.github/build_windows_packages.ps1
index 2e4acb2a..75918d35 100644
--- a/.github/build_windows_packages.ps1
+++ b/.github/build_windows_packages.ps1
@@ -162,7 +162,7 @@ Copy-Item -Path $curr -Destination $pkgName -Recurse
$7zPath = "$pkgName.7z"
$start = Get-Date
Write-Host "Compress Starting at $start"
-& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
+& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -mmt=on -bsp1
$end = Get-Date
Write-Host "Elapsed time: $($end - $start)"
Get-ChildItem .
diff --git a/GPT_SoVITS/AR/models/t2s_lightning_module.py b/GPT_SoVITS/AR/models/t2s_lightning_module.py
index 97f3a084..fd357b94 100644
--- a/GPT_SoVITS/AR/models/t2s_lightning_module.py
+++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py
@@ -28,7 +28,8 @@ class Text2SemanticLightningModule(LightningModule):
self.load_state_dict(
torch.load(
pretrained_s1,
- map_location="cpu", weights_only=False,
+ map_location="cpu",
+ weights_only=False,
)["weight"],
)
)
diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index be936005..795b55dd 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -32,19 +32,21 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
from tools.audio_sr import AP_BWE
from tools.i18n.i18n import I18nAuto, scan_language_list
-from tools.my_utils import load_audio
from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
from sv import SV
-resample_transform_dict={}
-def resample(audio_tensor, sr0,sr1,device):
+
+resample_transform_dict = {}
+
+
+def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
- key="%s-%s-%s"%(sr0,sr1,str(device))
+ key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
- resample_transform_dict[key] = torchaudio.transforms.Resample(
- sr0, sr1
- ).to(device)
+ resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
+
+
language = os.environ.get("language", "Auto")
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language)
@@ -111,6 +113,7 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
return processed_audio
+
class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
@@ -479,7 +482,7 @@ class TTS:
def init_vits_weights(self, weights_path: str):
self.configs.vits_weights_path = weights_path
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
- if "Pro"in model_version:
+ if "Pro" in model_version:
self.init_sv_model()
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
@@ -498,9 +501,9 @@ class TTS:
else:
hps["model"]["version"] = "v2"
version = hps["model"]["version"]
- v3v4set={"v3", "v4"}
+ v3v4set = {"v3", "v4"}
if model_version not in v3v4set:
- if "Pro"not in model_version:
+ if "Pro" not in model_version:
model_version = version
else:
hps["model"]["version"] = model_version
@@ -542,7 +545,7 @@ class TTS:
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q
- self.is_v2pro=model_version in {"v2Pro","v2ProPlus"}
+ self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}
if if_lora_v3 == False:
print(
@@ -632,7 +635,9 @@ class TTS:
)
self.vocoder.remove_weight_norm()
state_dict_g = torch.load(
- "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
+ "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+ map_location="cpu",
+ weights_only=False,
)
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
@@ -752,11 +757,13 @@ class TTS:
if raw_sr != self.configs.sampling_rate:
audio = raw_audio.to(self.configs.device)
- if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
+ if audio.shape[0] == 2:
+ audio = audio.mean(0).unsqueeze(0)
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
else:
audio = raw_audio.to(self.configs.device)
- if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
+ if audio.shape[0] == 2:
+ audio = audio.mean(0).unsqueeze(0)
maxx = audio.abs().max()
if maxx > 1:
@@ -775,8 +782,9 @@ class TTS:
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
if self.configs.is_half:
audio = audio.half()
- else:audio=None
- return spec,audio
+ else:
+ audio = None
+ return spec, audio
def _set_prompt_semantic(self, ref_wav_path: str):
zero_wav = np.zeros(
@@ -1073,7 +1081,10 @@ class TTS:
###### setting reference audio and prompt text preprocessing ########
t0 = time.perf_counter()
- if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"] or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)):
+ if (ref_audio_path is not None) and (
+ ref_audio_path != self.prompt_cache["ref_audio_path"]
+ or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)
+ ):
if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists")
self.set_ref_audio(ref_audio_path)
@@ -1212,9 +1223,10 @@ class TTS:
t_34 += t4 - t3
refer_audio_spec = []
- if self.is_v2pro:sv_emb=[]
- for spec,audio_tensor in self.prompt_cache["refer_spec"]:
- spec=spec.to(dtype=self.precision, device=self.configs.device)
+ if self.is_v2pro:
+ sv_emb = []
+ for spec, audio_tensor in self.prompt_cache["refer_spec"]:
+ spec = spec.to(dtype=self.precision, device=self.configs.device)
refer_audio_spec.append(spec)
if self.is_v2pro:
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
@@ -1249,10 +1261,14 @@ class TTS:
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
)
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
- if self.is_v2pro!=True:
- _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
+ if self.is_v2pro != True:
+ _batch_audio_fragment = self.vits_model.decode(
+ all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
+ ).detach()[0, 0, :]
else:
- _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
+ _batch_audio_fragment = self.vits_model.decode(
+ all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
+ ).detach()[0, 0, :]
audio_frag_end_idx.insert(0, 0)
batch_audio_fragment = [
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
@@ -1266,9 +1282,13 @@ class TTS:
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
) # .unsqueeze(0)#mq要多unsqueeze一次
if self.is_v2pro != True:
- audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
+ audio_fragment = self.vits_model.decode(
+ _pred_semantic, phones, refer_audio_spec, speed=speed_factor
+ ).detach()[0, 0, :]
else:
- audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
+ audio_fragment = self.vits_model.decode(
+ _pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
+ ).detach()[0, 0, :]
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
else:
if parallel_infer:
@@ -1410,7 +1430,7 @@ class TTS:
raw_entry = self.prompt_cache["refer_spec"][0]
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
- refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
+ refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
@@ -1480,7 +1500,7 @@ class TTS:
raw_entry = self.prompt_cache["refer_spec"][0]
if isinstance(raw_entry, tuple):
raw_entry = raw_entry[0]
- refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
+ refer_audio_spec = raw_entry.to(dtype=self.precision, device=self.configs.device)
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
index f03183a1..9a478d43 100644
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -160,7 +160,9 @@ class TextPreprocessor:
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
- if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+ if (tmp["lang"] == "en" and langlist[-1] == "en") or (
+ tmp["lang"] != "en" and langlist[-1] != "en"
+ ):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
diff --git a/GPT_SoVITS/eres2net/ERes2Net.py b/GPT_SoVITS/eres2net/ERes2Net.py
index f728742d..1618c813 100644
--- a/GPT_SoVITS/eres2net/ERes2Net.py
+++ b/GPT_SoVITS/eres2net/ERes2Net.py
@@ -1,13 +1,12 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-"""
- Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
- ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
- The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
- The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
"""
-
+Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
+The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
+The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
+"""
import torch
import math
@@ -16,15 +15,14 @@ import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
-class ReLU(nn.Hardtanh):
+class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
- inplace_str = 'inplace' if self.inplace else ''
- return self.__class__.__name__ + ' (' \
- + inplace_str + ')'
+ inplace_str = "inplace" if self.inplace else ""
+ return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2Net(nn.Module):
@@ -32,28 +30,28 @@ class BasicBlockERes2Net(nn.Module):
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockERes2Net, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
- convs=[]
- bns=[]
+ convs = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
- nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
- stride=stride, bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -64,18 +62,18 @@ class BasicBlockERes2Net(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
- sp = spx[i]
- else:
- sp = sp + spx[i]
- sp = self.convs[i](sp)
- sp = self.relu(self.bns[i](sp))
- if i==0:
- out = sp
- else:
- out = torch.cat((out,sp),1)
+ if i == 0:
+ sp = spx[i]
+ else:
+ sp = sp + spx[i]
+ sp = self.convs[i](sp)
+ sp = self.relu(self.bns[i](sp))
+ if i == 0:
+ out = sp
+ else:
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -86,22 +84,23 @@ class BasicBlockERes2Net(nn.Module):
return out
+
class BasicBlockERes2Net_diff_AFF(nn.Module):
expansion = 2
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockERes2Net_diff_AFF, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
- convs=[]
- fuse_models=[]
- bns=[]
+ convs = []
+ fuse_models = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width))
@@ -109,15 +108,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
- nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
- stride=stride, bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -128,19 +127,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
+ if i == 0:
sp = spx[i]
else:
- sp = self.fuse_models[i-1](sp, spx[i])
-
+ sp = self.fuse_models[i - 1](sp, spx[i])
+
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
- if i==0:
+ if i == 0:
out = sp
else:
- out = torch.cat((out,sp),1)
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -151,16 +150,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
return out
+
class ERes2Net(nn.Module):
- def __init__(self,
- block=BasicBlockERes2Net,
- block_fuse=BasicBlockERes2Net_diff_AFF,
- num_blocks=[3, 4, 6, 3],
- m_channels=32,
- feat_dim=80,
- embedding_size=192,
- pooling_func='TSTP',
- two_emb_layer=False):
+ def __init__(
+ self,
+ block=BasicBlockERes2Net,
+ block_fuse=BasicBlockERes2Net_diff_AFF,
+ num_blocks=[3, 4, 6, 3],
+ m_channels=32,
+ feat_dim=80,
+ embedding_size=192,
+ pooling_func="TSTP",
+ two_emb_layer=False,
+ ):
super(ERes2Net, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
@@ -176,20 +178,24 @@ class ERes2Net(nn.Module):
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
# Downsampling module for each layer
- self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False)
- self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
- self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
+ self.layer1_downsample = nn.Conv2d(
+ m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
+ )
+ self.layer2_downsample = nn.Conv2d(
+ m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
+ )
+ self.layer3_downsample = nn.Conv2d(
+ m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
+ )
# Bottom-up fusion module
self.fuse_mode12 = AFF(channels=m_channels * 4)
self.fuse_mode123 = AFF(channels=m_channels * 8)
self.fuse_mode1234 = AFF(channels=m_channels * 16)
- self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
- self.pool = getattr(pooling_layers, pooling_func)(
- in_dim=self.stats_dim * block.expansion)
- self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
- embedding_size)
+ self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
+ self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
+ self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
self.seg_2 = nn.Linear(embedding_size, embedding_size)
@@ -212,7 +218,7 @@ class ERes2Net(nn.Module):
out1 = self.layer1(out)
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
- fuse_out12 = self.fuse_mode12(out2, out1_downsample)
+ fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
@@ -243,18 +249,16 @@ class ERes2Net(nn.Module):
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
- fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
+ fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
return fuse_out1234
-if __name__ == '__main__':
-
+if __name__ == "__main__":
x = torch.zeros(10, 300, 80)
- model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP')
+ model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func="TSTP")
model.eval()
out = model(x)
- print(out.shape) # torch.Size([10, 192])
+ print(out.shape) # torch.Size([10, 192])
num_params = sum(param.numel() for param in model.parameters())
- print("{} M".format(num_params / 1e6)) # 6.61M
-
+ print("{} M".format(num_params / 1e6)) # 6.61M
diff --git a/GPT_SoVITS/eres2net/ERes2NetV2.py b/GPT_SoVITS/eres2net/ERes2NetV2.py
index fdfd6db4..2e152a41 100644
--- a/GPT_SoVITS/eres2net/ERes2NetV2.py
+++ b/GPT_SoVITS/eres2net/ERes2NetV2.py
@@ -1,14 +1,12 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-"""
- To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
- within each stage. However, this modification also increases the number of model parameters and computational complexity.
- To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
- both the model parameters and its computational cost.
"""
-
-
+To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
+within each stage. However, this modification also increases the number of model parameters and computational complexity.
+To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
+both the model parameters and its computational cost.
+"""
import torch
import math
@@ -17,47 +15,42 @@ import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
-class ReLU(nn.Hardtanh):
+class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
- inplace_str = 'inplace' if self.inplace else ''
- return self.__class__.__name__ + ' (' \
- + inplace_str + ')'
+ inplace_str = "inplace" if self.inplace else ""
+ return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2NetV2(nn.Module):
-
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
super(BasicBlockERes2NetV2, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
self.expansion = expansion
- convs=[]
- bns=[]
+ convs = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
- nn.Conv2d(in_planes,
- self.expansion * planes,
- kernel_size=1,
- stride=stride,
- bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -68,18 +61,18 @@ class BasicBlockERes2NetV2(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
- sp = spx[i]
- else:
- sp = sp + spx[i]
- sp = self.convs[i](sp)
- sp = self.relu(self.bns[i](sp))
- if i==0:
- out = sp
- else:
- out = torch.cat((out,sp),1)
+ if i == 0:
+ sp = spx[i]
+ else:
+ sp = sp + spx[i]
+ sp = self.convs[i](sp)
+ sp = self.relu(self.bns[i](sp))
+ if i == 0:
+ out = sp
+ else:
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -90,22 +83,22 @@ class BasicBlockERes2NetV2(nn.Module):
return out
-class BasicBlockERes2NetV2AFF(nn.Module):
+class BasicBlockERes2NetV2AFF(nn.Module):
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
super(BasicBlockERes2NetV2AFF, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
self.expansion = expansion
- convs=[]
- fuse_models=[]
- bns=[]
+ convs = []
+ fuse_models = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width, r=4))
@@ -113,18 +106,15 @@ class BasicBlockERes2NetV2AFF(nn.Module):
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
- nn.Conv2d(in_planes,
- self.expansion * planes,
- kernel_size=1,
- stride=stride,
- bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -135,19 +125,19 @@ class BasicBlockERes2NetV2AFF(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
+ if i == 0:
sp = spx[i]
else:
- sp = self.fuse_models[i-1](sp, spx[i])
-
+ sp = self.fuse_models[i - 1](sp, spx[i])
+
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
- if i==0:
+ if i == 0:
out = sp
else:
- out = torch.cat((out,sp),1)
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -158,19 +148,22 @@ class BasicBlockERes2NetV2AFF(nn.Module):
return out
+
class ERes2NetV2(nn.Module):
- def __init__(self,
- block=BasicBlockERes2NetV2,
- block_fuse=BasicBlockERes2NetV2AFF,
- num_blocks=[3, 4, 6, 3],
- m_channels=64,
- feat_dim=80,
- embedding_size=192,
- baseWidth=26,
- scale=2,
- expansion=2,
- pooling_func='TSTP',
- two_emb_layer=False):
+ def __init__(
+ self,
+ block=BasicBlockERes2NetV2,
+ block_fuse=BasicBlockERes2NetV2AFF,
+ num_blocks=[3, 4, 6, 3],
+ m_channels=64,
+ feat_dim=80,
+ embedding_size=192,
+ baseWidth=26,
+ scale=2,
+ expansion=2,
+ pooling_func="TSTP",
+ two_emb_layer=False,
+ ):
super(ERes2NetV2, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
@@ -181,42 +174,29 @@ class ERes2NetV2(nn.Module):
self.scale = scale
self.expansion = expansion
- self.conv1 = nn.Conv2d(1,
- m_channels,
- kernel_size=3,
- stride=1,
- padding=1,
- bias=False)
+ self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
- self.layer1 = self._make_layer(block,
- m_channels,
- num_blocks[0],
- stride=1)
- self.layer2 = self._make_layer(block,
- m_channels * 2,
- num_blocks[1],
- stride=2)
- self.layer3 = self._make_layer(block_fuse,
- m_channels * 4,
- num_blocks[2],
- stride=2)
- self.layer4 = self._make_layer(block_fuse,
- m_channels * 8,
- num_blocks[3],
- stride=2)
+ self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
+ self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
+ self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
# Downsampling module
- self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \
- padding=1, stride=2, bias=False)
+ self.layer3_ds = nn.Conv2d(
+ m_channels * 4 * self.expansion,
+ m_channels * 8 * self.expansion,
+ kernel_size=3,
+ padding=1,
+ stride=2,
+ bias=False,
+ )
# Bottom-up fusion module
self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
- self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
- self.pool = getattr(pooling_layers, pooling_func)(
- in_dim=self.stats_dim * self.expansion)
- self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
- embedding_size)
+ self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
+ self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion)
+ self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
self.seg_2 = nn.Linear(embedding_size, embedding_size)
@@ -228,7 +208,11 @@ class ERes2NetV2(nn.Module):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
- layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion))
+ layers.append(
+ block(
+ self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion
+ )
+ )
self.in_planes = planes * self.expansion
return nn.Sequential(*layers)
@@ -264,7 +248,7 @@ class ERes2NetV2(nn.Module):
out3_ds = self.layer3_ds(out3)
fuse_out34 = self.fuse34(out4, out3_ds)
# print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
- return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1)
+ return fuse_out34.flatten(start_dim=1, end_dim=2).mean(-1)
# stats = self.pool(fuse_out34)
#
# embed_a = self.seg_1(stats)
@@ -276,17 +260,13 @@ class ERes2NetV2(nn.Module):
# else:
# return embed_a
-if __name__ == '__main__':
+if __name__ == "__main__":
x = torch.randn(1, 300, 80)
model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
model.eval()
y = model(x)
print(y.size())
- macs, num_params = profile(model, inputs=(x, ))
- print("Params: {} M".format(num_params / 1e6)) # 17.86 M
- print("MACs: {} G".format(macs / 1e9)) # 12.69 G
-
-
-
-
+ macs, num_params = profile(model, inputs=(x,))
+ print("Params: {} M".format(num_params / 1e6)) # 17.86 M
+ print("MACs: {} G".format(macs / 1e9)) # 12.69 G
diff --git a/GPT_SoVITS/eres2net/ERes2Net_huge.py b/GPT_SoVITS/eres2net/ERes2Net_huge.py
index 0ea82d39..0f04236b 100644
--- a/GPT_SoVITS/eres2net/ERes2Net_huge.py
+++ b/GPT_SoVITS/eres2net/ERes2Net_huge.py
@@ -1,14 +1,13 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
- ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
- The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
- The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
- ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
- recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
+"""Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
+The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
+The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
+ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
+recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
"""
-import pdb
import torch
import math
@@ -17,15 +16,14 @@ import torch.nn.functional as F
import pooling_layers as pooling_layers
from fusion import AFF
-class ReLU(nn.Hardtanh):
+class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
- inplace_str = 'inplace' if self.inplace else ''
- return self.__class__.__name__ + ' (' \
- + inplace_str + ')'
+ inplace_str = "inplace" if self.inplace else ""
+ return self.__class__.__name__ + " (" + inplace_str + ")"
class BasicBlockERes2Net(nn.Module):
@@ -33,27 +31,28 @@ class BasicBlockERes2Net(nn.Module):
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
super(BasicBlockERes2Net, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
- convs=[]
- bns=[]
+ convs = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -64,18 +63,18 @@ class BasicBlockERes2Net(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
- sp = spx[i]
- else:
- sp = sp + spx[i]
- sp = self.convs[i](sp)
- sp = self.relu(self.bns[i](sp))
- if i==0:
- out = sp
- else:
- out = torch.cat((out,sp),1)
+ if i == 0:
+ sp = spx[i]
+ else:
+ sp = sp + spx[i]
+ sp = self.convs[i](sp)
+ sp = self.relu(self.bns[i](sp))
+ if i == 0:
+ out = sp
+ else:
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -86,22 +85,23 @@ class BasicBlockERes2Net(nn.Module):
return out
+
class BasicBlockERes2Net_diff_AFF(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
super(BasicBlockERes2Net_diff_AFF, self).__init__()
- width = int(math.floor(planes*(baseWidth/64.0)))
- self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
- self.bn1 = nn.BatchNorm2d(width*scale)
+ width = int(math.floor(planes * (baseWidth / 64.0)))
+ self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+ self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
- convs=[]
- fuse_models=[]
- bns=[]
+ convs = []
+ fuse_models = []
+ bns = []
for i in range(self.nums):
- convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
- bns.append(nn.BatchNorm2d(width))
+ convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+ bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width))
@@ -109,14 +109,15 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
-
- self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
- self.bn3 = nn.BatchNorm2d(planes*self.expansion)
+
+ self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
- nn.BatchNorm2d(self.expansion * planes))
+ nn.BatchNorm2d(self.expansion * planes),
+ )
self.stride = stride
self.width = width
self.scale = scale
@@ -127,20 +128,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
- spx = torch.split(out,self.width,1)
+ spx = torch.split(out, self.width, 1)
for i in range(self.nums):
- if i==0:
+ if i == 0:
sp = spx[i]
else:
- sp = self.fuse_models[i-1](sp, spx[i])
-
+ sp = self.fuse_models[i - 1](sp, spx[i])
+
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
- if i==0:
+ if i == 0:
out = sp
else:
- out = torch.cat((out,sp),1)
-
+ out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
@@ -151,16 +151,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
return out
+
class ERes2Net(nn.Module):
- def __init__(self,
- block=BasicBlockERes2Net,
- block_fuse=BasicBlockERes2Net_diff_AFF,
- num_blocks=[3, 4, 6, 3],
- m_channels=64,
- feat_dim=80,
- embedding_size=192,
- pooling_func='TSTP',
- two_emb_layer=False):
+ def __init__(
+ self,
+ block=BasicBlockERes2Net,
+ block_fuse=BasicBlockERes2Net_diff_AFF,
+ num_blocks=[3, 4, 6, 3],
+ m_channels=64,
+ feat_dim=80,
+ embedding_size=192,
+ pooling_func="TSTP",
+ two_emb_layer=False,
+ ):
super(ERes2Net, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
@@ -176,17 +179,22 @@ class ERes2Net(nn.Module):
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
- self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
- self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
- self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False)
+ self.layer1_downsample = nn.Conv2d(
+ m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
+ )
+ self.layer2_downsample = nn.Conv2d(
+ m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
+ )
+ self.layer3_downsample = nn.Conv2d(
+ m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False
+ )
self.fuse_mode12 = AFF(channels=m_channels * 8)
self.fuse_mode123 = AFF(channels=m_channels * 16)
self.fuse_mode1234 = AFF(channels=m_channels * 32)
- self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
- self.pool = getattr(pooling_layers, pooling_func)(
- in_dim=self.stats_dim * block.expansion)
+ self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
+ self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
@@ -229,7 +237,7 @@ class ERes2Net(nn.Module):
else:
return embed_a
- def forward2(self, x,if_mean):
+ def forward2(self, x, if_mean):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = x.unsqueeze_(1)
@@ -243,14 +251,13 @@ class ERes2Net(nn.Module):
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
- fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2)#bs,20480,T
- if(if_mean==False):
- mean=fuse_out1234[0].transpose(1,0)#(T,20480),bs=T
+ fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2) # bs,20480,T
+ if if_mean == False:
+ mean = fuse_out1234[0].transpose(1, 0) # (T,20480),bs=T
else:
- mean = fuse_out1234.mean(2)#bs,20480
- mean_std=torch.cat([mean,torch.zeros_like(mean)],1)
- return self.seg_1(mean_std)#(T,192)
-
+ mean = fuse_out1234.mean(2) # bs,20480
+ mean_std = torch.cat([mean, torch.zeros_like(mean)], 1)
+ return self.seg_1(mean_std) # (T,192)
# stats = self.pool(fuse_out1234)
# if self.two_emb_layer:
@@ -275,12 +282,8 @@ class ERes2Net(nn.Module):
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
- fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
+ fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2).mean(-1)
return fuse_out1234
# print(fuse_out1234.shape)
# print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
# pdb.set_trace()
-
-
-
-
diff --git a/GPT_SoVITS/eres2net/fusion.py b/GPT_SoVITS/eres2net/fusion.py
index 2aff7a72..d156a55c 100644
--- a/GPT_SoVITS/eres2net/fusion.py
+++ b/GPT_SoVITS/eres2net/fusion.py
@@ -6,7 +6,6 @@ import torch.nn as nn
class AFF(nn.Module):
-
def __init__(self, channels=64, r=4):
super(AFF, self).__init__()
inter_channels = int(channels // r)
@@ -23,7 +22,6 @@ class AFF(nn.Module):
xa = torch.cat((x, ds_y), dim=1)
x_att = self.local_att(xa)
x_att = 1.0 + torch.tanh(x_att)
- xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att)
+ xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
return xo
-
diff --git a/GPT_SoVITS/eres2net/kaldi.py b/GPT_SoVITS/eres2net/kaldi.py
index 58664c0c..a80e5e6b 100644
--- a/GPT_SoVITS/eres2net/kaldi.py
+++ b/GPT_SoVITS/eres2net/kaldi.py
@@ -144,7 +144,7 @@ def _get_waveform_and_window_properties(
)
assert 0 < window_shift, "`window_shift` must be greater than 0"
assert padded_window_size % 2 == 0, (
- "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
+ "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
)
assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
@@ -441,7 +441,9 @@ def get_mel_banks(
high_freq: float,
vtln_low: float,
vtln_high: float,
- vtln_warp_factor: float,device=None,dtype=None
+ vtln_warp_factor: float,
+ device=None,
+ dtype=None,
) -> Tuple[Tensor, Tensor]:
"""
Returns:
@@ -457,9 +459,9 @@ def get_mel_banks(
if high_freq <= 0.0:
high_freq += nyquist
- assert (
- (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
- ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+ assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), (
+ "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+ )
# fft-bin width [think of it as Nyquist-freq / half-window-length]
fft_bin_width = sample_freq / window_length_padded
@@ -475,7 +477,7 @@ def get_mel_banks(
assert vtln_warp_factor == 1.0 or (
(low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
- ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
+ ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
vtln_low, vtln_high, low_freq, high_freq
)
@@ -508,9 +510,12 @@ def get_mel_banks(
bins[up_idx] = up_slope[up_idx]
bins[down_idx] = down_slope[down_idx]
- return bins.to(device=device,dtype=dtype)#, center_freqs
+ return bins.to(device=device, dtype=dtype) # , center_freqs
+
+
+cache = {}
+
-cache={}
def fbank(
waveform: Tensor,
blackman_coeff: float = 0.42,
@@ -620,14 +625,34 @@ def fbank(
# size (num_mel_bins, padded_window_size // 2)
# print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
- cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype)
+ cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % (
+ num_mel_bins,
+ padded_window_size,
+ sample_frequency,
+ low_freq,
+ high_freq,
+ vtln_low,
+ vtln_high,
+ vtln_warp,
+ device,
+ dtype,
+ )
if cache_key not in cache:
mel_energies = get_mel_banks(
- num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype
+ num_mel_bins,
+ padded_window_size,
+ sample_frequency,
+ low_freq,
+ high_freq,
+ vtln_low,
+ vtln_high,
+ vtln_warp,
+ device,
+ dtype,
)
- cache[cache_key]=mel_energies
+ cache[cache_key] = mel_energies
else:
- mel_energies=cache[cache_key]
+ mel_energies = cache[cache_key]
# pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
diff --git a/GPT_SoVITS/eres2net/pooling_layers.py b/GPT_SoVITS/eres2net/pooling_layers.py
index 1fa0e7d6..c3e0eab6 100644
--- a/GPT_SoVITS/eres2net/pooling_layers.py
+++ b/GPT_SoVITS/eres2net/pooling_layers.py
@@ -1,7 +1,7 @@
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
+"""This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
import torch
import torch.nn as nn
@@ -11,6 +11,7 @@ class TAP(nn.Module):
"""
Temporal average pooling, only first-order mean is considered
"""
+
def __init__(self, **kwargs):
super(TAP, self).__init__()
@@ -25,6 +26,7 @@ class TSDP(nn.Module):
"""
Temporal standard deviation pooling, only second-order std is considered
"""
+
def __init__(self, **kwargs):
super(TSDP, self).__init__()
@@ -41,6 +43,7 @@ class TSTP(nn.Module):
x-vector
Comment: simple concatenation can not make full use of both statistics
"""
+
def __init__(self, **kwargs):
super(TSTP, self).__init__()
@@ -56,9 +59,10 @@ class TSTP(nn.Module):
class ASTP(nn.Module):
- """ Attentive statistics pooling: Channel- and context-dependent
- statistics pooling, first used in ECAPA_TDNN.
+ """Attentive statistics pooling: Channel- and context-dependent
+ statistics pooling, first used in ECAPA_TDNN.
"""
+
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
super(ASTP, self).__init__()
self.global_context_att = global_context_att
@@ -66,15 +70,10 @@ class ASTP(nn.Module):
# Use Conv1d with stride == 1 rather than Linear, then we don't
# need to transpose inputs.
if global_context_att:
- self.linear1 = nn.Conv1d(
- in_dim * 3, bottleneck_dim,
- kernel_size=1) # equals W and b in the paper
+ self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1) # equals W and b in the paper
else:
- self.linear1 = nn.Conv1d(
- in_dim, bottleneck_dim,
- kernel_size=1) # equals W and b in the paper
- self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
- kernel_size=1) # equals V and k in the paper
+ self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
+ self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
def forward(self, x):
"""
@@ -88,15 +87,13 @@ class ASTP(nn.Module):
if self.global_context_att:
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
- context_std = torch.sqrt(
- torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
x_in = torch.cat((x, context_mean, context_std), dim=1)
else:
x_in = x
# DON'T use ReLU here! ReLU may be hard to converge.
- alpha = torch.tanh(
- self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
+ alpha = torch.tanh(self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
alpha = torch.softmax(self.linear2(alpha), dim=2)
mean = torch.sum(alpha * x, dim=2)
var = torch.sum(alpha * (x**2), dim=2) - mean**2
diff --git a/GPT_SoVITS/export_torch_script_v3v4.py b/GPT_SoVITS/export_torch_script_v3v4.py
index 55d27282..89cb4b03 100644
--- a/GPT_SoVITS/export_torch_script_v3v4.py
+++ b/GPT_SoVITS/export_torch_script_v3v4.py
@@ -402,7 +402,7 @@ class GPTSoVITSV3(torch.nn.Module):
chunk_len = 934 - fea_ref.shape[2]
wav_gen_list = []
idx = 0
- fea_todo = fea_todo[:,:,:-5]
+ fea_todo = fea_todo[:, :, :-5]
wav_gen_length = fea_todo.shape[2] * 256
while 1:
# current_time = datetime.now()
@@ -434,7 +434,8 @@ class GPTSoVITSV3(torch.nn.Module):
wav_gen = torch.cat(wav_gen_list, 2)
return wav_gen[0][0][:wav_gen_length]
-
+
+
class GPTSoVITSV4(torch.nn.Module):
def __init__(self, gpt_sovits_half, cfm, hifigan):
super().__init__()
@@ -461,7 +462,7 @@ class GPTSoVITSV4(torch.nn.Module):
chunk_len = 1000 - fea_ref.shape[2]
wav_gen_list = []
idx = 0
- fea_todo = fea_todo[:,:,:-10]
+ fea_todo = fea_todo[:, :, :-10]
wav_gen_length = fea_todo.shape[2] * 480
while 1:
# current_time = datetime.now()
@@ -577,6 +578,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
v3v4set = {"v3", "v4"}
+
def get_sovits_weights(sovits_path):
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@@ -699,14 +701,13 @@ def export_cfm(
return export_cfm
-def export_1(ref_wav_path,ref_wav_text,version="v3"):
+def export_1(ref_wav_path, ref_wav_text, version="v3"):
if version == "v3":
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
init_bigvgan()
else:
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
init_hifigan()
-
dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
@@ -751,9 +752,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
# phones1, bert1, norm_text1 = get_phones_and_bert(
# "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
# )
- phones1, bert1, norm_text1 = get_phones_and_bert(
- ref_wav_text, "auto", "v3"
- )
+ phones1, bert1, norm_text1 = get_phones_and_bert(ref_wav_text, "auto", "v3")
phones2, bert2, norm_text2 = get_phones_and_bert(
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
"auto",
@@ -914,7 +913,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,))
hifigan_model_.save("onnx/ad/hifigan_model.pt")
wav_gen = hifigan_model(cmf_res)
-
+
print("wav_gen:", wav_gen.shape, wav_gen.dtype)
audio = wav_gen[0][0].cpu().detach().numpy()
@@ -1201,7 +1200,6 @@ def export_2(version="v3"):
gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
sr = 24000 if version == "v3" else 48000
-
time.sleep(5)
# print("thread:", torch.get_num_threads())
# print("thread:", torch.get_num_interop_threads())
@@ -1212,14 +1210,14 @@ def export_2(version="v3"):
"汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
gpt_sovits_v3v4,
"out.wav",
- sr
+ sr,
)
test_export(
"你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
gpt_sovits_v3v4,
"out2.wav",
- sr
+ sr,
)
# test_export(
@@ -1251,6 +1249,6 @@ def test_export_gpt_sovits_v3():
with torch.no_grad():
- export_1("onnx/ad/ref.wav","你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。","v4")
+ export_1("onnx/ad/ref.wav", "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "v4")
# export_2("v4")
# test_export_gpt_sovits_v3()
diff --git a/GPT_SoVITS/f5_tts/model/backbones/dit.py b/GPT_SoVITS/f5_tts/model/backbones/dit.py
index f64a3c39..4aa3b9ac 100644
--- a/GPT_SoVITS/f5_tts/model/backbones/dit.py
+++ b/GPT_SoVITS/f5_tts/model/backbones/dit.py
@@ -143,9 +143,9 @@ class DiT(nn.Module):
drop_audio_cond=False, # cfg for cond audio
drop_text=False, # cfg for text
# mask: bool["b n"] | None = None, # noqa: F722
- infer=False, # bool
- text_cache=None, # torch tensor as text_embed
- dt_cache=None, # torch tensor as dt
+ infer=False, # bool
+ text_cache=None, # torch tensor as text_embed
+ dt_cache=None, # torch tensor as dt
):
x = x0.transpose(2, 1)
cond = cond0.transpose(2, 1)
@@ -191,4 +191,4 @@ class DiT(nn.Module):
if infer:
return output, text_embed, dt
else:
- return output
\ No newline at end of file
+ return output
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 5c7d0103..6b9b35f7 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1337,5 +1337,6 @@ if __name__ == "__main__":
inbrowser=True,
share=is_share,
server_port=infer_ttswebui,
+ show_api=False,
# quiet=True,
)
diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py
index 470b7bbd..608297d0 100644
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@@ -505,5 +505,6 @@ if __name__ == "__main__":
inbrowser=True,
share=is_share,
server_port=infer_ttswebui,
+ show_api=False,
# quiet=True,
)
diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py
index 81829683..46eff5fb 100644
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@@ -21,7 +21,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
3) computes spectrograms from audio files.
"""
- def __init__(self, hparams, version=None,val=False):
+ def __init__(self, hparams, version=None, val=False):
exp_dir = hparams.exp_dir
self.path2 = "%s/2-name2text.txt" % exp_dir
self.path4 = "%s/4-cnhubert" % exp_dir
@@ -29,7 +29,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
assert os.path.exists(self.path2)
assert os.path.exists(self.path4)
assert os.path.exists(self.path5)
- self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
+ self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
if self.is_v2Pro:
self.path7 = "%s/7-sv_cn" % exp_dir
assert os.path.exists(self.path7)
@@ -118,7 +118,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
if self.is_v2Pro:
- sv_emb=torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
+ sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
except:
traceback.print_exc()
spec = torch.zeros(1025, 100)
@@ -126,10 +126,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
ssl = torch.zeros(1, 768, 100)
text = text[-1:]
if self.is_v2Pro:
- sv_emb=torch.zeros(1,20480)
+ sv_emb = torch.zeros(1, 20480)
print("load audio or ssl error!!!!!!", audiopath)
if self.is_v2Pro:
- return (ssl, spec, wav, text,sv_emb)
+ return (ssl, spec, wav, text, sv_emb)
else:
return (ssl, spec, wav, text)
@@ -192,9 +192,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
class TextAudioSpeakerCollate:
"""Zero-pads model inputs and targets"""
- def __init__(self, return_ids=False,version=None):
+ def __init__(self, return_ids=False, version=None):
self.return_ids = return_ids
- self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
+ self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
@@ -228,7 +228,7 @@ class TextAudioSpeakerCollate:
text_padded.zero_()
if self.is_v2Pro:
- sv_embs=torch.FloatTensor(len(batch),20480)
+ sv_embs = torch.FloatTensor(len(batch), 20480)
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
@@ -250,11 +250,30 @@ class TextAudioSpeakerCollate:
text_lengths[i] = text.size(0)
if self.is_v2Pro:
- sv_embs[i]=row[4]
+ sv_embs[i] = row[4]
if self.is_v2Pro:
- return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths,sv_embs
+ return (
+ ssl_padded,
+ ssl_lengths,
+ spec_padded,
+ spec_lengths,
+ wav_padded,
+ wav_lengths,
+ text_padded,
+ text_lengths,
+ sv_embs,
+ )
else:
- return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
+ return (
+ ssl_padded,
+ ssl_lengths,
+ spec_padded,
+ spec_lengths,
+ wav_padded,
+ wav_lengths,
+ text_padded,
+ text_lengths,
+ )
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py
index 4fbec59f..1c8e662f 100644
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -586,12 +586,17 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap
-v2pro_set={"v2Pro","v2ProPlus"}
+
+v2pro_set = {"v2Pro", "v2ProPlus"}
+
+
class MultiPeriodDiscriminator(torch.nn.Module):
- def __init__(self, use_spectral_norm=False,version=None):
+ def __init__(self, use_spectral_norm=False, version=None):
super(MultiPeriodDiscriminator, self).__init__()
- if version in v2pro_set:periods = [2, 3, 5, 7, 11,17,23]
- else:periods = [2, 3, 5, 7, 11]
+ if version in v2pro_set:
+ periods = [2, 3, 5, 7, 11, 17, 23]
+ else:
+ periods = [2, 3, 5, 7, 11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
@@ -787,6 +792,7 @@ class CodePredictor(nn.Module):
return pred_codes.transpose(0, 1)
+
class SynthesizerTrn(nn.Module):
"""
Synthesizer for Training
@@ -886,13 +892,13 @@ class SynthesizerTrn(nn.Module):
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
self.freeze_quantizer = freeze_quantizer
- self.is_v2pro=self.version in v2pro_set
+ self.is_v2pro = self.version in v2pro_set
if self.is_v2pro:
self.sv_emb = nn.Linear(20480, gin_channels)
self.ge_to512 = nn.Linear(gin_channels, 512)
self.prelu = nn.PReLU(num_parameters=gin_channels)
- def forward(self, ssl, y, y_lengths, text, text_lengths,sv_emb=None):
+ def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None):
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
if self.version == "v1":
ge = self.ref_enc(y * y_mask, y_mask)
@@ -952,7 +958,7 @@ class SynthesizerTrn(nn.Module):
return o, y_mask, (z, z_p, m_p, logs_p)
@torch.no_grad()
- def decode(self, codes, text, refer,noise_scale=0.5, speed=1, sv_emb=None):
+ def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=None):
def get_ge(refer, sv_emb):
ge = None
if refer is not None:
@@ -970,8 +976,8 @@ class SynthesizerTrn(nn.Module):
if type(refer) == list:
ges = []
- for idx,_refer in enumerate(refer):
- ge = get_ge(_refer, sv_emb[idx]if self.is_v2pro else None)
+ for idx, _refer in enumerate(refer):
+ ge = get_ge(_refer, sv_emb[idx] if self.is_v2pro else None)
ges.append(ge)
ge = torch.stack(ges, 0).mean(0)
else:
@@ -983,7 +989,14 @@ class SynthesizerTrn(nn.Module):
quantized = self.quantizer.decode(codes)
if self.semantic_frame_rate == "25hz":
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
- x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, self.ge_to512(ge.transpose(2,1)).transpose(2,1)if self.is_v2pro else ge, speed)
+ x, m_p, logs_p, y_mask = self.enc_p(
+ quantized,
+ y_lengths,
+ text,
+ text_lengths,
+ self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
+ speed,
+ )
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=ge, reverse=True)
@@ -996,6 +1009,7 @@ class SynthesizerTrn(nn.Module):
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
return codes.transpose(0, 1)
+
class CFM(torch.nn.Module):
def __init__(self, in_channels, dit):
super().__init__()
@@ -1029,7 +1043,18 @@ class CFM(torch.nn.Module):
t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
v_pred, text_emb, dt = self.estimator(
- x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False, infer=True, text_cache=text_cache, dt_cache=dt_cache
+ x,
+ prompt_x,
+ x_lens,
+ t_tensor,
+ d_tensor,
+ mu,
+ use_grad_ckpt=False,
+ drop_audio_cond=False,
+ drop_text=False,
+ infer=True,
+ text_cache=text_cache,
+ dt_cache=dt_cache,
)
v_pred = v_pred.transpose(2, 1)
if self.use_conditioner_cache:
@@ -1037,18 +1062,18 @@ class CFM(torch.nn.Module):
dt_cache = dt
if inference_cfg_rate > 1e-5:
neg, text_cfg_emb, _ = self.estimator(
- x,
- prompt_x,
- x_lens,
- t_tensor,
- d_tensor,
- mu,
- use_grad_ckpt=False,
- drop_audio_cond=True,
- drop_text=True,
- infer=True,
- text_cache=text_cfg_cache,
- dt_cache=dt_cache
+ x,
+ prompt_x,
+ x_lens,
+ t_tensor,
+ d_tensor,
+ mu,
+ use_grad_ckpt=False,
+ drop_audio_cond=True,
+ drop_text=True,
+ infer=True,
+ text_cache=text_cfg_cache,
+ dt_cache=dt_cache,
)
neg = neg.transpose(2, 1)
if self.use_conditioner_cache:
diff --git a/GPT_SoVITS/module/modules.py b/GPT_SoVITS/module/modules.py
index 9a94898f..6fa84a43 100644
--- a/GPT_SoVITS/module/modules.py
+++ b/GPT_SoVITS/module/modules.py
@@ -1,5 +1,4 @@
import math
-import pdb
import numpy as np
import torch
@@ -720,10 +719,10 @@ class MelStyleEncoder(nn.Module):
else:
len_ = (~mask).sum(dim=1).unsqueeze(1)
x = x.masked_fill(mask.unsqueeze(-1), 0)
- dtype=x.dtype
+ dtype = x.dtype
x = x.float()
- x=torch.div(x,len_.unsqueeze(1))
- out=x.sum(dim=1).to(dtype)
+ x = torch.div(x, len_.unsqueeze(1))
+ out = x.sum(dim=1).to(dtype)
return out
def forward(self, x, mask=None):
diff --git a/GPT_SoVITS/prepare_datasets/2-get-sv.py b/GPT_SoVITS/prepare_datasets/2-get-sv.py
index 8980833f..80b0ad69 100644
--- a/GPT_SoVITS/prepare_datasets/2-get-sv.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py
@@ -10,7 +10,6 @@ i_part = os.environ.get("i_part")
all_parts = os.environ.get("all_parts")
if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
-from feature_extractor import cnhubert
opt_dir = os.environ.get("opt_dir")
sv_path = os.environ.get("sv_path")
@@ -19,19 +18,18 @@ import torch
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
import traceback
-import numpy as np
-from scipy.io import wavfile
import torchaudio
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
-from tools.my_utils import load_audio, clean_path
+from tools.my_utils import clean_path
from time import time as ttime
import shutil
from ERes2NetV2 import ERes2NetV2
import kaldi as Kaldi
+
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
dir = os.path.dirname(path)
name = os.path.basename(path)
@@ -56,37 +54,45 @@ if torch.cuda.is_available():
else:
device = "cpu"
+
class SV:
- def __init__(self,device,is_half):
- pretrained_state = torch.load(sv_path, map_location='cpu')
- embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
+ def __init__(self, device, is_half):
+ pretrained_state = torch.load(sv_path, map_location="cpu")
+ embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
embedding_model.load_state_dict(pretrained_state)
embedding_model.eval()
- self.embedding_model=embedding_model
- self.res=torchaudio.transforms.Resample(32000, 16000).to(device)
+ self.embedding_model = embedding_model
+ self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
if is_half == False:
- self.embedding_model=self.embedding_model.to(device)
+ self.embedding_model = self.embedding_model.to(device)
else:
- self.embedding_model=self.embedding_model.half().to(device)
- self.is_half=is_half
+ self.embedding_model = self.embedding_model.half().to(device)
+ self.is_half = is_half
- def compute_embedding3(self,wav):#(1,x)#-1~1
+ def compute_embedding3(self, wav): # (1,x)#-1~1
with torch.no_grad():
- wav=self.res(wav)
- if self.is_half==True:wav=wav.half()
- feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
+ wav = self.res(wav)
+ if self.is_half == True:
+ wav = wav.half()
+ feat = torch.stack(
+ [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
+ )
sv_emb = self.embedding_model.forward3(feat)
return sv_emb
-sv=SV(device,is_half)
+
+sv = SV(device, is_half)
+
+
def name2go(wav_name, wav_path):
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
- if os.path.exists(sv_cn_path):return
- wav_path="%s/%s" % (wav32dir, wav_name)
- wav32k,sr0 = torchaudio.load(wav_path)
- assert sr0==32000
+ if os.path.exists(sv_cn_path):
+ return
+ wav_path = "%s/%s" % (wav32dir, wav_name)
+ wav32k, sr0 = torchaudio.load(wav_path)
+ assert sr0 == 32000
wav32k = wav32k.to(device)
- emb=sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
+ emb = sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
my_save(emb, sv_cn_path)
diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py
index ca30359c..20db9b19 100644
--- a/GPT_SoVITS/process_ckpt.py
+++ b/GPT_SoVITS/process_ckpt.py
@@ -17,15 +17,16 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
shutil.move(tmp_path, "%s/%s" % (dir, name))
-
from io import BytesIO
-model_version2byte={
- "v3":b"03",
- "v4":b"04",
- "v2Pro":b"05",
- "v2ProPlus":b"06",
+model_version2byte = {
+ "v3": b"03",
+ "v4": b"04",
+ "v2Pro": b"05",
+ "v2ProPlus": b"06",
}
+
+
def my_save2(fea, path, model_version):
bio = BytesIO()
torch.save(fea, bio)
@@ -50,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
if lora_rank:
opt["lora_rank"] = lora_rank
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
- elif (model_version!=None and "Pro"in model_version):
+ elif model_version != None and "Pro" in model_version:
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
else:
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
@@ -58,6 +59,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
except:
return traceback.format_exc()
+
"""
00:v1
01:v2
diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py
index 0a046049..4b9f6488 100644
--- a/GPT_SoVITS/s2_train.py
+++ b/GPT_SoVITS/s2_train.py
@@ -36,7 +36,7 @@ from module.models import (
MultiPeriodDiscriminator,
SynthesizerTrn,
)
-from process_ckpt import savee,my_save2
+from process_ckpt import savee
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = False
@@ -87,11 +87,30 @@ def run(rank, n_gpus, hps):
if torch.cuda.is_available():
torch.cuda.set_device(rank)
- train_dataset = TextAudioSpeakerLoader(hps.data,version=hps.model.version)
+ train_dataset = TextAudioSpeakerLoader(hps.data, version=hps.model.version)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size,
- [32,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,],
+ [
+ 32,
+ 300,
+ 400,
+ 500,
+ 600,
+ 700,
+ 800,
+ 900,
+ 1000,
+ 1100,
+ 1200,
+ 1300,
+ 1400,
+ 1500,
+ 1600,
+ 1700,
+ 1800,
+ 1900,
+ ],
num_replicas=n_gpus,
rank=rank,
shuffle=True,
@@ -130,9 +149,9 @@ def run(rank, n_gpus, hps):
)
net_d = (
- MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).cuda(rank)
+ MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).cuda(rank)
if torch.cuda.is_available()
- else MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).to(device)
+ else MultiPeriodDiscriminator(hps.model.use_spectral_norm, version=hps.model.version).to(device)
)
for name, param in net_g.named_parameters():
if not param.requires_grad:
@@ -235,7 +254,7 @@ def run(rank, n_gpus, hps):
print(
"loaded pretrained %s" % hps.train.pretrained_s2D,
net_d.module.load_state_dict(
- torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],strict=False
+ torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"], strict=False
)
if torch.cuda.is_available()
else net_d.load_state_dict(
@@ -310,17 +329,44 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
net_g.train()
net_d.train()
for batch_idx, data in enumerate(tqdm(train_loader)):
- if hps.model.version in {"v2Pro","v2ProPlus"}:
- ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths,sv_emb=data
+ if hps.model.version in {"v2Pro", "v2ProPlus"}:
+ ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths, sv_emb = data
else:
- ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths=data
+ ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data
if torch.cuda.is_available():
- spec, spec_lengths = (spec.cuda(rank,non_blocking=True,),spec_lengths.cuda(rank,non_blocking=True,),)
- y, y_lengths = (y.cuda(rank,non_blocking=True,),y_lengths.cuda(rank,non_blocking=True,),)
+ spec, spec_lengths = (
+ spec.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ spec_lengths.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ )
+ y, y_lengths = (
+ y.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ y_lengths.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ )
ssl = ssl.cuda(rank, non_blocking=True)
ssl.requires_grad = False
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
- text, text_lengths = (text.cuda(rank,non_blocking=True,),text_lengths.cuda(rank,non_blocking=True,),)
+ text, text_lengths = (
+ text.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ text_lengths.cuda(
+ rank,
+ non_blocking=True,
+ ),
+ )
if hps.model.version in {"v2Pro", "v2ProPlus"}:
sv_emb = sv_emb.cuda(rank, non_blocking=True)
else:
@@ -334,9 +380,19 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
sv_emb = sv_emb.to(device)
with autocast(enabled=hps.train.fp16_run):
if hps.model.version in {"v2Pro", "v2ProPlus"}:
- (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl) = net_g(ssl, spec, spec_lengths, text, text_lengths,sv_emb)
+ (y_hat, kl_ssl, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl) = net_g(
+ ssl, spec, spec_lengths, text, text_lengths, sv_emb
+ )
else:
- (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl,) = net_g(ssl, spec, spec_lengths, text, text_lengths)
+ (
+ y_hat,
+ kl_ssl,
+ ids_slice,
+ x_mask,
+ z_mask,
+ (z, z_p, m_p, logs_p, m_q, logs_q),
+ stats_ssl,
+ ) = net_g(ssl, spec, spec_lengths, text, text_lengths)
mel = spec_to_mel_torch(
spec,
@@ -508,7 +564,14 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
% (
hps.name,
epoch,
- savee(ckpt,hps.name + "_e%s_s%s" % (epoch, global_step),epoch,global_step,hps,model_version=None if hps.model.version not in {"v2Pro","v2ProPlus"}else hps.model.version),
+ savee(
+ ckpt,
+ hps.name + "_e%s_s%s" % (epoch, global_step),
+ epoch,
+ global_step,
+ hps,
+ model_version=None if hps.model.version not in {"v2Pro", "v2ProPlus"} else hps.model.version,
+ ),
)
)
diff --git a/GPT_SoVITS/sv.py b/GPT_SoVITS/sv.py
index fb5806ff..22e70369 100644
--- a/GPT_SoVITS/sv.py
+++ b/GPT_SoVITS/sv.py
@@ -1,24 +1,32 @@
-import sys,os,torch
+import sys
+import os
+import torch
+
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
from ERes2NetV2 import ERes2NetV2
import kaldi as Kaldi
+
+
class SV:
- def __init__(self,device,is_half):
- pretrained_state = torch.load(sv_path, map_location='cpu', weights_only=False)
- embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
+ def __init__(self, device, is_half):
+ pretrained_state = torch.load(sv_path, map_location="cpu", weights_only=False)
+ embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
embedding_model.load_state_dict(pretrained_state)
embedding_model.eval()
- self.embedding_model=embedding_model
+ self.embedding_model = embedding_model
if is_half == False:
- self.embedding_model=self.embedding_model.to(device)
+ self.embedding_model = self.embedding_model.to(device)
else:
- self.embedding_model=self.embedding_model.half().to(device)
- self.is_half=is_half
+ self.embedding_model = self.embedding_model.half().to(device)
+ self.is_half = is_half
- def compute_embedding3(self,wav):
+ def compute_embedding3(self, wav):
with torch.no_grad():
- if self.is_half==True:wav=wav.half()
- feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
+ if self.is_half == True:
+ wav = wav.half()
+ feat = torch.stack(
+ [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
+ )
sv_emb = self.embedding_model.forward3(feat)
return sv_emb
diff --git a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
index 88b93794..0187ea69 100644
--- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
+++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
@@ -3,38 +3,44 @@ import re
# jieba静音
import jieba
+
jieba.setLogLevel(logging.CRITICAL)
# 更改fast_langdetect大模型位置
from pathlib import Path
import fast_langdetect
-fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
+
+fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
+ fast_langdetect.infer.LangDetectConfig(
+ cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
+ )
+)
from split_lang import LangSplitter
def full_en(text):
- pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
+ pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
return bool(re.match(pattern, text))
def full_cjk(text):
# 来自wiki
cjk_ranges = [
- (0x4E00, 0x9FFF), # CJK Unified Ideographs
- (0x3400, 0x4DB5), # CJK Extension A
- (0x20000, 0x2A6DD), # CJK Extension B
- (0x2A700, 0x2B73F), # CJK Extension C
- (0x2B740, 0x2B81F), # CJK Extension D
- (0x2B820, 0x2CEAF), # CJK Extension E
- (0x2CEB0, 0x2EBEF), # CJK Extension F
- (0x30000, 0x3134A), # CJK Extension G
- (0x31350, 0x323AF), # CJK Extension H
- (0x2EBF0, 0x2EE5D), # CJK Extension H
+ (0x4E00, 0x9FFF), # CJK Unified Ideographs
+ (0x3400, 0x4DB5), # CJK Extension A
+ (0x20000, 0x2A6DD), # CJK Extension B
+ (0x2A700, 0x2B73F), # CJK Extension C
+ (0x2B740, 0x2B81F), # CJK Extension D
+ (0x2B820, 0x2CEAF), # CJK Extension E
+ (0x2CEB0, 0x2EBEF), # CJK Extension F
+ (0x30000, 0x3134A), # CJK Extension G
+ (0x31350, 0x323AF), # CJK Extension H
+ (0x2EBF0, 0x2EE5D), # CJK Extension H
]
- pattern = r'[0-9、-〜。!?.!?… /]+$'
+ pattern = r"[0-9、-〜。!?.!?… /]+$"
cjk_text = ""
for char in text:
@@ -45,7 +51,7 @@ def full_cjk(text):
return cjk_text
-def split_jako(tag_lang,item):
+def split_jako(tag_lang, item):
if tag_lang == "ja":
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
else:
@@ -53,41 +59,40 @@ def split_jako(tag_lang,item):
lang_list: list[dict] = []
tag = 0
- for match in re.finditer(pattern, item['text']):
+ for match in re.finditer(pattern, item["text"]):
if match.start() > tag:
- lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
+ lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
tag = match.end()
- lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
+ lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
- if tag < len(item['text']):
- lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
+ if tag < len(item["text"]):
+ lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
return lang_list
def merge_lang(lang_list, item):
- if lang_list and item['lang'] == lang_list[-1]['lang']:
- lang_list[-1]['text'] += item['text']
+ if lang_list and item["lang"] == lang_list[-1]["lang"]:
+ lang_list[-1]["text"] += item["text"]
else:
lang_list.append(item)
return lang_list
-class LangSegmenter():
+class LangSegmenter:
# 默认过滤器, 基于gsv目前四种语言
DEFAULT_LANG_MAP = {
"zh": "zh",
"yue": "zh", # 粤语
"wuu": "zh", # 吴语
"zh-cn": "zh",
- "zh-tw": "x", # 繁体设置为x
+ "zh-tw": "x", # 繁体设置为x
"ko": "ko",
"ja": "ja",
"en": "en",
}
-
def getTexts(text):
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
substr = lang_splitter.split_by_lang(text=text)
@@ -95,18 +100,18 @@ class LangSegmenter():
lang_list: list[dict] = []
for _, item in enumerate(substr):
- dict_item = {'lang':item.lang,'text':item.text}
+ dict_item = {"lang": item.lang, "text": item.text}
# 处理短英文被识别为其他语言的问题
- if full_en(dict_item['text']):
- dict_item['lang'] = 'en'
- lang_list = merge_lang(lang_list,dict_item)
+ if full_en(dict_item["text"]):
+ dict_item["lang"] = "en"
+ lang_list = merge_lang(lang_list, dict_item)
continue
# 处理非日语夹日文的问题(不包含CJK)
ja_list: list[dict] = []
- if dict_item['lang'] != 'ja':
- ja_list = split_jako('ja',dict_item)
+ if dict_item["lang"] != "ja":
+ ja_list = split_jako("ja", dict_item)
if not ja_list:
ja_list.append(dict_item)
@@ -115,8 +120,8 @@ class LangSegmenter():
ko_list: list[dict] = []
temp_list: list[dict] = []
for _, ko_item in enumerate(ja_list):
- if ko_item["lang"] != 'ko':
- ko_list = split_jako('ko',ko_item)
+ if ko_item["lang"] != "ko":
+ ko_list = split_jako("ko", ko_item)
if ko_list:
temp_list.extend(ko_list)
@@ -126,50 +131,50 @@ class LangSegmenter():
# 未存在非日韩文夹日韩文
if len(temp_list) == 1:
# 未知语言检查是否为CJK
- if dict_item['lang'] == 'x':
- cjk_text = full_cjk(dict_item['text'])
+ if dict_item["lang"] == "x":
+ cjk_text = full_cjk(dict_item["text"])
if cjk_text:
- dict_item = {'lang':'zh','text':cjk_text}
- lang_list = merge_lang(lang_list,dict_item)
+ dict_item = {"lang": "zh", "text": cjk_text}
+ lang_list = merge_lang(lang_list, dict_item)
else:
- lang_list = merge_lang(lang_list,dict_item)
+ lang_list = merge_lang(lang_list, dict_item)
continue
else:
- lang_list = merge_lang(lang_list,dict_item)
+ lang_list = merge_lang(lang_list, dict_item)
continue
# 存在非日韩文夹日韩文
for _, temp_item in enumerate(temp_list):
# 未知语言检查是否为CJK
- if temp_item['lang'] == 'x':
- cjk_text = full_cjk(dict_item['text'])
+ if temp_item["lang"] == "x":
+ cjk_text = full_cjk(dict_item["text"])
if cjk_text:
- dict_item = {'lang':'zh','text':cjk_text}
- lang_list = merge_lang(lang_list,dict_item)
+ dict_item = {"lang": "zh", "text": cjk_text}
+ lang_list = merge_lang(lang_list, dict_item)
else:
- lang_list = merge_lang(lang_list,dict_item)
+ lang_list = merge_lang(lang_list, dict_item)
else:
- lang_list = merge_lang(lang_list,temp_item)
+ lang_list = merge_lang(lang_list, temp_item)
temp_list = lang_list
lang_list = []
for _, temp_item in enumerate(temp_list):
- if temp_item['lang'] == 'x':
+ if temp_item["lang"] == "x":
if lang_list:
- temp_item['lang'] = lang_list[-1]['lang']
+ temp_item["lang"] = lang_list[-1]["lang"]
elif len(temp_list) > 1:
- temp_item['lang'] = temp_list[1]['lang']
+ temp_item["lang"] = temp_list[1]["lang"]
else:
- temp_item['lang'] = 'zh'
+ temp_item["lang"] = "zh"
- lang_list = merge_lang(lang_list,temp_item)
+ lang_list = merge_lang(lang_list, temp_item)
return lang_list
-
+
if __name__ == "__main__":
text = "MyGO?,你也喜欢まいご吗?"
print(LangSegmenter.getTexts(text))
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
- print(LangSegmenter.getTexts(text))
\ No newline at end of file
+ print(LangSegmenter.getTexts(text))
diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py
index a8268107..52eed443 100644
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@@ -3,7 +3,6 @@
import json
import os
-import traceback
import warnings
import zipfile
from typing import Any, Dict, List, Tuple
@@ -23,8 +22,9 @@ from .utils import load_config
onnxruntime.set_default_logger_severity(3)
try:
onnxruntime.preload_dlls()
-except:pass
- #traceback.print_exc()
+except:
+ pass
+ # traceback.print_exc()
warnings.filterwarnings("ignore")
model_version = "1.1"
diff --git a/GPT_SoVITS/text/tone_sandhi.py b/GPT_SoVITS/text/tone_sandhi.py
index e9a279c6..4ed73781 100644
--- a/GPT_SoVITS/text/tone_sandhi.py
+++ b/GPT_SoVITS/text/tone_sandhi.py
@@ -655,11 +655,7 @@ class ToneSandhi:
while i < len(seg):
word, pos = seg[i]
merged = False
- if (
- i - 1 >= 0
- and word == "一"
- and i + 1 < len(seg)
- ):
+ if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
last = new_seg[-1] if new_seg else seg[i - 1]
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
combined = last[0] + "一" + seg[i + 1][0]
diff --git a/README.md b/README.md
index d03514f0..7c845800 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+#
+
GPT-SoVITS-WebUI
@@ -7,12 +9,17 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

-
+[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
+[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
+[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[](https://discord.gg/dnrgs5GHfG)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+
+[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
+[](https://rentry.co/GPT-SoVITS-guide#/)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
**English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
@@ -20,7 +27,7 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
---
-## Features:
+## Features
1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
@@ -34,13 +41,13 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
Unseen speakers few-shot fine-tuning demo:
-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+
-**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+
## Installation
-For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
+For users in China, you can use [AutoDL Cloud Docker](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to experience the full functionality online.
### Tested Environments
@@ -171,7 +178,7 @@ docker exec -it
```
-if you want to switch to V1,then
-
-```bash
-python webui.py v1
-```
-
-Or maunally switch version in WebUI
-
### Finetune
#### Path Auto-filling is now supported
@@ -253,7 +247,7 @@ Or maunally switch version in WebUI
#### Integrated Package Users
-Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
+Double-click `go-webui.bat` or use `go-webui.ps1` , then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
#### Others
@@ -333,7 +327,7 @@ Use v4 from v1/v2/v3 environment:
New Features:
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
- [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
+ [more details]()
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
@@ -373,11 +367,6 @@ Use the command line to open the WebUI for UVR5
python tools/uvr5/webui.py ""
```
-
-
This is how the audio segmentation of the dataset is done using the command line
```bash
@@ -453,5 +442,5 @@ Thankful to @Naozumi520 for providing the Cantonese training set and for the gui
## Thanks to all contributors for their efforts
-
+
diff --git a/api.py b/api.py
index b7e94e77..dc2e8826 100644
--- a/api.py
+++ b/api.py
@@ -199,6 +199,8 @@ def is_full(*items): # 任意一项为空返回False
bigvgan_model = hifigan_model = sv_cn_model = None
+
+
def clean_hifigan_model():
global hifigan_model
if hifigan_model:
@@ -208,6 +210,8 @@ def clean_hifigan_model():
torch.cuda.empty_cache()
except:
pass
+
+
def clean_bigvgan_model():
global bigvgan_model
if bigvgan_model:
@@ -217,6 +221,8 @@ def clean_bigvgan_model():
torch.cuda.empty_cache()
except:
pass
+
+
def clean_sv_cn_model():
global sv_cn_model
if sv_cn_model:
@@ -229,7 +235,7 @@ def clean_sv_cn_model():
def init_bigvgan():
- global bigvgan_model, hifigan_model,sv_cn_model
+ global bigvgan_model, hifigan_model, sv_cn_model
from BigVGAN import bigvgan
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
@@ -247,7 +253,7 @@ def init_bigvgan():
def init_hifigan():
- global hifigan_model, bigvgan_model,sv_cn_model
+ global hifigan_model, bigvgan_model, sv_cn_model
hifigan_model = Generator(
initial_channel=100,
resblock="1",
@@ -262,7 +268,9 @@ def init_hifigan():
hifigan_model.eval()
hifigan_model.remove_weight_norm()
state_dict_g = torch.load(
- "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
+ "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+ map_location="cpu",
+ weights_only=False,
)
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
if is_half == True:
@@ -272,19 +280,21 @@ def init_hifigan():
from sv import SV
+
+
def init_sv_cn():
global hifigan_model, bigvgan_model, sv_cn_model
sv_cn_model = SV(device, is_half)
-resample_transform_dict={}
-def resample(audio_tensor, sr0,sr1,device):
+resample_transform_dict = {}
+
+
+def resample(audio_tensor, sr0, sr1, device):
global resample_transform_dict
- key="%s-%s-%s"%(sr0,sr1,str(device))
+ key = "%s-%s-%s" % (sr0, sr1, str(device))
if key not in resample_transform_dict:
- resample_transform_dict[key] = torchaudio.transforms.Resample(
- sr0, sr1
- ).to(device)
+ resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
return resample_transform_dict[key](audio_tensor)
@@ -370,6 +380,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
def get_sovits_weights(sovits_path):
from config import pretrained_sovits_name
+
path_sovits_v3 = pretrained_sovits_name["v3"]
path_sovits_v4 = pretrained_sovits_name["v4"]
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@@ -628,15 +639,17 @@ class DictToAttrRecursive(dict):
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
- sr1=int(hps.data.sampling_rate)
- audio, sr0=torchaudio.load(filename)
- if sr0!=sr1:
- audio=audio.to(device)
- if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
- audio=resample(audio,sr0,sr1,device)
+ sr1 = int(hps.data.sampling_rate)
+ audio, sr0 = torchaudio.load(filename)
+ if sr0 != sr1:
+ audio = audio.to(device)
+ if audio.shape[0] == 2:
+ audio = audio.mean(0).unsqueeze(0)
+ audio = resample(audio, sr0, sr1, device)
else:
- audio=audio.to(device)
- if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
+ audio = audio.to(device)
+ if audio.shape[0] == 2:
+ audio = audio.mean(0).unsqueeze(0)
maxx = audio.abs().max()
if maxx > 1:
@@ -649,10 +662,10 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
hps.data.win_length,
center=False,
)
- spec=spec.to(dtype)
- if is_v2pro==True:
- audio=resample(audio,sr1,16000,device).to(dtype)
- return spec,audio
+ spec = spec.to(dtype)
+ if is_v2pro == True:
+ audio = resample(audio, sr1, 16000, device).to(dtype)
+ return spec, audio
def pack_audio(audio_bytes, data, rate):
@@ -872,29 +885,29 @@ def get_tts_wav(
prompt_semantic = codes[0, 0]
prompt = prompt_semantic.unsqueeze(0).to(device)
- is_v2pro = version in {"v2Pro","v2ProPlus"}
+ is_v2pro = version in {"v2Pro", "v2ProPlus"}
if version not in {"v3", "v4"}:
refers = []
if is_v2pro:
- sv_emb= []
+ sv_emb = []
if sv_cn_model == None:
init_sv_cn()
if inp_refs:
for path in inp_refs:
- try:#####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer
- refer,audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
+ try: #####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer
+ refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro)
refers.append(refer)
if is_v2pro:
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
except Exception as e:
logger.error(e)
if len(refers) == 0:
- refers,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
- refers=[refers]
+ refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro)
+ refers = [refers]
if is_v2pro:
- sv_emb=[sv_cn_model.compute_embedding3(audio_tensor)]
+ sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
else:
- refer,audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
+ refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
t1 = ttime()
# os.environ['version'] = version
@@ -937,14 +950,22 @@ def get_tts_wav(
if version not in {"v3", "v4"}:
if is_v2pro:
audio = (
- vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb)
+ vq_model.decode(
+ pred_semantic,
+ torch.LongTensor(phones2).to(device).unsqueeze(0),
+ refers,
+ speed=speed,
+ sv_emb=sv_emb,
+ )
.detach()
.cpu()
.numpy()[0, 0]
)
else:
audio = (
- vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)
+ vq_model.decode(
+ pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
+ )
.detach()
.cpu()
.numpy()[0, 0]
@@ -1108,7 +1129,6 @@ def handle(
if not default_refer.is_ready():
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
-
if cut_punc == None:
text = cut_text(text, default_cut_punc)
else:
diff --git a/config.py b/config.py
index 8f4be146..fdc11c0a 100644
--- a/config.py
+++ b/config.py
@@ -144,7 +144,8 @@ webui_port_subfix = 9871
api_port = 9880
-#Thanks to the contribution of @Karasukaigan and @XXXXRT666
+
+# Thanks to the contribution of @Karasukaigan and @XXXXRT666
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
cpu = torch.device("cpu")
cuda = torch.device(f"cuda:{idx}")
@@ -157,10 +158,13 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
mem_gb = mem_bytes / (1024**3) + 0.4
major, minor = capability
sm_version = major + minor / 10.0
- is_16_series = bool(re.search(r"16\d{2}", name))and sm_version == 7.5
- if mem_gb < 4 or sm_version < 5.3:return cpu, torch.float32, 0.0, 0.0
- if sm_version == 6.1 or is_16_series==True:return cuda, torch.float32, sm_version, mem_gb
- if sm_version > 6.1:return cuda, torch.float16, sm_version, mem_gb
+ is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
+ if mem_gb < 4 or sm_version < 5.3:
+ return cpu, torch.float32, 0.0, 0.0
+ if sm_version == 6.1 or is_16_series == True:
+ return cuda, torch.float32, sm_version, mem_gb
+ if sm_version > 6.1:
+ return cuda, torch.float16, sm_version, mem_gb
return cpu, torch.float32, 0.0, 0.0
diff --git a/docker_build.ps1 b/docker_build.ps1
new file mode 100644
index 00000000..c0508fa6
--- /dev/null
+++ b/docker_build.ps1
@@ -0,0 +1,73 @@
+$ErrorActionPreference = "Stop"
+
+$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
+Set-Location $ScriptDir
+
+if (-not (Get-Command "docker" -ErrorAction SilentlyContinue)) {
+ Write-Host "Docker Not Found"
+ exit 1
+}
+
+$Lite = $false
+$CudaVersion = "12.6"
+
+function Write-Help {
+ Write-Host @"
+Usage: powershell -File docker_build.ps1 [OPTIONS]
+
+Options:
+ --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)
+ --lite Build a Lite Image
+ -h, --help Show this help message and exit
+
+Examples:
+ powershell -File docker_build.ps1 --cuda 12.6 --lite
+"@
+}
+
+if ($args.Count -eq 0) {
+ Write-Help
+ exit 0
+}
+
+for ($i = 0; $i -lt $args.Count; $i++) {
+ switch ($args[$i]) {
+ '--cuda' {
+ $i++
+ $val = $args[$i]
+ if ($val -ne "12.6" -and $val -ne "12.8") {
+ Write-Host "Error: Invalid CUDA_VERSION: $val"
+ Write-Host "Choose From: [12.6, 12.8]"
+ exit 1
+ }
+ $CudaVersion = $val
+ }
+ '--lite' {
+ $Lite = $true
+ }
+ '-h' { Write-Help; exit 0 }
+ '--help' { Write-Help; exit 0 }
+ default {
+ Write-Host "Unknown Argument: $($args[$i])"
+ Write-Host "Use -h or --help to see available options."
+ exit 1
+ }
+ }
+}
+
+$arch = (Get-CimInstance Win32_Processor).Architecture
+$TargetPlatform = if ($arch -eq 9) { "linux/amd64" } else { "linux/arm64" }
+
+if ($Lite) {
+ $TorchBase = "lite"
+} else {
+ $TorchBase = "full"
+}
+
+docker build `
+ --build-arg CUDA_VERSION=$CudaVersion `
+ --build-arg LITE=$Lite `
+ --build-arg TARGETPLATFORM=$TargetPlatform `
+ --build-arg TORCH_BASE=$TorchBase `
+ -t "$env:USERNAME/gpt-sovits:local" `
+ .
\ No newline at end of file
diff --git a/docker_build.sh b/docker_build.sh
index 354599d2..b6a803fb 100644
--- a/docker_build.sh
+++ b/docker_build.sh
@@ -25,7 +25,7 @@ print_help() {
echo " -h, --help Show this help message and exit"
echo ""
echo "Examples:"
- echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
+ echo " bash docker_build.sh --cuda 12.6"
}
# Show help if no arguments provided
diff --git a/docs/cn/Changelog_CN.md b/docs/cn/Changelog_CN.md
index 6c6dcdbe..b3d620e9 100644
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@@ -157,7 +157,7 @@
- 类型: 修复
- 提交: StaryLan
- 2024.02.07 [Commit#14a28510](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
- - 内容: 修复 UVR5 `inf everywhere` 报错的问题 (`is_half` 传参未转换布尔类型导致恒定半精度推理, 16系显卡会 `inf`).
+ - 内容: 修复 UVR5 `inf everywhere` 报错的问题 (`is_half` 传参未转换布尔类型导致恒定半精度推理, 16 系显卡会 `inf`).
- 类型: 修复
- 提交: RVC-Boss
- 2024.02.07 [Commit#d74f888e](https://github.com/RVC-Boss/GPT-SoVITS/commit/d74f888e7ac86063bfeacef95d0e6ddafe42b3b2)
@@ -230,6 +230,7 @@
## 202403
- 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)
+
- 内容: Faster Whisper 在没有 CUDA 可用时自动使用 CPU 推理.
- 类型: 优化
- 提交: ShiroDoMain
@@ -347,7 +348,7 @@
- 2024.07.23 [Commit#9588a3c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2), [PR#1340](https://github.com/RVC-Boss/GPT-SoVITS/pull/1340)
- 内容: 支持合成语速调节, 支持冻结随机性只调节语速, 并将其更新到`api.py` 上.
- 类型: 新功能
- - 提交: RVC-Boss, 红血球AE3803
+ - 提交: RVC-Boss, 红血球 AE3803
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356)
- 内容: 增加 BS-Roformer 人声伴奏分离模型支持.
- 类型: 新功能
@@ -409,7 +410,7 @@
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
- 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
- - 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
+ - 类型: 新功能 (特性参阅 [Wiki]())
- 提交: RVC-Boss
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
- 内容: 更新项目多语言文档.
@@ -453,7 +454,7 @@
- 类型: 文档
- 提交: StaryLan
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122)
- - 内容: 对于模型无法判断的CJK短字符采用规则判断.
+ - 内容: 对于模型无法判断的 CJK 短字符采用规则判断.
- 类型: 修复
- 提交: KamioRinn
- 关联: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
@@ -475,6 +476,7 @@
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
+
- 内容: **为 SoVITS v3 适配并行推理**.
- 类型: 新功能
- 提交: ChasonJiang
diff --git a/docs/cn/README.md b/docs/cn/README.md
index e674d9cf..21ee3567 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -1,18 +1,25 @@
+#
+
GPT-SoVITS-WebUI
-强大的少样本语音转换与语音合成Web用户界面.
+强大的少样本语音转换与语音合成Web用户界面.
[](https://github.com/RVC-Boss/GPT-SoVITS)

-
+[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
+[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
+[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
-[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[](https://discord.gg/dnrgs5GHfG)
+[](https://github.com/RVC-Boss/gpt-sovits/releases)
+[](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+
+[](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
+[](https://rentry.co/GPT-SoVITS-guide#/)
+[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md)
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@@ -36,7 +43,7 @@
-**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+
## 安装
@@ -171,7 +178,7 @@ docker exec -it
```
-若想使用 V1,则
-
-```bash
-python webui.py v1
-```
-
-或者在 webUI 内动态切换
-
### 微调
#### 现已支持自动填充路径
@@ -243,13 +241,13 @@ python webui.py v1
3. 进行降噪(可选)
4. 进行 ASR
5. 校对标注
-6. 前往下一个窗口,点击训练
+6. 前往下一个窗口, 点击训练
### 打开推理 WebUI
#### 整合包用户
-双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
+双击 `go-webui.bat` 或者使用 `go-webui.ps1` , 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
#### 其他
@@ -287,7 +285,7 @@ python webui.py
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下
- 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
+ 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型, 解压并重命名为`G2PWModel`, 将其放到`GPT_SoVITS/text`目录下)
## V3 更新说明
@@ -449,5 +447,5 @@ python ./tools/asr/fasterwhisper_asr.py -i -o