mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-08-19 00:09:48 +08:00
Merge branch 'RVC-Boss:main' into feat/frontend-usability-enhancements
This commit is contained in:
commit
7e6a607b9e
@ -8,7 +8,7 @@ repos:
|
||||
# Run the linter.
|
||||
- id: ruff
|
||||
types_or: [ python, pyi ]
|
||||
args: [ --fix ]
|
||||
args: [ --fix , "--exit-zero" ]
|
||||
# Run the formatter.
|
||||
- id: ruff-format
|
||||
types_or: [ python, pyi ]
|
||||
|
@ -28,7 +28,7 @@ class Text2SemanticLightningModule(LightningModule):
|
||||
self.load_state_dict(
|
||||
torch.load(
|
||||
pretrained_s1,
|
||||
map_location="cpu",
|
||||
map_location="cpu", weights_only=False,
|
||||
)["weight"],
|
||||
)
|
||||
)
|
||||
|
@ -35,7 +35,16 @@ from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||
from tools.my_utils import load_audio
|
||||
from TTS_infer_pack.text_segmentation_method import splits
|
||||
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
||||
|
||||
from sv import SV
|
||||
resample_transform_dict={}
|
||||
def resample(audio_tensor, sr0,sr1,device):
|
||||
global resample_transform_dict
|
||||
key="%s-%s-%s"%(sr0,sr1,str(device))
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(
|
||||
sr0, sr1
|
||||
).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
language = os.environ.get("language", "Auto")
|
||||
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||
i18n = I18nAuto(language=language)
|
||||
@ -102,18 +111,6 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
||||
|
||||
return processed_audio
|
||||
|
||||
|
||||
resample_transform_dict = {}
|
||||
|
||||
|
||||
def resample(audio_tensor, sr0, sr1, device):
|
||||
global resample_transform_dict
|
||||
key = "%s-%s" % (sr0, sr1)
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
|
||||
|
||||
class DictToAttrRecursive(dict):
|
||||
def __init__(self, input_dict):
|
||||
super().__init__(input_dict)
|
||||
@ -252,6 +249,24 @@ class TTS_Config:
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
"v2Pro": {
|
||||
"device": "cpu",
|
||||
"is_half": False,
|
||||
"version": "v2Pro",
|
||||
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"vits_weights_path": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
"v2ProPlus": {
|
||||
"device": "cpu",
|
||||
"is_half": False,
|
||||
"version": "v2ProPlus",
|
||||
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"vits_weights_path": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
}
|
||||
configs: dict = None
|
||||
v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
|
||||
@ -287,7 +302,7 @@ class TTS_Config:
|
||||
|
||||
assert isinstance(configs, dict)
|
||||
version = configs.get("version", "v2").lower()
|
||||
assert version in ["v1", "v2", "v3", "v4"]
|
||||
assert version in ["v1", "v2", "v3", "v4", "v2Pro", "v2ProPlus"]
|
||||
self.default_configs[version] = configs.get(version, self.default_configs[version])
|
||||
self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
|
||||
|
||||
@ -403,6 +418,7 @@ class TTS:
|
||||
self.cnhuhbert_model: CNHubert = None
|
||||
self.vocoder = None
|
||||
self.sr_model: AP_BWE = None
|
||||
self.sv_model = None
|
||||
self.sr_model_not_exist: bool = False
|
||||
|
||||
self.vocoder_configs: dict = {
|
||||
@ -463,6 +479,8 @@ class TTS:
|
||||
def init_vits_weights(self, weights_path: str):
|
||||
self.configs.vits_weights_path = weights_path
|
||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
|
||||
if "Pro"in model_version:
|
||||
self.init_sv_model()
|
||||
path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]
|
||||
|
||||
if if_lora_v3 == True and os.path.exists(path_sovits) == False:
|
||||
@ -472,7 +490,6 @@ class TTS:
|
||||
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||
dict_s2 = load_sovits_new(weights_path)
|
||||
hps = dict_s2["config"]
|
||||
|
||||
hps["model"]["semantic_frame_rate"] = "25hz"
|
||||
if "enc_p.text_embedding.weight" not in dict_s2["weight"]:
|
||||
hps["model"]["version"] = "v2" # v3model,v2sybomls
|
||||
@ -480,7 +497,15 @@ class TTS:
|
||||
hps["model"]["version"] = "v1"
|
||||
else:
|
||||
hps["model"]["version"] = "v2"
|
||||
# version = hps["model"]["version"]
|
||||
version = hps["model"]["version"]
|
||||
v3v4set={"v3", "v4"}
|
||||
if model_version not in v3v4set:
|
||||
if "Pro"not in model_version:
|
||||
model_version = version
|
||||
else:
|
||||
hps["model"]["version"] = model_version
|
||||
else:
|
||||
hps["model"]["version"] = model_version
|
||||
|
||||
self.configs.filter_length = hps["data"]["filter_length"]
|
||||
self.configs.segment_size = hps["train"]["segment_size"]
|
||||
@ -496,7 +521,7 @@ class TTS:
|
||||
|
||||
# print(f"model_version:{model_version}")
|
||||
# print(f'hps["model"]["version"]:{hps["model"]["version"]}')
|
||||
if model_version not in {"v3", "v4"}:
|
||||
if model_version not in v3v4set:
|
||||
vits_model = SynthesizerTrn(
|
||||
self.configs.filter_length // 2 + 1,
|
||||
self.configs.segment_size // self.configs.hop_length,
|
||||
@ -517,6 +542,8 @@ class TTS:
|
||||
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
||||
del vits_model.enc_q
|
||||
|
||||
self.is_v2pro=model_version in {"v2Pro","v2ProPlus"}
|
||||
|
||||
if if_lora_v3 == False:
|
||||
print(
|
||||
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
|
||||
@ -551,7 +578,7 @@ class TTS:
|
||||
self.configs.t2s_weights_path = weights_path
|
||||
self.configs.save_configs()
|
||||
self.configs.hz = 50
|
||||
dict_s1 = torch.load(weights_path, map_location=self.configs.device)
|
||||
dict_s1 = torch.load(weights_path, map_location=self.configs.device, weights_only=False)
|
||||
config = dict_s1["config"]
|
||||
self.configs.max_sec = config["data"]["max_sec"]
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
@ -605,7 +632,7 @@ class TTS:
|
||||
)
|
||||
self.vocoder.remove_weight_norm()
|
||||
state_dict_g = torch.load(
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
|
||||
)
|
||||
print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
|
||||
|
||||
@ -631,6 +658,11 @@ class TTS:
|
||||
print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好"))
|
||||
self.sr_model_not_exist = True
|
||||
|
||||
def init_sv_model(self):
|
||||
if self.sv_model is not None:
|
||||
return
|
||||
self.sv_model = SV(self.configs.device, self.configs.is_half)
|
||||
|
||||
def enable_half_precision(self, enable: bool = True, save: bool = True):
|
||||
"""
|
||||
To enable half precision for the TTS model.
|
||||
@ -706,11 +738,11 @@ class TTS:
|
||||
self.prompt_cache["ref_audio_path"] = ref_audio_path
|
||||
|
||||
def _set_ref_spec(self, ref_audio_path):
|
||||
spec = self._get_ref_spec(ref_audio_path)
|
||||
spec_audio = self._get_ref_spec(ref_audio_path)
|
||||
if self.prompt_cache["refer_spec"] in [[], None]:
|
||||
self.prompt_cache["refer_spec"] = [spec]
|
||||
self.prompt_cache["refer_spec"] = [spec_audio]
|
||||
else:
|
||||
self.prompt_cache["refer_spec"][0] = spec
|
||||
self.prompt_cache["refer_spec"][0] = spec_audio
|
||||
|
||||
def _get_ref_spec(self, ref_audio_path):
|
||||
raw_audio, raw_sr = torchaudio.load(ref_audio_path)
|
||||
@ -718,25 +750,33 @@ class TTS:
|
||||
self.prompt_cache["raw_audio"] = raw_audio
|
||||
self.prompt_cache["raw_sr"] = raw_sr
|
||||
|
||||
audio = load_audio(ref_audio_path, int(self.configs.sampling_rate))
|
||||
audio = torch.FloatTensor(audio)
|
||||
if raw_sr != self.configs.sampling_rate:
|
||||
audio = raw_audio.to(self.configs.device)
|
||||
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
|
||||
audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
|
||||
else:
|
||||
audio = raw_audio.to(self.configs.device)
|
||||
if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
|
||||
|
||||
maxx = audio.abs().max()
|
||||
if maxx > 1:
|
||||
audio /= min(2, maxx)
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec = spectrogram_torch(
|
||||
audio_norm,
|
||||
audio,
|
||||
self.configs.filter_length,
|
||||
self.configs.sampling_rate,
|
||||
self.configs.hop_length,
|
||||
self.configs.win_length,
|
||||
center=False,
|
||||
)
|
||||
spec = spec.to(self.configs.device)
|
||||
if self.configs.is_half:
|
||||
spec = spec.half()
|
||||
return spec
|
||||
if self.is_v2pro == True:
|
||||
audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
|
||||
if self.configs.is_half:
|
||||
audio = audio.half()
|
||||
else:audio=None
|
||||
return spec,audio
|
||||
|
||||
def _set_prompt_semantic(self, ref_wav_path: str):
|
||||
zero_wav = np.zeros(
|
||||
@ -1171,10 +1211,13 @@ class TTS:
|
||||
t4 = time.perf_counter()
|
||||
t_34 += t4 - t3
|
||||
|
||||
refer_audio_spec: torch.Tensor = [
|
||||
item.to(dtype=self.precision, device=self.configs.device)
|
||||
for item in self.prompt_cache["refer_spec"]
|
||||
]
|
||||
refer_audio_spec = []
|
||||
if self.is_v2pro:sv_emb=[]
|
||||
for spec,audio_tensor in self.prompt_cache["refer_spec"]:
|
||||
spec=spec.to(dtype=self.precision, device=self.configs.device)
|
||||
refer_audio_spec.append(spec)
|
||||
if self.is_v2pro:
|
||||
sv_emb.append(self.sv_model.compute_embedding3(audio_tensor))
|
||||
|
||||
batch_audio_fragment = []
|
||||
|
||||
@ -1206,9 +1249,10 @@ class TTS:
|
||||
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
)
|
||||
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
||||
_batch_audio_fragment = self.vits_model.decode(
|
||||
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
|
||||
).detach()[0, 0, :]
|
||||
if self.is_v2pro!=True:
|
||||
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
|
||||
else:
|
||||
_batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
|
||||
audio_frag_end_idx.insert(0, 0)
|
||||
batch_audio_fragment = [
|
||||
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
|
||||
@ -1221,9 +1265,10 @@ class TTS:
|
||||
_pred_semantic = (
|
||||
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
||||
) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment = self.vits_model.decode(
|
||||
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
|
||||
).detach()[0, 0, :]
|
||||
if self.is_v2pro != True:
|
||||
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
|
||||
else:
|
||||
audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
|
||||
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
||||
else:
|
||||
if parallel_infer:
|
||||
@ -1362,7 +1407,10 @@ class TTS:
|
||||
):
|
||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||
raw_entry = self.prompt_cache["refer_spec"][0]
|
||||
if isinstance(raw_entry, tuple):
|
||||
raw_entry = raw_entry[0]
|
||||
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
|
||||
|
||||
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
||||
@ -1429,7 +1477,10 @@ class TTS:
|
||||
) -> List[torch.Tensor]:
|
||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||
raw_entry = self.prompt_cache["refer_spec"][0]
|
||||
if isinstance(raw_entry, tuple):
|
||||
raw_entry = raw_entry[0]
|
||||
refer_audio_spec = raw_entry.to(dtype=self.precision,device=self.configs.device)
|
||||
|
||||
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
||||
|
91
GPT_SoVITS/configs/s2v2Pro.json
Normal file
91
GPT_SoVITS/configs/s2v2Pro.json
Normal file
@ -0,0 +1,91 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 100,
|
||||
"eval_interval": 500,
|
||||
"seed": 1234,
|
||||
"epochs": 100,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 20480,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0,
|
||||
"text_low_lr_rate": 0.4,
|
||||
"grad_ckpt": false
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 640,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 300,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
8,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [
|
||||
16,
|
||||
16,
|
||||
8,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 1024,
|
||||
"semantic_frame_rate": "25hz",
|
||||
"freeze_quantizer": true
|
||||
},
|
||||
"s2_ckpt_dir": "logs/s2/big2k1",
|
||||
"content_module": "cnhubert"
|
||||
}
|
91
GPT_SoVITS/configs/s2v2ProPlus.json
Normal file
91
GPT_SoVITS/configs/s2v2ProPlus.json
Normal file
@ -0,0 +1,91 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 100,
|
||||
"eval_interval": 500,
|
||||
"seed": 1234,
|
||||
"epochs": 100,
|
||||
"learning_rate": 0.0001,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.99
|
||||
],
|
||||
"eps": 1e-09,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 20480,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0,
|
||||
"text_low_lr_rate": 0.4,
|
||||
"grad_ckpt": false
|
||||
},
|
||||
"data": {
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 32000,
|
||||
"filter_length": 2048,
|
||||
"hop_length": 640,
|
||||
"win_length": 2048,
|
||||
"n_mel_channels": 128,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 300,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.0,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [
|
||||
3,
|
||||
7,
|
||||
11
|
||||
],
|
||||
"resblock_dilation_sizes": [
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
],
|
||||
[
|
||||
1,
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"upsample_rates": [
|
||||
10,
|
||||
8,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"upsample_initial_channel": 768,
|
||||
"upsample_kernel_sizes": [
|
||||
20,
|
||||
16,
|
||||
8,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 1024,
|
||||
"semantic_frame_rate": "25hz",
|
||||
"freeze_quantizer": true
|
||||
},
|
||||
"s2_ckpt_dir": "logs/s2/big2k1",
|
||||
"content_module": "cnhubert"
|
||||
}
|
260
GPT_SoVITS/eres2net/ERes2Net.py
Normal file
260
GPT_SoVITS/eres2net/ERes2Net.py
Normal file
@ -0,0 +1,260 @@
|
||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
"""
|
||||
Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
||||
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||
"""
|
||||
|
||||
|
||||
import torch
|
||||
import math
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import pooling_layers as pooling_layers
|
||||
from fusion import AFF
|
||||
|
||||
class ReLU(nn.Hardtanh):
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(ReLU, self).__init__(0, 20, inplace)
|
||||
|
||||
def __repr__(self):
|
||||
inplace_str = 'inplace' if self.inplace else ''
|
||||
return self.__class__.__name__ + ' (' \
|
||||
+ inplace_str + ')'
|
||||
|
||||
|
||||
class BasicBlockERes2Net(nn.Module):
|
||||
expansion = 2
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||
super(BasicBlockERes2Net, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
|
||||
convs=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
|
||||
stride=stride, bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = sp + spx[i]
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
||||
expansion = 2
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
|
||||
convs=[]
|
||||
fuse_models=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
for j in range(self.nums - 1):
|
||||
fuse_models.append(AFF(channels=width))
|
||||
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.fuse_models = nn.ModuleList(fuse_models)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
|
||||
stride=stride, bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = self.fuse_models[i-1](sp, spx[i])
|
||||
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class ERes2Net(nn.Module):
|
||||
def __init__(self,
|
||||
block=BasicBlockERes2Net,
|
||||
block_fuse=BasicBlockERes2Net_diff_AFF,
|
||||
num_blocks=[3, 4, 6, 3],
|
||||
m_channels=32,
|
||||
feat_dim=80,
|
||||
embedding_size=192,
|
||||
pooling_func='TSTP',
|
||||
two_emb_layer=False):
|
||||
super(ERes2Net, self).__init__()
|
||||
self.in_planes = m_channels
|
||||
self.feat_dim = feat_dim
|
||||
self.embedding_size = embedding_size
|
||||
self.stats_dim = int(feat_dim / 8) * m_channels * 8
|
||||
self.two_emb_layer = two_emb_layer
|
||||
|
||||
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
|
||||
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
|
||||
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||
|
||||
# Downsampling module for each layer
|
||||
self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False)
|
||||
self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
|
||||
self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
|
||||
|
||||
# Bottom-up fusion module
|
||||
self.fuse_mode12 = AFF(channels=m_channels * 4)
|
||||
self.fuse_mode123 = AFF(channels=m_channels * 8)
|
||||
self.fuse_mode1234 = AFF(channels=m_channels * 16)
|
||||
|
||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
||||
self.pool = getattr(pooling_layers, pooling_func)(
|
||||
in_dim=self.stats_dim * block.expansion)
|
||||
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
|
||||
embedding_size)
|
||||
if self.two_emb_layer:
|
||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
||||
else:
|
||||
self.seg_bn_1 = nn.Identity()
|
||||
self.seg_2 = nn.Identity()
|
||||
|
||||
def _make_layer(self, block, planes, num_blocks, stride):
|
||||
strides = [stride] + [1] * (num_blocks - 1)
|
||||
layers = []
|
||||
for stride in strides:
|
||||
layers.append(block(self.in_planes, planes, stride))
|
||||
self.in_planes = planes * block.expansion
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
|
||||
stats = self.pool(fuse_out1234)
|
||||
|
||||
embed_a = self.seg_1(stats)
|
||||
if self.two_emb_layer:
|
||||
out = F.relu(embed_a)
|
||||
out = self.seg_bn_1(out)
|
||||
embed_b = self.seg_2(out)
|
||||
return embed_b
|
||||
else:
|
||||
return embed_a
|
||||
|
||||
def forward3(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
|
||||
return fuse_out1234
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
x = torch.zeros(10, 300, 80)
|
||||
model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP')
|
||||
model.eval()
|
||||
out = model(x)
|
||||
print(out.shape) # torch.Size([10, 192])
|
||||
|
||||
num_params = sum(param.numel() for param in model.parameters())
|
||||
print("{} M".format(num_params / 1e6)) # 6.61M
|
||||
|
292
GPT_SoVITS/eres2net/ERes2NetV2.py
Normal file
292
GPT_SoVITS/eres2net/ERes2NetV2.py
Normal file
@ -0,0 +1,292 @@
|
||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
"""
|
||||
To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
|
||||
within each stage. However, this modification also increases the number of model parameters and computational complexity.
|
||||
To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
|
||||
both the model parameters and its computational cost.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
import torch
|
||||
import math
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import pooling_layers as pooling_layers
|
||||
from fusion import AFF
|
||||
|
||||
class ReLU(nn.Hardtanh):
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(ReLU, self).__init__(0, 20, inplace)
|
||||
|
||||
def __repr__(self):
|
||||
inplace_str = 'inplace' if self.inplace else ''
|
||||
return self.__class__.__name__ + ' (' \
|
||||
+ inplace_str + ')'
|
||||
|
||||
|
||||
class BasicBlockERes2NetV2(nn.Module):
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
||||
super(BasicBlockERes2NetV2, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
self.expansion = expansion
|
||||
|
||||
convs=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes,
|
||||
self.expansion * planes,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = sp + spx[i]
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class BasicBlockERes2NetV2AFF(nn.Module):
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
|
||||
super(BasicBlockERes2NetV2AFF, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
self.expansion = expansion
|
||||
|
||||
convs=[]
|
||||
fuse_models=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
for j in range(self.nums - 1):
|
||||
fuse_models.append(AFF(channels=width, r=4))
|
||||
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.fuse_models = nn.ModuleList(fuse_models)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes,
|
||||
self.expansion * planes,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = self.fuse_models[i-1](sp, spx[i])
|
||||
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class ERes2NetV2(nn.Module):
|
||||
def __init__(self,
|
||||
block=BasicBlockERes2NetV2,
|
||||
block_fuse=BasicBlockERes2NetV2AFF,
|
||||
num_blocks=[3, 4, 6, 3],
|
||||
m_channels=64,
|
||||
feat_dim=80,
|
||||
embedding_size=192,
|
||||
baseWidth=26,
|
||||
scale=2,
|
||||
expansion=2,
|
||||
pooling_func='TSTP',
|
||||
two_emb_layer=False):
|
||||
super(ERes2NetV2, self).__init__()
|
||||
self.in_planes = m_channels
|
||||
self.feat_dim = feat_dim
|
||||
self.embedding_size = embedding_size
|
||||
self.stats_dim = int(feat_dim / 8) * m_channels * 8
|
||||
self.two_emb_layer = two_emb_layer
|
||||
self.baseWidth = baseWidth
|
||||
self.scale = scale
|
||||
self.expansion = expansion
|
||||
|
||||
self.conv1 = nn.Conv2d(1,
|
||||
m_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||
self.layer1 = self._make_layer(block,
|
||||
m_channels,
|
||||
num_blocks[0],
|
||||
stride=1)
|
||||
self.layer2 = self._make_layer(block,
|
||||
m_channels * 2,
|
||||
num_blocks[1],
|
||||
stride=2)
|
||||
self.layer3 = self._make_layer(block_fuse,
|
||||
m_channels * 4,
|
||||
num_blocks[2],
|
||||
stride=2)
|
||||
self.layer4 = self._make_layer(block_fuse,
|
||||
m_channels * 8,
|
||||
num_blocks[3],
|
||||
stride=2)
|
||||
|
||||
# Downsampling module
|
||||
self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \
|
||||
padding=1, stride=2, bias=False)
|
||||
|
||||
# Bottom-up fusion module
|
||||
self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
|
||||
|
||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
||||
self.pool = getattr(pooling_layers, pooling_func)(
|
||||
in_dim=self.stats_dim * self.expansion)
|
||||
self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
|
||||
embedding_size)
|
||||
if self.two_emb_layer:
|
||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
||||
else:
|
||||
self.seg_bn_1 = nn.Identity()
|
||||
self.seg_2 = nn.Identity()
|
||||
|
||||
def _make_layer(self, block, planes, num_blocks, stride):
|
||||
strides = [stride] + [1] * (num_blocks - 1)
|
||||
layers = []
|
||||
for stride in strides:
|
||||
layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion))
|
||||
self.in_planes = planes * self.expansion
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out3 = self.layer3(out2)
|
||||
out4 = self.layer4(out3)
|
||||
out3_ds = self.layer3_ds(out3)
|
||||
fuse_out34 = self.fuse34(out4, out3_ds)
|
||||
stats = self.pool(fuse_out34)
|
||||
|
||||
embed_a = self.seg_1(stats)
|
||||
if self.two_emb_layer:
|
||||
out = F.relu(embed_a)
|
||||
out = self.seg_bn_1(out)
|
||||
embed_b = self.seg_2(out)
|
||||
return embed_b
|
||||
else:
|
||||
return embed_a
|
||||
|
||||
def forward3(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out3 = self.layer3(out2)
|
||||
out4 = self.layer4(out3)
|
||||
out3_ds = self.layer3_ds(out3)
|
||||
fuse_out34 = self.fuse34(out4, out3_ds)
|
||||
# print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
|
||||
return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1)
|
||||
# stats = self.pool(fuse_out34)
|
||||
#
|
||||
# embed_a = self.seg_1(stats)
|
||||
# if self.two_emb_layer:
|
||||
# out = F.relu(embed_a)
|
||||
# out = self.seg_bn_1(out)
|
||||
# embed_b = self.seg_2(out)
|
||||
# return embed_b
|
||||
# else:
|
||||
# return embed_a
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
x = torch.randn(1, 300, 80)
|
||||
model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
|
||||
model.eval()
|
||||
y = model(x)
|
||||
print(y.size())
|
||||
macs, num_params = profile(model, inputs=(x, ))
|
||||
print("Params: {} M".format(num_params / 1e6)) # 17.86 M
|
||||
print("MACs: {} G".format(macs / 1e9)) # 12.69 G
|
||||
|
||||
|
||||
|
||||
|
286
GPT_SoVITS/eres2net/ERes2Net_huge.py
Normal file
286
GPT_SoVITS/eres2net/ERes2Net_huge.py
Normal file
@ -0,0 +1,286 @@
|
||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||
ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
|
||||
The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||
ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
|
||||
recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
|
||||
"""
|
||||
import pdb
|
||||
|
||||
import torch
|
||||
import math
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import pooling_layers as pooling_layers
|
||||
from fusion import AFF
|
||||
|
||||
class ReLU(nn.Hardtanh):
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(ReLU, self).__init__(0, 20, inplace)
|
||||
|
||||
def __repr__(self):
|
||||
inplace_str = 'inplace' if self.inplace else ''
|
||||
return self.__class__.__name__ + ' (' \
|
||||
+ inplace_str + ')'
|
||||
|
||||
|
||||
class BasicBlockERes2Net(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
||||
super(BasicBlockERes2Net, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
|
||||
convs=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = sp + spx[i]
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class BasicBlockERes2Net_diff_AFF(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
|
||||
super(BasicBlockERes2Net_diff_AFF, self).__init__()
|
||||
width = int(math.floor(planes*(baseWidth/64.0)))
|
||||
self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(width*scale)
|
||||
self.nums = scale
|
||||
|
||||
convs=[]
|
||||
fuse_models=[]
|
||||
bns=[]
|
||||
for i in range(self.nums):
|
||||
convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
for j in range(self.nums - 1):
|
||||
fuse_models.append(AFF(channels=width))
|
||||
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.fuse_models = nn.ModuleList(fuse_models)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes*self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
|
||||
nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out,self.width,1)
|
||||
for i in range(self.nums):
|
||||
if i==0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = self.fuse_models[i-1](sp, spx[i])
|
||||
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i==0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out,sp),1)
|
||||
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class ERes2Net(nn.Module):
|
||||
def __init__(self,
|
||||
block=BasicBlockERes2Net,
|
||||
block_fuse=BasicBlockERes2Net_diff_AFF,
|
||||
num_blocks=[3, 4, 6, 3],
|
||||
m_channels=64,
|
||||
feat_dim=80,
|
||||
embedding_size=192,
|
||||
pooling_func='TSTP',
|
||||
two_emb_layer=False):
|
||||
super(ERes2Net, self).__init__()
|
||||
self.in_planes = m_channels
|
||||
self.feat_dim = feat_dim
|
||||
self.embedding_size = embedding_size
|
||||
self.stats_dim = int(feat_dim / 8) * m_channels * 8
|
||||
self.two_emb_layer = two_emb_layer
|
||||
|
||||
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||
|
||||
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
|
||||
self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
|
||||
self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||
self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||
|
||||
self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
|
||||
self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
|
||||
self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False)
|
||||
|
||||
self.fuse_mode12 = AFF(channels=m_channels * 8)
|
||||
self.fuse_mode123 = AFF(channels=m_channels * 16)
|
||||
self.fuse_mode1234 = AFF(channels=m_channels * 32)
|
||||
|
||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
|
||||
self.pool = getattr(pooling_layers, pooling_func)(
|
||||
in_dim=self.stats_dim * block.expansion)
|
||||
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
|
||||
if self.two_emb_layer:
|
||||
self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
|
||||
self.seg_2 = nn.Linear(embedding_size, embedding_size)
|
||||
else:
|
||||
self.seg_bn_1 = nn.Identity()
|
||||
self.seg_2 = nn.Identity()
|
||||
|
||||
def _make_layer(self, block, planes, num_blocks, stride):
|
||||
strides = [stride] + [1] * (num_blocks - 1)
|
||||
layers = []
|
||||
for stride in strides:
|
||||
layers.append(block(self.in_planes, planes, stride))
|
||||
self.in_planes = planes * block.expansion
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
|
||||
stats = self.pool(fuse_out1234)
|
||||
|
||||
embed_a = self.seg_1(stats)
|
||||
if self.two_emb_layer:
|
||||
out = F.relu(embed_a)
|
||||
out = self.seg_bn_1(out)
|
||||
embed_b = self.seg_2(out)
|
||||
return embed_b
|
||||
else:
|
||||
return embed_a
|
||||
|
||||
def forward2(self, x,if_mean):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2)#bs,20480,T
|
||||
if(if_mean==False):
|
||||
mean=fuse_out1234[0].transpose(1,0)#(T,20480),bs=T
|
||||
else:
|
||||
mean = fuse_out1234.mean(2)#bs,20480
|
||||
mean_std=torch.cat([mean,torch.zeros_like(mean)],1)
|
||||
return self.seg_1(mean_std)#(T,192)
|
||||
|
||||
|
||||
# stats = self.pool(fuse_out1234)
|
||||
# if self.two_emb_layer:
|
||||
# out = F.relu(embed_a)
|
||||
# out = self.seg_bn_1(out)
|
||||
# embed_b = self.seg_2(out)
|
||||
# return embed_b
|
||||
# else:
|
||||
# return embed_a
|
||||
|
||||
def forward3(self, x):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
|
||||
return fuse_out1234
|
||||
# print(fuse_out1234.shape)
|
||||
# print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
|
||||
# pdb.set_trace()
|
||||
|
||||
|
||||
|
||||
|
29
GPT_SoVITS/eres2net/fusion.py
Normal file
29
GPT_SoVITS/eres2net/fusion.py
Normal file
@ -0,0 +1,29 @@
|
||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class AFF(nn.Module):
|
||||
|
||||
def __init__(self, channels=64, r=4):
|
||||
super(AFF, self).__init__()
|
||||
inter_channels = int(channels // r)
|
||||
|
||||
self.local_att = nn.Sequential(
|
||||
nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
|
||||
nn.BatchNorm2d(inter_channels),
|
||||
nn.SiLU(inplace=True),
|
||||
nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
|
||||
nn.BatchNorm2d(channels),
|
||||
)
|
||||
|
||||
def forward(self, x, ds_y):
|
||||
xa = torch.cat((x, ds_y), dim=1)
|
||||
x_att = self.local_att(xa)
|
||||
x_att = 1.0 + torch.tanh(x_att)
|
||||
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att)
|
||||
|
||||
return xo
|
||||
|
819
GPT_SoVITS/eres2net/kaldi.py
Normal file
819
GPT_SoVITS/eres2net/kaldi.py
Normal file
@ -0,0 +1,819 @@
|
||||
import math
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
from torch import Tensor
|
||||
|
||||
__all__ = [
|
||||
"get_mel_banks",
|
||||
"inverse_mel_scale",
|
||||
"inverse_mel_scale_scalar",
|
||||
"mel_scale",
|
||||
"mel_scale_scalar",
|
||||
"spectrogram",
|
||||
"fbank",
|
||||
"mfcc",
|
||||
"vtln_warp_freq",
|
||||
"vtln_warp_mel_freq",
|
||||
]
|
||||
|
||||
# numeric_limits<float>::epsilon() 1.1920928955078125e-07
|
||||
EPSILON = torch.tensor(torch.finfo(torch.float).eps)
|
||||
# 1 milliseconds = 0.001 seconds
|
||||
MILLISECONDS_TO_SECONDS = 0.001
|
||||
|
||||
# window types
|
||||
HAMMING = "hamming"
|
||||
HANNING = "hanning"
|
||||
POVEY = "povey"
|
||||
RECTANGULAR = "rectangular"
|
||||
BLACKMAN = "blackman"
|
||||
WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
|
||||
|
||||
|
||||
def _get_epsilon(device, dtype):
|
||||
return EPSILON.to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
def _next_power_of_2(x: int) -> int:
|
||||
r"""Returns the smallest power of 2 that is greater than x"""
|
||||
return 1 if x == 0 else 2 ** (x - 1).bit_length()
|
||||
|
||||
|
||||
def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
|
||||
r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
|
||||
representing how the window is shifted along the waveform. Each row is a frame.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): Tensor of size ``num_samples``
|
||||
window_size (int): Frame length
|
||||
window_shift (int): Frame shift
|
||||
snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
|
||||
in the file, and the number of frames depends on the frame_length. If False, the number of frames
|
||||
depends only on the frame_shift, and we reflect the data at the ends.
|
||||
|
||||
Returns:
|
||||
Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
|
||||
"""
|
||||
assert waveform.dim() == 1
|
||||
num_samples = waveform.size(0)
|
||||
strides = (window_shift * waveform.stride(0), waveform.stride(0))
|
||||
|
||||
if snip_edges:
|
||||
if num_samples < window_size:
|
||||
return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
|
||||
else:
|
||||
m = 1 + (num_samples - window_size) // window_shift
|
||||
else:
|
||||
reversed_waveform = torch.flip(waveform, [0])
|
||||
m = (num_samples + (window_shift // 2)) // window_shift
|
||||
pad = window_size // 2 - window_shift // 2
|
||||
pad_right = reversed_waveform
|
||||
if pad > 0:
|
||||
# torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
|
||||
# but we want [2, 1, 0, 0, 1, 2]
|
||||
pad_left = reversed_waveform[-pad:]
|
||||
waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
|
||||
else:
|
||||
# pad is negative so we want to trim the waveform at the front
|
||||
waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
|
||||
|
||||
sizes = (m, window_size)
|
||||
return waveform.as_strided(sizes, strides)
|
||||
|
||||
|
||||
def _feature_window_function(
|
||||
window_type: str,
|
||||
window_size: int,
|
||||
blackman_coeff: float,
|
||||
device: torch.device,
|
||||
dtype: int,
|
||||
) -> Tensor:
|
||||
r"""Returns a window function with the given type and size"""
|
||||
if window_type == HANNING:
|
||||
return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
|
||||
elif window_type == HAMMING:
|
||||
return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
|
||||
elif window_type == POVEY:
|
||||
# like hanning but goes to zero at edges
|
||||
return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
|
||||
elif window_type == RECTANGULAR:
|
||||
return torch.ones(window_size, device=device, dtype=dtype)
|
||||
elif window_type == BLACKMAN:
|
||||
a = 2 * math.pi / (window_size - 1)
|
||||
window_function = torch.arange(window_size, device=device, dtype=dtype)
|
||||
# can't use torch.blackman_window as they use different coefficients
|
||||
return (
|
||||
blackman_coeff
|
||||
- 0.5 * torch.cos(a * window_function)
|
||||
+ (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
|
||||
).to(device=device, dtype=dtype)
|
||||
else:
|
||||
raise Exception("Invalid window type " + window_type)
|
||||
|
||||
|
||||
def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
|
||||
r"""Returns the log energy of size (m) for a strided_input (m,*)"""
|
||||
device, dtype = strided_input.device, strided_input.dtype
|
||||
log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log() # size (m)
|
||||
if energy_floor == 0.0:
|
||||
return log_energy
|
||||
return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
|
||||
|
||||
|
||||
def _get_waveform_and_window_properties(
|
||||
waveform: Tensor,
|
||||
channel: int,
|
||||
sample_frequency: float,
|
||||
frame_shift: float,
|
||||
frame_length: float,
|
||||
round_to_power_of_two: bool,
|
||||
preemphasis_coefficient: float,
|
||||
) -> Tuple[Tensor, int, int, int]:
|
||||
r"""Gets the waveform and window properties"""
|
||||
channel = max(channel, 0)
|
||||
assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
|
||||
waveform = waveform[channel, :] # size (n)
|
||||
window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
|
||||
window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
|
||||
padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
|
||||
|
||||
assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
|
||||
window_size, len(waveform)
|
||||
)
|
||||
assert 0 < window_shift, "`window_shift` must be greater than 0"
|
||||
assert padded_window_size % 2 == 0, (
|
||||
"the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
|
||||
)
|
||||
assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
|
||||
assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
|
||||
return waveform, window_shift, window_size, padded_window_size
|
||||
|
||||
|
||||
def _get_window(
|
||||
waveform: Tensor,
|
||||
padded_window_size: int,
|
||||
window_size: int,
|
||||
window_shift: int,
|
||||
window_type: str,
|
||||
blackman_coeff: float,
|
||||
snip_edges: bool,
|
||||
raw_energy: bool,
|
||||
energy_floor: float,
|
||||
dither: float,
|
||||
remove_dc_offset: bool,
|
||||
preemphasis_coefficient: float,
|
||||
) -> Tuple[Tensor, Tensor]:
|
||||
r"""Gets a window and its log energy
|
||||
|
||||
Returns:
|
||||
(Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
|
||||
"""
|
||||
device, dtype = waveform.device, waveform.dtype
|
||||
epsilon = _get_epsilon(device, dtype)
|
||||
|
||||
# size (m, window_size)
|
||||
strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
|
||||
|
||||
if dither != 0.0:
|
||||
rand_gauss = torch.randn(strided_input.shape, device=device, dtype=dtype)
|
||||
strided_input = strided_input + rand_gauss * dither
|
||||
|
||||
if remove_dc_offset:
|
||||
# Subtract each row/frame by its mean
|
||||
row_means = torch.mean(strided_input, dim=1).unsqueeze(1) # size (m, 1)
|
||||
strided_input = strided_input - row_means
|
||||
|
||||
if raw_energy:
|
||||
# Compute the log energy of each row/frame before applying preemphasis and
|
||||
# window function
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m)
|
||||
|
||||
if preemphasis_coefficient != 0.0:
|
||||
# strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
|
||||
offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
|
||||
0
|
||||
) # size (m, window_size + 1)
|
||||
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
|
||||
|
||||
# Apply window_function to each row/frame
|
||||
window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
|
||||
0
|
||||
) # size (1, window_size)
|
||||
strided_input = strided_input * window_function # size (m, window_size)
|
||||
|
||||
# Pad columns with zero until we reach size (m, padded_window_size)
|
||||
if padded_window_size != window_size:
|
||||
padding_right = padded_window_size - window_size
|
||||
strided_input = torch.nn.functional.pad(
|
||||
strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
|
||||
).squeeze(0)
|
||||
|
||||
# Compute energy after window function (not the raw one)
|
||||
if not raw_energy:
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m)
|
||||
|
||||
return strided_input, signal_log_energy
|
||||
|
||||
|
||||
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
|
||||
# subtracts the column mean of the tensor size (m, n) if subtract_mean=True
|
||||
# it returns size (m, n)
|
||||
if subtract_mean:
|
||||
col_means = torch.mean(tensor, dim=0).unsqueeze(0)
|
||||
tensor = tensor - col_means
|
||||
return tensor
|
||||
|
||||
|
||||
def spectrogram(
|
||||
waveform: Tensor,
|
||||
blackman_coeff: float = 0.42,
|
||||
channel: int = -1,
|
||||
dither: float = 0.0,
|
||||
energy_floor: float = 1.0,
|
||||
frame_length: float = 25.0,
|
||||
frame_shift: float = 10.0,
|
||||
min_duration: float = 0.0,
|
||||
preemphasis_coefficient: float = 0.97,
|
||||
raw_energy: bool = True,
|
||||
remove_dc_offset: bool = True,
|
||||
round_to_power_of_two: bool = True,
|
||||
sample_frequency: float = 16000.0,
|
||||
snip_edges: bool = True,
|
||||
subtract_mean: bool = False,
|
||||
window_type: str = POVEY,
|
||||
) -> Tensor:
|
||||
r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
|
||||
compute-spectrogram-feats.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
|
||||
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
|
||||
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
|
||||
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
|
||||
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
|
||||
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
|
||||
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
|
||||
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
|
||||
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
|
||||
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
|
||||
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
|
||||
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
|
||||
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
|
||||
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. (Default: ``True``)
|
||||
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
|
||||
specified there) (Default: ``16000.0``)
|
||||
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
|
||||
in the file, and the number of frames depends on the frame_length. If False, the number of frames
|
||||
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
|
||||
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
|
||||
it this way. (Default: ``False``)
|
||||
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
|
||||
(Default: ``'povey'``)
|
||||
|
||||
Returns:
|
||||
Tensor: A spectrogram identical to what Kaldi would output. The shape is
|
||||
(m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
|
||||
"""
|
||||
device, dtype = waveform.device, waveform.dtype
|
||||
epsilon = _get_epsilon(device, dtype)
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
|
||||
)
|
||||
|
||||
if len(waveform) < min_duration * sample_frequency:
|
||||
# signal is too short
|
||||
return torch.empty(0)
|
||||
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform,
|
||||
padded_window_size,
|
||||
window_size,
|
||||
window_shift,
|
||||
window_type,
|
||||
blackman_coeff,
|
||||
snip_edges,
|
||||
raw_energy,
|
||||
energy_floor,
|
||||
dither,
|
||||
remove_dc_offset,
|
||||
preemphasis_coefficient,
|
||||
)
|
||||
|
||||
# size (m, padded_window_size // 2 + 1, 2)
|
||||
fft = torch.fft.rfft(strided_input)
|
||||
|
||||
# Convert the FFT into a power spectrum
|
||||
power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log() # size (m, padded_window_size // 2 + 1)
|
||||
power_spectrum[:, 0] = signal_log_energy
|
||||
|
||||
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
|
||||
return power_spectrum
|
||||
|
||||
|
||||
def inverse_mel_scale_scalar(mel_freq: float) -> float:
|
||||
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
|
||||
|
||||
|
||||
def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
|
||||
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
|
||||
|
||||
|
||||
def mel_scale_scalar(freq: float) -> float:
|
||||
return 1127.0 * math.log(1.0 + freq / 700.0)
|
||||
|
||||
|
||||
def mel_scale(freq: Tensor) -> Tensor:
|
||||
return 1127.0 * (1.0 + freq / 700.0).log()
|
||||
|
||||
|
||||
def vtln_warp_freq(
|
||||
vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
freq: Tensor,
|
||||
) -> Tensor:
|
||||
r"""This computes a VTLN warping function that is not the same as HTK's one,
|
||||
but has similar inputs (this function has the advantage of never producing
|
||||
empty bins).
|
||||
|
||||
This function computes a warp function F(freq), defined between low_freq
|
||||
and high_freq inclusive, with the following properties:
|
||||
F(low_freq) == low_freq
|
||||
F(high_freq) == high_freq
|
||||
The function is continuous and piecewise linear with two inflection
|
||||
points.
|
||||
The lower inflection point (measured in terms of the unwarped
|
||||
frequency) is at frequency l, determined as described below.
|
||||
The higher inflection point is at a frequency h, determined as
|
||||
described below.
|
||||
If l <= f <= h, then F(f) = f/vtln_warp_factor.
|
||||
If the higher inflection point (measured in terms of the unwarped
|
||||
frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
|
||||
Since (by the last point) F(h) == h/vtln_warp_factor, then
|
||||
max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
|
||||
h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
|
||||
= vtln_high_cutoff * min(1, vtln_warp_factor).
|
||||
If the lower inflection point (measured in terms of the unwarped
|
||||
frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
|
||||
This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
|
||||
= vtln_low_cutoff * max(1, vtln_warp_factor)
|
||||
Args:
|
||||
vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
|
||||
vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
|
||||
low_freq (float): Lower frequency cutoffs in mel computation
|
||||
high_freq (float): Upper frequency cutoffs in mel computation
|
||||
vtln_warp_factor (float): Vtln warp factor
|
||||
freq (Tensor): given frequency in Hz
|
||||
|
||||
Returns:
|
||||
Tensor: Freq after vtln warp
|
||||
"""
|
||||
assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
|
||||
assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
|
||||
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
|
||||
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
|
||||
scale = 1.0 / vtln_warp_factor
|
||||
Fl = scale * l # F(l)
|
||||
Fh = scale * h # F(h)
|
||||
assert l > low_freq and h < high_freq
|
||||
# slope of left part of the 3-piece linear function
|
||||
scale_left = (Fl - low_freq) / (l - low_freq)
|
||||
# [slope of center part is just "scale"]
|
||||
|
||||
# slope of right part of the 3-piece linear function
|
||||
scale_right = (high_freq - Fh) / (high_freq - h)
|
||||
|
||||
res = torch.empty_like(freq)
|
||||
|
||||
outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq) # freq < low_freq || freq > high_freq
|
||||
before_l = torch.lt(freq, l) # freq < l
|
||||
before_h = torch.lt(freq, h) # freq < h
|
||||
after_h = torch.ge(freq, h) # freq >= h
|
||||
|
||||
# order of operations matter here (since there is overlapping frequency regions)
|
||||
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
|
||||
res[before_h] = scale * freq[before_h]
|
||||
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
|
||||
res[outside_low_high_freq] = freq[outside_low_high_freq]
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def vtln_warp_mel_freq(
|
||||
vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
mel_freq: Tensor,
|
||||
) -> Tensor:
|
||||
r"""
|
||||
Args:
|
||||
vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
|
||||
vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
|
||||
low_freq (float): Lower frequency cutoffs in mel computation
|
||||
high_freq (float): Upper frequency cutoffs in mel computation
|
||||
vtln_warp_factor (float): Vtln warp factor
|
||||
mel_freq (Tensor): Given frequency in Mel
|
||||
|
||||
Returns:
|
||||
Tensor: ``mel_freq`` after vtln warp
|
||||
"""
|
||||
return mel_scale(
|
||||
vtln_warp_freq(
|
||||
vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_mel_banks(
|
||||
num_bins: int,
|
||||
window_length_padded: int,
|
||||
sample_freq: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_low: float,
|
||||
vtln_high: float,
|
||||
vtln_warp_factor: float,device=None,dtype=None
|
||||
) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Returns:
|
||||
(Tensor, Tensor): The tuple consists of ``bins`` (which is
|
||||
melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
|
||||
center frequencies of bins of size (``num_bins``)).
|
||||
"""
|
||||
assert num_bins > 3, "Must have at least 3 mel bins"
|
||||
assert window_length_padded % 2 == 0
|
||||
num_fft_bins = window_length_padded / 2
|
||||
nyquist = 0.5 * sample_freq
|
||||
|
||||
if high_freq <= 0.0:
|
||||
high_freq += nyquist
|
||||
|
||||
assert (
|
||||
(0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
|
||||
), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
|
||||
|
||||
# fft-bin width [think of it as Nyquist-freq / half-window-length]
|
||||
fft_bin_width = sample_freq / window_length_padded
|
||||
mel_low_freq = mel_scale_scalar(low_freq)
|
||||
mel_high_freq = mel_scale_scalar(high_freq)
|
||||
|
||||
# divide by num_bins+1 in next line because of end-effects where the bins
|
||||
# spread out to the sides.
|
||||
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||
|
||||
if vtln_high < 0.0:
|
||||
vtln_high += nyquist
|
||||
|
||||
assert vtln_warp_factor == 1.0 or (
|
||||
(low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
|
||||
), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
|
||||
vtln_low, vtln_high, low_freq, high_freq
|
||||
)
|
||||
|
||||
bin = torch.arange(num_bins).unsqueeze(1)
|
||||
left_mel = mel_low_freq + bin * mel_freq_delta # size(num_bins, 1)
|
||||
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # size(num_bins, 1)
|
||||
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # size(num_bins, 1)
|
||||
|
||||
if vtln_warp_factor != 1.0:
|
||||
left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
|
||||
center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
|
||||
right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
|
||||
|
||||
# center_freqs = inverse_mel_scale(center_mel) # size (num_bins)
|
||||
# size(1, num_fft_bins)
|
||||
mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
|
||||
|
||||
# size (num_bins, num_fft_bins)
|
||||
up_slope = (mel - left_mel) / (center_mel - left_mel)
|
||||
down_slope = (right_mel - mel) / (right_mel - center_mel)
|
||||
|
||||
if vtln_warp_factor == 1.0:
|
||||
# left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
|
||||
bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
|
||||
else:
|
||||
# warping can move the order of left_mel, center_mel, right_mel anywhere
|
||||
bins = torch.zeros_like(up_slope)
|
||||
up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel) # left_mel < mel <= center_mel
|
||||
down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel) # center_mel < mel < right_mel
|
||||
bins[up_idx] = up_slope[up_idx]
|
||||
bins[down_idx] = down_slope[down_idx]
|
||||
|
||||
return bins.to(device=device,dtype=dtype)#, center_freqs
|
||||
|
||||
cache={}
|
||||
def fbank(
|
||||
waveform: Tensor,
|
||||
blackman_coeff: float = 0.42,
|
||||
channel: int = -1,
|
||||
dither: float = 0.0,
|
||||
energy_floor: float = 1.0,
|
||||
frame_length: float = 25.0,
|
||||
frame_shift: float = 10.0,
|
||||
high_freq: float = 0.0,
|
||||
htk_compat: bool = False,
|
||||
low_freq: float = 20.0,
|
||||
min_duration: float = 0.0,
|
||||
num_mel_bins: int = 23,
|
||||
preemphasis_coefficient: float = 0.97,
|
||||
raw_energy: bool = True,
|
||||
remove_dc_offset: bool = True,
|
||||
round_to_power_of_two: bool = True,
|
||||
sample_frequency: float = 16000.0,
|
||||
snip_edges: bool = True,
|
||||
subtract_mean: bool = False,
|
||||
use_energy: bool = False,
|
||||
use_log_fbank: bool = True,
|
||||
use_power: bool = True,
|
||||
vtln_high: float = -500.0,
|
||||
vtln_low: float = 100.0,
|
||||
vtln_warp: float = 1.0,
|
||||
window_type: str = POVEY,
|
||||
) -> Tensor:
|
||||
r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
|
||||
compute-fbank-feats.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
|
||||
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
|
||||
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
|
||||
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
|
||||
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
|
||||
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
|
||||
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
|
||||
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
|
||||
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
|
||||
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
|
||||
high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
|
||||
(Default: ``0.0``)
|
||||
htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible features
|
||||
(need to change other parameters). (Default: ``False``)
|
||||
low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
|
||||
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
|
||||
num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
|
||||
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
|
||||
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
|
||||
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. (Default: ``True``)
|
||||
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
|
||||
specified there) (Default: ``16000.0``)
|
||||
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
|
||||
in the file, and the number of frames depends on the frame_length. If False, the number of frames
|
||||
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
|
||||
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
|
||||
it this way. (Default: ``False``)
|
||||
use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
|
||||
use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
|
||||
use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
|
||||
negative, offset from high-mel-freq (Default: ``-500.0``)
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
|
||||
vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
|
||||
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
|
||||
(Default: ``'povey'``)
|
||||
|
||||
Returns:
|
||||
Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
|
||||
where m is calculated in _get_strided
|
||||
"""
|
||||
device, dtype = waveform.device, waveform.dtype
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
|
||||
)
|
||||
|
||||
if len(waveform) < min_duration * sample_frequency:
|
||||
# signal is too short
|
||||
return torch.empty(0, device=device, dtype=dtype)
|
||||
|
||||
# strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform,
|
||||
padded_window_size,
|
||||
window_size,
|
||||
window_shift,
|
||||
window_type,
|
||||
blackman_coeff,
|
||||
snip_edges,
|
||||
raw_energy,
|
||||
energy_floor,
|
||||
dither,
|
||||
remove_dc_offset,
|
||||
preemphasis_coefficient,
|
||||
)
|
||||
|
||||
# size (m, padded_window_size // 2 + 1)
|
||||
spectrum = torch.fft.rfft(strided_input).abs()
|
||||
if use_power:
|
||||
spectrum = spectrum.pow(2.0)
|
||||
|
||||
# size (num_mel_bins, padded_window_size // 2)
|
||||
# print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
|
||||
|
||||
cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype)
|
||||
if cache_key not in cache:
|
||||
mel_energies = get_mel_banks(
|
||||
num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype
|
||||
)
|
||||
cache[cache_key]=mel_energies
|
||||
else:
|
||||
mel_energies=cache[cache_key]
|
||||
|
||||
# pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
|
||||
mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
|
||||
|
||||
# sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
|
||||
mel_energies = torch.mm(spectrum, mel_energies.T)
|
||||
if use_log_fbank:
|
||||
# avoid log of zero (which should be prevented anyway by dithering)
|
||||
mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
|
||||
|
||||
# if use_energy then add it as the last column for htk_compat == true else first column
|
||||
if use_energy:
|
||||
signal_log_energy = signal_log_energy.unsqueeze(1) # size (m, 1)
|
||||
# returns size (m, num_mel_bins + 1)
|
||||
if htk_compat:
|
||||
mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
|
||||
else:
|
||||
mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
|
||||
|
||||
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
|
||||
return mel_energies
|
||||
|
||||
|
||||
def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
|
||||
# returns a dct matrix of size (num_mel_bins, num_ceps)
|
||||
# size (num_mel_bins, num_mel_bins)
|
||||
dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
|
||||
# kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
|
||||
# this would be the first column in the dct_matrix for torchaudio as it expects a
|
||||
# right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
|
||||
# expects a left multiply e.g. dct_matrix * vector).
|
||||
dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
|
||||
dct_matrix = dct_matrix[:, :num_ceps]
|
||||
return dct_matrix
|
||||
|
||||
|
||||
def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
|
||||
# returns size (num_ceps)
|
||||
# Compute liftering coefficients (scaling on cepstral coeffs)
|
||||
# coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
|
||||
i = torch.arange(num_ceps)
|
||||
return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
|
||||
|
||||
|
||||
def mfcc(
|
||||
waveform: Tensor,
|
||||
blackman_coeff: float = 0.42,
|
||||
cepstral_lifter: float = 22.0,
|
||||
channel: int = -1,
|
||||
dither: float = 0.0,
|
||||
energy_floor: float = 1.0,
|
||||
frame_length: float = 25.0,
|
||||
frame_shift: float = 10.0,
|
||||
high_freq: float = 0.0,
|
||||
htk_compat: bool = False,
|
||||
low_freq: float = 20.0,
|
||||
num_ceps: int = 13,
|
||||
min_duration: float = 0.0,
|
||||
num_mel_bins: int = 23,
|
||||
preemphasis_coefficient: float = 0.97,
|
||||
raw_energy: bool = True,
|
||||
remove_dc_offset: bool = True,
|
||||
round_to_power_of_two: bool = True,
|
||||
sample_frequency: float = 16000.0,
|
||||
snip_edges: bool = True,
|
||||
subtract_mean: bool = False,
|
||||
use_energy: bool = False,
|
||||
vtln_high: float = -500.0,
|
||||
vtln_low: float = 100.0,
|
||||
vtln_warp: float = 1.0,
|
||||
window_type: str = POVEY,
|
||||
) -> Tensor:
|
||||
r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
|
||||
compute-mfcc-feats.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
|
||||
blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
|
||||
cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
|
||||
channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
|
||||
dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
|
||||
the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
|
||||
energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
|
||||
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
|
||||
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
|
||||
frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
|
||||
frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
|
||||
high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
|
||||
(Default: ``0.0``)
|
||||
htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible
|
||||
features (need to change other parameters). (Default: ``False``)
|
||||
low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
|
||||
num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
|
||||
min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
|
||||
num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
|
||||
preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
|
||||
raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
|
||||
remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. (Default: ``True``)
|
||||
sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
|
||||
specified there) (Default: ``16000.0``)
|
||||
snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
|
||||
in the file, and the number of frames depends on the frame_length. If False, the number of frames
|
||||
depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
|
||||
subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
|
||||
it this way. (Default: ``False``)
|
||||
use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
|
||||
negative, offset from high-mel-freq (Default: ``-500.0``)
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
|
||||
vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
|
||||
window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
|
||||
(Default: ``"povey"``)
|
||||
|
||||
Returns:
|
||||
Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
|
||||
where m is calculated in _get_strided
|
||||
"""
|
||||
assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
|
||||
|
||||
device, dtype = waveform.device, waveform.dtype
|
||||
|
||||
# The mel_energies should not be squared (use_power=True), not have mean subtracted
|
||||
# (subtract_mean=False), and use log (use_log_fbank=True).
|
||||
# size (m, num_mel_bins + use_energy)
|
||||
feature = fbank(
|
||||
waveform=waveform,
|
||||
blackman_coeff=blackman_coeff,
|
||||
channel=channel,
|
||||
dither=dither,
|
||||
energy_floor=energy_floor,
|
||||
frame_length=frame_length,
|
||||
frame_shift=frame_shift,
|
||||
high_freq=high_freq,
|
||||
htk_compat=htk_compat,
|
||||
low_freq=low_freq,
|
||||
min_duration=min_duration,
|
||||
num_mel_bins=num_mel_bins,
|
||||
preemphasis_coefficient=preemphasis_coefficient,
|
||||
raw_energy=raw_energy,
|
||||
remove_dc_offset=remove_dc_offset,
|
||||
round_to_power_of_two=round_to_power_of_two,
|
||||
sample_frequency=sample_frequency,
|
||||
snip_edges=snip_edges,
|
||||
subtract_mean=False,
|
||||
use_energy=use_energy,
|
||||
use_log_fbank=True,
|
||||
use_power=True,
|
||||
vtln_high=vtln_high,
|
||||
vtln_low=vtln_low,
|
||||
vtln_warp=vtln_warp,
|
||||
window_type=window_type,
|
||||
)
|
||||
|
||||
if use_energy:
|
||||
# size (m)
|
||||
signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
|
||||
# offset is 0 if htk_compat==True else 1
|
||||
mel_offset = int(not htk_compat)
|
||||
feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
|
||||
|
||||
# size (num_mel_bins, num_ceps)
|
||||
dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
|
||||
|
||||
# size (m, num_ceps)
|
||||
feature = feature.matmul(dct_matrix)
|
||||
|
||||
if cepstral_lifter != 0.0:
|
||||
# size (1, num_ceps)
|
||||
lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
|
||||
feature *= lifter_coeffs.to(device=device, dtype=dtype)
|
||||
|
||||
# if use_energy then replace the last column for htk_compat == true else first column
|
||||
if use_energy:
|
||||
feature[:, 0] = signal_log_energy
|
||||
|
||||
if htk_compat:
|
||||
energy = feature[:, 0].unsqueeze(1) # size (m, 1)
|
||||
feature = feature[:, 1:] # size (m, num_ceps - 1)
|
||||
if not use_energy:
|
||||
# scale on C0 (actually removing a scale we previously added that's
|
||||
# part of one common definition of the cosine transform.)
|
||||
energy *= math.sqrt(2)
|
||||
|
||||
feature = torch.cat((feature, energy), dim=1)
|
||||
|
||||
feature = _subtract_column_mean(feature, subtract_mean)
|
||||
return feature
|
104
GPT_SoVITS/eres2net/pooling_layers.py
Normal file
104
GPT_SoVITS/eres2net/pooling_layers.py
Normal file
@ -0,0 +1,104 @@
|
||||
# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class TAP(nn.Module):
|
||||
"""
|
||||
Temporal average pooling, only first-order mean is considered
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
super(TAP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
pooling_mean = x.mean(dim=-1)
|
||||
# To be compatable with 2D input
|
||||
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||
return pooling_mean
|
||||
|
||||
|
||||
class TSDP(nn.Module):
|
||||
"""
|
||||
Temporal standard deviation pooling, only second-order std is considered
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
super(TSDP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
# The last dimension is the temporal axis
|
||||
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||
pooling_std = pooling_std.flatten(start_dim=1)
|
||||
return pooling_std
|
||||
|
||||
|
||||
class TSTP(nn.Module):
|
||||
"""
|
||||
Temporal statistics pooling, concatenate mean and std, which is used in
|
||||
x-vector
|
||||
Comment: simple concatenation can not make full use of both statistics
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
super(TSTP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
# The last dimension is the temporal axis
|
||||
pooling_mean = x.mean(dim=-1)
|
||||
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||
pooling_std = pooling_std.flatten(start_dim=1)
|
||||
|
||||
stats = torch.cat((pooling_mean, pooling_std), 1)
|
||||
return stats
|
||||
|
||||
|
||||
class ASTP(nn.Module):
|
||||
""" Attentive statistics pooling: Channel- and context-dependent
|
||||
statistics pooling, first used in ECAPA_TDNN.
|
||||
"""
|
||||
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
|
||||
super(ASTP, self).__init__()
|
||||
self.global_context_att = global_context_att
|
||||
|
||||
# Use Conv1d with stride == 1 rather than Linear, then we don't
|
||||
# need to transpose inputs.
|
||||
if global_context_att:
|
||||
self.linear1 = nn.Conv1d(
|
||||
in_dim * 3, bottleneck_dim,
|
||||
kernel_size=1) # equals W and b in the paper
|
||||
else:
|
||||
self.linear1 = nn.Conv1d(
|
||||
in_dim, bottleneck_dim,
|
||||
kernel_size=1) # equals W and b in the paper
|
||||
self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
|
||||
kernel_size=1) # equals V and k in the paper
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
|
||||
or a 4-dimensional tensor in resnet architecture (B,C,F,T)
|
||||
0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
|
||||
"""
|
||||
if len(x.shape) == 4:
|
||||
x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
|
||||
assert len(x.shape) == 3
|
||||
|
||||
if self.global_context_att:
|
||||
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
||||
context_std = torch.sqrt(
|
||||
torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
||||
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
||||
else:
|
||||
x_in = x
|
||||
|
||||
# DON'T use ReLU here! ReLU may be hard to converge.
|
||||
alpha = torch.tanh(
|
||||
self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
|
||||
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
||||
mean = torch.sum(alpha * x, dim=2)
|
||||
var = torch.sum(alpha * (x**2), dim=2) - mean**2
|
||||
std = torch.sqrt(var.clamp(min=1e-10))
|
||||
return torch.cat([mean, std], dim=1)
|
@ -331,7 +331,7 @@ class VitsModel(nn.Module):
|
||||
def __init__(self, vits_path):
|
||||
super().__init__()
|
||||
# dict_s2 = torch.load(vits_path,map_location="cpu")
|
||||
dict_s2 = torch.load(vits_path)
|
||||
dict_s2 = torch.load(vits_path, weights_only=False)
|
||||
self.hps = dict_s2["config"]
|
||||
if dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
|
||||
self.hps["model"]["version"] = "v1"
|
||||
@ -645,7 +645,7 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
|
||||
|
||||
# gpt_path = "GPT_weights_v2/xw-e15.ckpt"
|
||||
# dict_s1 = torch.load(gpt_path, map_location=device)
|
||||
dict_s1 = torch.load(gpt_path)
|
||||
dict_s1 = torch.load(gpt_path, weights_only=False)
|
||||
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
|
||||
print("#### get_raw_t2s_model ####")
|
||||
print(raw_t2s.config)
|
||||
|
@ -10,7 +10,7 @@ from inference_webui import get_phones_and_bert
|
||||
import librosa
|
||||
from module import commons
|
||||
from module.mel_processing import mel_spectrogram_torch
|
||||
from module.models_onnx import CFM, SynthesizerTrnV3
|
||||
from module.models_onnx import CFM, Generator, SynthesizerTrnV3
|
||||
import numpy as np
|
||||
import torch._dynamo.config
|
||||
import torchaudio
|
||||
@ -46,7 +46,7 @@ class MelSpectrgram(torch.nn.Module):
|
||||
center=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.hann_window = torch.hann_window(1024).to(device=device, dtype=dtype)
|
||||
self.hann_window = torch.hann_window(win_size).to(device=device, dtype=dtype)
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
||||
self.mel_basis = torch.from_numpy(mel).to(dtype=dtype, device=device)
|
||||
self.n_fft: int = n_fft
|
||||
@ -189,6 +189,19 @@ mel_fn = lambda x: mel_spectrogram_torch(
|
||||
"center": False,
|
||||
},
|
||||
)
|
||||
mel_fn_v4 = lambda x: mel_spectrogram_torch(
|
||||
x,
|
||||
**{
|
||||
"n_fft": 1280,
|
||||
"win_size": 1280,
|
||||
"hop_size": 320,
|
||||
"num_mels": 100,
|
||||
"sampling_rate": 32000,
|
||||
"fmin": 0,
|
||||
"fmax": None,
|
||||
"center": False,
|
||||
},
|
||||
)
|
||||
|
||||
spec_min = -12
|
||||
spec_max = 2
|
||||
@ -285,6 +298,84 @@ class ExportGPTSovitsHalf(torch.nn.Module):
|
||||
return fea_ref, fea_todo, mel2
|
||||
|
||||
|
||||
class ExportGPTSovitsV4Half(torch.nn.Module):
|
||||
def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3):
|
||||
super().__init__()
|
||||
self.hps = hps
|
||||
self.t2s_m = t2s_m
|
||||
self.vq_model = vq_model
|
||||
self.mel2 = MelSpectrgram(
|
||||
dtype=torch.float32,
|
||||
device=device,
|
||||
n_fft=1280,
|
||||
num_mels=100,
|
||||
sampling_rate=32000,
|
||||
hop_size=320,
|
||||
win_size=1280,
|
||||
fmin=0,
|
||||
fmax=None,
|
||||
center=False,
|
||||
)
|
||||
# self.dtype = dtype
|
||||
self.filter_length: int = hps.data.filter_length
|
||||
self.sampling_rate: int = hps.data.sampling_rate
|
||||
self.hop_length: int = hps.data.hop_length
|
||||
self.win_length: int = hps.data.win_length
|
||||
|
||||
def forward(
|
||||
self,
|
||||
ssl_content,
|
||||
ref_audio_32k: torch.FloatTensor,
|
||||
phoneme_ids0,
|
||||
phoneme_ids1,
|
||||
bert1,
|
||||
bert2,
|
||||
top_k,
|
||||
):
|
||||
refer = spectrogram_torch(
|
||||
ref_audio_32k,
|
||||
self.filter_length,
|
||||
self.sampling_rate,
|
||||
self.hop_length,
|
||||
self.win_length,
|
||||
center=False,
|
||||
).to(ssl_content.dtype)
|
||||
|
||||
codes = self.vq_model.extract_latent(ssl_content)
|
||||
prompt_semantic = codes[0, 0]
|
||||
prompt = prompt_semantic.unsqueeze(0)
|
||||
# print('extract_latent',codes.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
pred_semantic = self.t2s_m(prompt, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k)
|
||||
# print('t2s_m',pred_semantic.shape,datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
ge = self.vq_model.create_ge(refer)
|
||||
# print('create_ge',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
prompt_ = prompt.unsqueeze(0)
|
||||
fea_ref = self.vq_model(prompt_, phoneme_ids0, ge)
|
||||
# print('fea_ref',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# print(prompt_.shape, phoneme_ids0.shape, ge.shape)
|
||||
# print(fea_ref.shape)
|
||||
|
||||
ref_32k = ref_audio_32k
|
||||
mel2 = norm_spec(self.mel2(ref_32k)).to(ssl_content.dtype)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
if T_min > 500:
|
||||
mel2 = mel2[:, :, -500:]
|
||||
fea_ref = fea_ref[:, :, -500:]
|
||||
T_min = 500
|
||||
|
||||
fea_todo = self.vq_model(pred_semantic, phoneme_ids1, ge)
|
||||
# print('fea_todo',datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# print(pred_semantic.shape, phoneme_ids1.shape, ge.shape)
|
||||
# print(fea_todo.shape)
|
||||
|
||||
return fea_ref, fea_todo, mel2
|
||||
|
||||
|
||||
class GPTSoVITSV3(torch.nn.Module):
|
||||
def __init__(self, gpt_sovits_half, cfm, bigvgan):
|
||||
super().__init__()
|
||||
@ -311,6 +402,7 @@ class GPTSoVITSV3(torch.nn.Module):
|
||||
chunk_len = 934 - fea_ref.shape[2]
|
||||
wav_gen_list = []
|
||||
idx = 0
|
||||
fea_todo = fea_todo[:,:,:-5]
|
||||
wav_gen_length = fea_todo.shape[2] * 256
|
||||
while 1:
|
||||
# current_time = datetime.now()
|
||||
@ -342,6 +434,65 @@ class GPTSoVITSV3(torch.nn.Module):
|
||||
|
||||
wav_gen = torch.cat(wav_gen_list, 2)
|
||||
return wav_gen[0][0][:wav_gen_length]
|
||||
|
||||
class GPTSoVITSV4(torch.nn.Module):
|
||||
def __init__(self, gpt_sovits_half, cfm, hifigan):
|
||||
super().__init__()
|
||||
self.gpt_sovits_half = gpt_sovits_half
|
||||
self.cfm = cfm
|
||||
self.hifigan = hifigan
|
||||
|
||||
def forward(
|
||||
self,
|
||||
ssl_content,
|
||||
ref_audio_32k: torch.FloatTensor,
|
||||
phoneme_ids0: torch.LongTensor,
|
||||
phoneme_ids1: torch.LongTensor,
|
||||
bert1,
|
||||
bert2,
|
||||
top_k: torch.LongTensor,
|
||||
sample_steps: torch.LongTensor,
|
||||
):
|
||||
# current_time = datetime.now()
|
||||
# print("gpt_sovits_half",current_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
fea_ref, fea_todo, mel2 = self.gpt_sovits_half(
|
||||
ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k
|
||||
)
|
||||
chunk_len = 1000 - fea_ref.shape[2]
|
||||
wav_gen_list = []
|
||||
idx = 0
|
||||
fea_todo = fea_todo[:,:,:-10]
|
||||
wav_gen_length = fea_todo.shape[2] * 480
|
||||
while 1:
|
||||
# current_time = datetime.now()
|
||||
# print("idx:",idx,current_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len]
|
||||
if fea_todo_chunk.shape[-1] == 0:
|
||||
break
|
||||
|
||||
# 因为导出的模型在不同shape时会重新编译还是怎么的,会卡顿10s这样,
|
||||
# 所以在这里补0让他shape维持不变
|
||||
# 但是这样会导致生成的音频长度不对,所以在最后截取一下。
|
||||
# 经过 hifigan 之后音频长度就是 fea_todo.shape[2] * 480
|
||||
complete_len = chunk_len - fea_todo_chunk.shape[-1]
|
||||
if complete_len != 0:
|
||||
fea_todo_chunk = torch.cat(
|
||||
[
|
||||
fea_todo_chunk,
|
||||
torch.zeros(1, 512, complete_len).to(fea_todo_chunk.device).to(fea_todo_chunk.dtype),
|
||||
],
|
||||
2,
|
||||
)
|
||||
|
||||
cfm_res, fea_ref, mel2 = self.cfm(fea_ref, fea_todo_chunk, mel2, sample_steps)
|
||||
idx += chunk_len
|
||||
|
||||
cfm_res = denorm_spec(cfm_res)
|
||||
hifigan_res = self.hifigan(cfm_res)
|
||||
wav_gen_list.append(hifigan_res)
|
||||
|
||||
wav_gen = torch.cat(wav_gen_list, 2)
|
||||
return wav_gen[0][0][:wav_gen_length]
|
||||
|
||||
|
||||
def init_bigvgan():
|
||||
@ -361,6 +512,31 @@ def init_bigvgan():
|
||||
bigvgan_model = bigvgan_model.to(device)
|
||||
|
||||
|
||||
def init_hifigan():
|
||||
global hifigan_model, bigvgan_model
|
||||
hifigan_model = Generator(
|
||||
initial_channel=100,
|
||||
resblock="1",
|
||||
resblock_kernel_sizes=[3, 7, 11],
|
||||
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
||||
upsample_rates=[10, 6, 2, 2, 2],
|
||||
upsample_initial_channel=512,
|
||||
upsample_kernel_sizes=[20, 12, 4, 4, 4],
|
||||
gin_channels=0,
|
||||
is_bias=True,
|
||||
)
|
||||
hifigan_model.eval()
|
||||
hifigan_model.remove_weight_norm()
|
||||
state_dict_g = torch.load(
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
|
||||
)
|
||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||
if is_half == True:
|
||||
hifigan_model = hifigan_model.half().to(device)
|
||||
else:
|
||||
hifigan_model = hifigan_model.to(device)
|
||||
|
||||
|
||||
class Sovits:
|
||||
def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps):
|
||||
self.vq_model = vq_model
|
||||
@ -399,6 +575,7 @@ class DictToAttrRecursive(dict):
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
def get_sovits_weights(sovits_path):
|
||||
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
@ -419,8 +596,8 @@ def get_sovits_weights(sovits_path):
|
||||
else:
|
||||
hps.model.version = "v2"
|
||||
|
||||
if model_version == "v3":
|
||||
hps.model.version = "v3"
|
||||
if model_version in v3v4set:
|
||||
hps.model.version = model_version
|
||||
|
||||
logger.info(f"hps: {hps}")
|
||||
|
||||
@ -522,10 +699,14 @@ def export_cfm(
|
||||
return export_cfm
|
||||
|
||||
|
||||
def export():
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
||||
|
||||
init_bigvgan()
|
||||
def export_1(ref_wav_path,ref_wav_text,version="v3"):
|
||||
if version == "v3":
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
||||
init_bigvgan()
|
||||
else:
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
|
||||
init_hifigan()
|
||||
|
||||
|
||||
dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
|
||||
raw_t2s = get_raw_t2s_model(dict_s1).to(device)
|
||||
@ -540,9 +721,9 @@ def export():
|
||||
script_t2s = torch.jit.script(t2s_m).to(device)
|
||||
|
||||
hps = sovits.hps
|
||||
ref_wav_path = "onnx/ad/ref.wav"
|
||||
# ref_wav_path = "onnx/ad/ref.wav"
|
||||
speed = 1.0
|
||||
sample_steps = 32
|
||||
sample_steps = 8
|
||||
dtype = torch.float16 if is_half == True else torch.float32
|
||||
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
|
||||
zero_wav = np.zeros(
|
||||
@ -567,8 +748,11 @@ def export():
|
||||
prompt_semantic = codes[0, 0]
|
||||
prompt = prompt_semantic.unsqueeze(0).to(device)
|
||||
|
||||
# phones1, bert1, norm_text1 = get_phones_and_bert(
|
||||
# "你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
|
||||
# )
|
||||
phones1, bert1, norm_text1 = get_phones_and_bert(
|
||||
"你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。", "all_zh", "v3"
|
||||
ref_wav_text, "auto", "v3"
|
||||
)
|
||||
phones2, bert2, norm_text2 = get_phones_and_bert(
|
||||
"这是一个简单的示例,真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
|
||||
@ -634,25 +818,33 @@ def export():
|
||||
# vq_model = sovits.vq_model
|
||||
vq_model = trace_vq_model
|
||||
|
||||
gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model)
|
||||
torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt")
|
||||
if version == "v3":
|
||||
gpt_sovits_half = ExportGPTSovitsHalf(sovits.hps, script_t2s, trace_vq_model)
|
||||
torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v3_half.pt")
|
||||
else:
|
||||
gpt_sovits_half = ExportGPTSovitsV4Half(sovits.hps, script_t2s, trace_vq_model)
|
||||
torch.jit.script(gpt_sovits_half).save("onnx/ad/gpt_sovits_v4_half.pt")
|
||||
|
||||
ref_audio, sr = torchaudio.load(ref_wav_path)
|
||||
ref_audio = ref_audio.to(device).float()
|
||||
if ref_audio.shape[0] == 2:
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if sr != 24000:
|
||||
ref_audio = resample(ref_audio, sr)
|
||||
tgt_sr = 24000 if version == "v3" else 32000
|
||||
if sr != tgt_sr:
|
||||
ref_audio = resample(ref_audio, sr, tgt_sr)
|
||||
# mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel_fn(ref_audio))
|
||||
mel2 = mel_fn(ref_audio) if version == "v3" else mel_fn_v4(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
print("fea_ref:", fea_ref.shape, T_min)
|
||||
if T_min > 468:
|
||||
mel2 = mel2[:, :, -468:]
|
||||
fea_ref = fea_ref[:, :, -468:]
|
||||
T_min = 468
|
||||
chunk_len = 934 - T_min
|
||||
Tref = 468 if version == "v3" else 500
|
||||
Tchunk = 934 if version == "v3" else 1000
|
||||
if T_min > Tref:
|
||||
mel2 = mel2[:, :, -Tref:]
|
||||
fea_ref = fea_ref[:, :, -Tref:]
|
||||
T_min = Tref
|
||||
chunk_len = Tchunk - T_min
|
||||
mel2 = mel2.to(dtype)
|
||||
|
||||
# fea_todo, ge = sovits.vq_model(pred_semantic,y_lengths, phoneme_ids1, ge)
|
||||
@ -714,13 +906,19 @@ def export():
|
||||
with torch.inference_mode():
|
||||
cmf_res_rand = torch.randn(1, 100, 934).to(device).to(dtype)
|
||||
torch._dynamo.mark_dynamic(cmf_res_rand, 2)
|
||||
bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,))
|
||||
bigvgan_model_.save("onnx/ad/bigvgan_model.pt")
|
||||
wav_gen = bigvgan_model(cmf_res)
|
||||
if version == "v3":
|
||||
bigvgan_model_ = torch.jit.trace(bigvgan_model, optimize=True, example_inputs=(cmf_res_rand,))
|
||||
bigvgan_model_.save("onnx/ad/bigvgan_model.pt")
|
||||
wav_gen = bigvgan_model(cmf_res)
|
||||
else:
|
||||
hifigan_model_ = torch.jit.trace(hifigan_model, optimize=True, example_inputs=(cmf_res_rand,))
|
||||
hifigan_model_.save("onnx/ad/hifigan_model.pt")
|
||||
wav_gen = hifigan_model(cmf_res)
|
||||
|
||||
print("wav_gen:", wav_gen.shape, wav_gen.dtype)
|
||||
audio = wav_gen[0][0].cpu().detach().numpy()
|
||||
|
||||
sr = 24000
|
||||
sr = 24000 if version == "v3" else 48000
|
||||
soundfile.write("out.export.wav", (audio * 32768).astype(np.int16), sr)
|
||||
|
||||
|
||||
@ -846,10 +1044,11 @@ def test_export(
|
||||
soundfile.write(output, (audio * 32768).astype(np.int16), sr)
|
||||
|
||||
|
||||
def test_export1(
|
||||
def test_export(
|
||||
todo_text,
|
||||
gpt_sovits_v3,
|
||||
gpt_sovits_v3v4,
|
||||
output,
|
||||
out_sr=24000,
|
||||
):
|
||||
# hps = sovits.hps
|
||||
ref_wav_path = "onnx/ad/ref.wav"
|
||||
@ -859,7 +1058,7 @@ def test_export1(
|
||||
dtype = torch.float16 if is_half == True else torch.float32
|
||||
|
||||
zero_wav = np.zeros(
|
||||
int(24000 * 0.3),
|
||||
int(out_sr * 0.3),
|
||||
dtype=np.float16 if is_half == True else np.float32,
|
||||
)
|
||||
|
||||
@ -894,7 +1093,7 @@ def test_export1(
|
||||
|
||||
bert1 = bert1.T.to(device)
|
||||
bert2 = bert2.T.to(device)
|
||||
top_k = torch.LongTensor([15]).to(device)
|
||||
top_k = torch.LongTensor([20]).to(device)
|
||||
|
||||
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.info("start inference %s", current_time)
|
||||
@ -907,22 +1106,26 @@ def test_export1(
|
||||
bert2.shape,
|
||||
top_k.shape,
|
||||
)
|
||||
wav_gen = gpt_sovits_v3(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps)
|
||||
wav_gen = gpt_sovits_v3v4(ssl_content, ref_audio_32k, phoneme_ids0, phoneme_ids1, bert1, bert2, top_k, sample_steps)
|
||||
print("wav_gen:", wav_gen.shape, wav_gen.dtype)
|
||||
|
||||
wav_gen = torch.cat([wav_gen, zero_wav_torch], 0)
|
||||
|
||||
audio = wav_gen.cpu().detach().numpy()
|
||||
logger.info("end bigvgan %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
sr = 24000
|
||||
soundfile.write(output, (audio * 32768).astype(np.int16), sr)
|
||||
soundfile.write(output, (audio * 32768).astype(np.int16), out_sr)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
def test_():
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
||||
def export_2(version="v3"):
|
||||
if version == "v3":
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/s2Gv3.pth")
|
||||
# init_bigvgan()
|
||||
else:
|
||||
sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
|
||||
# init_hifigan()
|
||||
|
||||
# cfm = ExportCFM(sovits.cfm)
|
||||
# cfm.cfm.estimator = dit
|
||||
@ -963,25 +1166,41 @@ def test_():
|
||||
# gpt_sovits_v3_half = gpt_sovits_v3_half.half()
|
||||
# gpt_sovits_v3_half = gpt_sovits_v3_half.cuda()
|
||||
# gpt_sovits_v3_half.eval()
|
||||
gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model)
|
||||
logger.info("gpt_sovits_v3_half ok")
|
||||
if version == "v3":
|
||||
gpt_sovits_v3_half = ExportGPTSovitsHalf(sovits.hps, t2s_m, vq_model)
|
||||
logger.info("gpt_sovits_v3_half ok")
|
||||
# init_bigvgan()
|
||||
# global bigvgan_model
|
||||
bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt")
|
||||
# bigvgan_model = torch.jit.optimize_for_inference(bigvgan_model)
|
||||
bigvgan_model = bigvgan_model.half()
|
||||
bigvgan_model = bigvgan_model.cuda()
|
||||
bigvgan_model.eval()
|
||||
|
||||
# init_bigvgan()
|
||||
# global bigvgan_model
|
||||
bigvgan_model = torch.jit.load("onnx/ad/bigvgan_model.pt")
|
||||
# bigvgan_model = torch.jit.optimize_for_inference(bigvgan_model)
|
||||
bigvgan_model = bigvgan_model.half()
|
||||
bigvgan_model = bigvgan_model.cuda()
|
||||
bigvgan_model.eval()
|
||||
logger.info("bigvgan ok")
|
||||
gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model)
|
||||
gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3)
|
||||
gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt")
|
||||
gpt_sovits_v3 = gpt_sovits_v3.half().to(device)
|
||||
gpt_sovits_v3.eval()
|
||||
print("save gpt_sovits_v3 ok")
|
||||
else:
|
||||
gpt_sovits_v4_half = ExportGPTSovitsV4Half(sovits.hps, t2s_m, vq_model)
|
||||
logger.info("gpt_sovits_v4 ok")
|
||||
|
||||
logger.info("bigvgan ok")
|
||||
hifigan_model = torch.jit.load("onnx/ad/hifigan_model.pt")
|
||||
hifigan_model = hifigan_model.half()
|
||||
hifigan_model = hifigan_model.cuda()
|
||||
hifigan_model.eval()
|
||||
logger.info("hifigan ok")
|
||||
gpt_sovits_v4 = GPTSoVITSV4(gpt_sovits_v4_half, cfm, hifigan_model)
|
||||
gpt_sovits_v4 = torch.jit.script(gpt_sovits_v4)
|
||||
gpt_sovits_v4.save("onnx/ad/gpt_sovits_v4.pt")
|
||||
print("save gpt_sovits_v4 ok")
|
||||
|
||||
gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
|
||||
sr = 24000 if version == "v3" else 48000
|
||||
|
||||
gpt_sovits_v3 = GPTSoVITSV3(gpt_sovits_v3_half, cfm, bigvgan_model)
|
||||
gpt_sovits_v3 = torch.jit.script(gpt_sovits_v3)
|
||||
gpt_sovits_v3.save("onnx/ad/gpt_sovits_v3.pt")
|
||||
gpt_sovits_v3 = gpt_sovits_v3.half().to(device)
|
||||
gpt_sovits_v3.eval()
|
||||
print("save gpt_sovits_v3 ok")
|
||||
|
||||
time.sleep(5)
|
||||
# print("thread:", torch.get_num_threads())
|
||||
@ -989,16 +1208,18 @@ def test_():
|
||||
# torch.set_num_interop_threads(1)
|
||||
# torch.set_num_threads(1)
|
||||
|
||||
test_export1(
|
||||
test_export(
|
||||
"汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
|
||||
gpt_sovits_v3,
|
||||
gpt_sovits_v3v4,
|
||||
"out.wav",
|
||||
sr
|
||||
)
|
||||
|
||||
test_export1(
|
||||
test_export(
|
||||
"你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
|
||||
gpt_sovits_v3,
|
||||
gpt_sovits_v3v4,
|
||||
"out2.wav",
|
||||
sr
|
||||
)
|
||||
|
||||
# test_export(
|
||||
@ -1022,7 +1243,7 @@ def test_export_gpt_sovits_v3():
|
||||
# gpt_sovits_v3,
|
||||
# "out4.wav",
|
||||
# )
|
||||
test_export1(
|
||||
test_export(
|
||||
"风萧萧兮易水寒,壮士一去兮不复还.",
|
||||
gpt_sovits_v3,
|
||||
"out5.wav",
|
||||
@ -1030,6 +1251,6 @@ def test_export_gpt_sovits_v3():
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
# export()
|
||||
test_()
|
||||
export_1("onnx/ad/ref.wav","你这老坏蛋,我找了你这么久,真没想到在这里找到你。他说。","v4")
|
||||
# export_2("v4")
|
||||
# test_export_gpt_sovits_v3()
|
@ -30,32 +30,14 @@ logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
|
||||
warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
version = model_version = os.environ.get("version", "v2")
|
||||
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
|
||||
|
||||
from config import name2sovits_path,name2gpt_path,change_choices,get_weights_names
|
||||
SoVITS_names, GPT_names = get_weights_names()
|
||||
from config import pretrained_sovits_name
|
||||
path_sovits_v3 = pretrained_sovits_name["v3"]
|
||||
path_sovits_v4 = pretrained_sovits_name["v4"]
|
||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
|
||||
pretrained_sovits_name = [
|
||||
"GPT_SoVITS/pretrained_models/s2G488k.pth",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
|
||||
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
|
||||
]
|
||||
pretrained_gpt_name = [
|
||||
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
]
|
||||
|
||||
|
||||
_ = [[], []]
|
||||
for i in range(4):
|
||||
if os.path.exists(pretrained_gpt_name[i]):
|
||||
_[0].append(pretrained_gpt_name[i])
|
||||
if os.path.exists(pretrained_sovits_name[i]):
|
||||
_[-1].append(pretrained_sovits_name[i])
|
||||
pretrained_gpt_name, pretrained_sovits_name = _
|
||||
|
||||
|
||||
if os.path.exists("./weight.json"):
|
||||
pass
|
||||
@ -66,17 +48,22 @@ else:
|
||||
with open("./weight.json", "r", encoding="utf-8") as file:
|
||||
weight_data = file.read()
|
||||
weight_data = json.loads(weight_data)
|
||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name))
|
||||
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name))
|
||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
||||
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0]))
|
||||
if isinstance(gpt_path, list):
|
||||
gpt_path = gpt_path[0]
|
||||
if isinstance(sovits_path, list):
|
||||
sovits_path = sovits_path[0]
|
||||
|
||||
# gpt_path = os.environ.get(
|
||||
# "gpt_path", pretrained_gpt_name
|
||||
# )
|
||||
# sovits_path = os.environ.get("sovits_path", pretrained_sovits_name)
|
||||
# print(2333333)
|
||||
# print(os.environ["gpt_path"])
|
||||
# print(gpt_path)
|
||||
# print(GPT_names)
|
||||
# print(weight_data)
|
||||
# print(weight_data.get("GPT", {}))
|
||||
# print(version)###GPT version里没有s2的v2pro
|
||||
# print(weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
||||
|
||||
cnhubert_base_path = os.environ.get("cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base")
|
||||
bert_path = os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large")
|
||||
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
||||
@ -215,33 +202,21 @@ if is_half == True:
|
||||
else:
|
||||
ssl_model = ssl_model.to(device)
|
||||
|
||||
resample_transform_dict = {}
|
||||
|
||||
|
||||
def resample(audio_tensor, sr0, sr1):
|
||||
global resample_transform_dict
|
||||
key = "%s-%s" % (sr0, sr1)
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
|
||||
|
||||
###todo:put them to process_ckpt and modify my_save func (save sovits weights), gpt save weights use my_save in process_ckpt
|
||||
# symbol_version-model_version-if_lora_v3
|
||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
|
||||
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
||||
if "!"in sovits_path:sovits_path=name2sovits_path[sovits_path]
|
||||
global vq_model, hps, version, model_version, dict_language, if_lora_v3
|
||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
||||
print(sovits_path, version, model_version, if_lora_v3)
|
||||
is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
|
||||
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
|
||||
if if_lora_v3 == True and is_exist == False:
|
||||
info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n(
|
||||
"SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version
|
||||
)
|
||||
info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version)
|
||||
gr.Warning(info)
|
||||
raise FileExistsError(info)
|
||||
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
||||
@ -297,13 +272,16 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
||||
version = hps.model.version
|
||||
# print("sovits版本:",hps.model.version)
|
||||
if model_version not in v3v4set:
|
||||
if "Pro"not in model_version:
|
||||
model_version = version
|
||||
else:
|
||||
hps.model.version = model_version
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model,
|
||||
)
|
||||
model_version = version
|
||||
else:
|
||||
hps.model.version = model_version
|
||||
vq_model = SynthesizerTrnV3(
|
||||
@ -377,9 +355,10 @@ except:
|
||||
|
||||
|
||||
def change_gpt_weights(gpt_path):
|
||||
if "!"in gpt_path:gpt_path=name2gpt_path[gpt_path]
|
||||
global hz, max_sec, t2s_model, config
|
||||
hz = 50
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
|
||||
config = dict_s1["config"]
|
||||
max_sec = config["data"]["max_sec"]
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
@ -404,9 +383,36 @@ import torch
|
||||
|
||||
now_dir = os.getcwd()
|
||||
|
||||
def clean_hifigan_model():
|
||||
global hifigan_model
|
||||
if hifigan_model:
|
||||
hifigan_model = hifigan_model.cpu()
|
||||
hifigan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
def clean_bigvgan_model():
|
||||
global bigvgan_model
|
||||
if bigvgan_model:
|
||||
bigvgan_model = bigvgan_model.cpu()
|
||||
bigvgan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
def clean_sv_cn_model():
|
||||
global sv_cn_model
|
||||
if sv_cn_model:
|
||||
sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu()
|
||||
sv_cn_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
|
||||
def init_bigvgan():
|
||||
global bigvgan_model, hifigan_model
|
||||
global bigvgan_model, hifigan_model,sv_cn_model
|
||||
from BigVGAN import bigvgan
|
||||
|
||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
||||
@ -416,21 +422,15 @@ def init_bigvgan():
|
||||
# remove weight norm in the model and set to eval mode
|
||||
bigvgan_model.remove_weight_norm()
|
||||
bigvgan_model = bigvgan_model.eval()
|
||||
if hifigan_model:
|
||||
hifigan_model = hifigan_model.cpu()
|
||||
hifigan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
clean_hifigan_model()
|
||||
clean_sv_cn_model()
|
||||
if is_half == True:
|
||||
bigvgan_model = bigvgan_model.half().to(device)
|
||||
else:
|
||||
bigvgan_model = bigvgan_model.to(device)
|
||||
|
||||
|
||||
def init_hifigan():
|
||||
global hifigan_model, bigvgan_model
|
||||
global hifigan_model, bigvgan_model,sv_cn_model
|
||||
hifigan_model = Generator(
|
||||
initial_channel=100,
|
||||
resblock="1",
|
||||
@ -445,47 +445,73 @@ def init_hifigan():
|
||||
hifigan_model.eval()
|
||||
hifigan_model.remove_weight_norm()
|
||||
state_dict_g = torch.load(
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu"
|
||||
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
|
||||
)
|
||||
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
||||
if bigvgan_model:
|
||||
bigvgan_model = bigvgan_model.cpu()
|
||||
bigvgan_model = None
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
clean_bigvgan_model()
|
||||
clean_sv_cn_model()
|
||||
if is_half == True:
|
||||
hifigan_model = hifigan_model.half().to(device)
|
||||
else:
|
||||
hifigan_model = hifigan_model.to(device)
|
||||
|
||||
from sv import SV
|
||||
def init_sv_cn():
|
||||
global hifigan_model, bigvgan_model,sv_cn_model
|
||||
sv_cn_model = SV(device, is_half)
|
||||
clean_bigvgan_model()
|
||||
clean_hifigan_model()
|
||||
|
||||
bigvgan_model = hifigan_model = None
|
||||
|
||||
bigvgan_model = hifigan_model = sv_cn_model = None
|
||||
if model_version == "v3":
|
||||
init_bigvgan()
|
||||
if model_version == "v4":
|
||||
init_hifigan()
|
||||
if model_version in {"v2Pro","v2ProPlus"}:
|
||||
init_sv_cn()
|
||||
|
||||
resample_transform_dict={}
|
||||
def resample(audio_tensor, sr0,sr1,device):
|
||||
global resample_transform_dict
|
||||
key="%s-%s-%s"%(sr0,sr1,str(device))
|
||||
if key not in resample_transform_dict:
|
||||
resample_transform_dict[key] = torchaudio.transforms.Resample(
|
||||
sr0, sr1
|
||||
).to(device)
|
||||
return resample_transform_dict[key](audio_tensor)
|
||||
|
||||
def get_spepc(hps, filename):
|
||||
def get_spepc(hps, filename,dtype,device,is_v2pro=False):
|
||||
# audio = load_audio(filename, int(hps.data.sampling_rate))
|
||||
audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate))
|
||||
audio = torch.FloatTensor(audio)
|
||||
|
||||
# audio, sampling_rate = librosa.load(filename, sr=int(hps.data.sampling_rate))
|
||||
# audio = torch.FloatTensor(audio)
|
||||
|
||||
sr1=int(hps.data.sampling_rate)
|
||||
audio, sr0=torchaudio.load(filename)
|
||||
if sr0!=sr1:
|
||||
audio=audio.to(device)
|
||||
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
||||
audio=resample(audio,sr0,sr1,device)
|
||||
else:
|
||||
audio=audio.to(device)
|
||||
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
||||
|
||||
maxx = audio.abs().max()
|
||||
if maxx > 1:
|
||||
audio /= min(2, maxx)
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec = spectrogram_torch(
|
||||
audio_norm,
|
||||
audio,
|
||||
hps.data.filter_length,
|
||||
hps.data.sampling_rate,
|
||||
hps.data.hop_length,
|
||||
hps.data.win_length,
|
||||
center=False,
|
||||
)
|
||||
return spec
|
||||
spec=spec.to(dtype)
|
||||
if is_v2pro==True:
|
||||
audio=resample(audio,sr1,16000,device).to(dtype)
|
||||
return spec,audio
|
||||
|
||||
|
||||
def clean_text_inf(text, language, version):
|
||||
@ -718,6 +744,10 @@ def get_tts_wav(
|
||||
ref_free = False # s2v3暂不支持ref_free
|
||||
else:
|
||||
if_sr = False
|
||||
if model_version not in {"v3","v4","v2Pro","v2ProPlus"}:
|
||||
clean_bigvgan_model()
|
||||
clean_hifigan_model()
|
||||
clean_sv_cn_model()
|
||||
t0 = ttime()
|
||||
prompt_language = dict_language[prompt_language]
|
||||
text_language = dict_language[text_language]
|
||||
@ -821,26 +851,37 @@ def get_tts_wav(
|
||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
||||
cache[i_text] = pred_semantic
|
||||
t3 = ttime()
|
||||
is_v2pro=model_version in {"v2Pro","v2ProPlus"}
|
||||
# print(23333,is_v2pro,model_version)
|
||||
###v3不存在以下逻辑和inp_refs
|
||||
if model_version not in v3v4set:
|
||||
refers = []
|
||||
if is_v2pro:
|
||||
sv_emb=[]
|
||||
if sv_cn_model == None:
|
||||
init_sv_cn()
|
||||
if inp_refs:
|
||||
for path in inp_refs:
|
||||
try:
|
||||
refer = get_spepc(hps, path.name).to(dtype).to(device)
|
||||
try:#####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer
|
||||
refer,audio_tensor = get_spepc(hps, path.name,dtype,device,is_v2pro)
|
||||
refers.append(refer)
|
||||
if is_v2pro:
|
||||
sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor))
|
||||
except:
|
||||
traceback.print_exc()
|
||||
if len(refers) == 0:
|
||||
refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
|
||||
audio = vq_model.decode(
|
||||
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
|
||||
)[0][0] # .cpu().detach().numpy()
|
||||
refers,audio_tensor = get_spepc(hps, ref_wav_path,dtype,device,is_v2pro)
|
||||
refers=[refers]
|
||||
if is_v2pro:
|
||||
sv_emb=[sv_cn_model.compute_embedding3(audio_tensor)]
|
||||
if is_v2pro:
|
||||
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb)[0][0]
|
||||
else:
|
||||
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0]
|
||||
else:
|
||||
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
|
||||
refer,audio_tensor = get_spepc(hps, ref_wav_path,dtype,device)
|
||||
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
|
||||
phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0)
|
||||
# print(11111111, phoneme_ids0, phoneme_ids1)
|
||||
fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
|
||||
ref_audio, sr = torchaudio.load(ref_wav_path)
|
||||
ref_audio = ref_audio.to(device).float()
|
||||
@ -848,7 +889,7 @@ def get_tts_wav(
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
tgt_sr = 24000 if model_version == "v3" else 32000
|
||||
if sr != tgt_sr:
|
||||
ref_audio = resample(ref_audio, sr, tgt_sr)
|
||||
ref_audio = resample(ref_audio, sr, tgt_sr,device)
|
||||
# print("ref_audio",ref_audio.abs().mean())
|
||||
mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
@ -901,7 +942,7 @@ def get_tts_wav(
|
||||
t1 = ttime()
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
|
||||
audio_opt = torch.cat(audio_opt, 0) # np.concatenate
|
||||
if model_version in {"v1", "v2"}:
|
||||
if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
|
||||
opt_sr = 32000
|
||||
elif model_version == "v3":
|
||||
opt_sr = 24000
|
||||
@ -1035,38 +1076,6 @@ def process_text(texts):
|
||||
_text.append(text)
|
||||
return _text
|
||||
|
||||
|
||||
def change_choices():
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
|
||||
"choices": sorted(GPT_names, key=custom_sort_key),
|
||||
"__type__": "update",
|
||||
}
|
||||
|
||||
|
||||
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
|
||||
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
|
||||
for path in SoVITS_weight_root + GPT_weight_root:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def get_weights_names(GPT_weight_root, SoVITS_weight_root):
|
||||
SoVITS_names = [i for i in pretrained_sovits_name]
|
||||
for path in SoVITS_weight_root:
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".pth"):
|
||||
SoVITS_names.append("%s/%s" % (path, name))
|
||||
GPT_names = [i for i in pretrained_gpt_name]
|
||||
for path in GPT_weight_root:
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".ckpt"):
|
||||
GPT_names.append("%s/%s" % (path, name))
|
||||
return SoVITS_names, GPT_names
|
||||
|
||||
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
|
||||
|
||||
def html_center(text, label="p"):
|
||||
return f"""<div style="text-align: center; margin: 100; padding: 50;">
|
||||
<{label} style="margin: 0; padding: 0;">{text}</{label}>
|
||||
|
@ -98,13 +98,23 @@ cut_method = {
|
||||
i18n("按标点符号切"): "cut5",
|
||||
}
|
||||
|
||||
from config import name2sovits_path,name2gpt_path,change_choices,get_weights_names
|
||||
SoVITS_names, GPT_names = get_weights_names()
|
||||
from config import pretrained_sovits_name
|
||||
path_sovits_v3 = pretrained_sovits_name["v3"]
|
||||
path_sovits_v4 = pretrained_sovits_name["v4"]
|
||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
|
||||
|
||||
tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")
|
||||
tts_config.device = device
|
||||
tts_config.is_half = is_half
|
||||
tts_config.version = version
|
||||
if gpt_path is not None:
|
||||
if "!"in gpt_path:gpt_path=name2gpt_path[gpt_path]
|
||||
tts_config.t2s_weights_path = gpt_path
|
||||
if sovits_path is not None:
|
||||
if "!"in sovits_path:sovits_path=name2sovits_path[sovits_path]
|
||||
tts_config.vits_weights_path = sovits_path
|
||||
if cnhubert_base_path is not None:
|
||||
tts_config.cnhuhbert_base_path = cnhubert_base_path
|
||||
@ -179,41 +189,6 @@ def custom_sort_key(s):
|
||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||
return parts
|
||||
|
||||
|
||||
def change_choices():
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {
|
||||
"choices": sorted(GPT_names, key=custom_sort_key),
|
||||
"__type__": "update",
|
||||
}
|
||||
|
||||
|
||||
path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
|
||||
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
||||
is_exist_s2gv4 = os.path.exists(path_sovits_v4)
|
||||
pretrained_sovits_name = [
|
||||
"GPT_SoVITS/pretrained_models/s2G488k.pth",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
|
||||
"GPT_SoVITS/pretrained_models/s2Gv3.pth",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
|
||||
]
|
||||
pretrained_gpt_name = [
|
||||
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
]
|
||||
|
||||
|
||||
_ = [[], []]
|
||||
for i in range(4):
|
||||
if os.path.exists(pretrained_gpt_name[i]):
|
||||
_[0].append(pretrained_gpt_name[i])
|
||||
if os.path.exists(pretrained_sovits_name[i]):
|
||||
_[-1].append(pretrained_sovits_name[i])
|
||||
pretrained_gpt_name, pretrained_sovits_name = _
|
||||
|
||||
if os.path.exists("./weight.json"):
|
||||
pass
|
||||
else:
|
||||
@ -223,43 +198,17 @@ else:
|
||||
with open("./weight.json", "r", encoding="utf-8") as file:
|
||||
weight_data = file.read()
|
||||
weight_data = json.loads(weight_data)
|
||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name))
|
||||
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name))
|
||||
gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
|
||||
sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0]))
|
||||
if isinstance(gpt_path, list):
|
||||
gpt_path = gpt_path[0]
|
||||
if isinstance(sovits_path, list):
|
||||
sovits_path = sovits_path[0]
|
||||
|
||||
|
||||
SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"]
|
||||
GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"]
|
||||
for path in SoVITS_weight_root + GPT_weight_root:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def get_weights_names(GPT_weight_root, SoVITS_weight_root):
|
||||
SoVITS_names = [i for i in pretrained_sovits_name]
|
||||
for path in SoVITS_weight_root:
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".pth"):
|
||||
SoVITS_names.append("%s/%s" % (path, name))
|
||||
GPT_names = [i for i in pretrained_gpt_name]
|
||||
for path in GPT_weight_root:
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".ckpt"):
|
||||
GPT_names.append("%s/%s" % (path, name))
|
||||
return SoVITS_names, GPT_names
|
||||
|
||||
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast
|
||||
|
||||
v3v4set = {"v3", "v4"}
|
||||
|
||||
|
||||
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
||||
if "!"in sovits_path:sovits_path=name2sovits_path[sovits_path]
|
||||
global version, model_version, dict_language, if_lora_v3
|
||||
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
||||
# print(sovits_path,version, model_version, if_lora_v3)
|
||||
|
@ -21,7 +21,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams, val=False):
|
||||
def __init__(self, hparams, version=None,val=False):
|
||||
exp_dir = hparams.exp_dir
|
||||
self.path2 = "%s/2-name2text.txt" % exp_dir
|
||||
self.path4 = "%s/4-cnhubert" % exp_dir
|
||||
@ -29,8 +29,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
assert os.path.exists(self.path2)
|
||||
assert os.path.exists(self.path4)
|
||||
assert os.path.exists(self.path5)
|
||||
self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
|
||||
if self.is_v2Pro:
|
||||
self.path7 = "%s/7-sv_cn" % exp_dir
|
||||
assert os.path.exists(self.path7)
|
||||
names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀
|
||||
names5 = set(os.listdir(self.path5))
|
||||
if self.is_v2Pro:
|
||||
names6 = set([name[:-3] for name in list(os.listdir(self.path7))]) # 去除.pt后缀
|
||||
self.phoneme_data = {}
|
||||
with open(self.path2, "r", encoding="utf8") as f:
|
||||
lines = f.read().strip("\n").split("\n")
|
||||
@ -40,8 +46,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
if len(tmp) != 4:
|
||||
continue
|
||||
self.phoneme_data[tmp[0]] = [tmp[1]]
|
||||
|
||||
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
|
||||
if self.is_v2Pro:
|
||||
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5 & names6)
|
||||
else:
|
||||
self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5)
|
||||
tmp = self.audiopaths_sid_text
|
||||
leng = len(tmp)
|
||||
min_num = 100
|
||||
@ -109,14 +117,21 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
typee = ssl.dtype
|
||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||
ssl.requires_grad = False
|
||||
if self.is_v2Pro:
|
||||
sv_emb=torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
|
||||
except:
|
||||
traceback.print_exc()
|
||||
spec = torch.zeros(1025, 100)
|
||||
wav = torch.zeros(1, 100 * self.hop_length)
|
||||
ssl = torch.zeros(1, 768, 100)
|
||||
text = text[-1:]
|
||||
if self.is_v2Pro:
|
||||
sv_emb=torch.zeros(1,20480)
|
||||
print("load audio or ssl error!!!!!!", audiopath)
|
||||
return (ssl, spec, wav, text)
|
||||
if self.is_v2Pro:
|
||||
return (ssl, spec, wav, text,sv_emb)
|
||||
else:
|
||||
return (ssl, spec, wav, text)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768
|
||||
@ -177,8 +192,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
class TextAudioSpeakerCollate:
|
||||
"""Zero-pads model inputs and targets"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
def __init__(self, return_ids=False,version=None):
|
||||
self.return_ids = return_ids
|
||||
self.is_v2Pro=version in {"v2Pro","v2ProPlus"}
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text, audio and speaker identities
|
||||
@ -211,6 +227,9 @@ class TextAudioSpeakerCollate:
|
||||
ssl_padded.zero_()
|
||||
text_padded.zero_()
|
||||
|
||||
if self.is_v2Pro:
|
||||
sv_embs=torch.FloatTensor(len(batch),20480)
|
||||
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
@ -230,7 +249,12 @@ class TextAudioSpeakerCollate:
|
||||
text_padded[i, : text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
|
||||
if self.is_v2Pro:
|
||||
sv_embs[i]=row[4]
|
||||
if self.is_v2Pro:
|
||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths,sv_embs
|
||||
else:
|
||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
|
||||
|
||||
|
||||
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
|
@ -586,11 +586,12 @@ class DiscriminatorS(torch.nn.Module):
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
v2pro_set={"v2Pro","v2ProPlus"}
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
def __init__(self, use_spectral_norm=False,version=None):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
periods = [2, 3, 5, 7, 11]
|
||||
if version in v2pro_set:periods = [2, 3, 5, 7, 11,17,23]
|
||||
else:periods = [2, 3, 5, 7, 11]
|
||||
|
||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||
@ -786,7 +787,6 @@ class CodePredictor(nn.Module):
|
||||
|
||||
return pred_codes.transpose(0, 1)
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
@ -886,12 +886,23 @@ class SynthesizerTrn(nn.Module):
|
||||
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
|
||||
self.freeze_quantizer = freeze_quantizer
|
||||
|
||||
def forward(self, ssl, y, y_lengths, text, text_lengths):
|
||||
self.is_v2pro=self.version in v2pro_set
|
||||
if self.is_v2pro:
|
||||
self.sv_emb = nn.Linear(20480, gin_channels)
|
||||
self.ge_to512 = nn.Linear(gin_channels, 512)
|
||||
self.prelu = nn.PReLU(num_parameters=gin_channels)
|
||||
|
||||
def forward(self, ssl, y, y_lengths, text, text_lengths,sv_emb=None):
|
||||
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
|
||||
if self.version == "v1":
|
||||
ge = self.ref_enc(y * y_mask, y_mask)
|
||||
else:
|
||||
ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
|
||||
if self.is_v2pro:
|
||||
sv_emb = self.sv_emb(sv_emb) # B*20480->B*512
|
||||
ge += sv_emb.unsqueeze(-1)
|
||||
ge = self.prelu(ge)
|
||||
ge512 = self.ge_to512(ge.transpose(2, 1)).transpose(2, 1)
|
||||
with autocast(enabled=False):
|
||||
maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
|
||||
with maybe_no_grad:
|
||||
@ -904,7 +915,7 @@ class SynthesizerTrn(nn.Module):
|
||||
if self.semantic_frame_rate == "25hz":
|
||||
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
|
||||
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge512 if self.is_v2pro else ge)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
|
||||
z_p = self.flow(z, y_mask, g=ge)
|
||||
|
||||
@ -941,8 +952,8 @@ class SynthesizerTrn(nn.Module):
|
||||
return o, y_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
@torch.no_grad()
|
||||
def decode(self, codes, text, refer, noise_scale=0.5, speed=1):
|
||||
def get_ge(refer):
|
||||
def decode(self, codes, text, refer,noise_scale=0.5, speed=1, sv_emb=None):
|
||||
def get_ge(refer, sv_emb):
|
||||
ge = None
|
||||
if refer is not None:
|
||||
refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
|
||||
@ -951,16 +962,20 @@ class SynthesizerTrn(nn.Module):
|
||||
ge = self.ref_enc(refer * refer_mask, refer_mask)
|
||||
else:
|
||||
ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
|
||||
if self.is_v2pro:
|
||||
sv_emb = self.sv_emb(sv_emb) # B*20480->B*512
|
||||
ge += sv_emb.unsqueeze(-1)
|
||||
ge = self.prelu(ge)
|
||||
return ge
|
||||
|
||||
if type(refer) == list:
|
||||
ges = []
|
||||
for _refer in refer:
|
||||
ge = get_ge(_refer)
|
||||
for idx,_refer in enumerate(refer):
|
||||
ge = get_ge(_refer, sv_emb[idx]if self.is_v2pro else None)
|
||||
ges.append(ge)
|
||||
ge = torch.stack(ges, 0).mean(0)
|
||||
else:
|
||||
ge = get_ge(refer)
|
||||
ge = get_ge(refer, sv_emb)
|
||||
|
||||
y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
|
||||
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
|
||||
@ -968,7 +983,7 @@ class SynthesizerTrn(nn.Module):
|
||||
quantized = self.quantizer.decode(codes)
|
||||
if self.semantic_frame_rate == "25hz":
|
||||
quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, self.ge_to512(ge.transpose(2,1)).transpose(2,1)if self.is_v2pro else ge, speed)
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
|
||||
z = self.flow(z_p, y_mask, g=ge, reverse=True)
|
||||
|
@ -391,6 +391,7 @@ class Generator(torch.nn.Module):
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
gin_channels=0,
|
||||
is_bias=False,
|
||||
):
|
||||
super(Generator, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
@ -418,7 +419,7 @@ class Generator(torch.nn.Module):
|
||||
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(ch, k, d))
|
||||
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=is_bias)
|
||||
self.ups.apply(init_weights)
|
||||
|
||||
if gin_channels != 0:
|
||||
|
109
GPT_SoVITS/prepare_datasets/2-get-sv.py
Normal file
109
GPT_SoVITS/prepare_datasets/2-get-sv.py
Normal file
@ -0,0 +1,109 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
inp_text = os.environ.get("inp_text")
|
||||
inp_wav_dir = os.environ.get("inp_wav_dir")
|
||||
exp_name = os.environ.get("exp_name")
|
||||
i_part = os.environ.get("i_part")
|
||||
all_parts = os.environ.get("all_parts")
|
||||
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
||||
from feature_extractor import cnhubert
|
||||
|
||||
opt_dir = os.environ.get("opt_dir")
|
||||
sv_path = os.environ.get("sv_path")
|
||||
import torch
|
||||
|
||||
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
||||
|
||||
import traceback
|
||||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
import torchaudio
|
||||
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
|
||||
from tools.my_utils import load_audio, clean_path
|
||||
from time import time as ttime
|
||||
import shutil
|
||||
from ERes2NetV2 import ERes2NetV2
|
||||
import kaldi as Kaldi
|
||||
|
||||
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||
dir = os.path.dirname(path)
|
||||
name = os.path.basename(path)
|
||||
# tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
||||
tmp_path = "%s%s.pth" % (ttime(), i_part)
|
||||
torch.save(fea, tmp_path)
|
||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
||||
|
||||
|
||||
sv_cn_dir = "%s/7-sv_cn" % (opt_dir)
|
||||
wav32dir = "%s/5-wav32k" % (opt_dir)
|
||||
os.makedirs(opt_dir, exist_ok=True)
|
||||
os.makedirs(sv_cn_dir, exist_ok=True)
|
||||
os.makedirs(wav32dir, exist_ok=True)
|
||||
|
||||
maxx = 0.95
|
||||
alpha = 0.5
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda:0"
|
||||
# elif torch.backends.mps.is_available():
|
||||
# device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
|
||||
class SV:
|
||||
def __init__(self,device,is_half):
|
||||
pretrained_state = torch.load(sv_path, map_location='cpu')
|
||||
embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
|
||||
embedding_model.load_state_dict(pretrained_state)
|
||||
embedding_model.eval()
|
||||
self.embedding_model=embedding_model
|
||||
self.res=torchaudio.transforms.Resample(32000, 16000).to(device)
|
||||
if is_half == False:
|
||||
self.embedding_model=self.embedding_model.to(device)
|
||||
else:
|
||||
self.embedding_model=self.embedding_model.half().to(device)
|
||||
self.is_half=is_half
|
||||
|
||||
def compute_embedding3(self,wav):#(1,x)#-1~1
|
||||
with torch.no_grad():
|
||||
wav=self.res(wav)
|
||||
if self.is_half==True:wav=wav.half()
|
||||
feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
|
||||
sv_emb = self.embedding_model.forward3(feat)
|
||||
return sv_emb
|
||||
|
||||
sv=SV(device,is_half)
|
||||
def name2go(wav_name, wav_path):
|
||||
sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
|
||||
if os.path.exists(sv_cn_path):return
|
||||
wav_path="%s/%s" % (wav32dir, wav_name)
|
||||
wav32k,sr0 = torchaudio.load(wav_path)
|
||||
assert sr0==32000
|
||||
wav32k = wav32k.to(device)
|
||||
emb=sv.compute_embedding3(wav32k).cpu() # torch.Size([1, 20480])
|
||||
my_save(emb, sv_cn_path)
|
||||
|
||||
|
||||
with open(inp_text, "r", encoding="utf8") as f:
|
||||
lines = f.read().strip("\n").split("\n")
|
||||
|
||||
for line in lines[int(i_part) :: int(all_parts)]:
|
||||
try:
|
||||
wav_name, spk_name, language, text = line.split("|")
|
||||
wav_name = clean_path(wav_name)
|
||||
if inp_wav_dir != "" and inp_wav_dir != None:
|
||||
wav_name = os.path.basename(wav_name)
|
||||
wav_path = "%s/%s" % (inp_wav_dir, wav_name)
|
||||
|
||||
else:
|
||||
wav_path = wav_name
|
||||
wav_name = os.path.basename(wav_name)
|
||||
name2go(wav_name, wav_path)
|
||||
except:
|
||||
print(line, traceback.format_exc())
|
@ -17,29 +17,27 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
||||
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
||||
|
||||
|
||||
"""
|
||||
00:v1
|
||||
01:v2
|
||||
02:v3
|
||||
03:v3lora
|
||||
04:v4lora
|
||||
|
||||
"""
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
def my_save2(fea, path, cfm_version):
|
||||
model_version2byte={
|
||||
"v3":b"03",
|
||||
"v4":b"04",
|
||||
"v2Pro":b"05",
|
||||
"v2ProPlus":b"06",
|
||||
}
|
||||
def my_save2(fea, path, model_version):
|
||||
bio = BytesIO()
|
||||
torch.save(fea, bio)
|
||||
bio.seek(0)
|
||||
data = bio.getvalue()
|
||||
byte = b"03" if cfm_version == "v3" else b"04"
|
||||
byte = model_version2byte[model_version]
|
||||
data = byte + data[2:]
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
|
||||
def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None):
|
||||
def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
|
||||
try:
|
||||
opt = OrderedDict()
|
||||
opt["weight"] = {}
|
||||
@ -51,26 +49,40 @@ def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None):
|
||||
opt["info"] = "%sepoch_%siteration" % (epoch, steps)
|
||||
if lora_rank:
|
||||
opt["lora_rank"] = lora_rank
|
||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version)
|
||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
||||
elif (model_version!=None and "Pro"in model_version):
|
||||
my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
|
||||
else:
|
||||
my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
||||
return "Success."
|
||||
except:
|
||||
return traceback.format_exc()
|
||||
|
||||
|
||||
"""
|
||||
00:v1
|
||||
01:v2
|
||||
02:v3
|
||||
03:v3lora
|
||||
04:v4lora
|
||||
05:v2Pro
|
||||
06:v2ProPlus
|
||||
"""
|
||||
head2version = {
|
||||
b"00": ["v1", "v1", False],
|
||||
b"01": ["v2", "v2", False],
|
||||
b"02": ["v2", "v3", False],
|
||||
b"03": ["v2", "v3", True],
|
||||
b"04": ["v2", "v4", True],
|
||||
b"05": ["v2", "v2Pro", False],
|
||||
b"06": ["v2", "v2ProPlus", False],
|
||||
}
|
||||
hash_pretrained_dict = {
|
||||
"dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained
|
||||
"43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained
|
||||
"6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained
|
||||
"4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained
|
||||
"c7e9fce2223f3db685cdfa1e6368728a": ["v2", "v2Pro", False], # s2Gv2Pro.pth#sovits_v2Pro_pretrained
|
||||
"66b313e39455b57ab1b0bc0b239c9d0a": ["v2", "v2ProPlus", False], # s2Gv2ProPlus.pth#sovits_v2ProPlus_pretrained
|
||||
}
|
||||
import hashlib
|
||||
|
||||
|
@ -36,7 +36,7 @@ from module.models import (
|
||||
MultiPeriodDiscriminator,
|
||||
SynthesizerTrn,
|
||||
)
|
||||
from process_ckpt import savee
|
||||
from process_ckpt import savee,my_save2
|
||||
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = False
|
||||
@ -87,38 +87,19 @@ def run(rank, n_gpus, hps):
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_device(rank)
|
||||
|
||||
train_dataset = TextAudioSpeakerLoader(hps.data) ########
|
||||
train_dataset = TextAudioSpeakerLoader(hps.data,version=hps.model.version)
|
||||
train_sampler = DistributedBucketSampler(
|
||||
train_dataset,
|
||||
hps.train.batch_size,
|
||||
[
|
||||
32,
|
||||
300,
|
||||
400,
|
||||
500,
|
||||
600,
|
||||
700,
|
||||
800,
|
||||
900,
|
||||
1000,
|
||||
1100,
|
||||
1200,
|
||||
1300,
|
||||
1400,
|
||||
1500,
|
||||
1600,
|
||||
1700,
|
||||
1800,
|
||||
1900,
|
||||
],
|
||||
[32,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,],
|
||||
num_replicas=n_gpus,
|
||||
rank=rank,
|
||||
shuffle=True,
|
||||
)
|
||||
collate_fn = TextAudioSpeakerCollate()
|
||||
collate_fn = TextAudioSpeakerCollate(version=hps.model.version)
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
num_workers=6,
|
||||
num_workers=5,
|
||||
shuffle=False,
|
||||
pin_memory=True,
|
||||
collate_fn=collate_fn,
|
||||
@ -149,9 +130,9 @@ def run(rank, n_gpus, hps):
|
||||
)
|
||||
|
||||
net_d = (
|
||||
MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
|
||||
MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).cuda(rank)
|
||||
if torch.cuda.is_available()
|
||||
else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device)
|
||||
else MultiPeriodDiscriminator(hps.model.use_spectral_norm,version=hps.model.version).to(device)
|
||||
)
|
||||
for name, param in net_g.named_parameters():
|
||||
if not param.requires_grad:
|
||||
@ -235,12 +216,12 @@ def run(rank, n_gpus, hps):
|
||||
print(
|
||||
"loaded pretrained %s" % hps.train.pretrained_s2G,
|
||||
net_g.module.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
|
||||
strict=False,
|
||||
)
|
||||
if torch.cuda.is_available()
|
||||
else net_g.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
|
||||
strict=False,
|
||||
),
|
||||
) ##测试不加载优化器
|
||||
@ -254,11 +235,11 @@ def run(rank, n_gpus, hps):
|
||||
print(
|
||||
"loaded pretrained %s" % hps.train.pretrained_s2D,
|
||||
net_d.module.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],strict=False
|
||||
)
|
||||
if torch.cuda.is_available()
|
||||
else net_d.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2D, map_location="cpu", weights_only=False)["weight"],
|
||||
),
|
||||
)
|
||||
|
||||
@ -328,50 +309,20 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
|
||||
net_g.train()
|
||||
net_d.train()
|
||||
for batch_idx, (
|
||||
ssl,
|
||||
ssl_lengths,
|
||||
spec,
|
||||
spec_lengths,
|
||||
y,
|
||||
y_lengths,
|
||||
text,
|
||||
text_lengths,
|
||||
) in enumerate(tqdm(train_loader)):
|
||||
for batch_idx, data in enumerate(tqdm(train_loader)):
|
||||
if hps.model.version in {"v2Pro","v2ProPlus"}:
|
||||
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths,sv_emb=data
|
||||
else:
|
||||
ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths=data
|
||||
if torch.cuda.is_available():
|
||||
spec, spec_lengths = (
|
||||
spec.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
spec_lengths.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
)
|
||||
y, y_lengths = (
|
||||
y.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
y_lengths.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
)
|
||||
spec, spec_lengths = (spec.cuda(rank,non_blocking=True,),spec_lengths.cuda(rank,non_blocking=True,),)
|
||||
y, y_lengths = (y.cuda(rank,non_blocking=True,),y_lengths.cuda(rank,non_blocking=True,),)
|
||||
ssl = ssl.cuda(rank, non_blocking=True)
|
||||
ssl.requires_grad = False
|
||||
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
|
||||
text, text_lengths = (
|
||||
text.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
text_lengths.cuda(
|
||||
rank,
|
||||
non_blocking=True,
|
||||
),
|
||||
)
|
||||
text, text_lengths = (text.cuda(rank,non_blocking=True,),text_lengths.cuda(rank,non_blocking=True,),)
|
||||
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||
sv_emb = sv_emb.cuda(rank, non_blocking=True)
|
||||
else:
|
||||
spec, spec_lengths = spec.to(device), spec_lengths.to(device)
|
||||
y, y_lengths = y.to(device), y_lengths.to(device)
|
||||
@ -379,17 +330,13 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
ssl.requires_grad = False
|
||||
# ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
|
||||
text, text_lengths = text.to(device), text_lengths.to(device)
|
||||
|
||||
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||
sv_emb = sv_emb.to(device)
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
(
|
||||
y_hat,
|
||||
kl_ssl,
|
||||
ids_slice,
|
||||
x_mask,
|
||||
z_mask,
|
||||
(z, z_p, m_p, logs_p, m_q, logs_q),
|
||||
stats_ssl,
|
||||
) = net_g(ssl, spec, spec_lengths, text, text_lengths)
|
||||
if hps.model.version in {"v2Pro", "v2ProPlus"}:
|
||||
(y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl) = net_g(ssl, spec, spec_lengths, text, text_lengths,sv_emb)
|
||||
else:
|
||||
(y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl,) = net_g(ssl, spec, spec_lengths, text, text_lengths)
|
||||
|
||||
mel = spec_to_mel_torch(
|
||||
spec,
|
||||
@ -561,13 +508,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
% (
|
||||
hps.name,
|
||||
epoch,
|
||||
savee(
|
||||
ckpt,
|
||||
hps.name + "_e%s_s%s" % (epoch, global_step),
|
||||
epoch,
|
||||
global_step,
|
||||
hps,
|
||||
),
|
||||
savee(ckpt,hps.name + "_e%s_s%s" % (epoch, global_step),epoch,global_step,hps,model_version=None if hps.model.version not in {"v2Pro","v2ProPlus"}else hps.model.version),
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -204,12 +204,12 @@ def run(rank, n_gpus, hps):
|
||||
print(
|
||||
"loaded pretrained %s" % hps.train.pretrained_s2G,
|
||||
net_g.module.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
|
||||
strict=False,
|
||||
)
|
||||
if torch.cuda.is_available()
|
||||
else net_g.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
|
||||
strict=False,
|
||||
),
|
||||
) ##测试不加载优化器
|
||||
|
@ -189,7 +189,7 @@ def run(rank, n_gpus, hps):
|
||||
print(
|
||||
"loaded pretrained %s" % hps.train.pretrained_s2G,
|
||||
net_g.load_state_dict(
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"],
|
||||
torch.load(hps.train.pretrained_s2G, map_location="cpu", weights_only=False)["weight"],
|
||||
strict=False,
|
||||
),
|
||||
)
|
||||
@ -365,7 +365,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
||||
epoch,
|
||||
global_step,
|
||||
hps,
|
||||
cfm_version=hps.model.version,
|
||||
model_version=hps.model.version,
|
||||
lora_rank=lora_rank,
|
||||
),
|
||||
)
|
||||
|
24
GPT_SoVITS/sv.py
Normal file
24
GPT_SoVITS/sv.py
Normal file
@ -0,0 +1,24 @@
|
||||
import sys,os,torch
|
||||
sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
|
||||
sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
|
||||
from ERes2NetV2 import ERes2NetV2
|
||||
import kaldi as Kaldi
|
||||
class SV:
|
||||
def __init__(self,device,is_half):
|
||||
pretrained_state = torch.load(sv_path, map_location='cpu', weights_only=False)
|
||||
embedding_model = ERes2NetV2(baseWidth=24,scale=4,expansion=4)
|
||||
embedding_model.load_state_dict(pretrained_state)
|
||||
embedding_model.eval()
|
||||
self.embedding_model=embedding_model
|
||||
if is_half == False:
|
||||
self.embedding_model=self.embedding_model.to(device)
|
||||
else:
|
||||
self.embedding_model=self.embedding_model.half().to(device)
|
||||
self.is_half=is_half
|
||||
|
||||
def compute_embedding3(self,wav):
|
||||
with torch.no_grad():
|
||||
if self.is_half==True:wav=wav.half()
|
||||
feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
|
||||
sv_emb = self.embedding_model.forward3(feat)
|
||||
return sv_emb
|
@ -23,8 +23,8 @@ from .utils import load_config
|
||||
onnxruntime.set_default_logger_severity(3)
|
||||
try:
|
||||
onnxruntime.preload_dlls()
|
||||
except:
|
||||
traceback.print_exc()
|
||||
except:pass
|
||||
#traceback.print_exc()
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
model_version = "1.1"
|
||||
|
@ -22,7 +22,7 @@ logger = logging
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
|
||||
iteration = checkpoint_dict["iteration"]
|
||||
learning_rate = checkpoint_dict["learning_rate"]
|
||||
if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None:
|
||||
|
18
README.md
18
README.md
@ -328,6 +328,23 @@ Use v4 from v1/v2/v3 environment:
|
||||
|
||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
## V2Pro Release Notes
|
||||
|
||||
New Features:
|
||||
|
||||
1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
|
||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
|
||||
|
||||
2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
|
||||
|
||||
Use v2Pro from v1/v2/v3/v4 environment:
|
||||
|
||||
1. `pip install -r requirements.txt` to update some packages
|
||||
|
||||
2. Clone the latest codes from github.
|
||||
|
||||
3. Download v2Pro pretrained models (v2Pro/s2Dv2Pro.pth, v2Pro/s2Gv2Pro.pth, v2Pro/s2Dv2ProPlus.pth, v2Pro/s2Gv2ProPlus.pth, and sv/pretrained_eres2netv2w24s4ep4.ckpt) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
## Todo List
|
||||
|
||||
- [x] **High Priority:**
|
||||
@ -410,6 +427,7 @@ Special thanks to the following projects and contributors:
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
- [eresnetv2](https://modelscope.cn/models/iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common)
|
||||
|
||||
### Text Frontend for Inference
|
||||
|
||||
|
2
api.py
2
api.py
@ -374,7 +374,7 @@ hz = 50
|
||||
|
||||
|
||||
def get_gpt_weights(gpt_path):
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
|
||||
config = dict_s1["config"]
|
||||
max_sec = config["data"]["max_sec"]
|
||||
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
||||
|
442
batch_inference.py
Normal file
442
batch_inference.py
Normal file
@ -0,0 +1,442 @@
|
||||
import argparse
|
||||
import os
|
||||
import pdb
|
||||
import signal
|
||||
import sys
|
||||
from time import time as ttime
|
||||
import torch
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||
import numpy as np
|
||||
from feature_extractor import cnhubert
|
||||
from io import BytesIO
|
||||
from module.models import SynthesizerTrn
|
||||
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||
from text import cleaned_text_to_sequence
|
||||
from text.cleaner import clean_text
|
||||
from module.mel_processing import spectrogram_torch
|
||||
from my_utils import load_audio
|
||||
import config as global_config
|
||||
|
||||
g_config = global_config.Config()
|
||||
|
||||
# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
|
||||
|
||||
parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
|
||||
parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
|
||||
|
||||
parser.add_argument("-dr", "--default_refer_path", type=str, default="",
|
||||
help="默认参考音频路径, 请求缺少参考音频时调用")
|
||||
parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
|
||||
parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
|
||||
|
||||
parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
|
||||
parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
|
||||
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
|
||||
parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
|
||||
parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
|
||||
# bool值的用法为 `python ./api.py -fp ...`
|
||||
# 此时 full_precision==True, half_precision==False
|
||||
|
||||
parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
|
||||
parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
sovits_path = args.sovits_path
|
||||
gpt_path = args.gpt_path
|
||||
|
||||
default_refer_path = args.default_refer_path
|
||||
default_refer_text = args.default_refer_text
|
||||
default_refer_language = args.default_refer_language
|
||||
has_preset = False
|
||||
|
||||
device = args.device
|
||||
port = args.port
|
||||
host = args.bind_addr
|
||||
|
||||
if sovits_path == "":
|
||||
sovits_path = g_config.pretrained_sovits_path
|
||||
print(f"[WARN] 未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
|
||||
if gpt_path == "":
|
||||
gpt_path = g_config.pretrained_gpt_path
|
||||
print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
|
||||
|
||||
# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
|
||||
if default_refer_path == "" or default_refer_text == "" or default_refer_language == "":
|
||||
default_refer_path, default_refer_text, default_refer_language = "", "", ""
|
||||
print("[INFO] 未指定默认参考音频")
|
||||
has_preset = False
|
||||
else:
|
||||
print(f"[INFO] 默认参考音频路径: {default_refer_path}")
|
||||
print(f"[INFO] 默认参考音频文本: {default_refer_text}")
|
||||
print(f"[INFO] 默认参考音频语种: {default_refer_language}")
|
||||
has_preset = True
|
||||
|
||||
is_half = g_config.is_half
|
||||
if args.full_precision:
|
||||
is_half = False
|
||||
if args.half_precision:
|
||||
is_half = True
|
||||
if args.full_precision and args.half_precision:
|
||||
is_half = g_config.is_half # 炒饭fallback
|
||||
|
||||
print(f"[INFO] 半精: {is_half}")
|
||||
|
||||
cnhubert_base_path = args.hubert_path
|
||||
bert_path = args.bert_path
|
||||
|
||||
cnhubert.cnhubert_base_path = cnhubert_base_path
|
||||
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
||||
if is_half:
|
||||
bert_model = bert_model.half().to(device)
|
||||
else:
|
||||
bert_model = bert_model.to(device)
|
||||
|
||||
|
||||
def get_bert_feature(text, word2ph):
|
||||
with torch.no_grad():
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
for i in inputs:
|
||||
inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
|
||||
res = bert_model(**inputs, output_hidden_states=True)
|
||||
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
||||
assert len(word2ph) == len(text)
|
||||
phone_level_feature = []
|
||||
for i in range(len(word2ph)):
|
||||
repeat_feature = res[i].repeat(word2ph[i], 1)
|
||||
phone_level_feature.append(repeat_feature)
|
||||
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
||||
# if(is_half==True):phone_level_feature=phone_level_feature.half()
|
||||
return phone_level_feature.T
|
||||
|
||||
|
||||
n_semantic = 1024
|
||||
dict_s2 = torch.load(sovits_path, map_location="cpu", weights_only=False)
|
||||
hps = dict_s2["config"]
|
||||
print(hps)
|
||||
|
||||
class DictToAttrRecursive(dict):
|
||||
def __init__(self, input_dict):
|
||||
super().__init__(input_dict)
|
||||
for key, value in input_dict.items():
|
||||
if isinstance(value, dict):
|
||||
value = DictToAttrRecursive(value)
|
||||
self[key] = value
|
||||
setattr(self, key, value)
|
||||
|
||||
def __getattr__(self, item):
|
||||
try:
|
||||
return self[item]
|
||||
except KeyError:
|
||||
raise AttributeError(f"Attribute {item} not found")
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if isinstance(value, dict):
|
||||
value = DictToAttrRecursive(value)
|
||||
super(DictToAttrRecursive, self).__setitem__(key, value)
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __delattr__(self, item):
|
||||
try:
|
||||
del self[item]
|
||||
except KeyError:
|
||||
raise AttributeError(f"Attribute {item} not found")
|
||||
|
||||
|
||||
hps = DictToAttrRecursive(hps)
|
||||
hps.model.semantic_frame_rate = "25hz"
|
||||
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
|
||||
config = dict_s1["config"]
|
||||
ssl_model = cnhubert.get_model()
|
||||
if is_half:
|
||||
ssl_model = ssl_model.half().to(device)
|
||||
else:
|
||||
ssl_model = ssl_model.to(device)
|
||||
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**hps.model)
|
||||
if is_half:
|
||||
vq_model = vq_model.half().to(device)
|
||||
else:
|
||||
vq_model = vq_model.to(device)
|
||||
vq_model.eval()
|
||||
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
||||
hz = 50
|
||||
max_sec = config['data']['max_sec']
|
||||
t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
|
||||
t2s_model.load_state_dict(dict_s1["weight"])
|
||||
if is_half:
|
||||
t2s_model = t2s_model.half()
|
||||
t2s_model = t2s_model.to(device)
|
||||
t2s_model.eval()
|
||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
print("Number of parameter: %.2fM" % (total / 1e6))
|
||||
|
||||
|
||||
def get_spepc(hps, filename):
|
||||
audio = load_audio(filename, int(hps.data.sampling_rate))
|
||||
audio = torch.FloatTensor(audio)
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
|
||||
hps.data.win_length, center=False)
|
||||
return spec
|
||||
|
||||
|
||||
dict_language = {
|
||||
"中文": "zh",
|
||||
"英文": "en",
|
||||
"日文": "ja",
|
||||
"ZH": "zh",
|
||||
"EN": "en",
|
||||
"JA": "ja",
|
||||
"zh": "zh",
|
||||
"en": "en",
|
||||
"ja": "ja"
|
||||
}
|
||||
|
||||
|
||||
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
t0 = ttime()
|
||||
prompt_text = prompt_text.strip("\n")
|
||||
prompt_language, text = prompt_language, text.strip("\n")
|
||||
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
|
||||
with torch.no_grad():
|
||||
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
||||
wav16k = torch.from_numpy(wav16k)
|
||||
zero_wav_torch = torch.from_numpy(zero_wav)
|
||||
if (is_half == True):
|
||||
wav16k = wav16k.half().to(device)
|
||||
zero_wav_torch = zero_wav_torch.half().to(device)
|
||||
else:
|
||||
wav16k = wav16k.to(device)
|
||||
zero_wav_torch = zero_wav_torch.to(device)
|
||||
wav16k=torch.cat([wav16k,zero_wav_torch])
|
||||
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
|
||||
codes = vq_model.extract_latent(ssl_content)
|
||||
prompt_semantic = codes[0, 0]
|
||||
t1 = ttime()
|
||||
prompt_language = dict_language[prompt_language]
|
||||
text_language = dict_language[text_language]
|
||||
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
||||
phones1 = cleaned_text_to_sequence(phones1)
|
||||
texts = text.split("\n")
|
||||
audio_opt = []
|
||||
|
||||
for text in texts:
|
||||
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
|
||||
phones2 = cleaned_text_to_sequence(phones2)
|
||||
if (prompt_language == "zh"):
|
||||
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
||||
else:
|
||||
bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
|
||||
device)
|
||||
if (text_language == "zh"):
|
||||
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
|
||||
else:
|
||||
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
|
||||
bert = torch.cat([bert1, bert2], 1)
|
||||
|
||||
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
||||
bert = bert.to(device).unsqueeze(0)
|
||||
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
|
||||
prompt = prompt_semantic.unsqueeze(0).to(device)
|
||||
t2 = ttime()
|
||||
with torch.no_grad():
|
||||
# pred_semantic = t2s_model.model.infer(
|
||||
pred_semantic, idx = t2s_model.model.infer_panel(
|
||||
all_phoneme_ids,
|
||||
all_phoneme_len,
|
||||
prompt,
|
||||
bert,
|
||||
# prompt_phone_len=ph_offset,
|
||||
top_k=config['inference']['top_k'],
|
||||
early_stop_num=hz * max_sec)
|
||||
t3 = ttime()
|
||||
# print(pred_semantic.shape,idx)
|
||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
refer = get_spepc(hps, ref_wav_path) # .to(device)
|
||||
if (is_half == True):
|
||||
refer = refer.half().to(device)
|
||||
else:
|
||||
refer = refer.to(device)
|
||||
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
|
||||
audio = \
|
||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
|
||||
refer).detach().cpu().numpy()[
|
||||
0, 0] ###试试重建不带上prompt部分
|
||||
audio_opt.append(audio)
|
||||
audio_opt.append(zero_wav)
|
||||
t4 = ttime()
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
||||
# yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
|
||||
return hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
|
||||
def get_tts_wavs(ref_wav_path, prompt_text, prompt_language, textss, text_language):
|
||||
t0 = ttime()
|
||||
prompt_text = prompt_text.strip("\n")
|
||||
zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
|
||||
with torch.no_grad():
|
||||
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
||||
wav16k = torch.from_numpy(wav16k)
|
||||
zero_wav_torch = torch.from_numpy(zero_wav)
|
||||
if (is_half == True):
|
||||
wav16k = wav16k.half().to(device)
|
||||
zero_wav_torch = zero_wav_torch.half().to(device)
|
||||
else:
|
||||
wav16k = wav16k.to(device)
|
||||
zero_wav_torch = zero_wav_torch.to(device)
|
||||
wav16k=torch.cat([wav16k,zero_wav_torch])
|
||||
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2) # .float()
|
||||
codes = vq_model.extract_latent(ssl_content)
|
||||
prompt_semantic = codes[0, 0]
|
||||
t1 = ttime()
|
||||
prompt_language = dict_language[prompt_language]
|
||||
text_language = dict_language[text_language]
|
||||
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
||||
phones1 = cleaned_text_to_sequence(phones1)
|
||||
audios_opt=[]
|
||||
for text0 in textss:
|
||||
texts = text0.strip("\n").split("\n")
|
||||
audio_opt = []
|
||||
for text in texts:
|
||||
text=text.strip("。")+"。"
|
||||
phones2, word2ph2, norm_text2 = clean_text(text, text_language)
|
||||
phones2 = cleaned_text_to_sequence(phones2)
|
||||
if (prompt_language == "zh"):
|
||||
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
||||
else:
|
||||
bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
|
||||
device)
|
||||
if (text_language == "zh"):
|
||||
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
|
||||
else:
|
||||
bert2 = torch.zeros((1024, len(phones2))).to(bert1)
|
||||
bert = torch.cat([bert1, bert2], 1)
|
||||
|
||||
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
||||
bert = bert.to(device).unsqueeze(0)
|
||||
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
|
||||
prompt = prompt_semantic.unsqueeze(0).to(device)
|
||||
t2 = ttime()
|
||||
with torch.no_grad():
|
||||
# pred_semantic = t2s_model.model.infer(
|
||||
pred_semantic, idx = t2s_model.model.infer_panel(
|
||||
all_phoneme_ids,
|
||||
all_phoneme_len,
|
||||
prompt,
|
||||
bert,
|
||||
# prompt_phone_len=ph_offset,
|
||||
top_k=config['inference']['top_k'],
|
||||
early_stop_num=hz * max_sec)
|
||||
t3 = ttime()
|
||||
# print(pred_semantic.shape,idx)
|
||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
refer = get_spepc(hps, ref_wav_path) # .to(device)
|
||||
if (is_half == True):
|
||||
refer = refer.half().to(device)
|
||||
else:
|
||||
refer = refer.to(device)
|
||||
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
|
||||
audio = \
|
||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
|
||||
refer).detach().cpu().numpy()[
|
||||
0, 0] ###试试重建不带上prompt部分
|
||||
audio_opt.append(audio)
|
||||
audio_opt.append(zero_wav)
|
||||
t4 = ttime()
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
||||
audios_opt.append([text0,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16)])
|
||||
return audios_opt
|
||||
|
||||
|
||||
# get_tts_wav(r"D:\BaiduNetdiskDownload\gsv\speech\萧逸声音-你得先从滑雪的基本技巧学起.wav", "你得先从滑雪的基本技巧学起。", "中文", "我觉得还是该给喜欢的女孩子一场认真的告白。", "中文")
|
||||
# with open(r"D:\BaiduNetdiskDownload\gsv\烟嗓-todo1.txt","r",encoding="utf8")as f:
|
||||
# with open(r"D:\BaiduNetdiskDownload\gsv\年下-todo1.txt","r",encoding="utf8")as f:
|
||||
# with open(r"D:\BaiduNetdiskDownload\gsv\萧逸3b.txt","r",encoding="utf8")as f:
|
||||
with open(r"D:\BaiduNetdiskDownload\gsv\萧逸4.txt","r",encoding="utf8")as f:
|
||||
textss=f.read().split("\n")
|
||||
for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\speech\萧逸声音-你得先从滑雪的基本技巧学起.wav", "你得先从滑雪的基本技巧学起。", "中文", textss, "中文")):
|
||||
|
||||
# for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\足够的能力,去制定好自己的生活规划。低沉烟嗓.MP3_1940480_2095360.wav", "足够的能力,去制定好自己的生活规划。", "中文", textss, "中文")):
|
||||
# for idx,(text,audio)in enumerate(get_tts_wavs(r"D:\BaiduNetdiskDownload\gsv\不会呀!你前几天才吃过你还说好吃来着。年下少年音.MP3_537600_711040.wav", "不会呀!你前几天才吃过你还说好吃来着。", "中文", textss, "中文")):
|
||||
print(idx,text)
|
||||
# sf.write(r"D:\BaiduNetdiskDownload\gsv\output\烟嗓第一批\%04d-%s.wav"%(idx,text),audio,32000)
|
||||
# sf.write(r"D:\BaiduNetdiskDownload\gsv\output\年下\%04d-%s.wav"%(idx,text),audio,32000)
|
||||
sf.write(r"D:\BaiduNetdiskDownload\gsv\output\萧逸第4批\%04d-%s.wav"%(idx,text),audio,32000)
|
||||
|
||||
|
||||
# def handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language):
|
||||
# if command == "/restart":
|
||||
# os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
|
||||
# elif command == "/exit":
|
||||
# os.kill(os.getpid(), signal.SIGTERM)
|
||||
# exit(0)
|
||||
#
|
||||
# if (
|
||||
# refer_wav_path == "" or refer_wav_path is None
|
||||
# or prompt_text == "" or prompt_text is None
|
||||
# or prompt_language == "" or prompt_language is None
|
||||
# ):
|
||||
# refer_wav_path, prompt_text, prompt_language = (
|
||||
# default_refer_path,
|
||||
# default_refer_text,
|
||||
# default_refer_language,
|
||||
# )
|
||||
# if not has_preset:
|
||||
# raise HTTPException(status_code=400, detail="未指定参考音频且接口无预设")
|
||||
#
|
||||
# with torch.no_grad():
|
||||
# gen = get_tts_wav(
|
||||
# refer_wav_path, prompt_text, prompt_language, text, text_language
|
||||
# )
|
||||
# sampling_rate, audio_data = next(gen)
|
||||
#
|
||||
# wav = BytesIO()
|
||||
# sf.write(wav, audio_data, sampling_rate, format="wav")
|
||||
# wav.seek(0)
|
||||
#
|
||||
# torch.cuda.empty_cache()
|
||||
# return StreamingResponse(wav, media_type="audio/wav")
|
||||
|
||||
|
||||
# app = FastAPI()
|
||||
#
|
||||
#
|
||||
# @app.post("/")
|
||||
# async def tts_endpoint(request: Request):
|
||||
# json_post_raw = await request.json()
|
||||
# return handle(
|
||||
# json_post_raw.get("command"),
|
||||
# json_post_raw.get("refer_wav_path"),
|
||||
# json_post_raw.get("prompt_text"),
|
||||
# json_post_raw.get("prompt_language"),
|
||||
# json_post_raw.get("text"),
|
||||
# json_post_raw.get("text_language"),
|
||||
# )
|
||||
#
|
||||
#
|
||||
# @app.get("/")
|
||||
# async def tts_endpoint(
|
||||
# command: str = None,
|
||||
# refer_wav_path: str = None,
|
||||
# prompt_text: str = None,
|
||||
# prompt_language: str = None,
|
||||
# text: str = None,
|
||||
# text_language: str = None,
|
||||
# ):
|
||||
# return handle(command, refer_wav_path, prompt_text, prompt_language, text, text_language)
|
||||
#
|
||||
#
|
||||
# if __name__ == "__main__":
|
||||
# uvicorn.run(app, host=host, port=port, workers=1)
|
185
config.py
185
config.py
@ -1,8 +1,126 @@
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
from tools.i18n.i18n import I18nAuto
|
||||
|
||||
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
|
||||
|
||||
|
||||
pretrained_sovits_name = {
|
||||
"v1": "GPT_SoVITS/pretrained_models/s2G488k.pth",
|
||||
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
|
||||
"v3": "GPT_SoVITS/pretrained_models/s2Gv3.pth", ###v3v4还要检查vocoder,算了。。。
|
||||
"v4": "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
|
||||
"v2Pro": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
|
||||
"v2ProPlus": "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
|
||||
}
|
||||
|
||||
pretrained_gpt_name = {
|
||||
"v1": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
|
||||
"v2": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
|
||||
"v3": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"v4": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"v2Pro": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"v2ProPlus": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
}
|
||||
name2sovits_path = {
|
||||
# i18n("不训练直接推v1底模!"): "GPT_SoVITS/pretrained_models/s2G488k.pth",
|
||||
i18n("不训练直接推v2底模!"): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
|
||||
# i18n("不训练直接推v3底模!"): "GPT_SoVITS/pretrained_models/s2Gv3.pth",
|
||||
# i18n("不训练直接推v4底模!"): "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
|
||||
i18n("不训练直接推v2Pro底模!"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2Pro.pth",
|
||||
i18n("不训练直接推v2ProPlus底模!"): "GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth",
|
||||
}
|
||||
name2gpt_path = {
|
||||
# i18n("不训练直接推v1底模!"):"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
|
||||
i18n(
|
||||
"不训练直接推v2底模!"
|
||||
): "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
|
||||
i18n("不训练直接推v3底模!"): "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
}
|
||||
SoVITS_weight_root = [
|
||||
"SoVITS_weights",
|
||||
"SoVITS_weights_v2",
|
||||
"SoVITS_weights_v3",
|
||||
"SoVITS_weights_v4",
|
||||
"SoVITS_weights_v2Pro",
|
||||
"SoVITS_weights_v2ProPlus",
|
||||
]
|
||||
GPT_weight_root = [
|
||||
"GPT_weights",
|
||||
"GPT_weights_v2",
|
||||
"GPT_weights_v3",
|
||||
"GPT_weights_v4",
|
||||
"GPT_weights_v2Pro",
|
||||
"GPT_weights_v2ProPlus",
|
||||
]
|
||||
SoVITS_weight_version2root = {
|
||||
"v1": "SoVITS_weights",
|
||||
"v2": "SoVITS_weights_v2",
|
||||
"v3": "SoVITS_weights_v3",
|
||||
"v4": "SoVITS_weights_v4",
|
||||
"v2Pro": "SoVITS_weights_v2Pro",
|
||||
"v2ProPlus": "SoVITS_weights_v2ProPlus",
|
||||
}
|
||||
GPT_weight_version2root = {
|
||||
"v1": "GPT_weights",
|
||||
"v2": "GPT_weights_v2",
|
||||
"v3": "GPT_weights_v3",
|
||||
"v4": "GPT_weights_v4",
|
||||
"v2Pro": "GPT_weights_v2Pro",
|
||||
"v2ProPlus": "GPT_weights_v2ProPlus",
|
||||
}
|
||||
|
||||
|
||||
def custom_sort_key(s):
|
||||
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||
parts = re.split("(\d+)", s)
|
||||
# 将数字部分转换为整数,非数字部分保持不变
|
||||
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||
return parts
|
||||
|
||||
|
||||
def get_weights_names():
|
||||
SoVITS_names = []
|
||||
for key in name2sovits_path:
|
||||
if os.path.exists(name2sovits_path[key]):
|
||||
SoVITS_names.append(key)
|
||||
for path in SoVITS_weight_root:
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".pth"):
|
||||
SoVITS_names.append("%s/%s" % (path, name))
|
||||
if not SoVITS_names:
|
||||
SoVITS_names = [""]
|
||||
GPT_names = []
|
||||
for key in name2gpt_path:
|
||||
if os.path.exists(name2gpt_path[key]):
|
||||
GPT_names.append(key)
|
||||
for path in GPT_weight_root:
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
for name in os.listdir(path):
|
||||
if name.endswith(".ckpt"):
|
||||
GPT_names.append("%s/%s" % (path, name))
|
||||
SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
|
||||
GPT_names = sorted(GPT_names, key=custom_sort_key)
|
||||
if not GPT_names:
|
||||
GPT_names = [""]
|
||||
return SoVITS_names, GPT_names
|
||||
|
||||
|
||||
def change_choices():
|
||||
SoVITS_names, GPT_names = get_weights_names()
|
||||
return {"choices": SoVITS_names, "__type__": "update"}, {
|
||||
"choices": GPT_names,
|
||||
"__type__": "update",
|
||||
}
|
||||
|
||||
|
||||
# 推理用的指定模型
|
||||
sovits_path = ""
|
||||
gpt_path = ""
|
||||
@ -18,10 +136,6 @@ pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=
|
||||
|
||||
exp_root = "logs"
|
||||
python_exec = sys.executable or "python"
|
||||
if torch.cuda.is_available():
|
||||
infer_device = "cuda"
|
||||
else:
|
||||
infer_device = "cpu"
|
||||
|
||||
webui_port_main = 9874
|
||||
webui_port_uvr5 = 9873
|
||||
@ -30,20 +144,55 @@ webui_port_subfix = 9871
|
||||
|
||||
api_port = 9880
|
||||
|
||||
if infer_device == "cuda":
|
||||
gpu_name = torch.cuda.get_device_name(0)
|
||||
if (
|
||||
("16" in gpu_name and "V100" not in gpu_name.upper())
|
||||
or "P40" in gpu_name.upper()
|
||||
or "P10" in gpu_name.upper()
|
||||
or "1060" in gpu_name
|
||||
or "1070" in gpu_name
|
||||
or "1080" in gpu_name
|
||||
):
|
||||
is_half = False
|
||||
|
||||
if infer_device == "cpu":
|
||||
is_half = False
|
||||
def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
|
||||
cpu = torch.device("cpu")
|
||||
cuda = torch.device(f"cuda:{idx}")
|
||||
if not torch.cuda.is_available():
|
||||
return cpu, torch.float32, 0.0, 0.0
|
||||
device_idx = idx
|
||||
capability = torch.cuda.get_device_capability(device_idx)
|
||||
name = torch.cuda.get_device_name(device_idx)
|
||||
mem_bytes = torch.cuda.get_device_properties(device_idx).total_memory
|
||||
mem_gb = mem_bytes / (1024**3) + 0.4
|
||||
major, minor = capability
|
||||
sm_version = major + minor / 10.0
|
||||
is_16_series = bool(re.search(r"16\d{2}", name))
|
||||
if mem_gb < 4:
|
||||
return cpu, torch.float32, 0.0, 0.0
|
||||
if (sm_version >= 7.0 and sm_version != 7.5) or (5.3 <= sm_version <= 6.0):
|
||||
if is_16_series and sm_version == 7.5:
|
||||
return cuda, torch.float32, sm_version, mem_gb # 16系卡除外
|
||||
else:
|
||||
return cuda, torch.float16, sm_version, mem_gb
|
||||
return cpu, torch.float32, 0.0, 0.0
|
||||
|
||||
|
||||
IS_GPU = True
|
||||
GPU_INFOS: list[str] = []
|
||||
GPU_INDEX: set[int] = set()
|
||||
GPU_COUNT = torch.cuda.device_count()
|
||||
CPU_INFO: str = "0\tCPU " + i18n("CPU训练,较慢")
|
||||
tmp: list[tuple[torch.device, torch.dtype, float, float]] = []
|
||||
memset: set[float] = set()
|
||||
|
||||
for i in range(max(GPU_COUNT, 1)):
|
||||
tmp.append(get_device_dtype_sm(i))
|
||||
|
||||
for j in tmp:
|
||||
device = j[0]
|
||||
memset.add(j[3])
|
||||
if device.type != "cpu":
|
||||
GPU_INFOS.append(f"{device.index}\t{torch.cuda.get_device_name(device.index)}")
|
||||
GPU_INDEX.add(device.index)
|
||||
|
||||
if not GPU_INFOS:
|
||||
IS_GPU = False
|
||||
GPU_INFOS.append(CPU_INFO)
|
||||
GPU_INDEX.add(0)
|
||||
|
||||
infer_device = max(tmp, key=lambda x: (x[2], x[3]))[0]
|
||||
is_half = any(dtype == torch.float16 for _, dtype, _, _ in tmp)
|
||||
|
||||
|
||||
class Config:
|
||||
|
@ -1,4 +1,6 @@
|
||||
### 20240121更新
|
||||
# 更新日志
|
||||
|
||||
## 20240121
|
||||
|
||||
1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网
|
||||
|
||||
@ -12,7 +14,7 @@
|
||||
|
||||
6-大幅削弱合成音频包含参考音频结尾的问题
|
||||
|
||||
### 20240122更新
|
||||
## 20240122
|
||||
|
||||
1-修复过短输出文件返回重复参考音频的问题.
|
||||
|
||||
@ -20,7 +22,7 @@
|
||||
|
||||
3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误.
|
||||
|
||||
### 20240123更新
|
||||
## 20240123
|
||||
|
||||
1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
|
||||
|
||||
@ -30,7 +32,7 @@
|
||||
|
||||
4-中文分词使用jieba_fast代替jieba
|
||||
|
||||
### 20240126更新
|
||||
## 20240126
|
||||
|
||||
1-支持输出文本中英混合、日英混合
|
||||
|
||||
@ -46,7 +48,7 @@
|
||||
|
||||
7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度.
|
||||
|
||||
### 20240128更新
|
||||
## 20240128
|
||||
|
||||
1-修复数字转汉字念法问题
|
||||
|
||||
@ -58,7 +60,7 @@
|
||||
|
||||
5-完善Dockerfile的下载模型流程
|
||||
|
||||
### 20240129更新
|
||||
## 20240129
|
||||
|
||||
1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
|
||||
|
||||
@ -67,7 +69,7 @@
|
||||
3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
|
||||
|
||||
|
||||
### 20240130更新
|
||||
## 20240130
|
||||
|
||||
1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
|
||||
|
||||
@ -75,19 +77,19 @@
|
||||
|
||||
3-增加按标点符号切分
|
||||
|
||||
### 20240201更新
|
||||
## 20240201
|
||||
|
||||
1-修复uvr5读取格式错误导致分离失败的问题
|
||||
|
||||
2-支持中日英混合多种文本自动切分识别语种
|
||||
|
||||
### 20240202更新
|
||||
## 20240202
|
||||
|
||||
1-修复asr路径尾缀带/保存文件名报错
|
||||
|
||||
2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
|
||||
|
||||
### 20240207更新
|
||||
## 20240207
|
||||
|
||||
1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
|
||||
|
||||
@ -103,29 +105,29 @@
|
||||
|
||||
7-集成faster whisper ASR日文英文
|
||||
|
||||
### 20240208更新
|
||||
## 20240208
|
||||
|
||||
1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b).
|
||||
|
||||
### 20240212更新
|
||||
## 20240212
|
||||
|
||||
1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题.
|
||||
|
||||
2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457
|
||||
|
||||
### 20240214更新
|
||||
## 20240214
|
||||
|
||||
1-训练支持中文实验名 (原来会报错)
|
||||
|
||||
2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题.
|
||||
|
||||
### 20240216更新
|
||||
## 20240216
|
||||
|
||||
1-支持无参考文本输入
|
||||
|
||||
2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
|
||||
|
||||
### 20240221更新
|
||||
## 20240221
|
||||
|
||||
1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦).
|
||||
|
||||
@ -135,7 +137,7 @@
|
||||
|
||||
4-colab修复不开启公网url
|
||||
|
||||
### 20240306更新
|
||||
## 20240306
|
||||
|
||||
1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672
|
||||
|
||||
@ -147,7 +149,7 @@
|
||||
|
||||
5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
|
||||
|
||||
### 202403/202404/202405更新
|
||||
## 202403/202404/202405
|
||||
|
||||
2个重点
|
||||
|
||||
@ -169,9 +171,9 @@
|
||||
|
||||
6-nan自动转fp32阶段的hubert提取bug修复
|
||||
|
||||
### 20240610
|
||||
## 20240610
|
||||
|
||||
小问题修复:
|
||||
小问题修复:
|
||||
|
||||
1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
|
||||
|
||||
@ -179,13 +181,13 @@
|
||||
|
||||
3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
|
||||
|
||||
大问题修复:
|
||||
大问题修复:
|
||||
|
||||
4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
|
||||
|
||||
### 20240706
|
||||
## 20240706
|
||||
|
||||
小问题修复:
|
||||
小问题修复:
|
||||
|
||||
1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
|
||||
|
||||
@ -197,13 +199,13 @@
|
||||
|
||||
5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251
|
||||
|
||||
重点:
|
||||
重点:
|
||||
|
||||
6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速.
|
||||
|
||||
后面会逐渐验证快速推理分支的推理改动的一致性
|
||||
|
||||
### 20240727
|
||||
## 20240727
|
||||
|
||||
1-清理冗余i18n代码 https://github.com/RVC-Boss/GPT-SoVITS/pull/1298
|
||||
|
||||
@ -211,113 +213,184 @@
|
||||
|
||||
3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756
|
||||
|
||||
重点:
|
||||
重点:
|
||||
|
||||
4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340
|
||||
|
||||
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356): 增加 BS-Roformer 人声伴奏分离模型支持.
|
||||
- 类型: 新功能
|
||||
- 提交: KamioRinn
|
||||
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351): 更好的中文文本前端.
|
||||
- 类型: 新功能
|
||||
- 提交: KamioRinn
|
||||
|
||||
### 20240806
|
||||
## 202408 (V2 版本)
|
||||
|
||||
1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
|
||||
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355): 添加自动填充下一步文件路径的功能.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c): 支持 BS-Roformer 的 FP16 推理.
|
||||
- 类型: 性能优化
|
||||
- 提交: RVC-Boss
|
||||
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78): 增加用户友好逻辑, 对用户随意输入的显卡序号也能正常运行.
|
||||
- 类型: 杂项
|
||||
- 提交: RVC-Boss
|
||||
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758): **新增 GPT-SoVITS V2 模型.**
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3): 增加粤语 FunASR 支持.
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388): 优化界面, 优化计时逻辑.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488): 优化多音字逻辑 (V2 版本特供).
|
||||
- 类型: 修复, 新功能
|
||||
- 提交: KamioRinn, RVC-Boss
|
||||
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422): 修复参考音频混合只能上传一条的错误, 添加数据集检查, 缺失会弹出警告窗口.
|
||||
- 类型: 修复, 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508): 上游 LangSegment 库支持通过 SSML 标签优化数字、电话、时间日期等.
|
||||
- 类型: 新功能
|
||||
- 提交: juntaosun
|
||||
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503): 修复并优化 API.
|
||||
- 类型: 修复
|
||||
- 提交: KamioRinn
|
||||
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490): 合并 fast_inference 分支.
|
||||
- 类型: 重构
|
||||
- 提交: ChasonJiang
|
||||
- 2024.08.21 **正式发布 GPT-SoVITS V2 版本.**
|
||||
|
||||
2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488
|
||||
## 202502 (V3 版本)
|
||||
|
||||
3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355
|
||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4): **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
|
||||
- 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
|
||||
- 提交: RVC-Boss
|
||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032): 更新项目多语言文档.
|
||||
- 类型: 文档
|
||||
- 提交: StaryLan
|
||||
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033): 更新日语文档.
|
||||
- 类型: 文档
|
||||
- 提交: Fyphen
|
||||
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010): 优化注意力计算逻辑.
|
||||
- 类型: 性能优化
|
||||
- 提交: wzy3650
|
||||
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040): 微调添加梯度检查点支持, 需要 12G 显存进行微调.
|
||||
- 类型: 新功能
|
||||
- 提交: Kakaru Hayate
|
||||
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073): 切换新的语言分割工具, 优化多语种混合文本切分策略, 优化文本里的数字和英文处理逻辑.
|
||||
- 类型: 新功能
|
||||
- 提交: KamioRinn
|
||||
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e): **GPT-SoVITS V3 模型支持 LoRA 训练, 需要 8G 显存进行微调.**
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078): 人声背景音分离增加 Mel Band Roformer 模型支持.
|
||||
- 类型: 新功能
|
||||
- 提交: Sucial
|
||||
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114): 修复中文路径下 Mecab 的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错).
|
||||
- 类型: 修复
|
||||
- 提交: KamioRinn
|
||||
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc): **支持使用 24KHz 转 48kHz 的音频超分模型**, 缓解 V3 模型生成音频感觉闷的问题.
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 关联: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
|
||||
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123): 更新项目多语言文档
|
||||
- 类型: 文档
|
||||
- 提交: StaryLan
|
||||
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122): 对于模型无法判断的CJK短字符采用规则判断.
|
||||
- 类型: 修复
|
||||
- 提交: KamioRinn
|
||||
- 关联: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
|
||||
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5): 增加语速传参以支持调整合成语速.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss
|
||||
- 2025.02.28 **正式发布 GPT-SoVITS V3**.
|
||||
|
||||
4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
|
||||
## 202503
|
||||
|
||||
5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
|
||||
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236): 修复一批由依赖的库版本不对导致的问题.
|
||||
- 类型: 修复
|
||||
- 提交: XXXXRT666
|
||||
- 关联:
|
||||
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
|
||||
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
|
||||
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
|
||||
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
|
||||
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241): **为 SoVITS v3 适配并行推理**.
|
||||
- 类型: 新功能
|
||||
- 提交: ChasonJiang
|
||||
|
||||
6-GPT-SoVITS-v2支持
|
||||
- 修复其他若干错误.
|
||||
|
||||
7-计时逻辑优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/1387
|
||||
- 整合包修复 onnxruntime GPU 推理的支持
|
||||
- 类型: 修复
|
||||
- 内容:
|
||||
- G2PW 内的 ONNX 模型由 CPU 推理 换为 GPU, 显著降低推理的 CPU 瓶颈;
|
||||
- foxjoy 去混响模型现在可使用 GPU 推理
|
||||
|
||||
### 20240821
|
||||
## 202504 (V4 版本)
|
||||
|
||||
1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490
|
||||
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f): 解锁 SoVITS v3 并行推理, 修复模型加载异步逻辑.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss
|
||||
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255): Ruff 格式化代码, 更新 G2PW 链接.
|
||||
- 类型: 风格
|
||||
- 提交: XXXXRT666
|
||||
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290): 清理文档, 支持 Python 3.11, 更新安装文件.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300): 更新 Colab, 安装文件和模型下载.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf): **新增 GPT-SoVITS V4 模型**.
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307): 适配 V4 并行推理.
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309): 修复模型版本传参.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310): 修复 Numpy 与 Numba 版本不匹配问题, 更新 librosa 版本.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss, XXXXRT666
|
||||
- 关联: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
|
||||
- **2024.04.22 正式发布 GPT-SoVITS V4**.
|
||||
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311): 更新 Gradio 参数.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322): 完善 Colab/Kaggle Notebook 脚本.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
|
||||
2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508
|
||||
## 202505
|
||||
|
||||
3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503
|
||||
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351): 完善 Docker, Windows 自动构建脚本, Pre-Commit 格式化.
|
||||
- 类型: 杂项
|
||||
- 提交: XXXXRT666
|
||||
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408): 优化混合语种切分识别逻辑.
|
||||
- 类型: 修复
|
||||
- 提交: KamioRinn
|
||||
- 关联: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
|
||||
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377): 通过缓存策略使 SoVITS V3/V4 推理提速 10%.
|
||||
- 类型: 性能优化
|
||||
- 提交: Kakaru Hayate
|
||||
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba): 更新标注界面, 增加友情提示, 即标注完每一页都要点击 `Submit Text` 否则修改无效.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss
|
||||
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc): 修复 UVR5 和 ONNX 去混响模型使用 FFmpeg 编码 MP3 和 M4A 原路径带空格时的错误.
|
||||
- 类型: 修复
|
||||
- 提交: RVC-Boss
|
||||
|
||||
4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
|
||||
|
||||
5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
|
||||
|
||||
### 20250211
|
||||
|
||||
增加gpt-sovits-v3模型, 需要14G显存可以微调
|
||||
|
||||
### 20250212
|
||||
|
||||
sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040
|
||||
|
||||
### 20250214
|
||||
|
||||
优化多语种混合文本切分策略a https://github.com/RVC-Boss/GPT-SoVITS/pull/2047
|
||||
|
||||
### 20250217
|
||||
|
||||
优化文本里的数字和英文处理逻辑https://github.com/RVC-Boss/GPT-SoVITS/pull/2062
|
||||
|
||||
### 20250218
|
||||
|
||||
优化多语种混合文本切分策略b https://github.com/RVC-Boss/GPT-SoVITS/pull/2073
|
||||
|
||||
### 20250223
|
||||
|
||||
1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好
|
||||
|
||||
2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078
|
||||
|
||||
### 20250226
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114
|
||||
|
||||
修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错)
|
||||
|
||||
### 20250227
|
||||
|
||||
针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解.
|
||||
## 202506 (V2Pro 系列)
|
||||
- 2025.06.04 [Commit#b7c0c5ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/b7c0c5ca878bcdd419fd86bf80dba431a6653356)~[Commit#298ebb03](https://github.com/RVC-Boss/GPT-SoVITS/commit/298ebb03c5a719388527ae6a586c7ea960344e70): **新增 GPT-SoVITS V2Pro 系列模型**.
|
||||
- 类型: 新功能
|
||||
- 提交: RVC-Boss
|
||||
- 2025.06.05 https://github.com/RVC-Boss/GPT-SoVITS/pull/2426: config/inference_webui初始化bug修复.
|
||||
- 类型: 修复
|
||||
- 提交: SapphireLab
|
||||
- 2025.06.05 https://github.com/RVC-Boss/GPT-SoVITS/pull/2427: 优化精度自动检测逻辑;给webui前端界面模块增加可收缩式支持.
|
||||
- 类型: 新功能
|
||||
- 提交: XXXXRT666
|
||||
|
||||
|
||||
### 20250228
|
||||
|
||||
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
|
||||
|
||||
修复v3sovits未传参以支持调节语速
|
||||
|
||||
### 202503
|
||||
|
||||
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
|
||||
|
||||
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
修复其他若干bug
|
||||
|
||||
重点更新:
|
||||
|
||||
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理
|
||||
|
||||
### 202504/202505更新
|
||||
|
||||
1-修复uvr5和onnx去混响模型ffmpeg编码mp3和m4a原路径带空格会有bug的问题
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc
|
||||
|
||||
2-标注界面增加友情提示标注完每一面都要点submit text否则白忙活
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2
|
||||
|
||||
3-通过缓存策略使sovits推理提速10%
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/pull/2377
|
||||
|
||||
4-混合语种切分识别逻辑优化
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/pull/2408
|
||||
|
||||
5-完善colab/kaggle notebook脚本,完善linux环境配置脚本,docker环境,windows自动构建脚本
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/commit/ad7df5298bea51273c86c05b5b13f28ed7d9fe16
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/commit/d5e479dad6342222eb4887df627e69c048d2338c
|
||||
|
||||
预告:端午后基于V2版本进行重大优化更新!
|
||||
|
@ -1,4 +1,6 @@
|
||||
### 20240121 Update
|
||||
# Changelog
|
||||
|
||||
## 20240121
|
||||
|
||||
1. Added `is_share` to the `config`. In scenarios like Colab, this can be set to `True` to map the WebUI to the public network.
|
||||
2. Added English system translation support to WebUI.
|
||||
@ -7,20 +9,20 @@
|
||||
5. Cleaned up cached audio files and other files in the `TEMP` folder.
|
||||
6. Significantly reduced the issue of synthesized audio containing the end of the reference audio.
|
||||
|
||||
### 20240122 Update
|
||||
## 20240122
|
||||
|
||||
1. Fixed the issue where excessively short output files resulted in repeating the reference audio.
|
||||
2. Tested native support for English and Japanese training (Japanese training requires the root directory to be free of non-English special characters).
|
||||
3. Improved audio path checking. If an attempt is made to read from an incorrect input path, it will report that the path does not exist instead of an ffmpeg error.
|
||||
|
||||
### 20240123 Update
|
||||
## 20240123
|
||||
|
||||
1. Resolved the issue where Hubert extraction caused NaN errors, leading to SoVITS/GPT training ZeroDivisionError.
|
||||
2. Added support for quick model switching in the inference WebUI.
|
||||
3. Optimized the model file sorting logic.
|
||||
4. Replaced `jieba` with `jieba_fast` for Chinese word segmentation.
|
||||
|
||||
### 20240126 Update
|
||||
## 20240126
|
||||
|
||||
1. Added support for Chinese-English mixed and Japanese-English mixed output texts.
|
||||
2. Added an optional segmentation mode for output.
|
||||
@ -30,7 +32,7 @@
|
||||
6. Supported training and inference on Mac.
|
||||
7. Automatically forced single precision for GPU that do not support half precision; enforced single precision under CPU inference.
|
||||
|
||||
### 20240128 Update
|
||||
## 20240128
|
||||
|
||||
1. Fixed the issue with the pronunciation of numbers converting to Chinese characters.
|
||||
2. Fixed the issue of swallowing a few characters at the beginning of sentences.
|
||||
@ -38,29 +40,29 @@
|
||||
4. Fixed the issue where GPT training did not save checkpoints.
|
||||
5. Completed model downloading process in the Dockerfile.
|
||||
|
||||
### 20240129 Update
|
||||
## 20240129
|
||||
|
||||
1. Changed training configurations to single precision for GPUs like the 16 series, which have issues with half precision training.
|
||||
2. Tested and updated the available Colab version.
|
||||
3. Fixed the issue of git cloning the ModelScope FunASR repository with older versions of FunASR causing interface misalignment errors.
|
||||
|
||||
### 20240130 Update
|
||||
## 20240130
|
||||
|
||||
1. Automatically removed double quotes from all path-related entries to prevent errors from novice users copying paths with double quotes.
|
||||
2. Fixed issues with splitting Chinese and English punctuation and added punctuation at the beginning and end of sentences.
|
||||
3. Added splitting by punctuation.
|
||||
|
||||
### 20240201 Update
|
||||
## 20240201
|
||||
|
||||
1. Fixed the UVR5 format reading error causing separation failures.
|
||||
2. Supported automatic segmentation and language recognition for mixed Chinese-Japanese-English texts.
|
||||
|
||||
### 20240202 Update
|
||||
## 20240202
|
||||
|
||||
1. Fixed the issue where an ASR path ending with `/` caused an error in saving the filename.
|
||||
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) introduced PaddleSpeech's Normalizer to fix issues like reading "xx.xx%" (percent symbols) and "元/吨" being read as "元吨" instead of "元每吨", and fixed underscore errors.
|
||||
|
||||
### 20240207 Update
|
||||
## 20240207
|
||||
|
||||
1. Corrected language parameter confusion causing decreased Chinese inference quality reported in [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391).
|
||||
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) adapted UVR5 to higher versions of librosa.
|
||||
@ -70,33 +72,33 @@
|
||||
6. Supported automatic reading of `.list` full paths if the root directory is left blank during dataset preparation.
|
||||
7. Integrated Faster Whisper ASR for Japanese and English.
|
||||
|
||||
### 20240208 Update
|
||||
## 20240208
|
||||
|
||||
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) attempted to fix GPT training hang on Windows 10 1909 and [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (Traditional Chinese System Language).
|
||||
|
||||
### 20240212 Update
|
||||
## 20240212
|
||||
|
||||
1. Optimized logic for Faster Whisper and FunASR, switching Faster Whisper to mirror downloads to avoid issues with Hugging Face connections.
|
||||
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) enabled experimental DPO Loss training option to mitigate GPT repetition and missing characters by constructing negative samples during training and made several inference parameters available in the inference WebUI.
|
||||
|
||||
### 20240214 Update
|
||||
## 20240214
|
||||
|
||||
1. Supported Chinese experiment names in training (previously caused errors).
|
||||
2. Made DPO training an optional feature instead of mandatory. If selected, the batch size is automatically halved. Fixed issues with new parameters not being passed in the inference WebUI.
|
||||
|
||||
### 20240216 Update
|
||||
## 20240216
|
||||
|
||||
1. Supported input without reference text.
|
||||
2. Fixed bugs in Chinese frontend reported in [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475).
|
||||
|
||||
### 20240221 Update
|
||||
## 20240221
|
||||
|
||||
1. Added a noise reduction option during data processing (noise reduction leaves only 16kHz sampling rate; use only if the background noise is significant).
|
||||
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) optimized Chinese and Japanese frontend processing.
|
||||
3. Switched Mac CPU inference to use CPU instead of MPS for faster performance.
|
||||
4. Fixed Colab public URL issue.
|
||||
|
||||
### 20240306 Update
|
||||
## 20240306
|
||||
|
||||
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) accelerated inference by 50% (tested on RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39) .
|
||||
2. No longer requires downloading the Chinese FunASR model first when using Faster Whisper non-Chinese ASR.
|
||||
@ -104,9 +106,9 @@
|
||||
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) enabled automatic CPU inference for Faster Whisper if no CUDA is available.
|
||||
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) modified `is_half` check to ensure proper CPU inference on Mac.
|
||||
|
||||
### 202403/202404/202405 Update
|
||||
## 202403/202404/202405
|
||||
|
||||
#### Minor Fixes:
|
||||
### Minor Fixes:
|
||||
|
||||
1. Fixed issues with the no-reference text mode.
|
||||
2. Optimized the Chinese and English text frontend.
|
||||
@ -115,27 +117,27 @@
|
||||
5. Added error prompts for unsupported languages during training data processing.
|
||||
6. Fixed the bug in Hubert extraction.
|
||||
|
||||
#### Major Fixes:
|
||||
### Major Fixes:
|
||||
|
||||
1. Fixed the issue of SoVITS training without freezing VQ (which could cause quality degradation).
|
||||
2. Added a quick inference branch.
|
||||
|
||||
### 20240610 Update
|
||||
## 20240610
|
||||
|
||||
#### Minor Fixes:
|
||||
### Minor Fixes:
|
||||
|
||||
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) improved the logic for pure punctuation and multi-punctuation text input.
|
||||
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) fixed CMD format for MDXNet de-reverb in UVR5, supporting paths with spaces.
|
||||
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) fixed progress bar logic for SoVITS training in `s2_train.py`.
|
||||
|
||||
#### Major Fixes:
|
||||
### Major Fixes:
|
||||
|
||||
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
|
||||
**Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.**
|
||||
|
||||
### 20240706 Update
|
||||
## 20240706
|
||||
|
||||
#### Minor Fixes:
|
||||
### Minor Fixes:
|
||||
|
||||
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) fixed default batch size decimal issue in CPU inference.
|
||||
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) fixed issues where denoising or ASR encountering exceptions would exit all pending audio files.
|
||||
@ -143,80 +145,189 @@
|
||||
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) fixed multi-process save logic for multi-GPU training.
|
||||
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) removed redundant `my_utils`.
|
||||
|
||||
#### Major Fixes:
|
||||
### Major Fixes:
|
||||
|
||||
6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base.
|
||||
It also supports accelerated inference in no-reference text mode.
|
||||
|
||||
**Future updates will continue to verify the consistency of changes in the `fast_inference` branch**.
|
||||
|
||||
### 20240727 Update
|
||||
## 20240727
|
||||
|
||||
#### Minor Fixes:
|
||||
### Minor Fixes:
|
||||
|
||||
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) cleaned up redundant i18n code.
|
||||
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) fixed issues where trailing slashes in user file paths caused command line errors.
|
||||
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) fixed the step calculation logic in GPT training.
|
||||
|
||||
#### Major Fixes:
|
||||
### Major Fixes:
|
||||
|
||||
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis.
|
||||
Enabled freezing randomness while only adjusting the speech rate.
|
||||
|
||||
### 20240806 Update
|
||||
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356): Added support for the BS-RoFormer vocal accompaniment separation model.
|
||||
- Type: New Feature
|
||||
- Contributor: KamioRinn
|
||||
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351): Improved Chinese text frontend.
|
||||
- Type: New Feature
|
||||
- Contributor: KamioRinn
|
||||
|
||||
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) Added support for the BS RoFormer vocal accompaniment separation model. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) Enabled FP16 inference.
|
||||
2. Improved Chinese text frontend.
|
||||
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) added support for polyphonic characters (v2 only);
|
||||
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) added quantifier;
|
||||
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) supports arithmetic and basic math formulas;
|
||||
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) fixed mixed text errors.
|
||||
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) automatically filled in the paths when processing audio in the WebUI.
|
||||
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) optimized GPU recognition logic.
|
||||
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) added support for Cantonese ASR.
|
||||
6. Added support for GPT-SoVITS v2.
|
||||
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) optimized timing logic.
|
||||
## 202408 (V2 Version)
|
||||
|
||||
### 20240821 Update
|
||||
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355): Automatically fill in the paths when processing files in the WebUI.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c): Enabled FP16 inference support for BS-Roformer.
|
||||
- Type: Performance Optimization
|
||||
- Contributor: RVC-Boss
|
||||
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78): Optimized GPU recognition logic, added user-friendly logic to handle arbitrary GPU indices entered by users.
|
||||
- Type: Chore
|
||||
- Contributor: RVC-Boss
|
||||
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758): **Added GPT-SoVITS V2 model.**
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss
|
||||
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3): Added support for Cantonese ASR by using FunASR.
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss
|
||||
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388): Optimized UI and timing logic.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488): Optimized polyphonic character handling logic (V2 Only).
|
||||
- Type: Fix, New Feature
|
||||
- Contributor: KamioRinn, RVC-Boss
|
||||
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422): Fixed bug where only one reference audio could be uploaded; added dataset validation with warning popups for missing files.
|
||||
- Type: Fix, Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508): Upstream LangSegment library now supports optimizing numbers, phone numbers, dates, and times using SSML tags.
|
||||
- Type: New Feature
|
||||
- Contributor: juntaosun
|
||||
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503): Fixed and optimized API.
|
||||
- Type: Fix
|
||||
- Contributor: KamioRinn
|
||||
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490): Merged `fast_inference` branch into the main branch.
|
||||
- Type: Refactor
|
||||
- Contributor: ChasonJiang
|
||||
- 2024.08.21 **Officially released GPT-SoVITS V2 version.**
|
||||
|
||||
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) Merge the `fast_inference` branch into the main branch.
|
||||
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) Support for optimizing numbers, phone numbers, dates, and times using SSML tags.
|
||||
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) Fixed and optimized API.
|
||||
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Fixed the bug where only one reference audio could be uploaded for mixing, Added various dataset checks with warnings popping up if missing files.
|
||||
## 202502 (V3 Version)
|
||||
|
||||
### 20250211 Update
|
||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4): **Added GPT-SoVITS V3 model, which requires 14GB VRAM for fine-tuning.**
|
||||
- Type: New Feature (Refer to [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032): Updated multilingual project documentation.
|
||||
- Type: Documentation
|
||||
- Contributor: StaryLan
|
||||
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033): Updated Japanese documentation.
|
||||
- Type: Documentation
|
||||
- Contributor: Fyphen
|
||||
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010): Optimized attention calculation logic.
|
||||
- Type: Performance Optimization
|
||||
- Contributor: wzy3650
|
||||
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040): Added gradient checkpointing support for fine-tuning, requiring 12GB VRAM.
|
||||
- Type: New Feature
|
||||
- Contributor: Kakaru Hayate
|
||||
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073): Switched to a new language segmentation tool, improved multilingual mixed-text splitting strategy, and optimized number and English processing logic.
|
||||
- Type: New Feature
|
||||
- Contributor: KamioRinn
|
||||
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e): **GPT-SoVITS V3 model now supports LoRA training, requiring 8GB GPU Memory for fine-tuning.**
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078): Added Mel Band Roformer model support for vocal and Instrument separation.
|
||||
- Type: New Feature
|
||||
- Contributor: Sucial
|
||||
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114): Fixed MeCab error under Chinese paths (specifically for Japanese/Korean or multilingual text splitting).
|
||||
- Type: Fix
|
||||
- Contributor: KamioRinn
|
||||
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc): **Added 24kHz to 48kHz audio super-resolution models** to alleviate the "muffled" audio issue when generating 24K audio with V3 model.
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss
|
||||
- Related: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
|
||||
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123): Updated multilingual project documentation.
|
||||
- Type: Documentation
|
||||
- Contributor: StaryLan
|
||||
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122): Applied rule-based detection for short CJK characters when model cannot identify them.
|
||||
- Type: Fix
|
||||
- Contributor: KamioRinn
|
||||
- Related: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
|
||||
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5): Added speech rate parameter to control synthesis speed.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.02.28 **Officially released GPT-SoVITS V3**.
|
||||
|
||||
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) Added GPT-SoVITS v3 Model, Need 14GB GPU Memory to Fine-tune SoVITS v3.
|
||||
## 202503
|
||||
|
||||
### 20250212 Update
|
||||
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236): Fixed issues caused by incorrect versions of dependencies.
|
||||
- Type: Fix
|
||||
- Contributor: XXXXRT666
|
||||
- Related:
|
||||
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
|
||||
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
|
||||
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
|
||||
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
|
||||
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241): **Enabled parallel inference for SoVITS v3.**
|
||||
- Type: New Feature
|
||||
- Contributor: ChasonJiang
|
||||
|
||||
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) Added gradient checkpointing to Fine-tune SoVITS v3, Need 12GB GPU Memory.
|
||||
- Fixed other minor bugs.
|
||||
|
||||
### 20250214 Update
|
||||
- Integrated package fixes for ONNX runtime GPU inference support:
|
||||
- Type: Fix
|
||||
- Details:
|
||||
- ONNX models within G2PW switched from CPU to GPU inference, significantly reducing CPU bottleneck;
|
||||
- foxjoy dereverberation model now supports GPU inference.
|
||||
|
||||
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Optimize the multilingual mixed text segmentation strategy **A**.
|
||||
-AAdded `split-lang` as a language segmentation tool to improve segmentation capabilities for multi-language mixed text.
|
||||
## 202504 (V4 Version)
|
||||
|
||||
### 20250217 Update
|
||||
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f): Unlocked SoVITS v3 parallel inference; fixed asynchronous model loading logic.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255): Code formatting using Ruff; updated G2PW link.
|
||||
- Type: Style
|
||||
- Contributor: XXXXRT666
|
||||
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290): Cleaned up documentation; added Python 3.11 support; updated installers.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300): Updated Colab, installation files, and model downloads.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf): **Added GPT-SoVITS V4 model.**
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307): Enabled parallel inference for V4.
|
||||
- Type: New Feature
|
||||
- Contributor: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309): Fixed model version parameter passing.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310): Fixed Numpy and Numba version mismatch issue; updated librosa version.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss, XXXXRT666
|
||||
- Related: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
|
||||
- **2024.04.22 Officially released GPT-SoVITS V4**.
|
||||
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311): Updated Gradio parameters.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322): Improved Colab/Kaggle notebook scripts.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
|
||||
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Optimize the logic for handling numbers and English in the text.
|
||||
## 202505
|
||||
|
||||
### 20250218 Update
|
||||
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351): Improved Docker and Windows auto-build scripts; added pre-commit formatting.
|
||||
- Type: Chore
|
||||
- Contributor: XXXXRT666
|
||||
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408): Optimized multilingual text splitting and recognition logic.
|
||||
- Type: Fix
|
||||
- Contributor: KamioRinn
|
||||
- Related: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
|
||||
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377): Implemented caching strategies to improve SoVITS V3/V4 inference speed by 10%.
|
||||
- Type: Performance Optimization
|
||||
- Contributor: Kakaru Hayate
|
||||
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba): Updated the annotation interface with a reminder: click Submit Text after completing each page, or changes will not be saved.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss
|
||||
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc): Fixed UVR5 and ONNX dereverberation model errors when FFmpeg encodes MP3/M4A files with spaces in original paths.
|
||||
- Type: Fix
|
||||
- Contributor: RVC-Boss
|
||||
|
||||
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Optimize the multilingual mixed text segmentation strategy **B**.
|
||||
|
||||
### 20250223 Update
|
||||
|
||||
1. LoRA training is supported for fine-tuning with SoVITS V3. It requires 8GB GPU Memory and the results are better than full parameter fine-tuning.
|
||||
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Added Mel Band RoFormer model for Vocal & Instrument Separation.
|
||||
|
||||
### 20250226 Update
|
||||
|
||||
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Fix issues caused by non-English directories in Windows.
|
||||
- Using `langsegmenter` for Korean.
|
||||
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Fix issues caused by non-English directories in Windows.
|
||||
- Using `langsegmenter` for Korean/Japanese.
|
||||
|
||||
### 20250227 Update
|
||||
|
||||
- Added 24K to 48K audio super-resolution models to alleviate the muffled issue when generating 24K audio with V3 model, as reported in [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117).
|
||||
**Preview: Major optimization update based on V2 version coming after the Dragon Boat Festival!**
|
@ -1,4 +1,6 @@
|
||||
### 20240121 更新
|
||||
# 更新履歴
|
||||
|
||||
## 20240121
|
||||
|
||||
1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます.
|
||||
2. WebUI に英語システムの英語翻訳を追加しました.
|
||||
@ -7,20 +9,20 @@
|
||||
5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します.
|
||||
6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました.
|
||||
|
||||
### 20240122 更新
|
||||
## 20240122
|
||||
|
||||
1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました.
|
||||
2. 英語-日本語学習がスムーズに進む QA を完了しました. (ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります)
|
||||
3. オーディオパスをチェックします.間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます.これは ffmpeg モジュールのエラーではありません.
|
||||
|
||||
### 20240123 更新
|
||||
## 20240123
|
||||
|
||||
1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました.
|
||||
2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました.
|
||||
3. モデルファイルのソートロジックを最適化しました.
|
||||
4. 中国語の分析に `jieba_fast` を `jieba` に置き換えました.
|
||||
|
||||
### 20240126 更新
|
||||
## 20240126
|
||||
|
||||
1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします.
|
||||
2. 出力で選択的な分割モードをサポートします.
|
||||
@ -30,7 +32,7 @@
|
||||
6. MacOS での学習と推論をサポートします.
|
||||
7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します.
|
||||
|
||||
### 20240128 更新
|
||||
## 20240128
|
||||
|
||||
1. 数字を漢字で読む問題を修正しました.
|
||||
2. 文章の先頭の一部の単語が欠落する問題を修正しました.
|
||||
@ -38,29 +40,29 @@
|
||||
4. GPT 学習時の ckpt が保存されない問題を修正しました.
|
||||
5. Dockerfile のモデルダウンロードプロセスを改善しました.
|
||||
|
||||
### 20240129 更新
|
||||
## 20240129
|
||||
|
||||
1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました.
|
||||
2. Colab でも使用可能なバージョンをテストして更新しました.
|
||||
3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました.
|
||||
|
||||
### 20240130 更新
|
||||
## 20240130
|
||||
|
||||
1. パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません.
|
||||
2. 中国語と英語、日本語と英語の混合出力をサポートします.
|
||||
3. 出力で選択的な分割モードをサポートします.
|
||||
|
||||
### 20240201 更新
|
||||
## 20240201
|
||||
|
||||
1. UVR5 形式の読み取りエラーによる分離失敗を修正しました.
|
||||
2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました.
|
||||
|
||||
### 20240202 更新
|
||||
## 20240202
|
||||
|
||||
1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました.
|
||||
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%" (パーセント記号) の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました.
|
||||
|
||||
### 20240207 更新
|
||||
## 20240207
|
||||
|
||||
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました.
|
||||
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました.
|
||||
@ -70,32 +72,32 @@
|
||||
6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました.
|
||||
7. 日本語と英語のために Faster Whisper ASR を統合しました.
|
||||
|
||||
### 20240208 更新
|
||||
## 20240208
|
||||
|
||||
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (繁体字中国語システム言語) での GPT トレーニングのハングを修正する試みを行いました.
|
||||
|
||||
### 20240212 更新
|
||||
## 20240212
|
||||
|
||||
1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました.
|
||||
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました.
|
||||
|
||||
### 20240214 更新
|
||||
## 20240214
|
||||
|
||||
1. トレーニングで中国語の実験名をサポート (以前はエラーが発生していました).
|
||||
2. DPOトレーニングを必須ではなくオプション機能に変更.選択された場合、バッチサイズは自動的に半分になります.推論 WebUI で新しいパラメータが渡されない問題を修正しました.
|
||||
|
||||
### 20240216 更新
|
||||
## 20240216
|
||||
|
||||
1. 参照テキストなしでの入力をサポート.
|
||||
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました.
|
||||
|
||||
### 20240221 更新
|
||||
## 20240221
|
||||
|
||||
1. データ処理中のノイズ低減オプションを追加 (ノイズ低減は16kHzサンプリングレートのみを残します;背景ノイズが大きい場合にのみ使用してください).
|
||||
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました.
|
||||
3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました.
|
||||
4. Colab のパブリック URL の問題を修正しました.
|
||||
### 20240306 更新
|
||||
## 20240306
|
||||
|
||||
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト).
|
||||
2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました.
|
||||
@ -103,9 +105,9 @@
|
||||
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました.
|
||||
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました.
|
||||
|
||||
### 202403/202404/202405 更新
|
||||
## 202403/202404/202405
|
||||
|
||||
#### マイナー修正:
|
||||
### マイナー修正:
|
||||
|
||||
1. 参照テキストなしモードの問題を修正しました.
|
||||
2. 中国語と英語のテキストフロントエンドを最適化しました.
|
||||
@ -114,27 +116,27 @@
|
||||
5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました.
|
||||
6. Hubert 抽出のバグを修正しました.
|
||||
|
||||
#### メジャー修正:
|
||||
### メジャー修正:
|
||||
|
||||
1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました.
|
||||
2. クイック推論ブランチを追加しました.
|
||||
|
||||
### 20240610 更新
|
||||
## 20240610
|
||||
|
||||
#### マイナー修正:
|
||||
### マイナー修正:
|
||||
|
||||
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました.
|
||||
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました.
|
||||
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました.
|
||||
|
||||
#### メジャー修正:
|
||||
### メジャー修正:
|
||||
|
||||
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました.
|
||||
**注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします.**
|
||||
|
||||
### 20240706 更新
|
||||
## 20240706
|
||||
|
||||
#### マイナー修正:
|
||||
### マイナー修正:
|
||||
|
||||
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました.
|
||||
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、ノイズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました.
|
||||
@ -142,80 +144,189 @@
|
||||
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました.
|
||||
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました.
|
||||
|
||||
#### メジャー修正:
|
||||
### メジャー修正:
|
||||
|
||||
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました.
|
||||
また、参照テキストなしモードでの加速推論もサポートしています.
|
||||
|
||||
**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**.
|
||||
|
||||
### 20240727 更新
|
||||
## 20240727
|
||||
|
||||
#### マイナー修正:
|
||||
### マイナー修正:
|
||||
|
||||
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました.
|
||||
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました.
|
||||
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました.
|
||||
|
||||
#### メジャー修正:
|
||||
### メジャー修正:
|
||||
|
||||
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました.
|
||||
スピーチレートのみを調整しながらランダム性を固定できるようになりました.
|
||||
|
||||
### 20240806 更新
|
||||
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356): BS-RoFormerボーカル・伴奏分離モデルのサポートを追加。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: KamioRinn
|
||||
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351): 中国語テキストフロントエンドの改善。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: KamioRinn
|
||||
|
||||
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました.[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました.
|
||||
2. 中国語テキストフロントエンドを改善しました.
|
||||
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加 (v2 のみ);
|
||||
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 量詞を追加;
|
||||
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 四則演算と基本数式のサポート;
|
||||
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正.
|
||||
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました.
|
||||
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました.
|
||||
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました.
|
||||
6. GPT-SoVITS v2 のサポートを追加しました.
|
||||
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました.
|
||||
## 202408 (V2 バージョン)
|
||||
|
||||
### 20240821 更新
|
||||
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355): WebUIでファイル処理時にパスを自動入力するように変更。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c): BS-RoformerのFP16推論サポートを有効化。
|
||||
- タイプ: パフォーマンス最適化
|
||||
- 貢献者: RVC-Boss
|
||||
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78): GPU認識ロジックを最適化、ユーザーが入力した任意のGPUインデックスを処理するユーザーフレンドリーなロジックを追加。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: RVC-Boss
|
||||
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758): **GPT-SoVITS V2モデルを追加。**
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss
|
||||
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3): FunASRを使用して広東語ASRをサポート。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss
|
||||
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388): UIとタイミングロジックを最適化。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488): 多音字処理ロジックを最適化(V2のみ)。
|
||||
- タイプ: 修正、新機能
|
||||
- 貢献者: KamioRinn、RVC-Boss
|
||||
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422): 参照音声が1つしかアップロードできないバグを修正。欠損ファイルがある場合に警告ポップアップを表示するデータセット検証を追加。
|
||||
- タイプ: 修正、雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508): 上流のLangSegmentライブラリがSSMLタグを使用した数字、電話番号、日付、時刻の最適化をサポート。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: juntaosun
|
||||
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503): APIを修正・最適化。
|
||||
- タイプ: 修正
|
||||
- 貢献者: KamioRinn
|
||||
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490): `fast_inference`ブランチをメインブランチにマージ。
|
||||
- タイプ: リファクタリング
|
||||
- 貢献者: ChasonJiang
|
||||
- 2024.08.21 **GPT-SoVITS V2バージョンを正式リリース。**
|
||||
|
||||
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました.
|
||||
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました.
|
||||
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました.
|
||||
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました.
|
||||
## 202502 (V3 バージョン)
|
||||
|
||||
### 20250211 更新
|
||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4): **GPT-SoVITS V3モデルを追加。ファインチューニングには14GBのVRAMが必要。**
|
||||
- タイプ: 新機能([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))参照)
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032): 多言語プロジェクトドキュメントを更新。
|
||||
- タイプ: ドキュメント
|
||||
- 貢献者: StaryLan
|
||||
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033): 日本語ドキュメントを更新。
|
||||
- タイプ: ドキュメント
|
||||
- 貢献者: Fyphen
|
||||
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010): アテンション計算ロジックを最適化。
|
||||
- タイプ: パフォーマンス最適化
|
||||
- 貢献者: wzy3650
|
||||
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040): ファインチューニング用に勾配チェックポイントサポートを追加。12GB VRAMが必要。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: Kakaru Hayate
|
||||
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073): 新しい言語セグメンテーションツールに切り替え、多言語混合テキストの分割戦略を改善。数字と英語の処理ロジックを最適化。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: KamioRinn
|
||||
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e): **GPT-SoVITS V3モデルがLoRAトレーニングをサポート。ファインチューニングに8GB GPUメモリが必要。**
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078): ボーカルと楽器分離のためのMel Band Roformerモデルサポートを追加。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: Sucial
|
||||
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114): 中国語パス下でのMeCabエラーを修正(日本語/韓国語または多言語テキスト分割用)。
|
||||
- タイプ: 修正
|
||||
- 貢献者: KamioRinn
|
||||
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc): **24kHzから48kHzへのオーディオ超解像モデルを追加**。V3モデルで24Kオーディオを生成する際の「こもった」オーディオ問題を緩和。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss
|
||||
- 関連: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
|
||||
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123): 多言語プロジェクトドキュメントを更新。
|
||||
- タイプ: ドキュメント
|
||||
- 貢献者: StaryLan
|
||||
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122): モデルが識別できない短いCJK文字に対してルールベースの検出を適用。
|
||||
- タイプ: 修正
|
||||
- 貢献者: KamioRinn
|
||||
- 関連: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
|
||||
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5): 合成速度を制御するための発話速度パラメータを追加。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.02.28 **GPT-SoVITS V3を正式リリース**。
|
||||
|
||||
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました.SoVITS v3のファインチューニングには14GBのGPUメモリが必要です.
|
||||
## 202503
|
||||
|
||||
### 20250212 更新
|
||||
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236): 依存関係の不正なバージョンによる問題を修正。
|
||||
- タイプ: 修正
|
||||
- 貢献者: XXXXRT666
|
||||
- 関連:
|
||||
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
|
||||
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
|
||||
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
|
||||
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
|
||||
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241): **SoVITS v3の並列推論を有効化。**
|
||||
- タイプ: 新機能
|
||||
- 貢献者: ChasonJiang
|
||||
|
||||
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です.
|
||||
- その他の軽微なバグを修正。
|
||||
|
||||
### 20250214 更新
|
||||
- ONNXランタイムGPU推論サポートのための統合パッケージ修正:
|
||||
- タイプ: 修正
|
||||
- 詳細:
|
||||
- G2PW内のONNXモデルをCPUからGPU推論に切り替え、CPUボトルネックを大幅に削減;
|
||||
- foxjoy dereverberationモデルがGPU推論をサポート。
|
||||
|
||||
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**.
|
||||
- `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました.
|
||||
## 202504 (V4 バージョン)
|
||||
|
||||
### 20250217 更新
|
||||
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f): SoVITS v3並列推論のロックを解除。非同期モデル読み込みロジックを修正。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255): Ruffを使用したコードフォーマット。G2PWリンクを更新。
|
||||
- タイプ: スタイル
|
||||
- 貢献者: XXXXRT666
|
||||
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290): ドキュメントを整理。Python 3.11サポートを追加。インストーラーを更新。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300): Colab、インストールファイル、モデルダウンロードを更新。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf): **GPT-SoVITS V4モデルを追加。**
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307): V4の並列推論を有効化。
|
||||
- タイプ: 新機能
|
||||
- 貢献者: RVC-Boss、ChasonJiang
|
||||
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309): モデルバージョンパラメータの受け渡しを修正。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss、ChasonJiang
|
||||
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310): NumpyとNumbaのバージョン不一致問題を修正。librosaバージョンを更新。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss、XXXXRT666
|
||||
- 関連: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
|
||||
- **2024.04.22 GPT-SoVITS V4を正式リリース**。
|
||||
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311): Gradioパラメータを更新。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322): Colab/Kaggleノートブックスクリプトを改善。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
|
||||
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化.
|
||||
## 202505
|
||||
|
||||
### 20250218 更新
|
||||
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351): DockerとWindows自動ビルドスクリプトを改善。pre-commitフォーマットを追加。
|
||||
- タイプ: 雑務
|
||||
- 貢献者: XXXXRT666
|
||||
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408): 多言語テキスト分割と認識ロジックを最適化。
|
||||
- タイプ: 修正
|
||||
- 貢献者: KamioRinn
|
||||
- 関連: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
|
||||
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377): キャッシュ戦略を実装し、SoVITS V3/V4推論速度を10%向上。
|
||||
- タイプ: パフォーマンス最適化
|
||||
- 貢献者: Kakaru Hayate
|
||||
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba): アノテーションインターフェースを更新し、以下の注意事項を追加しました:各ページの編集が終わったら必ず「Submit Text」をクリックしてください。さもなくば変更は保存されません。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss
|
||||
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc): UVR5およびONNX dereverberationモデルのエラーを修正。FFmpegが元のパスにスペースを含むMP3/M4Aファイルをエンコードする場合の問題を解決。
|
||||
- タイプ: 修正
|
||||
- 貢献者: RVC-Boss
|
||||
|
||||
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**.
|
||||
|
||||
### 20250223 更新
|
||||
|
||||
1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました.8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています.
|
||||
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました.
|
||||
|
||||
### 20250226 更新
|
||||
|
||||
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました.
|
||||
- `langsegmenter`を使用して韓国語の問題を修正.
|
||||
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました.
|
||||
- `langsegmenter`を使用して韓国語/日本語の問題を修正.
|
||||
|
||||
### 20250227 更新
|
||||
|
||||
- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました.[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています.
|
||||
**プレビュー: 端午節後にV2バージョンを基にした大規模な最適化アップデートを予定!**
|
@ -1,4 +1,6 @@
|
||||
### 20240121 업데이트
|
||||
# 변경 내역
|
||||
|
||||
## 20240121
|
||||
|
||||
1. `config`에 `is_share`를 추가했습니다. Colab과 같은 시나리오에서는 이 값을 `True`로 설정하여 WebUI를 공개 네트워크에 매핑할 수 있습니다.
|
||||
2. WebUI에 영어 시스템 번역 지원을 추가했습니다.
|
||||
@ -7,20 +9,20 @@
|
||||
5. `TEMP` 폴더의 캐시된 오디오 파일 및 기타 파일을 정리했습니다.
|
||||
6. 참조 오디오의 끝이 포함된 합성 오디오 문제를 크게 줄였습니다.
|
||||
|
||||
### 20240122 업데이트
|
||||
## 20240122
|
||||
|
||||
1. 지나치게 짧은 출력 파일로 인해 참조 오디오가 반복되는 문제를 수정했습니다.
|
||||
2. 영어 및 일본어 훈련의 네이티브 지원을 테스트했습니다 (일본어 훈련 시 루트 디렉토리에 비영어 특수 문자가 없어야 합니다).
|
||||
3. 오디오 경로 확인을 개선했습니다. 잘못된 입력 경로에서 읽으려는 시도가 있을 경우, ffmpeg 오류 대신 경로가 존재하지 않는다고 보고합니다.
|
||||
|
||||
### 20240123 업데이트
|
||||
## 20240123
|
||||
|
||||
1. Hubert 추출로 인해 NaN 오류가 발생하여 SoVITS/GPT 훈련에서 ZeroDivisionError가 발생하는 문제를 해결했습니다.
|
||||
2. 추론 WebUI에서 빠른 모델 전환 지원을 추가했습니다.
|
||||
3. 모델 파일 정렬 로직을 최적화했습니다.
|
||||
4. 중국어 단어 분할을 위해 `jieba`를 `jieba_fast`로 교체했습니다.
|
||||
|
||||
### 20240126 업데이트
|
||||
## 20240126
|
||||
|
||||
1. 중국어-영어 혼합 및 일본어-영어 혼합 출력 텍스트를 지원합니다.
|
||||
2. 출력에 대한 선택적 분할 모드를 추가했습니다.
|
||||
@ -30,7 +32,7 @@
|
||||
6. Mac에서 훈련 및 추론을 지원합니다.
|
||||
7. 절반 정밀도를 지원하지 않는 GPU에 대해 자동으로 단정밀도를 강제하며, CPU 추론 시 단정밀도를 적용합니다.
|
||||
|
||||
### 20240128 업데이트
|
||||
## 20240128
|
||||
|
||||
1. 숫자의 발음이 중국어 문자로 변환되는 문제를 수정했습니다.
|
||||
2. 문장 시작 부분에서 몇 개의 문자가 누락되는 문제를 수정했습니다.
|
||||
@ -38,29 +40,29 @@
|
||||
4. GPT 훈련 시 체크포인트가 저장되지 않는 문제를 수정했습니다.
|
||||
5. Dockerfile 에서 모델 다운로드 프로세스를 완료했습니다.
|
||||
|
||||
### 20240129 업데이트
|
||||
## 20240129
|
||||
|
||||
1. 절반 정밀도 훈련에 문제가 있는 16 시리즈와 같은 GPU의 훈련 구성을 단정밀도로 변경했습니다.
|
||||
2. 사용 가능한 Colab 버전을 테스트하고 업데이트했습니다.
|
||||
3. 이전 버전의 FunASR 로 인해 인터페이스 정렬 오류가 발생하는 ModelScope FunASR 저장소의 git 클로닝 문제를 수정했습니다.
|
||||
|
||||
### 20240130 업데이트
|
||||
## 20240130
|
||||
|
||||
1. 모든 경로 관련 항목에서 이중 따옴표를 자동으로 제거하여 초보자가 이중 따옴표가 포함된 경로를 복사하는 오류를 방지했습니다.
|
||||
2. 중국어 및 영어 문장 부호 분할 문제를 수정하고 문장 시작과 끝에 부호를 추가했습니다.
|
||||
3. 부호에 의한 분할을 추가했습니다.
|
||||
|
||||
### 20240201 업데이트
|
||||
## 20240201
|
||||
|
||||
1. 분리 실패를 일으킨 UVR5 형식 읽기 오류를 수정했습니다.
|
||||
2. 혼합된 중국어-일본어-영어 텍스트에 대한 자동 분할 및 언어 인식을 지원합니다.
|
||||
|
||||
### 20240202 업데이트
|
||||
## 20240202
|
||||
|
||||
1. `/` 로 끝나는 ASR 경로가 파일 이름 저장 시 오류를 발생시키는 문제를 수정했습니다.
|
||||
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) 에서는 PaddleSpeech 의 Normalizer 를 도입하여 "xx.xx%" (백분율 기호)와 "元/吨"이 "元吨"으로 읽히는 문제를 "元每吨"으로 수정하고, 밑줄 오류를 수정했습니다.
|
||||
|
||||
### 20240207 업데이트
|
||||
## 20240207
|
||||
|
||||
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) 에서 보고된 중국어 추론 품질 저하를 일으킨 언어 매개변수 혼동을 수정했습니다.
|
||||
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) 에서는 UVR5 를 높은 버전의 librosa에 맞게 조정했습니다.
|
||||
@ -70,33 +72,33 @@
|
||||
6. 데이터셋 준비 시 루트 디렉토리를 비워두면 `.list` 전체 경로를 자동으로 읽도록 지원합니다.
|
||||
7. 일본어와 영어에 대한 Faster Whisper ASR을 통합했습니다.
|
||||
|
||||
### 20240208 업데이트
|
||||
## 20240208
|
||||
|
||||
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)에서는 Windows 10 1909와 [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (전통 중국어 시스템 언어)에서 GPT 훈련 멈춤 문제를 수정하려고 했습니다.
|
||||
|
||||
### 20240212 업데이트
|
||||
## 20240212
|
||||
|
||||
1. Faster Whisper와 FunASR의 로직을 최적화하고, Faster Whisper를 미러 다운로드로 전환하여 Hugging Face 연결 문제를 피했습니다.
|
||||
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)은 DPO Loss 실험적 훈련 옵션을 활성화하여 GPT의 반복 및 문자 누락 문제를 완화하고, 훈련 중 부정 샘플을 구성하며 여러 추론 매개변수를 추론 WebUI에서 사용할 수 있게 했습니다.
|
||||
|
||||
### 20240214 업데이트
|
||||
## 20240214
|
||||
|
||||
1. 훈련 시 중국어 실험 이름을 지원합니다 (이전에는 오류가 발생했습니다).
|
||||
2. DPO 훈련을 필수 기능 대신 선택적 기능으로 변경했습니다. 선택 시, 배치 크기가 자동으로 절반으로 줄어듭니다. 추론 WebUI에서 새로운 매개변수가 전달되지 않는 문제를 수정했습니다.
|
||||
|
||||
### 20240216 업데이트
|
||||
## 20240216
|
||||
|
||||
1. 참조 텍스트 없이 입력을 지원합니다.
|
||||
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)에서 보고된 중국어 프론트엔드의 버그를 수정했습니다.
|
||||
|
||||
### 20240221 업데이트
|
||||
## 20240221
|
||||
|
||||
1. 데이터 처리 중 노이즈 감소 옵션을 추가했습니다 (노이즈 감소는 16kHz 샘플링 비율만 남깁니다; 배경 노이즈가 심한 경우에만 사용하십시오).
|
||||
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) 중국어 및 일본어 프론트엔드 처리를 최적화했습니다.
|
||||
3. Mac CPU 추론을 MPS 대신 CPU를 사용하도록 전환하여 성능을 향상시켰습니다.
|
||||
4. Colab 공개 URL 문제를 수정했습니다.
|
||||
|
||||
### 20240306 업데이트
|
||||
## 20240306
|
||||
|
||||
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)는 추론 속도를 50% 가속화했습니다 (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39에서 테스트됨).
|
||||
2. Faster Whisper의 비중국어 ASR을 사용할 때 중국어 FunASR 모델을 먼저 다운로드할 필요가 없습니다.
|
||||
@ -104,9 +106,9 @@
|
||||
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)는 CUDA가 없는 경우 Faster Whisper의 자동 CPU 추론을 가능하게 했습니다.
|
||||
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)은 Mac에서 올바른 CPU 추론을 보장하기 위해 `is_half` 체크를 수정했습니다.
|
||||
|
||||
### 202403/202404/202405 업데이트
|
||||
## 202403/202404/202405
|
||||
|
||||
#### 사소한 수정:
|
||||
### 사소한 수정:
|
||||
|
||||
1. 참조 텍스트 없는 모드의 문제를 수정했습니다.
|
||||
2. 중국어 및 영어 텍스트 프론트엔드를 최적화했습니다.
|
||||
@ -115,27 +117,27 @@
|
||||
5. 훈련 데이터 처리 중 지원되지 않는 언어에 대한 오류 프롬프트를 추가했습니다.
|
||||
6. Hubert 추출의 버그를 수정했습니다.
|
||||
|
||||
#### 주요 수정:
|
||||
### 주요 수정:
|
||||
|
||||
1. VQ를 고정하지 않고 SoVITS 훈련의 문제를 수정했습니다(품질 저하를 일으킬 수 있음).
|
||||
2. 빠른 추론 분기를 추가했습니다.
|
||||
|
||||
### 20240610 업데이트
|
||||
## 20240610
|
||||
|
||||
#### 사소한 수정:
|
||||
### 사소한 수정:
|
||||
|
||||
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) 순수 구두점 및 다중 구두점 텍스트 입력 로직을 개선했습니다.
|
||||
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5에서 MDXNet 디러버브를 위한 CMD 형식을 수정하고 공백이 있는 경로를 지원했습니다.
|
||||
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py`에서 SoVITS 훈련을 위한 진행률 표시줄 로직을 수정했습니다.
|
||||
|
||||
#### 주요 수정:
|
||||
### 주요 수정:
|
||||
|
||||
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI의 GPT 미세 조정이 중국어 입력 텍스트의 BERT 기능을 읽지 않아 추론과 불일치 및 잠재적 품질 저하를 일으키는 문제를 수정했습니다.
|
||||
**주의: 이전에 많은 양의 데이터로 미세 조정한 경우 품질을 향상시키기 위해 모델을 다시 조정하는 것이 좋습니다.**
|
||||
|
||||
### 20240706 업데이트
|
||||
## 20240706
|
||||
|
||||
#### 사소한 수정:
|
||||
### 사소한 수정:
|
||||
|
||||
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU 추론에서 기본 배치 크기 소수점 문제를 수정했습니다.
|
||||
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) 노이즈 제거 또는 ASR이 예외를 만나면 모든 보류 중인 오디오 파일이 종료되는 문제를 수정했습니다.
|
||||
@ -143,80 +145,189 @@
|
||||
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) 다중 GPU 훈련을 위한 다중 프로세스 저장 로직을 수정했습니다.
|
||||
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) 불필요한 `my_utils`를 제거했습니다.
|
||||
|
||||
#### 주요 수정:
|
||||
### 주요 수정:
|
||||
|
||||
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)의 가속 추론 코드가 검증되어 메인 브랜치에 병합되었으며, 기본과 일관된 추론 효과를 보장합니다.
|
||||
또한 참조 텍스트 없는 모드에서 가속 추론을 지원합니다.
|
||||
|
||||
**향후 업데이트에서는 `fast_inference` 브랜치의 변경 사항의 일관성을 계속 검증할 것입니다**.
|
||||
|
||||
### 20240727 업데이트
|
||||
## 20240727
|
||||
|
||||
#### 사소한 수정:
|
||||
### 사소한 수정:
|
||||
|
||||
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 불필요한 i18n 코드를 정리했습니다.
|
||||
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) 사용자 파일 경로의 후행 슬래시가 명령줄 오류를 일으키는 문제를 수정했습니다.
|
||||
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT 훈련의 단계 계산 로직을 수정했습니다.
|
||||
|
||||
#### 주요 수정:
|
||||
### 주요 수정:
|
||||
|
||||
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) 합성을 위한 음성 속도 조절을 지원했습니다.
|
||||
음성 속도만 조절하면서 무작위성을 고정할 수 있습니다.
|
||||
|
||||
### 20240806 업데이트
|
||||
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356): BS-RoFormer 보컬 분리 모델 지원 추가.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: KamioRinn
|
||||
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351): 중국어 텍스트 프론트엔드 개선.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: KamioRinn
|
||||
|
||||
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer 보컬 반주 분리 모델에 대한 지원을 추가했습니다. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 추론을 활성화했습니다.
|
||||
2. 중국어 텍스트 프론트엔드를 개선했습니다.
|
||||
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 중국어 다의자 지원 (v2 전용);
|
||||
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 추가된 양자;
|
||||
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 사칙연산 및 기본 수학 공식을 지원합니다;
|
||||
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 혼합 텍스트 오류를 수정했습니다.
|
||||
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI 에서 오디오를 처리할 때 경로를 자동으로 채웠습니다.
|
||||
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 인식 로직을 최적화했습니다.
|
||||
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 광동어 ASR 지원을 추가했습니다.
|
||||
6. GPT-SoVITS v2 지원을 추가했습니다.
|
||||
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다.
|
||||
## 202408 (V2 버전)
|
||||
|
||||
### 20240821 업데이트
|
||||
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355): WebUI에서 파일 처리 시 경로 자동 입력 기능 추가.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c): BS-Roformer FP16 추론 지원 활성화.
|
||||
- 유형: 성능 최적화
|
||||
- 기여자: RVC-Boss
|
||||
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78): GPU 인식 로직 최적화, 사용자 입력 GPU 인덱스 처리 로직 추가.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: RVC-Boss
|
||||
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758): **GPT-SoVITS V2 모델 추가.**
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss
|
||||
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3): FunASR을 이용한 광둥어 ASR 지원 추가.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss
|
||||
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388): UI 및 타이밍 로직 최적화.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488): 다중 발음 문자 처리 로직 최적화 (V2 전용).
|
||||
- 유형: 수정, 신규 기능
|
||||
- 기여자: KamioRinn, RVC-Boss
|
||||
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422): 참조 오디오 1개만 업로드 가능한 버그 수정; 누락 파일 경고 팝업 추가.
|
||||
- 유형: 수정, 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508): 상위 LangSegment 라이브러리에서 SSML 태그로 숫자, 전화번호, 날짜, 시간 최적화 지원.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: juntaosun
|
||||
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503): API 수정 및 최적화.
|
||||
- 유형: 수정
|
||||
- 기여자: KamioRinn
|
||||
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490): `fast_inference` 브랜치를 메인 브랜치로 병합.
|
||||
- 유형: 리팩토링
|
||||
- 기여자: ChasonJiang
|
||||
- 2024.08.21 **GPT-SoVITS V2 버전 정식 출시.**
|
||||
|
||||
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` 브랜치를 메인 브랜치에 병합.
|
||||
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML 태그를 사용하여 숫자, 전화번호, 날짜 및 시간 최적화 지원.
|
||||
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API 수정 및 최적화.
|
||||
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 믹싱을 위한 참조 오디오를 하나만 업로드할 수 있는 버그 수정, 다양한 데이터셋 검사 추가 및 파일이 누락된 경우 경고 팝업.
|
||||
## 202502 (V3 버전)
|
||||
|
||||
### 20250211 업데이트
|
||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4): **GPT-SoVITS V3 모델 추가, 파인튜닝 시 14GB VRAM 필요.**
|
||||
- 유형: 신규 기능 ([위키 참조](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032): 다국어 프로젝트 문서 업데이트.
|
||||
- 유형: 문서화
|
||||
- 기여자: StaryLan
|
||||
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033): 일본어 문서 업데이트.
|
||||
- 유형: 문서화
|
||||
- 기여자: Fyphen
|
||||
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010): 어텐션 계산 로직 최적화.
|
||||
- 유형: 성능 최적화
|
||||
- 기여자: wzy3650
|
||||
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040): 파인튜닝 시 그래디언트 체크포인팅 지원 추가, 12GB VRAM 필요.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: Kakaru Hayate
|
||||
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073): 새로운 언어 분할 도구 전환, 다국어 혼합 텍스트 분할 전략 개선, 숫자 및 영어 처리 로직 최적화.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: KamioRinn
|
||||
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e): **GPT-SoVITS V3 모델 LoRA 학습 지원 추가, 파인튜닝 시 8GB GPU 메모리 필요.**
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078): 보컬 및 악기 분리를 위한 Mel Band Roformer 모델 지원 추가.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: Sucial
|
||||
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114): 중국어 경로에서 MeCab 오류 수정 (일본어/한국어 또는 다국어 텍스트 분할 전용).
|
||||
- 유형: 수정
|
||||
- 기여자: KamioRinn
|
||||
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc): **24kHz에서 48kHz 오디오 초해상도 모델 추가** (V3 모델로 24K 오디오 생성 시 "뭉개지는" 현상 완화).
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss
|
||||
- 관련: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
|
||||
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123): 다국어 프로젝트 문서 업데이트.
|
||||
- 유형: 문서화
|
||||
- 기여자: StaryLan
|
||||
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122): 모델이 인식하지 못하는 짧은 CJK 문자에 대해 규칙 기반 검출 적용.
|
||||
- 유형: 수정
|
||||
- 기여자: KamioRinn
|
||||
- 관련: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
|
||||
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5): 음성 속도 제어 매개변수 추가.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.02.28 **GPT-SoVITS V3 정식 출시**.
|
||||
|
||||
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 모델 추가, SoVITS v3의 파인튜닝에는 14GB GPU 메모리가 필요합니다.
|
||||
## 202503
|
||||
|
||||
### 20250212 업데이트
|
||||
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236): 의존성 버전 오류로 인한 문제 수정.
|
||||
- 유형: 수정
|
||||
- 기여자: XXXXRT666
|
||||
- 관련:
|
||||
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
|
||||
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
|
||||
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
|
||||
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
|
||||
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241): **SoVITS v3 병렬 추론 지원 활성화.**
|
||||
- 유형: 신규 기능
|
||||
- 기여자: ChasonJiang
|
||||
|
||||
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3의 파인튜닝에 그라디언트 체크포인트 추가, 12GB GPU 메모리가 필요합니다.
|
||||
- 기타 사소한 버그 수정.
|
||||
|
||||
### 20250214 업데이트
|
||||
- ONNX 런타임 GPU 추론 지원을 위한 패키지 통합 수정:
|
||||
- 유형: 수정
|
||||
- 상세:
|
||||
- G2PW 내 ONNX 모델이 CPU에서 GPU 추론으로 전환, CPU 병목 현상 크게 감소;
|
||||
- foxjoy dereverberation 모델이 GPU 추론 지원.
|
||||
|
||||
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 다국어 혼합 텍스트 분할 전략 **A** 최적화.
|
||||
- `split-lang`을 언어 분할 도구로 추가하여 다국어 혼합 텍스트의 분할 능력을 향상시켰습니다.
|
||||
## 202504 (V4 버전)
|
||||
|
||||
### 20250217 업데이트
|
||||
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f): SoVITS v3 병렬 추론 잠금 해제; 비동기 모델 로딩 로직 수정.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255): Ruff를 이용한 코드 포맷팅; G2PW 링크 업데이트.
|
||||
- 유형: 스타일
|
||||
- 기여자: XXXXRT666
|
||||
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290): 문서 정리; Python 3.11 지원 추가; 설치 프로그램 업데이트.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300): Colab, 설치 파일 및 모델 다운로드 업데이트.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf): **GPT-SoVITS V4 모델 추가.**
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307): V4 병렬 추론 지원 활성화.
|
||||
- 유형: 신규 기능
|
||||
- 기여자: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309): 모델 버전 매개변수 전달 오류 수정.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310): Numpy와 Numba 버전 불일치 문제 수정; librosa 버전 업데이트.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss, XXXXRT666
|
||||
- 관련: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
|
||||
- **2024.04.22 GPT-SoVITS V4 정식 출시**.
|
||||
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311): Gradio 매개변수 업데이트.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322): Colab/Kaggle 노트북 스크립트 개선.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
|
||||
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) 텍스트 내 숫자와 영어 처리 로직 최적화.
|
||||
## 202505
|
||||
|
||||
### 20250218 업데이트
|
||||
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351): Docker 및 Windows 자동 빌드 스크립트 개선; pre-commit 포맷팅 추가.
|
||||
- 유형: 정리 작업
|
||||
- 기여자: XXXXRT666
|
||||
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408): 다국어 텍스트 분할 및 인식 로직 최적화.
|
||||
- 유형: 수정
|
||||
- 기여자: KamioRinn
|
||||
- 관련: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
|
||||
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377): 캐싱 전략 구현으로 SoVITS V3/V4 추론 속도 10% 향상.
|
||||
- 유형: 성능 최적화
|
||||
- 기여자: Kakaru Hayate
|
||||
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba): 어노테이션 인터페이스를 업데이트하여 안내 문구를 추가했습니다: 각 페이지 편집 후 반드시 'Submit Text'를 클릭해 주세요. 그렇지 않으면 변경 사항이 저장되지 않습니다.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss
|
||||
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc): UVR5 및 ONNX dereverberation 모델에서 FFmpeg이 공백 포함 원본 경로로 MP3/M4A 파일 인코딩 시 오류 수정.
|
||||
- 유형: 수정
|
||||
- 기여자: RVC-Boss
|
||||
|
||||
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 다국어 혼합 텍스트 분할 전략 **B** 최적화.
|
||||
|
||||
### 20250223 업데이트
|
||||
|
||||
1. SoVITS V3의 파인튜닝에 LoRA 훈련이 지원됩니다. 8GB GPU 메모리가 필요하며, 전체 매개변수 파인튜닝보다 더 나은 결과를 제공합니다.
|
||||
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) 보컬 및 악기 분리를 위해 Mel Band RoFormer 모델 추가.
|
||||
|
||||
### 20250226 업데이트
|
||||
|
||||
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows에서 비영어 디렉토리로 인한 문제 수정.
|
||||
- 한국어에 대한 `langsegmenter` 사용 문제 수정.
|
||||
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows에서 비영어 디렉토리로 인한 문제 수정.
|
||||
- 한국어/일본어에 대한 `langsegmenter` 사용 문제 수정.
|
||||
|
||||
### 20250227 업데이트
|
||||
|
||||
- V3 모델로 24K 오디오를 생성할 때 발생하는 음성 뭉침 문제를 완화하기 위해, 24K에서 48K로의 오디오 초해상도 모델을 추가했습니다. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)에서 보고된 문제입니다.
|
||||
**미리보기: 단오절 이후 V2 버전 기반 대규모 최적화 업데이트 예정!**
|
@ -1,4 +1,6 @@
|
||||
### 20240121 Güncellemesi
|
||||
# Güncelleme Günlüğü
|
||||
|
||||
## 20240121
|
||||
|
||||
1. `config`e `is_share` eklendi. Colab gibi senaryolarda, WebUI'yi halka açık ağa yönlendirmek için bu değeri `True` olarak ayarlayabilirsiniz.
|
||||
2. WebUI'ye İngilizce sistem çeviri desteği eklendi.
|
||||
@ -7,20 +9,20 @@
|
||||
5. `TEMP` klasöründeki önbelleğe alınmış ses dosyaları ve diğer dosyaları temizledik.
|
||||
6. Referans sesinin sonunu içeren sentezlenmiş ses sorununu önemli ölçüde azalttık.
|
||||
|
||||
### 20240122 Güncellemesi
|
||||
## 20240122
|
||||
|
||||
1. Aşırı kısa çıktı dosyalarının referans sesini tekrarlamasına neden olan sorun giderildi.
|
||||
2. İngilizce ve Japonca eğitim için yerel destek test edildi (Japonca eğitim için kök dizinin İngilizce olmayan özel karakterlerden arındırılmış olması gerekir).
|
||||
3. Ses yolu denetimi iyileştirildi. Yanlış bir giriş yolundan okumaya çalışıldığında, ffmpeg hatası yerine yolun mevcut olmadığını bildirir.
|
||||
|
||||
### 20240123 Güncellemesi
|
||||
## 20240123
|
||||
|
||||
1. Hubert çıkarımının NaN hatalarına neden olup SoVITS/GPT eğitiminde ZeroDivisionError'a yol açtığı sorun çözüldü.
|
||||
2. İnferans WebUI'de hızlı model değiştirme desteği eklendi.
|
||||
3. Model dosyası sıralama mantığı optimize edildi.
|
||||
4. Çince kelime ayrımı için `jieba` `jieba_fast` ile değiştirildi.
|
||||
|
||||
### 20240126 Güncellemesi
|
||||
## 20240126
|
||||
|
||||
1. Çince-İngilizce ve Japonca-İngilizce karışık çıktı metinleri için destek eklendi.
|
||||
2. Çıktı için isteğe bağlı bir bölme modu eklendi.
|
||||
@ -30,7 +32,7 @@
|
||||
6. Mac'te eğitim ve çıkarım desteği eklendi.
|
||||
7. Yarım hassasiyeti desteklemeyen GPU'lar için otomatik olarak tek hassasiyet zorlandı; CPU çıkarımında tek hassasiyet uygulandı.
|
||||
|
||||
### 20240128 Güncellemesi
|
||||
## 20240128
|
||||
|
||||
1. Sayıların Çince karakterlere dönüştürülmesiyle ilgili sorunu düzelttik.
|
||||
2. Cümlelerin başındaki birkaç karakterin yutulması sorununu düzelttik.
|
||||
@ -38,29 +40,29 @@
|
||||
4. GPT eğitiminin kontrol noktalarını kaydetmemesi sorununu düzelttik.
|
||||
5. Dockerfile'da model indirme sürecini tamamladık.
|
||||
|
||||
### 20240129 Güncellemesi
|
||||
## 20240129
|
||||
|
||||
1. Yarım hassasiyet eğitimi ile ilgili sorun yaşayan 16 serisi gibi GPU'lar için eğitim yapılandırmalarını tek hassasiyete değiştirdik.
|
||||
2. Mevcut Colab sürümünü test ettik ve güncelledik.
|
||||
3. Eski sürüm FunASR ile ModelScope FunASR deposunun git klonlanmasıyla oluşan arayüz hizalama hatalarını düzelttik.
|
||||
|
||||
### 20240130 Güncellemesi
|
||||
## 20240130
|
||||
|
||||
1. Çift tırnaklarla yol kopyalama hatalarını önlemek için tüm yol ile ilgili girdilerden otomatik olarak çift tırnakları kaldırdık.
|
||||
2. Çince ve İngilizce noktalama işaretlerini ayırma sorunlarını düzelttik ve cümlelerin başına ve sonuna noktalama işaretleri ekledik.
|
||||
3. Noktalama işaretlerine göre ayırma özelliğini ekledik.
|
||||
|
||||
### 20240201 Güncellemesi
|
||||
## 20240201
|
||||
|
||||
1. Ayrılma hatalarına neden olan UVR5 format okuma hatasını düzelttik.
|
||||
2. Karışık Çince-Japonca-İngilizce metinler için otomatik segmentasyon ve dil tanıma desteği sağladık.
|
||||
|
||||
### 20240202 Güncellemesi
|
||||
## 20240202
|
||||
|
||||
1. `/` ile biten bir ASR yolunun dosya adını kaydetme hatasına neden olma sorununu düzelttik.
|
||||
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) PaddleSpeech'in Normalizer'ını tanıtarak "xx.xx%" (yüzde sembolleri) ve "元/吨" ifadesinin "元吨" yerine "元每吨" olarak okunması gibi sorunları düzelttik ve alt çizgi hatalarını giderdik.
|
||||
|
||||
### 20240207 Güncellemesi
|
||||
## 20240207
|
||||
|
||||
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)de bildirilen dil parametresi karışıklığının Çinçe çıkarım kalitesini düşürme sorununu düzelttik.
|
||||
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) ile UVR5'i daha yüksek versiyonlarda librosa'ya uyarladık.
|
||||
@ -70,33 +72,33 @@
|
||||
6. Veri seti hazırlığı sırasında kök dizini boş bırakıldığında `.list` tam yollarının otomatik olarak okunmasını destekledik.
|
||||
7. Japonca ve İngilizce için Faster Whisper ASR'yi entegre ettik.
|
||||
|
||||
### 20240208 Güncellemesi
|
||||
## 20240208
|
||||
|
||||
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) ile Windows 10 1909'da ve [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)de (Geleneksel Çince Sistem Dili) bildirilen GPT eğitim durma sorununu düzeltmeye çalıştık.
|
||||
|
||||
### 20240212 Güncellemesi
|
||||
## 20240212
|
||||
|
||||
1. Faster Whisper ve FunASR için mantığı optimize ettik, Hugging Face bağlantı sorunlarını önlemek için Faster Whisper'ı ayna indirmelere yönlendirdik.
|
||||
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) GPT tekrarı ve eksik karakterleri azaltmak için eğitim sırasında negatif örnekler oluşturarak deneysel DPO Loss eğitim seçeneğini etkinleştirdi ve çıkarım WebUI'de çeşitli çıkarım parametrelerini kullanılabilir hale getirdi.
|
||||
|
||||
### 20240214 Güncellemesi
|
||||
## 20240214
|
||||
|
||||
1. Eğitimde Çince deney adlarını destekledik (önceden hatalara neden oluyordu).
|
||||
2. DPO eğitimini zorunlu yerine isteğe bağlı bir özellik yaptık. Seçilirse, parti boyutu otomatik olarak yarıya indirilir. Çıkarım WebUI'de yeni parametrelerin iletilmemesi sorunlarını düzelttik.
|
||||
|
||||
### 20240216 Güncellemesi
|
||||
## 20240216
|
||||
|
||||
1. Referans metin olmadan girişi destekledik.
|
||||
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) de bildirilen Çince önyüz hatalarını düzelttik.
|
||||
|
||||
### 20240221 Güncellemesi
|
||||
## 20240221
|
||||
|
||||
1. Veri işleme sırasında bir gürültü azaltma seçeneği ekledik (gürültü azaltma sadece 16kHz örnekleme hızını bırakır; yalnızca arka plan gürültüsü önemliyse kullanın).
|
||||
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) ile Çince ve Japonca önyüz işlemesini optimize ettik.
|
||||
3. Mac CPU çıkarımını daha hızlı performans için MPS yerine CPU kullanacak şekilde değiştirdik.
|
||||
4. Colab genel URL sorununu düzelttik.
|
||||
|
||||
### 20240306 Güncellemesi
|
||||
## 20240306
|
||||
|
||||
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) çıkarımı %50 hızlandırdı (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 üzerinde test edildi).
|
||||
2. Faster Whisper'ın Çince olmayan ASR'sini kullanırken artık önce Çin FunASR modelini indirmeyi gerektirmiyor.
|
||||
@ -104,9 +106,9 @@
|
||||
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) CUDA mevcut olmadığında Faster Whisper için otomatik CPU çıkarımını etkinleştirdi.
|
||||
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) Mac'te doğru CPU çıkarımı sağlamak için `is_half` kontrolünü değiştirdi.
|
||||
|
||||
### 202403/202404/202405 Güncellemeleri
|
||||
## 202403/202404/202405 Güncellemeleri
|
||||
|
||||
#### Küçük Düzeltmeler:
|
||||
### Küçük Düzeltmeler:
|
||||
|
||||
1. Referans metin olmayan mod ile ilgili sorunlar düzeltildi.
|
||||
2. Çince ve İngilizce metin önyüzü optimize edildi.
|
||||
@ -115,27 +117,27 @@
|
||||
5. Eğitim verisi işleme sırasında desteklenmeyen diller için hata uyarıları eklendi.
|
||||
6. Hubert çıkarımındaki hata düzeltildi.
|
||||
|
||||
#### Büyük Düzeltmeler:
|
||||
### Büyük Düzeltmeler:
|
||||
|
||||
1. VQ'yu dondurmadan yapılan SoVITS eğitimi sorunu (bu kalite düşüşüne neden olabilir) düzeltildi.
|
||||
2. Hızlı çıkarım dalı eklendi.
|
||||
|
||||
### 20240610 Güncellemesi
|
||||
## 20240610
|
||||
|
||||
#### Küçük Düzeltmeler:
|
||||
### Küçük Düzeltmeler:
|
||||
|
||||
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) saf noktalama işareti ve çoklu noktalama işareti metin girdisi için mantığı geliştirdi.
|
||||
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5'teki MDXNet yankı giderme için CMD formatını düzeltti, boşluk içeren yolları destekledi.
|
||||
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py` içindeki SoVITS eğitimi için ilerleme çubuğu mantığını düzeltti.
|
||||
|
||||
#### Büyük Düzeltmeler:
|
||||
### Büyük Düzeltmeler:
|
||||
|
||||
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu.
|
||||
**Dikkat: Daha önce büyük miktarda veri ile ince ayar yaptıysanız, modelin kalitesini artırmak için yeniden ayar yapmanız önerilir.**
|
||||
|
||||
### 20240706 Güncellemesi
|
||||
## 20240706
|
||||
|
||||
#### Küçük Düzeltmeler:
|
||||
### Küçük Düzeltmeler:
|
||||
|
||||
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU çıkarımında varsayılan yığın boyutu ondalık sorununu düzeltti.
|
||||
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) gürültü giderme veya ASR ile ilgili istisnalarla karşılaşıldığında bekleyen tüm ses dosyalarının çıkış yapmasına neden olan sorunları düzeltti.
|
||||
@ -143,80 +145,189 @@
|
||||
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) çoklu GPU eğitimi için çoklu işlem kaydetme mantığını düzeltti.
|
||||
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) gereksiz `my_utils`'ı kaldırdı.
|
||||
|
||||
#### Büyük Düzeltmeler:
|
||||
### Büyük Düzeltmeler:
|
||||
|
||||
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) hızlandırılmış çıkarım kodu doğrulandı ve ana dala birleştirildi, taban ile tutarlı çıkarım etkileri sağlandı.
|
||||
Ayrıca referans metni olmayan modda hızlandırılmış çıkarımı destekler.
|
||||
|
||||
**Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.**
|
||||
|
||||
### 20240727 Güncellemesi
|
||||
## 20240727
|
||||
|
||||
#### Küçük Düzeltmeler:
|
||||
### Küçük Düzeltmeler:
|
||||
|
||||
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) gereksiz i18n kodlarını temizledi.
|
||||
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) kullanıcı dosya yollarındaki sonlandırma eğik çizgilerinin komut satırı hatalarına neden olduğu sorunları düzeltti.
|
||||
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT eğitimindeki adım hesaplama mantığını düzeltti.
|
||||
|
||||
#### Büyük Düzeltmeler:
|
||||
### Büyük Düzeltmeler:
|
||||
|
||||
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi.
|
||||
Konuşma hızını ayarlarken rastgeleliği dondurmayı etkinleştirdi.
|
||||
|
||||
### 20240806 Güncellemesi
|
||||
- 2024.07.27 [PR#1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR#1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356): BS-RoFormer vokal eşlik ayırma modeli desteği eklendi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- 2024.07.27 [PR#1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351): Çince metin ön işleme iyileştirildi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
|
||||
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer vokal eşlik ayırma modelini desteklemeye başladı. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 çıkarımı etkinleştirdi.
|
||||
2. Çince metin ön yüzünü geliştirdi.
|
||||
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) çoklu heceli karakterler için destek ekledi (v2 sadece);
|
||||
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) sayı belirleyici ekledi;
|
||||
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) aritmetik ve temel matematik formüllerini destekler;
|
||||
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) karışık metin hatalarını düzeltti.
|
||||
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI'de ses işlenirken yolları otomatik olarak doldurdu.
|
||||
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU tanıma mantığını optimize etti.
|
||||
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) Kantonca ASR desteği ekledi.
|
||||
6. GPT-SoVITS v2 desteği eklendi.
|
||||
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti.
|
||||
## 202408 (V2 Sürümü)
|
||||
|
||||
### 20240821 Güncelleme
|
||||
- 2024.08.01 [PR#1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1355): WebUI'de dosya işlerken yolların otomatik doldurulması.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2024.08.01 [Commit#e62e9653](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c): BS-Roformer için FP16 çıkarım desteği etkinleştirildi.
|
||||
- Tür: Performans Optimizasyonu
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2024.08.01 [Commit#bce451a2](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit#4c8b7612](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78): GPU tanıma mantığı optimize edildi, kullanıcıların girdiği rastgele GPU indekslerini işlemek için kullanıcı dostu mantık eklendi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2024.08.02 [Commit#ff6c193f](https://github.com/RVC-Boss/GPT-SoVITS/commit/ff6c193f6fb99d44eea3648d82ebcee895860a22)~[Commit#de7ee7c7](https://github.com/RVC-Boss/GPT-SoVITS/commit/de7ee7c7c15a2ec137feb0693b4ff3db61fad758): **GPT-SoVITS V2 modeli eklendi.**
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2024.08.03 [Commit#8a101474](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3): FunASR kullanarak Kantonca ASR desteği eklendi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2024.08.03 [PR#1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387), [PR#1388](https://github.com/RVC-Boss/GPT-SoVITS/pull/1388): UI ve zamanlama mantığı optimize edildi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2024.08.06 [PR#1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404), [PR#987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987), [PR#488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488): Çok sesli karakter işleme mantığı optimize edildi (Yalnızca V2).
|
||||
- Tür: Düzeltme, Yeni Özellik
|
||||
- Katkıda Bulunan: KamioRinn, RVC-Boss
|
||||
- 2024.08.13 [PR#1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422): Yalnızca bir referans ses yüklenebilme hatası düzeltildi; eksik dosyalar için uyarı açılır pencereleriyle veri seti doğrulama eklendi.
|
||||
- Tür: Düzeltme, Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2024.08.20 [Issue#1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508): Yukarı akış LangSegment kütüphanesi artık SSML etiketleri kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize ediyor.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: juntaosun
|
||||
- 2024.08.20 [PR#1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503): API düzeltildi ve optimize edildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- 2024.08.20 [PR#1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490): `fast_inference` dalı ana dala birleştirildi.
|
||||
- Tür: Yeniden Yapılandırma
|
||||
- Katkıda Bulunan: ChasonJiang
|
||||
- 2024.08.21 **GPT-SoVITS V2 sürümü resmi olarak yayınlandı.**
|
||||
|
||||
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` dalını ana dala birleştir.
|
||||
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML etiketlerini kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize etme desteği.
|
||||
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API düzeltildi ve optimize edildi.
|
||||
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Karıştırmak için yalnızca bir referans sesi yüklenebiliyordu hatası düzeltildi, çeşitli veri seti kontrolleri eklendi ve eksik dosyalar için uyarılar çıkar.
|
||||
## 202502 (V3 Sürümü)
|
||||
|
||||
### 20250211 Güncellemesi
|
||||
- 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4): **İnce ayar için 14GB VRAM gerektiren GPT-SoVITS V3 modeli eklendi.**
|
||||
- Tür: Yeni Özellik ([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) referans)
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032): Çok dilli proje dokümantasyonu güncellendi.
|
||||
- Tür: Dokümantasyon
|
||||
- Katkıda Bulunan: StaryLan
|
||||
- 2025.02.12 [PR#2033](https://github.com/RVC-Boss/GPT-SoVITS/pull/2033): Japonca dokümantasyon güncellendi.
|
||||
- Tür: Dokümantasyon
|
||||
- Katkıda Bulunan: Fyphen
|
||||
- 2025.02.12 [PR#2010](https://github.com/RVC-Boss/GPT-SoVITS/pull/2010): Dikkat hesaplama mantığı optimize edildi.
|
||||
- Tür: Performans Optimizasyonu
|
||||
- Katkıda Bulunan: wzy3650
|
||||
- 2025.02.12 [PR#2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040): İnce ayar için gradyan kontrol noktası desteği eklendi (12GB VRAM gerektirir).
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: Kakaru Hayate
|
||||
- 2025.02.14 [PR#2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047), [PR#2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062), [PR#2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073): Yeni dil bölümleme aracına geçildi, çok dilli karışık metin bölme stratejisi iyileştirildi, sayı ve İngilizce işleme mantığı optimize edildi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- 2025.02.23 [Commit#56509a17](https://github.com/RVC-Boss/GPT-SoVITS/commit/56509a17c918c8d149c48413a672b8ddf437495b)~[Commit#514fb692](https://github.com/RVC-Boss/GPT-SoVITS/commit/514fb692db056a06ed012bc3a5bca2a5b455703e): **GPT-SoVITS V3 modeli artık LoRA eğitimini destekliyor (ince ayar için 8GB GPU Belleği gerektirir).**
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.02.23 [PR#2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078): Vokal ve enstrüman ayırma için Mel Band Roformer model desteği eklendi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: Sucial
|
||||
- 2025.02.26 [PR#2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112), [PR#2114](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114): Çince yollarda MeCab hatası düzeltildi (özel olarak Japonca/Korece veya çok dilli metin bölme için).
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- 2025.02.27 [Commit#92961c3f](https://github.com/RVC-Boss/GPT-SoVITS/commit/92961c3f68b96009ff2cd00ce614a11b6c4d026f)~[Commit#250b1c73](https://github.com/RVC-Boss/GPT-SoVITS/commit/250b1c73cba60db18148b21ec5fbce01fd9d19bc): V3 modeliyle 24K ses üretirken "boğuk" ses sorununu hafifletmek için **24kHz'den 48kHz'e ses süper çözünürlük modelleri eklendi**.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- İlgili: [Issue#2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue#2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)
|
||||
- 2025.02.28 [PR#2123](https://github.com/RVC-Boss/GPT-SoVITS/pull/2123): Çok dilli proje dokümantasyonu güncellendi.
|
||||
- Tür: Dokümantasyon
|
||||
- Katkıda Bulunan: StaryLan
|
||||
- 2025.02.28 [PR#2122](https://github.com/RVC-Boss/GPT-SoVITS/pull/2122): Model tanımlayamadığında kısa CJK karakterleri için kural tabanlı tespit uygulandı.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- İlgili: [Issue#2116](https://github.com/RVC-Boss/GPT-SoVITS/issues/2116)
|
||||
- 2025.02.28 [Commit#c38b1690](https://github.com/RVC-Boss/GPT-SoVITS/commit/c38b16901978c1db79491e16905ea3a37a7cf686), [Commit#a32a2b89](https://github.com/RVC-Boss/GPT-SoVITS/commit/a32a2b893436fad56cc82409121c7fa36a1815d5): Sentez hızını kontrol etmek için konuşma hızı parametresi eklendi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.02.28 **GPT-SoVITS V3 resmi olarak yayınlandı**.
|
||||
|
||||
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 Modeli Eklendi, SoVITS v3'ü ince ayar yapmak için 14GB GPU belleği gereklidir.
|
||||
## 202503
|
||||
|
||||
### 20250212 Güncellemesi
|
||||
- 2025.03.31 [PR#2236](https://github.com/RVC-Boss/GPT-SoVITS/pull/2236): Bağımlılıkların yanlış sürümlerinden kaynaklanan sorunlar düzeltildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- İlgili:
|
||||
- PyOpenJTalk: [Issue#1131](https://github.com/RVC-Boss/GPT-SoVITS/issues/1131), [Issue#2231](https://github.com/RVC-Boss/GPT-SoVITS/issues/2231), [Issue#2233](https://github.com/RVC-Boss/GPT-SoVITS/issues/2233).
|
||||
- ONNX: [Issue#492](https://github.com/RVC-Boss/GPT-SoVITS/issues/492), [Issue#671](https://github.com/RVC-Boss/GPT-SoVITS/issues/671), [Issue#1192](https://github.com/RVC-Boss/GPT-SoVITS/issues/1192), [Issue#1819](https://github.com/RVC-Boss/GPT-SoVITS/issues/1819), [Issue#1841](https://github.com/RVC-Boss/GPT-SoVITS/issues/1841).
|
||||
- Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
|
||||
- PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
|
||||
- 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241): **SoVITS v3 için paralel çıkarım etkinleştirildi.**
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: ChasonJiang
|
||||
|
||||
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3'ü ince ayar yapmak için gradyan kontrol noktası ekledi, 12GB GPU belleği gereklidir.
|
||||
- Diğer küçük hatalar düzeltildi.
|
||||
|
||||
### 20250214 Güncellemesi
|
||||
- ONNX çalışma zamanı GPU çıkarım desteği için entegre paket düzeltmeleri:
|
||||
- Tür: Düzeltme
|
||||
- Detaylar:
|
||||
- G2PW içindeki ONNX modelleri CPU'dan GPU çıkarımına geçirildi, CPU darboğazı önemli ölçüde azaltıldı;
|
||||
- foxjoy yankı giderme modeli artık GPU çıkarımını destekliyor.
|
||||
|
||||
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Çok dilli karışık metin segmentasyon stratejisi **A**'yı optimize etti.
|
||||
- `split-lang` bir dil segmentasyon aracı olarak eklendi ve çok dilli karışık metinlerin segmentasyon yeteneklerini iyileştirdi.
|
||||
## 202504 (V4 Sürümü)
|
||||
|
||||
### 20250217 Güncellemesi
|
||||
- 2025.04.01 [Commit#6a60e5ed](https://github.com/RVC-Boss/GPT-SoVITS/commit/6a60e5edb1817af4a61c7a5b196c0d0f1407668f): SoVITS v3 paralel çıkarımı kilit açıldı; asenkron model yükleme mantığı düzeltildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.04.07 [PR#2255](https://github.com/RVC-Boss/GPT-SoVITS/pull/2255): Ruff ile kod biçimlendirme; G2PW bağlantısı güncellendi.
|
||||
- Tür: Stil
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2025.04.15 [PR#2290](https://github.com/RVC-Boss/GPT-SoVITS/pull/2290): Dokümantasyon temizlendi; Python 3.11 desteği eklendi; yükleyiciler güncellendi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2025.04.20 [PR#2300](https://github.com/RVC-Boss/GPT-SoVITS/pull/2300): Colab, kurulum dosyaları ve model indirmeleri güncellendi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2025.04.20 [Commit#e0c452f0](https://github.com/RVC-Boss/GPT-SoVITS/commit/e0c452f0078e8f7eb560b79a54d75573fefa8355)~[Commit#9d481da6](https://github.com/RVC-Boss/GPT-SoVITS/commit/9d481da610aa4b0ef8abf5651fd62800d2b4e8bf): **GPT-SoVITS V4 modeli eklendi.**
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.04.21 [Commit#8b394a15](https://github.com/RVC-Boss/GPT-SoVITS/commit/8b394a15bce8e1d85c0b11172442dbe7a6017ca2)~[Commit#bc2fe5ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/bc2fe5ec86536c77bb3794b4be263ac87e4fdae6), [PR#2307](https://github.com/RVC-Boss/GPT-SoVITS/pull/2307): V4 için paralel çıkarım etkinleştirildi.
|
||||
- Tür: Yeni Özellik
|
||||
- Katkıda Bulunan: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#7405427a](https://github.com/RVC-Boss/GPT-SoVITS/commit/7405427a0ab2a43af63205df401fd6607a408d87)~[Commit#590c83d7](https://github.com/RVC-Boss/GPT-SoVITS/commit/590c83d7667c8d4908f5bdaf2f4c1ba8959d29ff), [PR#2309](https://github.com/RVC-Boss/GPT-SoVITS/pull/2309): Model sürümü parametre aktarımı düzeltildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss, ChasonJiang
|
||||
- 2025.04.22 [Commit#fbdab94e](https://github.com/RVC-Boss/GPT-SoVITS/commit/fbdab94e17d605d85841af6f94f40a45976dd1d9), [PR#2310](https://github.com/RVC-Boss/GPT-SoVITS/pull/2310): Numpy ve Numba sürüm uyumsuzluğu sorunu düzeltildi; librosa sürümü güncellendi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss, XXXXRT666
|
||||
- İlgili: [Issue#2308](https://github.com/RVC-Boss/GPT-SoVITS/issues/2308)
|
||||
- **2025.04.22 GPT-SoVITS V4 resmi olarak yayınlandı**.
|
||||
- 2025.04.22 [PR#2311](https://github.com/RVC-Boss/GPT-SoVITS/pull/2311): Gradio parametreleri güncellendi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2025.04.25 [PR#2322](https://github.com/RVC-Boss/GPT-SoVITS/pull/2322): Colab/Kaggle notebook betikleri iyileştirildi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
|
||||
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Metindeki sayılar ve İngilizceyi işleme mantığını optimize etti.
|
||||
## 202505
|
||||
|
||||
### 20250218 Güncellemesi
|
||||
- 2025.05.26 [PR#2351](https://github.com/RVC-Boss/GPT-SoVITS/pull/2351): Docker ve Windows otomatik derleme betikleri iyileştirildi; ön işleme biçimlendirme eklendi.
|
||||
- Tür: Chore
|
||||
- Katkıda Bulunan: XXXXRT666
|
||||
- 2025.05.26 [PR#2408](https://github.com/RVC-Boss/GPT-SoVITS/pull/2408): Çok dilli metin bölme ve tanıma mantığı optimize edildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: KamioRinn
|
||||
- İlgili: [Issue#2404](https://github.com/RVC-Boss/GPT-SoVITS/issues/2404)
|
||||
- 2025.05.26 [PR#2377](https://github.com/RVC-Boss/GPT-SoVITS/pull/2377): SoVITS V3/V4 çıkarım hızını %10 artırmak için önbellekleme stratejileri uygulandı.
|
||||
- Tür: Performans Optimizasyonu
|
||||
- Katkıda Bulunan: Kakaru Hayate
|
||||
- 2025.05.26 [Commit#4d9d56b1](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d9d56b19638dc434d6eefd9545e4d8639a3e072), [Commit#8c705784](https://github.com/RVC-Boss/GPT-SoVITS/commit/8c705784c50bf438c7b6d0be33a9e5e3cb90e6b2), [Commit#fafe4e7f](https://github.com/RVC-Boss/GPT-SoVITS/commit/fafe4e7f120fba56c5f053c6db30aa675d5951ba): Açıklama arayüzü uyarı ile güncellendi: her sayfa tamamlandıktan sonra "Metni Gönder"e tıklayın, aksi takdirde değişiklikler kaydedilmez.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
- 2025.05.29 [Commit#1934fc1e](https://github.com/RVC-Boss/GPT-SoVITS/commit/1934fc1e1b22c4c162bba1bbe7d7ebb132944cdc): UVR5 ve ONNX yankı giderme modellerinde, FFmpeg'in orijinal yollarında boşluk bulunan MP3/M4A dosyalarını kodlarken oluşan hatalar düzeltildi.
|
||||
- Tür: Düzeltme
|
||||
- Katkıda Bulunan: RVC-Boss
|
||||
|
||||
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Çok dilli karışık metin segmentasyon stratejisi **B**'yi optimize etti.
|
||||
|
||||
### 20250223 Güncellemesi
|
||||
|
||||
1. SoVITS V3 için LoRA eğitimi, ince ayar yapmayı destekler. 8GB GPU belleği gereklidir ve sonuçlar tam parametreli ince ayar yapmaktan daha iyidir.
|
||||
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Mel Band RoFormer modelini vokal ve enstrüman ayrımı için ekledi.
|
||||
|
||||
### 20250226 Güncellemesi
|
||||
|
||||
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
|
||||
- Korece için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
|
||||
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
|
||||
- Korece/Japonca için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
|
||||
|
||||
### 20250227 Güncellemesi
|
||||
|
||||
- 24K sesli V3 modeliyle 24K ses oluştururken meydana gelen boğukluk sorununu hafifletmek için, 24K'dan 48K'ya ses süper çözünürlük modelleri eklendi. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117) de bildirilen sorunlar.
|
||||
**Önizleme: Ejderha Teknesi Festivali'nden sonra V2 sürümüne dayalı büyük optimizasyon güncellemesi gelecek!**
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb): Best choice for dual-channel reverberation, cannot remove single-channel reverberation;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverberation, can remove mono reverberation, but does not clean heavily high-frequency plate reverberation.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Unstable, do not use yet; may degrade model performance!)",
|
||||
"*实验/模型名": "*Experiment/model name",
|
||||
"*文本标注文件": "*Text labelling file",
|
||||
"*训练集音频文件目录": "*Audio dataset folder",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU number, can only input ONE integer",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ",
|
||||
"LoRA秩": "LoRA Rank",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained SoVITS V3 Model, Cannot Load LoRA Weights",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/",
|
||||
"SoVITS模型列表": "SoVITS weight list",
|
||||
"SoVITS训练": "SoVITS Training",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: Manually save all text box contents on the current page to memory and file (If you don't click this button before switching pages or exiting the labeling page, the data will be rolled back when you return, which would be a waste of work.)",
|
||||
"TTS推理WebUI": "TTS Inference WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 does not support the no-reference-text mode. Please provide reference text!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
|
||||
"batch_size": "Batch Size",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Pause Duration between Sentences (Seconds)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.",
|
||||
"合成语音": "Start inference",
|
||||
"合成音频": "Synthesize Audio",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " is Closed",
|
||||
"已完成": " Finished",
|
||||
"已开启": " is Opened",
|
||||
"并行合成中": "Parallel Synthesis in Progress",
|
||||
"并行推理": "Parallel Inference",
|
||||
"并行推理模式已关闭": "Parallel Inference Mode Disabled",
|
||||
"并行推理模式已开启": "Parallel Inference Mode Enabled",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained Model, Cannot Load LoRA Weights",
|
||||
"开启": "Open ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "When parallel inference mode is enabled, SoVITS V3/4 models do not support bucket processing; bucket processing has been automatically disabled.",
|
||||
"微调训练": "Fine-Tuning",
|
||||
"怎么切": "How to slice the sentence",
|
||||
"总训练轮数total_epoch": "Total training epochs (total_epoch):",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Model",
|
||||
"模型分为三类:": "Models are categorized into three types:",
|
||||
"模型切换": "Model switch",
|
||||
"模型加载中,请等待": "Model is loading, please wait...",
|
||||
"每张显卡的batch_size": "Batch size per GPU:",
|
||||
"版本": "Version",
|
||||
"粤英混合": "Yue-English Mixed",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " Process Terminated",
|
||||
"进程输出信息": " Process Output Information",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
|
||||
"采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing",
|
||||
"重复惩罚": "Repetition Penalty",
|
||||
"随机种子": "Random Seed",
|
||||
@ -203,6 +211,9 @@
|
||||
"音频标注WebUI": "Audio Labeling WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
|
||||
"音频超分中": "Running Audio Super-Resolution",
|
||||
"音频超采样": "Audio Upsampling",
|
||||
"音频超采样(仅对V3生效))": "Audio Upsampling (V3 Only)",
|
||||
"预测语义Token": "Predict Semantic Token",
|
||||
"预训练GPT模型路径": "Pretrained GPT Model Path",
|
||||
"预训练SSL模型路径": "Pretrained SSL Model Path",
|
||||
"预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): reverberación estéreo, la mejor opción; no puede eliminar reverberación mono",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Eliminar el efecto de retardo. Aggressive elimina más que Normal, DeReverb elimina reverberación adicional, puede eliminar reverberación mono, pero no limpia bien la reverberación de placa de alta frecuencia",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(¡Inestable! No lo uses aún, podría degradar el rendimiento del modelo)",
|
||||
"*实验/模型名": "*Nombre del experimento/modelo",
|
||||
"*文本标注文件": "*Archivo de etiquetado de texto",
|
||||
"*训练集音频文件目录": "*Directorio de archivos de audio de entrenamiento",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "Número de tarjeta GPU, solo se puede ingresar un número entero",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "Número de tarjeta GPU separado por '-', cada número de tarjeta es un proceso",
|
||||
"LoRA秩": "Rango de LoRA",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Falta el modelo base de SoVITS V3, no se pueden cargar los pesos de LoRA correspondientes",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entrenamiento de SoVITS: los archivos de pesos del modelo están en SoVITS_weights/",
|
||||
"SoVITS模型列表": "Lista de modelos SoVITS",
|
||||
"SoVITS训练": "Entrenamiento de SoVITS",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Enviar texto: Guarda manualmente el contenido de todos los campos de texto en la página actual en memoria y archivo (si no haces clic en este botón antes o después de cambiar de página o salir de la página de etiquetado, al regresar se desharán los cambios, ¡todo ese trabajo se perderá!).",
|
||||
"TTS推理WebUI": "WebUI de inferencia TTS",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "¡V3 no admite el modo sin texto de referencia! Por favor, introduce el texto de referencia.",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
|
||||
"batch_size": "Tamaño de lote",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Segundos de pausa entre frases",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Sube varios archivos de audio de referencia arrastrándolos y soltándolos (se recomienda que sean del mismo género) y promedia sus tonos. Si esta opción se deja en blanco, el tono será controlado por el único audio de referencia a la izquierda. Si se está afinando el modelo, se recomienda que todos los archivos de audio de referencia tengan tonos dentro del conjunto de entrenamiento de ajuste fino; se puede ignorar el modelo preentrenado.",
|
||||
"合成语音": "Síntesis de voz",
|
||||
"合成音频": "Sintetizar Audio",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Se añadirá soporte para conversión de fonemas, edición manual de fonemas y síntesis de voz por pasos en el futuro.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si el audio de referencia no es claro o no sabe qué escribir, habilite esta opción para ignorar el texto de referencia.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " Desactivado",
|
||||
"已完成": " Completado",
|
||||
"已开启": " Activado",
|
||||
"并行合成中": "Síntesis en paralelo en curso",
|
||||
"并行推理": "Inferencia paralela",
|
||||
"并行推理模式已关闭": "Modo de inferencia paralela deshabilitado",
|
||||
"并行推理模式已开启": "Modo de inferencia paralela habilitado",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Falta el modelo base, no se pueden cargar los pesos de LoRA correspondientes",
|
||||
"开启": "Activar ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "Al activar el modo de inferencia paralela, los modelos SoVITS V3/4 no admiten el procesamiento por lotes, por lo que este ha sido desactivado automáticamente.",
|
||||
"微调训练": "Entrenamiento de ajuste fino",
|
||||
"怎么切": "Cómo cortar",
|
||||
"总训练轮数total_epoch": "Número total de épocas de entrenamiento",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Modelo",
|
||||
"模型分为三类:": "Los modelos se dividen en tres categorías:",
|
||||
"模型切换": "Cambio de modelo",
|
||||
"模型加载中,请等待": "El modelo se está cargando, por favor espera...",
|
||||
"每张显卡的batch_size": "Tamaño de lote por tarjeta gráfica",
|
||||
"版本": "Versión",
|
||||
"粤英混合": "Mezcla Cantonés-Inglés",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " Proceso terminado",
|
||||
"进程输出信息": " Información de salida del proceso",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Seleccione el modelo almacenado en SoVITS_weights y GPT_weights después del entrenamiento. Uno de ellos es el modelo base, útil para experimentar con TTS de 5 segundos sin entrenamiento.",
|
||||
"采样步数(仅对V3/4生效)": "Pasos de muestreo (solo efectivo para V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Pasos de muestreo: si se siente ruidoso, intente aumentarlo; si es lento, intente reducirlo",
|
||||
"重复惩罚": "Penalización por repetición",
|
||||
"随机种子": "Semilla aleatoria",
|
||||
@ -203,9 +211,12 @@
|
||||
"音频标注WebUI": "WebUI de etiquetado de audio",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
|
||||
"音频超分中": "Superresolución de audio en proceso",
|
||||
"音频超采样": "Muestreo superior del audio",
|
||||
"音频超采样(仅对V3生效))": "Muestreo superior del audio (solo efectivo para V3)",
|
||||
"预测语义Token": "Predecir token semántico",
|
||||
"预训练GPT模型路径": "Ruta del modelo GPT preentrenado",
|
||||
"预训练SSL模型路径": "Ruta del modelo SSL preentrenado",
|
||||
"预训练SoVITS-D模型路径": "Ruta del modelo SoVITS-D preentrenado",
|
||||
"预训练SoVITS-G模型路径": "Ruta del modelo SoVITS-G preentrenado",
|
||||
"预训练中文BERT模型路径": "Ruta del modelo BERT en Chino preentrenado"
|
||||
"预训练中文BERT模型路径": "Ruta del modelo BERT Chino Preentrenado"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1) MDX-Net (onnx_dereverb) : C'est le meilleur choix pour la réverbération à deux canaux, mais il ne peut pas éliminer la réverbération à un seul canal;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho : Supprime les effets de délai. Aggressive est plus exhaustif que Normal dans la suppression, DeReverb élimine également la réverbération, peut supprimer la réverbération monocanal, mais n'élimine pas complètement la réverbération de plaque à haute fréquence.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Instable, à ne pas utiliser encore, pourrait dégrader les performances du modèle !)",
|
||||
"*实验/模型名": "*Nom de l'expérience/modèle",
|
||||
"*文本标注文件": "*Fichier d'annotation de texte",
|
||||
"*训练集音频文件目录": "*Répertoire des fichiers audio d'entraînement",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "Numéro de carte GPU, ne peut contenir qu'un seul entier",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "Numéro de carte GPU séparé par des tirets, un processus par numéro de carte",
|
||||
"LoRA秩": "Rang LoRA",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Modèle de base SoVITS V3 manquant, impossible de charger les poids LoRA correspondants",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entraînement SoVITS : les poids du modèle sont dans SoVITS_weights/",
|
||||
"SoVITS模型列表": "Liste des modèles SoVITS",
|
||||
"SoVITS训练": "Entraînement SoVITS",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Soumettre le texte : Enregistrez manuellement le contenu de tous les champs textuels de la page actuelle en mémoire et dans un fichier (si vous ne cliquez pas sur ce bouton avant ou après avoir changé de page, ou avant de quitter la page d'étiquetage, vos modifications seront annulées lorsque vous reviendrez, tout votre travail sera perdu).",
|
||||
"TTS推理WebUI": "Interface Web d'inférence TTS",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "Outil UVR5 de séparation voix/accompagnement & suppression de réverbération et de latence",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "La version V3 ne prend pas en charge le mode sans texte de référence. Veuillez fournir un texte de référence !",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé",
|
||||
"batch_size": "Taille de lot",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Temps de pause entre les phrases (secondes)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optionnel : Téléchargez plusieurs fichiers audio de référence en les faisant glisser (recommandé d'être du même genre) et fusionnez leur tonalité. Si cette option est laissée vide, la tonalité sera contrôlée par l'unique fichier audio de référence à gauche. Si vous ajustez le modèle, il est recommandé que tous les fichiers audio de référence aient des tonalités dans l'ensemble d'entraînement d'ajustement ; le modèle pré-entrainé peut être ignoré.",
|
||||
"合成语音": "Synthèse vocale",
|
||||
"合成音频": "Synthétiser l'audio",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Le support pour la conversion phonémique, l’édition manuelle des phonèmes et la synthèse vocale par étapes sera ajouté ultérieurement.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si vous ne comprenez pas bien l'audio de référence (vous ne savez pas quoi écrire), vous pouvez activer cette option. Une fois activée, le texte de référence sera ignoré.",
|
||||
@ -104,11 +107,13 @@
|
||||
"已关闭": " Fermé",
|
||||
"已完成": " Terminé",
|
||||
"已开启": " Activé",
|
||||
"并行合成中": "Synthèse parallèle en cours",
|
||||
"并行推理": "Inférence parallèle",
|
||||
"并行推理模式已关闭": "Mode d’inférence parallèle désactivé",
|
||||
"并行推理模式已开启": "Mode d’inférence parallèle activé",
|
||||
"开启": "Activer ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "Lorsque le mode d'inférence parallèle est activé, les modèles SoVITS V3/4 ne prennent pas en charge le traitement par lots, qui a donc été désactivé automatiquement.",
|
||||
"微调训练": "Entraînement de fine-tuning",
|
||||
"怎么切": "Comment découper",
|
||||
"总训练轮数total_epoch": "Nombre total d'époques d'entraînement",
|
||||
@ -140,6 +145,7 @@
|
||||
"模型": "Modèle",
|
||||
"模型分为三类:": "Les modèles sont classés en trois catégories:",
|
||||
"模型切换": "Changement de modèle",
|
||||
"模型加载中,请等待": "Le modèle est en cours de chargement, veuillez patienter...",
|
||||
"每张显卡的batch_size": "Taille de lot par carte graphique",
|
||||
"版本": "Version",
|
||||
"粤英混合": "Mélange Cantonais-Anglais",
|
||||
@ -188,6 +194,7 @@
|
||||
"进程已终止": " Processus terminé",
|
||||
"进程输出信息": " Sortie du processus",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choisissez le modèle entraîné stocké sous SoVITS_weights et GPT_weights. Par défaut, l'un d'eux est un modèle de base pour l'expérience de TTS Zero Shot de 5 secondes.",
|
||||
"采样步数(仅对V3/4生效)": "Nombre d'étapes d'échantillonnage (uniquement effectif pour V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Nombre d’étapes d’échantillonnage : si le son est bruité, essayez d’augmenter, si c’est lent, essayez de réduire",
|
||||
"重复惩罚": "Pénalité de répétition",
|
||||
"随机种子": "Graine aléatoire",
|
||||
@ -202,10 +209,14 @@
|
||||
"音频文件不存在,跳过:": "Fichier audio introuvable, passage : ",
|
||||
"音频标注WebUI": "Interface Web d'annotation audio",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier",
|
||||
"音频超分中": "Upscaling audio en cours",
|
||||
"音频超分中": "Super-résolution audio en cours",
|
||||
"音频超采样": "Suréchantillonnage audio",
|
||||
"音频超采样(仅对V3生效))": "Suréchantillonnage audio (uniquement effectif pour V3)",
|
||||
"预测语义Token": "Prédire le jeton sémantique",
|
||||
"预训练GPT模型路径": "Chemin du modèle GPT pré-entraîné",
|
||||
"预训练SSL模型路径": "Chemin du modèle SSL pré-entraîné",
|
||||
"预训练SoVITS-D模型路径": "Chemin du modèle SoVITS-D pré-entraîné",
|
||||
"预训练SoVITS-G模型路径": "Chemin du modèle SoVITS-G pré-entraîné",
|
||||
"预训练中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné"
|
||||
"预训练中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "#!底模缺失,无法加载相应 LoRA 权重"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): È la scelta migliore per la riverberazione a due canali, ma non può rimuovere la riverberazione a canale singolo;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Rimuove gli effetti di ritardo. Aggressive è più completo di Normal nella rimozione, DeReverb rimuove ulteriormente la riverberazione, può rimuovere la riverberazione a canale singolo, ma non rimuove completamente la riverberazione a piastra ad alta frequenza.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Instabile, non utilizzare ancora, potrebbe peggiorare le prestazioni del modello!)",
|
||||
"*实验/模型名": "*Nome dell'esperimento/modello",
|
||||
"*文本标注文件": "*File di annotazione del testo",
|
||||
"*训练集音频文件目录": "*Directory dei file audio del set di addestramento",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "Numero della scheda grafica, può essere inserito solo un numero intero",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "Numero di GPU separati da '-'; ogni numero corrisponde a un processo",
|
||||
"LoRA秩": "Rango LoRA",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Modello base SoVITS V3 mancante, impossibile caricare i pesi LoRA corrispondenti",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Addestramento SoVITS: i pesi del modello sono in SoVITS_weights/",
|
||||
"SoVITS模型列表": "Elenco dei modelli SoVITS",
|
||||
"SoVITS训练": "Addestramento SoVITS",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Invia testo: Salva manualmente i contenuti di tutti i campi testuali della pagina corrente in memoria e su file (se non premi questo pulsante prima o dopo aver girato pagina oppure prima di uscire dalla pagina di annotazione, tornando indietro i dati saranno ripristinati e avrai lavorato invano).",
|
||||
"TTS推理WebUI": "Interfaccia Web per inferenza TTS",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "Strumento UVR5 per separazione voce/accompagnamento & rimozione riverbero e latenza",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 non supporta la modalità senza testo di riferimento! Inserisci il testo di riferimento!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Quanta proporzione dell'audio normalizzato deve essere miscelata",
|
||||
"batch_size": "Dimensione del batch",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: Come calcolare la curva del volume. Più piccolo è, maggiore è la precisione ma aumenta la complessità computazionale (non significa che una maggiore precisione dà risultati migliori)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Durata pausa tra le frasi (secondi)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opzionale: Carica più file audio di riferimento trascinandoli (si consiglia dello stesso genere) e media il loro tono. Se questa opzione è lasciata vuota, il tono sarà controllato dal singolo file audio di riferimento a sinistra. Se si sta perfezionando il modello, è consigliato che tutti i file audio di riferimento abbiano toni presenti nel set di addestramento per il perfezionamento; il modello pre-addestrato può essere ignorato.",
|
||||
"合成语音": "Sintesi vocale",
|
||||
"合成音频": "Sintesi audio",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Formato di percorso della cartella valido: E:\\codes\\py39\\vits_vc_gpu\\Esempio di test di BaiLuShuangHua (copiare direttamente dalla barra degli indirizzi del gestore file).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "In futuro verrà aggiunto il supporto per la conversione dei fonemi, la modifica manuale dei fonemi e la sintesi vocale passo dopo passo.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se l'audio di riferimento non è chiaro o non sai cosa scrivere, abilita questa opzione per ignorare il testo di riferimento.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " Chiuso",
|
||||
"已完成": " Completato",
|
||||
"已开启": " Attivato",
|
||||
"并行合成中": "Sintesi parallela in corso",
|
||||
"并行推理": "Inferenza parallela",
|
||||
"并行推理模式已关闭": "Modalità di inferenza parallela disabilitata",
|
||||
"并行推理模式已开启": "Modalità di inferenza parallela abilitata",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Mancano il modello base, non è possibile caricare i pesi LoRA corrispondenti",
|
||||
"开启": "Attiva ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Attivare la modalità senza testo di riferimento. Anche se non inserisci un testo di riferimento, la modalità verrà attivata.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "Quando la modalità di inferenza parallela è attiva, i modelli SoVITS V3/4 non supportano l'elaborazione suddivisa in gruppi e questa è stata automaticamente disattivata.",
|
||||
"微调训练": "Addestramento fine-tuning",
|
||||
"怎么切": "Come tagliare",
|
||||
"总训练轮数total_epoch": "Numero totale di epoche di addestramento",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Modello",
|
||||
"模型分为三类:": "I modelli sono divisi in tre categorie:",
|
||||
"模型切换": "Cambio del modello",
|
||||
"模型加载中,请等待": "Il modello si sta caricando, attendere prego...",
|
||||
"每张显卡的batch_size": "Batch size per ogni scheda grafica",
|
||||
"版本": "Versione",
|
||||
"粤英混合": "Misto Cantonese-Inglese",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " Processo terminato",
|
||||
"进程输出信息": " Output del processo",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Scegli il modello salvato in SoVITS_weights e GPT_weights dopo l'addestramento. Uno di default è il modello di base, utilizzato per l'esperienza di Zero Shot TTS in 5 secondi.",
|
||||
"采样步数(仅对V3/4生效)": "Numero di passaggi di campionamento (valido solo per V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passi di campionamento: se sembra rumoroso, prova a aumentarlo, se è lento, prova a diminuirlo",
|
||||
"重复惩罚": "Penalità di ripetizione",
|
||||
"随机种子": "Seme casuale",
|
||||
@ -203,6 +211,9 @@
|
||||
"音频标注WebUI": "Interfaccia Web per annotazione audio",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Percorso di input per la segmentazione automatica dell'audio, può essere un file o una cartella",
|
||||
"音频超分中": "Super-risoluzione audio in corso",
|
||||
"音频超采样": "Upsampling audio",
|
||||
"音频超采样(仅对V3生效))": "Upsampling audio (valido solo per V3)",
|
||||
"预测语义Token": "Predici token semantico",
|
||||
"预训练GPT模型路径": "Percorso del modello GPT pre-addestrato",
|
||||
"预训练SSL模型路径": "Percorso del modello SSL pre-addestrato",
|
||||
"预训练SoVITS-D模型路径": "Percorso del modello SoVITS-D pre-addestrato",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):二重チャンネルのリバーブに最適な選択ですが、単一チャンネルのリバーブは除去できません;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:遅延効果を除去します。AggressiveはNormalよりも徹底的に除去し、DeReverbは追加でリバーブを除去し、モノラルリバーブを除去できますが、高周波数のプレートリバーブは完全には除去できません。",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(※不安定な機能です。使用は避けてください。モデル性能が低下する可能性があります!)",
|
||||
"*实验/模型名": "*実験/モデル名",
|
||||
"*文本标注文件": "*テキスト注釈ファイル",
|
||||
"*训练集音频文件目录": "*トレーニングデータのオーディオファイルディレクトリ",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU番号、1つの整数しか入力できません",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPUカード番号はハイフンで区切り、各カード番号ごとに1つのプロセスが実行されます",
|
||||
"LoRA秩": "LoRAランク",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 のベースモデルが不足しているため、対応する LoRA の重みをロードできません",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS トレーニング: モデルの重みファイルは SoVITS_weights/ にあります",
|
||||
"SoVITS模型列表": "SoVITSモデルリスト",
|
||||
"SoVITS训练": "SoVITSトレーニング",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "現在のページにあるすべてのテキストフィールドの内容を手動で保存します(メモリとファイルに反映)。ページ切り替えやアノテーション画面の終了前にこのボタンを押さないと、再度戻った際に変更が破棄され、作業が無駄になります。",
|
||||
"TTS推理WebUI": "TTS推論WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5ボーカルアカンパニメント分離&リバーブおよびディレイ除去ツール",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3は参照テキストなしのモードをサポートしていません。必ず参照テキストを入力してください!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:正規化後のオーディオが入る割合",
|
||||
"batch_size": "バッチサイズ",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: 音量曲線の計算方法、小さいほど精度が高くなりますが、計算量が増加します(精度が高いほど必ずしも効果が良いわけではありません)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "文間のポーズ秒数",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "オプション:複数の参照オーディオファイルをドラッグ&ドロップしてアップロードし、それらのトーンを平均化します(同性推奨)。このオプションを空白のままにした場合、トーンは左側の単一の参照オーディオによって制御されます。モデルを微調整する場合、すべての参照オーディオファイルが微調整のトレーニングセット内のトーンを持つことをお勧めします。プリトレーニングモデルは無視しても構いません。",
|
||||
"合成语音": "推論を開始",
|
||||
"合成音频": "音声を合成する",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "適切なフォルダパスの例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华テストサンプル(ファイルマネージャのアドレスバーからコピーしてください)。",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "今後、フォンメ转换、手動フォンメ編集、音声合成のステップバイステップ実行をサポートします。",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "参照音声がはっきり、または何を書くかわからない場合は、このオプションを有効にして参照テキストを無視します。",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "閉じました",
|
||||
"已完成": "完了しました",
|
||||
"已开启": "有効化しました",
|
||||
"并行合成中": "並列合成処理中",
|
||||
"并行推理": "並列推論",
|
||||
"并行推理模式已关闭": "並列推論モードを無効化",
|
||||
"并行推理模式已开启": "並列推論モードを有効化",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "ベースモデルが不足しているため、対応する LoRA の重みをロードできません",
|
||||
"开启": "有効化",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "参照テキストなしモードを有効にします。参照テキストを入力しない場合も同様に有効になります。",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "並列推論モードが有効な場合、SoVITS V3/4モデルはバケット処理をサポートしないため、自動的に無効になっています。",
|
||||
"微调训练": "ファインチューニング",
|
||||
"怎么切": "どうやって切るか",
|
||||
"总训练轮数total_epoch": "総トレーニングエポック数total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "モデル",
|
||||
"模型分为三类:": "モデルは3種類に分かれています:",
|
||||
"模型切换": "モデル切り替え",
|
||||
"模型加载中,请等待": "モデルを読み込み中です。しばらくお待ちください...",
|
||||
"每张显卡的batch_size": "各グラフィックカードのバッチサイズ",
|
||||
"版本": "バージョン",
|
||||
"粤英混合": "粤英混合",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "プロセスが終了しました",
|
||||
"进程输出信息": "プロセスの出力情報",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "SoVITS_weightsおよびGPT_weightsに保存されたモデルを選択します。デフォルトのものはプレトレインであり、ゼロショットTTSを体験できます。",
|
||||
"采样步数(仅对V3/4生效)": "サンプリングステップ数(V3/V4のみ有効)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "サンプリングステップ数:ノイズが強い場合は増やし、遅い場合は減らしてみてください",
|
||||
"重复惩罚": "繰り返しペナルティ",
|
||||
"随机种子": "ランダムシード",
|
||||
@ -199,13 +207,16 @@
|
||||
"韩文": "韓国語",
|
||||
"韩英混合": "韓英混合",
|
||||
"音频加载失败": "音声の読み込みに失敗しました",
|
||||
"音频文件不存在,跳过:": "オーディオファイルが見つからない。スキップ:",
|
||||
"音频标注WebUI": "音声ラベリングWebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "オーディオの自動分割入力パス、ファイルまたはフォルダを指定できます",
|
||||
"音频超分中": "音声超解像中",
|
||||
"预训练GPT模型路径": "事前にトレーニングされたGPTモデルのパス",
|
||||
"预训练SSL模型路径": "事前にトレーニングされたSSLモデルのパス",
|
||||
"预训练SoVITS-D模型路径": "事前にトレーニングされたSoVITS-Dモデルのパス",
|
||||
"预训练SoVITS-G模型路径": "事前にトレーニングされたSoVITS-Gモデルのパス",
|
||||
"预训练中文BERT模型路径": "事前にトレーニングされた中国語BERTモデルのパス"
|
||||
"音频文件不存在,跳过:": "音声ファイルが見つかりません。スキップします:",
|
||||
"音频标注WebUI": "音声アノテーション用WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音声自動分割の入力パス(ファイルまたはフォルダ指定可)",
|
||||
"音频超分中": "音声スーパーレゾリューション処理中",
|
||||
"音频超采样": "音声アップサンプリング",
|
||||
"音频超采样(仅对V3生效))": "音声アップサンプリング(V3のみ有効)",
|
||||
"预测语义Token": "意味的トークンを予測する",
|
||||
"预训练GPT模型路径": "事前学習済みGPTモデルのパス",
|
||||
"预训练SSL模型路径": "事前学習済みSSLモデルのパス",
|
||||
"预训练SoVITS-D模型路径": "事前学習済みSoVITS-Dモデルのパス",
|
||||
"预训练SoVITS-G模型路径": "事前学習済みSoVITS-Gモデルのパス",
|
||||
"预训练中文BERT模型路径": "事前学習済み中国語BERTモデルのパス"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): 듀얼 채널 리버브에는 가장 적합하지만, 싱글 채널 리버브는 제거할 수 없습니다",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:지연 효과를 제거합니다. Aggressive는 Normal보다 더 철저하게 제거하며, DeReverb는 추가로 리버브를 제거하여 단일 채널 리버브를 제거할 수 있지만 고주파 리버브는 완전히 제거하지 못합니다.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(불안정 상태입니다. 사용하지 마세요. 모델 성능 저하가 발생할 수 있습니다!)",
|
||||
"*实验/模型名": "*실험/모델 이름",
|
||||
"*文本标注文件": "*텍스트 주석 파일",
|
||||
"*训练集音频文件目录": "*훈련 세트 오디오 파일 디렉터리",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU 카드 번호, 1개의 정수만 입력 가능",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU 카드 번호는 -로 구분되며 각 카드 번호에 하나의 프로세스가 있어야 함",
|
||||
"LoRA秩": "LoRA 랭크",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 기본 모델이 없어서 해당 LoRA 가중치를 로드할 수 없습니다",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 훈련: 모델 가중치 파일은 SoVITS_weights/에 있습니다",
|
||||
"SoVITS模型列表": "SoVITS 모델 목록",
|
||||
"SoVITS训练": "SoVITS훈련",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "현재 페이지의 모든 텍스트 상자 내용을 수동으로 메모리와 파일에 저장합니다. (페이지 전환 전후 또는 주석 작업을 종료하기 전에 이 버튼을 누르지 않으면, 다시 돌아왔을 때 변경 사항이 롤백되어 작업이 무효가 됩니다.)",
|
||||
"TTS推理WebUI": "TTS 추론 WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 보컬 및 반주 분리 & 리버브 제거 및 딜레이 제거 도구",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3는 참조 텍스트 없이 작동할 수 없습니다. 반드시 참조 텍스트를 입력해주세요!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "알파 믹스: 정규화된 오디오가 들어오는 비율",
|
||||
"batch_size": "배치 크기",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop 크기: 볼륨 곡선을 계산하는 방법. 작을수록 정확도가 높아지지만 계산량이 높아집니다 (정확도가 높다고 효과가 좋아지지 않음)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "문장 간 정지 시간 (초)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "선택 사항: 여러 참조 오디오 파일을 드래그하여 업로드하고 (동일한 성별을 권장), 그들의 톤을 평균화합니다. 이 옵션을 비워두면 톤은 왼쪽의 단일 참조 오디오로 제어됩니다. 모델을 미세 조정하는 경우 모든 참조 오디오 파일이 미세 조정 훈련 세트 내의 톤을 가지고 있는 것이 좋으며, 사전 훈련된 모델은 무시할 수 있습니다.",
|
||||
"合成语音": "합성 음성",
|
||||
"合成音频": "오디오 생성",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "적절한 폴더 경로 형식 예: E:\\codes\\py39\\vits_vc_gpu\\백로서리 테스트 샘플 (파일 관리자 주소 표시줄에서 복사하면 됩니다).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "향후 음소 변환, 수동 음소 편집, 단계별 음성 합성 지원이 추가될 예정입니다.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "참고 오디오의 내용을 정확히 알아들을 수 없을 경우 이 옵션을 활성화하세요. 활성화하면 입력한 참고 텍스트를 무시합니다.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "닫힘",
|
||||
"已完成": "완료됨",
|
||||
"已开启": "켜짐",
|
||||
"并行合成中": "병렬 오디오 생성 중",
|
||||
"并行推理": "병렬 추론",
|
||||
"并行推理模式已关闭": "병렬 추론 모드 비활성화됨",
|
||||
"并行推理模式已开启": "병렬 추론 모드 활성화됨",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "기본 모델이 없어서 해당 LoRA 가중치를 로드할 수 없습니다",
|
||||
"开启": "켜기",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "참고 텍스트 없이 모드를 활성화합니다. 참고 텍스트를 입력하지 않으면 자동으로 활성화됩니다.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "병렬 추론 모드가 활성화된 경우, SoVITS V3/4 모델은 버킷 처리를 지원하지 않으며, 자동으로 비활성화됩니다.",
|
||||
"微调训练": "미세 조정 훈련",
|
||||
"怎么切": "자르기 옵션",
|
||||
"总训练轮数total_epoch": "총 훈련 라운드 수 (total_epoch)",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "모델",
|
||||
"模型分为三类:": "모델은 3가지로 나뉩니다:",
|
||||
"模型切换": "모델 전환",
|
||||
"模型加载中,请等待": "모델을 불러오는 중입니다. 잠시 기다려주세요...",
|
||||
"每张显卡的batch_size": "각 그래픽 카드의 배치 크기",
|
||||
"版本": "버전",
|
||||
"粤英混合": "粤영 혼합",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "프로세스 종료됨",
|
||||
"进程输出信息": "프로세스 출력 정보",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "SoVITS_weights 및 GPT_weights에 저장된 훈련 완료된 모델 중 선택. 기본적으로 하나는 기본 모델이며 5초 Zero Shot TTS를 체험할 수 있습니다.",
|
||||
"采样步数(仅对V3/4生效)": "샘플링 단계 수 (V3/V4에만 적용됨)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "샘플링 스텝: 노이즈가 느껴지면 증가, 느리다면 감소 시도",
|
||||
"重复惩罚": "반복 패널티",
|
||||
"随机种子": "랜덤 시드",
|
||||
@ -202,10 +210,13 @@
|
||||
"音频文件不存在,跳过:": "오디오 파일이 존재하지 않음, 건너뜀: ",
|
||||
"音频标注WebUI": "오디오 주석 WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "오디오 자동 분리 입력 경로, 파일 또는 폴더 가능",
|
||||
"音频超分中": "오디오 슈퍼 레졸루션 중",
|
||||
"预训练GPT模型路径": "사전 훈련된 GPT 모델 경로",
|
||||
"预训练SSL模型路径": "사전 훈련된 SSL 모델 경로",
|
||||
"预训练SoVITS-D模型路径": "사전 훈련된 SoVITS-D 모델 경로",
|
||||
"预训练SoVITS-G模型路径": "사전 훈련된 SoVITS-G 모델 경로",
|
||||
"预训练中文BERT模型路径": "사전 훈련된 중국어 BERT 모델 경로"
|
||||
"音频超分中": "오디오 슈퍼 레졸루션 처리 중",
|
||||
"音频超采样": "오디오 업샘플링",
|
||||
"音频超采样(仅对V3生效))": "오디오 업샘플링 (V3에만 적용됨)",
|
||||
"预测语义Token": "의미 기반 토큰 예측",
|
||||
"预训练GPT模型路径": "사전 학습된 GPT 모델 경로",
|
||||
"预训练SSL模型路径": "사전 학습된 SSL 모델 경로",
|
||||
"预训练SoVITS-D模型路径": "사전 학습된 SoVITS-D 모델 경로",
|
||||
"预训练SoVITS-G模型路径": "사전 학습된 SoVITS-G 모델 경로",
|
||||
"预训练中文BERT模型路径": "사전 학습된 중국어 BERT 모델 경로"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): É a melhor opção para reverberação de dois canais, mas não pode remover a reverberação de um único canal;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Remove os efeitos de atraso. Aggressive é mais completo que Normal na remoção, DeReverb remove adicionalmente a reverberação, pode remover a reverberação de um canal único, mas não remove completamente a reverberação de placa de alta frequência.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Instável! Não utilize ainda, pode degradar o desempenho do modelo!)",
|
||||
"*实验/模型名": "*Nome do experimento/modelo",
|
||||
"*文本标注文件": "*Arquivo de marcação de texto",
|
||||
"*训练集音频文件目录": "*Diretório de arquivos de áudio do conjunto de treinamento",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "Número da placa de vídeo, só é possível preencher com um número inteiro",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "Número da placa de vídeo dividido por-, cada número de placa é um processo",
|
||||
"LoRA秩": "Classificação LoRA",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "O modelo base do SoVITS V3 está ausente, impossibilitando o carregamento dos pesos do LoRA",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Treinamento SoVITS: O arquivo de pesos do modelo está em SoVITS_weights/",
|
||||
"SoVITS模型列表": "Lista de modelos SoVITS",
|
||||
"SoVITS训练": "Treinamento SoVITS",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Enviar texto: Salve manualmente o conteúdo de todos os campos de texto da página atual na memória e no arquivo. Se você não clicar neste botão antes ou depois de mudar de página ou sair da tela de anotação, ao retornar as alterações serão desfeitas e todo o trabalho será perdido.",
|
||||
"TTS推理WebUI": "Inferência TTS WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Separação de voz e acompanhamento & remoção de reverberação e atraso",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "O modo sem texto de referência não é suportado pelo V3. Por favor, forneça um texto de referência!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta",
|
||||
"batch_size": "Tamanho do Lote",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Tempo de pausa entre frases (segundos)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Faça upload de vários arquivos de áudio de referência arrastando e soltando-os (recomendado que sejam do mesmo gênero) e faça uma média dos seus tons. Se essa opção for deixada em branco, o tom será controlado pelo único áudio de referência à esquerda. Se estiver ajustando o modelo, é recomendado que todos os arquivos de áudio de referência tenham tons dentro do conjunto de treinamento de ajuste; o modelo pré-treinado pode ser ignorado.",
|
||||
"合成语音": "Voz sintetizada",
|
||||
"合成音频": "Sintetizar áudio",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemplo de formato de caminho de pasta válido: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copie do endereço da barra do gerenciador de arquivos).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Suporte para Conversão de Fonemas, Edição Manual de Fonemas e Síntese de Fase por Fase será adicionado no futuro.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Se não conseguir entender claramente o áudio de referência, ative esta opção. Quando ativada, o texto de referência inserido será ignorado.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " Fechado",
|
||||
"已完成": " Concluído",
|
||||
"已开启": " Ativado",
|
||||
"并行合成中": "Síntese em paralelo em andamento",
|
||||
"并行推理": "Inferência Paralela",
|
||||
"并行推理模式已关闭": "Modo de Inferência Paralela Desativado",
|
||||
"并行推理模式已开启": "Modo de Inferência Paralela Ativado",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Falta o modelo base, não foi possível carregar os pesos LoRA correspondentes",
|
||||
"开启": "Ativar ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "Quando o modo de inferência paralela está ativado, os modelos SoVITS V3/4 não suportam processamento por lotes e esta funcionalidade foi automaticamente desativada.",
|
||||
"微调训练": "Treinamento de ajuste fino",
|
||||
"怎么切": "Como cortar",
|
||||
"总训练轮数total_epoch": "Total de epoch de treinamento",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Modelo",
|
||||
"模型分为三类:": "Modelos dividem-se em três categorias:",
|
||||
"模型切换": "Troca de modelo",
|
||||
"模型加载中,请等待": "Carregando o modelo, por favor aguarde...",
|
||||
"每张显卡的batch_size": "Tamanho do lote de cada placa de vídeo",
|
||||
"版本": "Versão",
|
||||
"粤英混合": "Mistura Yue-Inglês",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " Processo encerrado",
|
||||
"进程输出信息": " Informações de saída do processo",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Selecione os modelos armazenados em Sovits_weights e GPT_WEIGHTS. O padrão é o modelo inferior, experiência para 5 segundos de Zero Shot TTS",
|
||||
"采样步数(仅对V3/4生效)": "Número de passos de amostragem (apenas válido para V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Passos de Amostragem: Se parecer ruidoso, tente aumentar; se parecer lento, tente diminuir",
|
||||
"重复惩罚": "Penalidade de Repetição",
|
||||
"随机种子": "Semente Aleatória",
|
||||
@ -199,10 +207,13 @@
|
||||
"韩文": "Coreano",
|
||||
"韩英混合": "Mistura Coreano-Inglês",
|
||||
"音频加载失败": "Falha ao Carregar o Áudio",
|
||||
"音频文件不存在,跳过:": "Arquivo de Áudio Não Encontrado, Pulando: ",
|
||||
"音频标注WebUI": "WebUI de anotação de áudio",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Caminho de entrada automático de corte de áudio, pode ser um arquivo ou uma pasta",
|
||||
"音频超分中": "Super-resolução de áudio em andamento",
|
||||
"音频文件不存在,跳过:": "Arquivo de áudio não encontrado, pulando: ",
|
||||
"音频标注WebUI": "WebUI de Anotação de Áudio",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Caminho de entrada para divisão automática de áudio (arquivo ou pasta)",
|
||||
"音频超分中": "Executando Super-Resolução de Áudio",
|
||||
"音频超采样": "Superamostragem de áudio",
|
||||
"音频超采样(仅对V3生效))": "Superamostragem de áudio (apenas válida para V3)",
|
||||
"预测语义Token": "Prever token semântico",
|
||||
"预训练GPT模型路径": "Caminho do modelo GPT pré-treinado",
|
||||
"预训练SSL模型路径": "Caminho do modelo SSL pré-treinado",
|
||||
"预训练SoVITS-D模型路径": "Caminho do modelo SoVITS-D pré-treinado",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):Это лучший выбор для реверберации с двумя каналами, но он не может устранить реверберацию с одним каналом;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Устраняет эффект задержки. Aggressive устраняет более тщательно, чем Normal, DeReverb дополнительно устраняет реверберацию, может устранить реверберацию с одного канала, но не полностью устраняет высокочастотную реверберацию.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Нестабильная версия, пока не используйте — возможно ухудшение качества модели!)",
|
||||
"*实验/模型名": "*Название эксперимента/модели",
|
||||
"*文本标注文件": "*Файл текстовой аннотации",
|
||||
"*训练集音频文件目录": "*Директория аудиофайлов обучающего набора",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "Номер GPU, можно указать только одно целое число",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "Номера GPU разделяются дефисом, на каждый номер отдельный процесс",
|
||||
"LoRA秩": "Ранг LoRA",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Отсутствует базовая модель SoVITS V3, невозможно загрузить соответствующие веса LoRA",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Обучение SoVITS: файлы весов модели находятся в SoVITS_weights/",
|
||||
"SoVITS模型列表": "Список моделей SoVITS",
|
||||
"SoVITS训练": "Обучение SoVITS",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Отправить текст: Вручную сохраните содержимое всех текстовых полей текущей страницы в память и файл. Если вы не нажмете эту кнопку до или после смены страницы или перед выходом из интерфейса разметки, при возврате все изменения будут отменены — работа пропадет зря.",
|
||||
"TTS推理WebUI": "TTS WebUI для инференса",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 Инструмент разделения вокала и сопровождения & удаления реверберации и задержки",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 не поддерживает режим без опорного текста. Пожалуйста, укажите опорный текст!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Какая доля нормализованного аудио смешивается",
|
||||
"batch_size": "размер пакета",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Как рассчитывается кривая громкости, чем меньше, тем выше точность и больше вычислительная нагрузка (большая точность не всегда означает лучший результат)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Время паузы между предложениями (в секундах)",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Необязательно: загрузите несколько файлов с эталонными аудиозаписями, перетащив их (рекомендуется одного пола), и усредните их тон. Если этот параметр не заполнен, тон будет контролироваться одной эталонной аудиозаписью слева. При тонкой настройке модели рекомендуется, чтобы все эталонные аудиозаписи имели тон в пределах обучающего набора для тонкой настройки; предварительно обученную модель можно игнорировать.",
|
||||
"合成语音": "Синтезированный голос",
|
||||
"合成音频": "Синтезировать аудио",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Пример допустимого формата пути к папке: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (просто скопируйте из адресной строки файлового менеджера).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "В будущем будет добавлена поддержка преобразования в фонемы, ручного редактирования фонемов и пошагового выполнения синтеза речи.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Если невозможно разобрать речь в эталонном аудио (и непонятно, что писать), можно включить эту опцию. При включении вводимый эталонный текст будет игнорироваться.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " Закрыто",
|
||||
"已完成": " Завершено",
|
||||
"已开启": " Включено",
|
||||
"并行合成中": "Синтез в параллельном режиме",
|
||||
"并行推理": "Параллельный вывод",
|
||||
"并行推理模式已关闭": "Режим параллельного вывода отключен",
|
||||
"并行推理模式已开启": "Режим параллельного вывода включен",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Отсутствует базовая модель, не удалось загрузить соответствующие веса LoRA.",
|
||||
"开启": "Включить ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Включить режим без референтного текста. Не заполняя референтный текст, вы также включаете этот режим.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "При включенном режиме параллельного вывода модель SoVits V3/4 не поддерживает обработку по бакетам, эта функция была автоматически отключена.",
|
||||
"微调训练": "Обучение с тонкой настройкой",
|
||||
"怎么切": "Как разрезать",
|
||||
"总训练轮数total_epoch": "Общее количество эпох обучения total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Модели",
|
||||
"模型分为三类:": "Модели делятся на три типа:",
|
||||
"模型切换": "Переключение модели",
|
||||
"模型加载中,请等待": "Модель загружается, пожалуйста, подождите...",
|
||||
"每张显卡的batch_size": "Размер пакета для каждой видеокарты",
|
||||
"版本": "Версия",
|
||||
"粤英混合": "Кантоно-английская смесь",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " Процесс завершён",
|
||||
"进程输出信息": " Выходные данные процесса",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Выберите модель, сохраненную в SoVITS_weights и GPT_weights после обучения. По умолчанию используется базовая модель для 5-секундного Zero Shot TTS.",
|
||||
"采样步数(仅对V3/4生效)": "Число шагов выборки (действительно только для V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Количество шагов выборки: если кажется, что шумно, попробуйте увеличить, если кажется, что медленно, попробуйте уменьшить",
|
||||
"重复惩罚": "Штраф за повторение",
|
||||
"随机种子": "Случайное начальное значение",
|
||||
@ -199,10 +207,13 @@
|
||||
"韩文": "Корейский",
|
||||
"韩英混合": "Корейско-английская смесь",
|
||||
"音频加载失败": "Не удалось загрузить аудио",
|
||||
"音频文件不存在,跳过:": "Аудиофайл не найден, пропускается: ",
|
||||
"音频标注WebUI": "WebUI для аннотирования аудиофайлов",
|
||||
"音频文件不存在,跳过:": "Файл аудио не найден, пропускается: ",
|
||||
"音频标注WebUI": "Веб-интерфейс разметки аудио",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Путь ввода для автоматического разделения аудио, может быть файлом или папкой",
|
||||
"音频超分中": "Супер-разрешение аудио в процессе",
|
||||
"音频超采样": "Апсэмплирование аудио",
|
||||
"音频超采样(仅对V3生效))": "Апсэмплирование аудио (действительно только для V3)",
|
||||
"预测语义Token": "Предсказать семантический токен",
|
||||
"预训练GPT模型路径": "Путь к предобученной модели GPT",
|
||||
"预训练SSL模型路径": "Путь к предобученной модели SSL",
|
||||
"预训练SoVITS-D模型路径": "Путь к предобученной модели SoVITS-D",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):İki kanallı yankılar için en iyi seçimdir, ancak tek kanallı yankıları ortadan kaldıramaz;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Gecikme etkilerini giderir. Aggressive, Normal'dan daha kapsamlı bir şekilde giderir, DeReverb ek olarak yankıyı giderir, tek kanallı yankıyı giderebilir, ancak yüksek frekanslı plaka yankısını tamamen gideremez.",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(Kararsız durumda, henüz kullanmayın! Model performansını düşürebilir!)",
|
||||
"*实验/模型名": "*Deney/model adı",
|
||||
"*文本标注文件": "*Metin etiketleme dosyası",
|
||||
"*训练集音频文件目录": "*Eğitim seti ses dosyası dizini",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU kart numarası, sadece bir tamsayı girilebilir",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU kart numaraları - ile ayrılır, her kart numarası için bir işlem",
|
||||
"LoRA秩": "LoRA Derecesi",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 temel modeli eksik, ilgili LoRA ağırlıkları yüklenemiyor",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Eğitimi: Model ağırlık dosyaları SoVITS_weights/ içinde",
|
||||
"SoVITS模型列表": "SoVITS model listesi",
|
||||
"SoVITS训练": "SoVITS Eğitimi",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Geçerli sayfadaki tüm metin kutusu içeriklerini manuel olarak belleğe ve dosyaya kaydedin. Bu butona sayfa değiştirmeden önce ya da etiketleme sayfasından çıkmadan önce tıklamazsanız, geri döndüğünüzde değişiklikler geri alınıp tüm işlemler boşa gidecektir.",
|
||||
"TTS推理WebUI": "TTS Çıkarım WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 İnsan Sesli ve Enstrümantal Ayrım & Reverb ve Gecikme Giderme Aracı",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3, referans metin olmadan çalışmayı desteklememektedir! Lütfen bir referans metin giriniz!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Normalizasyondan sonraki sesin ne kadarlık bir oranı karıştırılsın",
|
||||
"batch_size": "Toplu Boyut",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Ses seviyesi eğrisi nasıl hesaplanır, ne kadar küçükse hassasiyet o kadar yüksek ve hesaplama yükü o kadar artar (hassasiyet arttıkça etki mutlaka daha iyi olmaz)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "Cümleler Arası Duraklama Süresi",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "İsteğe bağlı: Birden fazla referans ses dosyasını sürükleyip bırakarak yükleyin (aynı cinsiyetten olmaları önerilir) ve tonlarını ortalayın. Bu seçenek boş bırakılırsa, ton soldaki tek referans ses dosyası tarafından kontrol edilir. Modeli ince ayar yapıyorsanız, tüm referans ses dosyalarının ince ayar eğitim seti içindeki tonlara sahip olması önerilir; önceden eğitilmiş model dikkate alınmayabilir.",
|
||||
"合成语音": "Ses sentezi",
|
||||
"合成音频": "Ses Sentezleme",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Geçerli klasör yolu formatı örneği: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (dosya yöneticisi adres çubuğundan kopyalayabilirsiniz).",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Sonraki sürümlerde fonem dönüşümü, el ile fonem düzenleme ve adım adım konuşma sentezi desteği eklenecek.",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Referans ses kaydını anlamıyorsanız (ne yazacağınızı bilmiyorsanız) açabilirsiniz. Açıldığında yazılmış olan referans metni göz ardı edilir.",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": " Kapalı",
|
||||
"已完成": " Tamamlandı",
|
||||
"已开启": " Açık",
|
||||
"并行合成中": "Paralel Sentezleme Yapılıyor",
|
||||
"并行推理": "Paralel Çıkarım",
|
||||
"并行推理模式已关闭": "Paralel Çıkarım Modu Kapalı",
|
||||
"并行推理模式已开启": "Paralel Çıkarım Modu Etkin",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "Temel model eksik, ilgili LoRA ağırlıkları yüklenemedi.",
|
||||
"开启": "Aç ",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "Referans metni olmayan mod açık. Referans metni doldurulmazsa bu mod otomatik olarak açılır.",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "Paralel çıkarım modu etkinleştirildiğinde, SoVITS V3/4 modelleri bölme işlemeyi desteklemez ve bu işlem otomatik olarak devre dışı bırakılır.",
|
||||
"微调训练": "İnce Ayar Eğitimi",
|
||||
"怎么切": "Nasıl kesilir",
|
||||
"总训练轮数total_epoch": "Toplam eğitim turu sayısı total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "Model",
|
||||
"模型分为三类:": "Modeller üç türdedir:",
|
||||
"模型切换": "Model değiştirme",
|
||||
"模型加载中,请等待": "Model yükleniyor, lütfen bekleyin...",
|
||||
"每张显卡的batch_size": "Her bir ekran kartı için batch_size",
|
||||
"版本": "Versiyon",
|
||||
"粤英混合": "Yue-İngilizce Karışık",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": " İşlem Sonlandırıldı",
|
||||
"进程输出信息": " İşlem Çıktı Bilgisi",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Eğitimi tamamlanmış ve SoVITS_weights ile GPT_weights altına kaydedilmiş modeli seçin. Varsayılan bir temel modeldir, 5 saniyelik Zero Shot TTS deneyimi için kullanılır.",
|
||||
"采样步数(仅对V3/4生效)": "Örnekleme Adım Sayısı (Sadece V3/4 için geçerli)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Örnekleme Adımları: Eğer gürültülü görünüyorsa, adımları artırın; eğer yavaş görünüyorsa, adımları azaltın",
|
||||
"重复惩罚": "Tekrarlama Cezası",
|
||||
"随机种子": "Rastgele Tohum",
|
||||
@ -199,10 +207,13 @@
|
||||
"韩文": "Korece",
|
||||
"韩英混合": "Korece-İngilizce Karışık",
|
||||
"音频加载失败": "Ses Yüklenemedi",
|
||||
"音频文件不存在,跳过:": "Ses Dosyası Bulunamadı, Atlanıyor: ",
|
||||
"音频文件不存在,跳过:": "Ses dosyası bulunamadı, atlanıyor: ",
|
||||
"音频标注WebUI": "Ses Etiketleme WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "Ses otomatik bölme giriş yolu, dosya veya klasör olabilir",
|
||||
"音频超分中": "Ses Süper Çözünürlük Yapılıyor",
|
||||
"音频超分中": "Ses Süper Çözünürlük İşlemi Devam Ediyor",
|
||||
"音频超采样": "Ses Üst-örnekleme",
|
||||
"音频超采样(仅对V3生效))": "Ses Üst-örnekleme (Sadece V3 için geçerli)",
|
||||
"预测语义Token": "Anlamsal Token Tahmini",
|
||||
"预训练GPT模型路径": "Önceden Eğitilmiş GPT Modeli Yolu",
|
||||
"预训练SSL模型路径": "Önceden Eğitilmiş SSL Modeli Yolu",
|
||||
"预训练SoVITS-D模型路径": "Önceden Eğitilmiş SoVITS-D Modeli Yolu",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延迟效果。Aggressive 比 Normal 去除得更彻底,DeReverb 额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(不稳定,先别用,可能劣化模型效果!)",
|
||||
"*实验/模型名": "*实验/模型名",
|
||||
"*文本标注文件": "*文本标注文件",
|
||||
"*训练集音频文件目录": "*训练集音频文件目录",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU卡号,只能填1个整数",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU卡号以-分割,每个卡号一个进程",
|
||||
"LoRA秩": "LoRA秩",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,无法加载相应 LoRA 权重",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 训练: 模型权重文件在 SoVITS_weights/",
|
||||
"SoVITS模型列表": "SoVITS模型列表",
|
||||
"SoVITS训练": "SoVITS训练",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)",
|
||||
"TTS推理WebUI": "TTS推理WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人声伴奏分离&去混响去延迟工具",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3不支持无参考文本模式,请填写参考文本!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例归一化后音频进来",
|
||||
"batch_size": "batch_size",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "句间停顿秒数",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。",
|
||||
"合成语音": "合成语音",
|
||||
"合成音频": "合成音频",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "已关闭",
|
||||
"已完成": "已完成",
|
||||
"已开启": "已开启",
|
||||
"并行合成中": "并行合成中",
|
||||
"并行推理": "并行推理",
|
||||
"并行推理模式已关闭": "并行推理模式已关闭",
|
||||
"并行推理模式已开启": "并行推理模式已开启",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,无法加载相应 LoRA 权重",
|
||||
"开启": "开启",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理",
|
||||
"微调训练": "微调训练",
|
||||
"怎么切": "怎么切",
|
||||
"总训练轮数total_epoch": "总训练轮数total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "模型",
|
||||
"模型分为三类:": "模型分为三类:",
|
||||
"模型切换": "模型切换",
|
||||
"模型加载中,请等待": "模型加载中,请等待",
|
||||
"每张显卡的batch_size": "每张显卡的batch_size",
|
||||
"版本": "版本",
|
||||
"粤英混合": "粤英混合",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "进程已终止",
|
||||
"进程输出信息": "进程输出信息",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。",
|
||||
"采样步数(仅对V3/4生效)": "采样步数(仅对V3/4生效)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试",
|
||||
"重复惩罚": "重复惩罚",
|
||||
"随机种子": "随机种子",
|
||||
@ -203,6 +211,9 @@
|
||||
"音频标注WebUI": "音频标注WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹",
|
||||
"音频超分中": "音频超分中",
|
||||
"音频超采样": "音频超采样",
|
||||
"音频超采样(仅对V3生效))": "音频超采样(仅对V3生效))",
|
||||
"预测语义Token": "预测语义Token",
|
||||
"预训练GPT模型路径": "预训练GPT模型路径",
|
||||
"预训练SSL模型路径": "预训练SSL模型路径",
|
||||
"预训练SoVITS-D模型路径": "预训练SoVITS-D模型路径",
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最佳選擇,但不能去除單通道混響;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: 去除延遲效果。Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但對高頻重的板式混響去不乾淨。",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(不穩定,暫時勿用,可能會導致模型效能下降!)",
|
||||
"*实验/模型名": "*實驗/模型名",
|
||||
"*文本标注文件": "*文本標注文件",
|
||||
"*训练集音频文件目录": "*訓練集音頻文件目錄",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
|
||||
"LoRA秩": "LoRA秩",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重檔案在 SoVITS_weights/",
|
||||
"SoVITS模型列表": "SoVITS模型列表",
|
||||
"SoVITS训练": "SoVITS訓練",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "提交文字:手動儲存目前頁面所有文字方塊內容至記憶體及檔案。若在換頁前後或離開標註頁面前未按下此按鈕,當你返回時變更將會還原,工作便會白費。",
|
||||
"TTS推理WebUI": "TTS推理WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 不支援無參考文字模式,請填寫參考文字!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
|
||||
"batch_size": "批次大小",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "句間停頓秒數",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
|
||||
"合成语音": "合成語音",
|
||||
"合成音频": "合成音訊",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試樣例(去文件管理器地址欄拷就行了)。",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "已關閉",
|
||||
"已完成": "已完成",
|
||||
"已开启": "已開啟",
|
||||
"并行合成中": "平行合成中",
|
||||
"并行推理": "並行推理",
|
||||
"并行推理模式已关闭": "並行推理模式已關閉",
|
||||
"并行推理模式已开启": "並行推理模式已開啟",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,無法加載相應 LoRA 權重",
|
||||
"开启": "開啟",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "當啟用平行推論模式時,SoVITS V3/4 模型不支援分倉處理,已自動關閉分倉處理。",
|
||||
"微调训练": "微調訓練",
|
||||
"怎么切": "怎麼切",
|
||||
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "模型",
|
||||
"模型分为三类:": "模型分為三類:",
|
||||
"模型切换": "模型切換",
|
||||
"模型加载中,请等待": "模型載入中,請稍等",
|
||||
"每张显卡的batch_size": "每張顯卡的batch_size",
|
||||
"版本": "版本",
|
||||
"粤英混合": "粵英混合",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "進程已終止",
|
||||
"进程输出信息": "進程輸出信息",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
|
||||
"采样步数(仅对V3/4生效)": "取樣步數(僅適用於 V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
|
||||
"重复惩罚": "重複懲罰",
|
||||
"随机种子": "隨機種子",
|
||||
@ -200,12 +208,15 @@
|
||||
"韩英混合": "韓英混合",
|
||||
"音频加载失败": "無法加載音頻",
|
||||
"音频文件不存在,跳过:": "音頻檔案不存在,跳過:",
|
||||
"音频标注WebUI": "音頻標註WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
|
||||
"音频超分中": "音頻超分中",
|
||||
"预训练GPT模型路径": "預訓練GPT模型路徑",
|
||||
"预训练SSL模型路径": "預訓練SSL模型路徑",
|
||||
"预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
|
||||
"预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
|
||||
"预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
|
||||
"音频标注WebUI": "音頻標註 WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音頻自動分割輸入路徑,可為檔案或資料夾",
|
||||
"音频超分中": "音頻超高解像度處理中",
|
||||
"音频超采样": "音頻超取樣",
|
||||
"音频超采样(仅对V3生效))": "音頻超取樣(僅適用於 V3)",
|
||||
"预测语义Token": "預測語意 Token",
|
||||
"预训练GPT模型路径": "預訓練 GPT 模型路徑",
|
||||
"预训练SSL模型路径": "預訓練 SSL 模型路徑",
|
||||
"预训练SoVITS-D模型路径": "預訓練 SoVITS-D 模型路徑",
|
||||
"预训练SoVITS-G模型路径": "預訓練 SoVITS-G 模型路徑",
|
||||
"预训练中文BERT模型路径": "預訓練中文 BERT 模型路徑"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最好的選擇,不能去除單通道混響;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延遲效果。Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(不稳定,暂时别用,可能会导致模型效果下降!)",
|
||||
"*实验/模型名": "*實驗/模型名",
|
||||
"*文本标注文件": "*文本標註文件",
|
||||
"*训练集音频文件目录": "*訓練集音頻文件目錄",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
|
||||
"LoRA秩": "LoRA秩",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/ 目錄下",
|
||||
"SoVITS模型列表": "SoVITS模型列表",
|
||||
"SoVITS训练": "SoVITS訓練",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "提交文字:手动保存当前页面所有文本框内容至内存和文件。若您在换页前后或退出标注页面前没有点击此按钮,再次返回时更改将会被撤销,您的工作便会白费。",
|
||||
"TTS推理WebUI": "TTS推理WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 不支持无参考文本模式,请填写参考文本!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
|
||||
"batch_size": "批次大小",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "句間停頓秒數",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
|
||||
"合成语音": "合成語音",
|
||||
"合成音频": "合成音频",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開。開啟後無視填寫的參考文本。",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "已關閉",
|
||||
"已完成": "已完成",
|
||||
"已开启": "已開啟",
|
||||
"并行合成中": "并行合成中",
|
||||
"并行推理": "並行推理",
|
||||
"并行推理模式已关闭": "並行推理模式已關閉",
|
||||
"并行推理模式已开启": "並行推理模式已開啟",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,无法加载相应 LoRA 权重",
|
||||
"开启": "開啟",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "当启用并行推理模式时,SoVits V3/4 模型不支持分桶处理,已自动关闭分桶处理。",
|
||||
"微调训练": "微調訓練",
|
||||
"怎么切": "怎麼切",
|
||||
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "模型",
|
||||
"模型分为三类:": "模型分為三類:",
|
||||
"模型切换": "模型切換",
|
||||
"模型加载中,请等待": "模型加载中,请等待",
|
||||
"每张显卡的batch_size": "每張顯卡的batch_size",
|
||||
"版本": "版本",
|
||||
"粤英混合": "粵英混合",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "進程已終止",
|
||||
"进程输出信息": "進程輸出信息",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
|
||||
"采样步数(仅对V3/4生效)": "采样步数(仅适用于 V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
|
||||
"重复惩罚": "重複懲罰",
|
||||
"随机种子": "隨機種子",
|
||||
@ -199,13 +207,16 @@
|
||||
"韩文": "韓文",
|
||||
"韩英混合": "韓英混合",
|
||||
"音频加载失败": "無法加載音頻",
|
||||
"音频文件不存在,跳过:": "音檔不存在,跳過:",
|
||||
"音频标注WebUI": "音頻標註WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
|
||||
"音频超分中": "音頻超分中",
|
||||
"预训练GPT模型路径": "預訓練GPT模型路徑",
|
||||
"预训练SSL模型路径": "預訓練SSL模型路徑",
|
||||
"预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
|
||||
"预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
|
||||
"预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
|
||||
"音频文件不存在,跳过:": "音频文件不存在,跳过:",
|
||||
"音频标注WebUI": "音频标注 WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹",
|
||||
"音频超分中": "音频超分辨率处理中",
|
||||
"音频超采样": "音频超采样",
|
||||
"音频超采样(仅对V3生效))": "音频超采样(仅适用于 V3)",
|
||||
"预测语义Token": "预测语义 Token",
|
||||
"预训练GPT模型路径": "预训练 GPT 模型路径",
|
||||
"预训练SSL模型路径": "预训练 SSL 模型路径",
|
||||
"预训练SoVITS-D模型路径": "预训练 SoVITS-D 模型路径",
|
||||
"预训练SoVITS-G模型路径": "预训练 SoVITS-G 模型路径",
|
||||
"预训练中文BERT模型路径": "预训练中文 BERT 模型路径"
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
{
|
||||
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最好的選擇,不能去除單通道混響;",
|
||||
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延遲效果。Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
|
||||
"(不稳定,先别用,可能劣化模型效果!)": "(不穩定,請暫時勿用,可能導致模型效果變差!)",
|
||||
"*实验/模型名": "*實驗/模型名",
|
||||
"*文本标注文件": "*文本標注文件",
|
||||
"*训练集音频文件目录": "*訓練集音頻文件目錄",
|
||||
@ -25,12 +26,13 @@
|
||||
"GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
|
||||
"GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
|
||||
"LoRA秩": "LoRA階",
|
||||
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "SoVITS V3 底模缺失,無法加載相應 LoRA 權重",
|
||||
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS 訓練: 模型權重文件在 SoVITS_weights/",
|
||||
"SoVITS模型列表": "SoVITS模型列表",
|
||||
"SoVITS训练": "SoVITS訓練",
|
||||
"Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "送出文字:手動儲存目前頁面所有文字欄位內容至記憶體與檔案。若您在換頁前後或離開標註頁面前未按下此按鈕,當您返回時異動將會復原,辛苦的工作便會白費。",
|
||||
"TTS推理WebUI": "TTS推理WebUI",
|
||||
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5人聲伴奏分離&去混響去延遲工具",
|
||||
"V3不支持无参考文本模式,请填写参考文本!": "V3 不支援無參考文字模式,請填寫參考文字!",
|
||||
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
|
||||
"batch_size": "批次大小",
|
||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
|
||||
@ -87,6 +89,7 @@
|
||||
"句间停顿秒数": "句間停頓秒數",
|
||||
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "可選項:通過拖曳多個文件上傳多個參考音頻(建議同性),平均融合他們的音色。如不填寫此項,音色由左側單個參考音頻控制。如是微調模型,建議參考音頻全部在微調訓練集音色內,底模不用管。",
|
||||
"合成语音": "合成語音",
|
||||
"合成音频": "合成音訊",
|
||||
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
|
||||
"后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支援轉音素、手工修改音素、語音合成分步執行。",
|
||||
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
|
||||
@ -104,11 +107,14 @@
|
||||
"已关闭": "已關閉",
|
||||
"已完成": "已完成",
|
||||
"已开启": "已開啟",
|
||||
"并行合成中": "平行合成中",
|
||||
"并行推理": "並行推理",
|
||||
"并行推理模式已关闭": "並行推理模式已關閉",
|
||||
"并行推理模式已开启": "並行推理模式已開啟",
|
||||
"底模缺失,无法加载相应 LoRA 权重": "底模缺失,無法載入相應 LoRA 權重",
|
||||
"开启": "開啟",
|
||||
"开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
|
||||
"当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "啟用平行推論模式時,SoVITS V3/4 模型不支援分倉處理,已自動關閉該功能。",
|
||||
"微调训练": "微調訓練",
|
||||
"怎么切": "怎麼切",
|
||||
"总训练轮数total_epoch": "總訓練輪數total_epoch",
|
||||
@ -140,6 +146,7 @@
|
||||
"模型": "模型",
|
||||
"模型分为三类:": "模型分為三類:",
|
||||
"模型切换": "模型切換",
|
||||
"模型加载中,请等待": "模型載入中,請稍候",
|
||||
"每张显卡的batch_size": "每張顯卡的batch_size",
|
||||
"版本": "版本",
|
||||
"粤英混合": "粵英混合",
|
||||
@ -188,6 +195,7 @@
|
||||
"进程已终止": "進程已終止",
|
||||
"进程输出信息": "進程輸出資訊",
|
||||
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
|
||||
"采样步数(仅对V3/4生效)": "取樣步數(僅適用於 V3/4)",
|
||||
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "採樣步數,如果覺得電,提高試試,如果覺得慢,降低試試",
|
||||
"重复惩罚": "重複懲罰",
|
||||
"随机种子": "隨機種子",
|
||||
@ -199,13 +207,16 @@
|
||||
"韩文": "韓文",
|
||||
"韩英混合": "韓英混合",
|
||||
"音频加载失败": "無法加載音頻",
|
||||
"音频文件不存在,跳过:": "音檔不存在,跳過:",
|
||||
"音频标注WebUI": "音頻標註WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
|
||||
"音频超分中": "音頻超分中",
|
||||
"预训练GPT模型路径": "預訓練GPT模型路徑",
|
||||
"预训练SSL模型路径": "預訓練SSL模型路徑",
|
||||
"预训练SoVITS-D模型路径": "預訓練SoVITS-D模型路徑",
|
||||
"预训练SoVITS-G模型路径": "預訓練SoVITS-G模型路徑",
|
||||
"预训练中文BERT模型路径": "預訓練中文BERT模型路徑"
|
||||
"音频文件不存在,跳过:": "音訊檔案不存在,跳過:",
|
||||
"音频标注WebUI": "音訊標註 WebUI",
|
||||
"音频自动切分输入路径,可文件可文件夹": "音訊自動切割輸入路徑,可為檔案或資料夾",
|
||||
"音频超分中": "音訊超高解析度處理中",
|
||||
"音频超采样": "音訊超取樣",
|
||||
"音频超采样(仅对V3生效))": "音訊超取樣(僅適用於 V3)",
|
||||
"预测语义Token": "預測語意 Token",
|
||||
"预训练GPT模型路径": "預訓練 GPT 模型路徑",
|
||||
"预训练SSL模型路径": "預訓練 SSL 模型路徑",
|
||||
"预训练SoVITS-D模型路径": "預訓練 SoVITS-D 模型路徑",
|
||||
"预训练SoVITS-G模型路径": "預訓練 SoVITS-G 模型路徑",
|
||||
"预训练中文BERT模型路径": "預訓練中文 BERT 模型路徑"
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
|
||||
if os.path.exists(wav_path):
|
||||
...
|
||||
else:
|
||||
gr.Warning(i18n("路径错误"))
|
||||
gr.Warning(wav_path+i18n("路径错误"))
|
||||
return
|
||||
if is_train:
|
||||
path_list.append(os.path.join(path_list[0], "2-name2text.txt"))
|
||||
|
Loading…
x
Reference in New Issue
Block a user